From 0fab8f2eb9b145e2eedb820a46d3db1aa8fc5885 Mon Sep 17 00:00:00 2001 From: Lewis Date: Sun, 19 Apr 2026 23:50:27 +0300 Subject: [PATCH] feat(tranquil-store): tranquil-gauntlet CLI, config overrides, profiles Lewis: May this revision serve well! --- .config/nextest.toml | 21 + crates/tranquil-store/Cargo.toml | 8 + .../src/bin/tranquil_gauntlet.rs | 698 ++++++++++++++++++ crates/tranquil-store/src/gauntlet/farm.rs | 55 +- .../tranquil-store/src/gauntlet/overrides.rs | 117 +++ .../tranquil-store/src/gauntlet/scenarios.rs | 87 +++ justfile | 15 + 7 files changed, 996 insertions(+), 5 deletions(-) create mode 100644 crates/tranquil-store/src/bin/tranquil_gauntlet.rs create mode 100644 crates/tranquil-store/src/gauntlet/overrides.rs diff --git a/.config/nextest.toml b/.config/nextest.toml index aa8a6ed..923cc0d 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -25,6 +25,27 @@ fail-fast = false test-threads = "num-cpus" slow-timeout = { period = "300s", terminate-after = 2 } +[profile.gauntlet-pr] +retries = 0 +fail-fast = true +test-threads = "num-cpus" +slow-timeout = { period = "60s", terminate-after = 5 } + +[[profile.gauntlet-pr.overrides]] +filter = "binary(gauntlet_smoke)" +slow-timeout = { period = "300s", terminate-after = 2 } + +[profile.gauntlet-nightly] +retries = 0 +fail-fast = false +test-threads = "num-cpus" +slow-timeout = { period = "600s", terminate-after = 1 } + +[profile.gauntlet-soak] +retries = 0 +fail-fast = false +test-threads = 1 + [test-groups] serial-env-tests = { max-threads = 1 } heavy-load-tests = { max-threads = 4 } diff --git a/crates/tranquil-store/Cargo.toml b/crates/tranquil-store/Cargo.toml index 3f2b4dc..cc7d30e 100644 --- a/crates/tranquil-store/Cargo.toml +++ b/crates/tranquil-store/Cargo.toml @@ -35,9 +35,17 @@ rayon = "1" smallvec = "1" uuid = { workspace = true } tempfile = { version = "3", optional = true } +clap = { workspace = true, optional = true } +toml = { version = "0.8", optional = true } [features] test-harness = ["dep:tempfile"] +gauntlet-cli = ["test-harness", "dep:clap", "dep:toml"] + +[[bin]] +name = "tranquil-gauntlet" +path = "src/bin/tranquil_gauntlet.rs" +required-features = ["gauntlet-cli"] [dev-dependencies] tranquil-store = { path = ".", features = ["test-harness"] } diff --git a/crates/tranquil-store/src/bin/tranquil_gauntlet.rs b/crates/tranquil-store/src/bin/tranquil_gauntlet.rs new file mode 100644 index 0000000..4e0822e --- /dev/null +++ b/crates/tranquil-store/src/bin/tranquil_gauntlet.rs @@ -0,0 +1,698 @@ +use std::io::{self, Write}; +use std::path::{Path, PathBuf}; +use std::process::ExitCode; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; + +use clap::{Parser, Subcommand}; +use serde::{Deserialize, Serialize}; +use tokio::runtime::Runtime; +use tranquil_store::gauntlet::{ + ConfigOverrides, Gauntlet, GauntletReport, InvariantViolation, OpStream, RegressionRecord, + Scenario, Seed, config_for, farm, + shrink::{DEFAULT_MAX_SHRINK_ITERATIONS, shrink_failure}, +}; + +const MAX_HOURS: f64 = 1.0e6; + +/// Deterministic storage-engine gauntlet: scenario fuzzing, shrinking, regression replay. +/// +/// Writes one NDjson record per seed to stdout; `farm` adds a final summary record. +/// Progress, batch stats, interrupt notices, and errors go to stderr. +/// Exits 0 on success, 1 on invariant violation, 2 on argument or runtime error. +/// First SIGINT stops after the current batch; a second press aborts. +/// +/// Hopefully we'll catch super complicated tranquil-store bugs with this!! +#[derive(Debug, Parser)] +#[command(name = "tranquil-gauntlet", version)] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Debug, Subcommand)] +enum Cmd { + /// Run a scenario across many seeds in parallel. + /// + /// With --hours, the command loops batches of --seeds until the deadline passes. + /// Without --hours, a single batch runs and the command exits. + /// The last stdout line is always a `"type":"summary"` record. + Farm { + /// Scenario to run. + #[arg(long, value_enum, required_unless_present = "config")] + scenario: Option, + + /// First seed in the batch range. Default 0. + #[arg(long)] + seed_start: Option, + + /// Number of seeds per batch. Default 256. Must be > 0. + #[arg(long)] + seeds: Option, + + /// Wall-clock budget in hours; batches repeat until the deadline elapses. + #[arg(long)] + hours: Option, + + /// Directory to dump regression Json on failure. + #[arg(long)] + dump_regressions: Option, + + /// Toml config with any of the above fields plus an `[overrides]` table. + #[arg(long)] + config: Option, + + /// Skip shrinking when dumping regressions. + #[arg(long)] + no_shrink: bool, + + /// Max shrink attempts per failing seed. + #[arg(long, default_value_t = DEFAULT_MAX_SHRINK_ITERATIONS, conflicts_with = "no_shrink")] + shrink_budget: usize, + }, + /// Replay a single seed or a saved regression file. + /// + /// With --from, replays a regression Json produced by `farm --dump-regressions`. + /// Otherwise supply --scenario and --seed, or a --config that sets them. + /// Writes one NDjson record to stdout. + Repro { + /// Scenario to replay. Ignored when --from is set. + #[arg(long, value_enum, conflicts_with = "from", required_unless_present_any = ["config", "from"])] + scenario: Option, + + /// Seed to replay. Ignored when --from is set. + #[arg(long, conflicts_with = "from", required_unless_present_any = ["config", "from"])] + seed: Option, + + /// Toml config with optional scenario, seed, and overrides. + #[arg(long, conflicts_with = "from")] + config: Option, + + /// Replay a saved regression Json from `farm --dump-regressions`. + #[arg(long)] + from: Option, + + /// Directory to dump regression Json if replay fails. + #[arg(long)] + dump_regressions: Option, + + /// Skip shrinking when dumping regressions. + #[arg(long)] + no_shrink: bool, + + /// Max shrink attempts when dumping regressions. + #[arg(long, default_value_t = DEFAULT_MAX_SHRINK_ITERATIONS, conflicts_with = "no_shrink")] + shrink_budget: usize, + }, +} + +#[derive(Debug, Deserialize)] +#[serde(deny_unknown_fields)] +struct ConfigFile { + #[serde(default)] + scenario: Option, + #[serde(default)] + seed: Option, + #[serde(default)] + seed_start: Option, + #[serde(default)] + seeds: Option, + #[serde(default)] + hours: Option, + #[serde(default)] + dump_regressions: Option, + #[serde(default)] + overrides: ConfigOverrides, +} + +fn load_config_file(path: &Path) -> Result { + let raw = std::fs::read_to_string(path).map_err(|e| format!("read {}: {e}", path.display()))?; + toml::from_str(&raw).map_err(|e| format!("parse {}: {e}", path.display())) +} + +#[derive(Debug, Serialize)] +struct NdjsonResult { + scenario: &'static str, + seed: u64, + ops_executed: usize, + op_errors: usize, + restarts: usize, + clean: bool, + violations: Vec, + wall_ms: u64, + ops_in_stream: usize, +} + +#[derive(Debug, Serialize)] +struct NdjsonViolation { + invariant: &'static str, + detail: String, +} + +#[derive(Debug, Serialize)] +struct NdjsonSummary { + #[serde(rename = "type")] + kind: &'static str, + scenario: &'static str, + seeds_run: u64, + clean: u64, + failed: u64, + total_ops: u64, + wall_ms: u64, + interrupted: bool, +} + +fn emit_summary(summary: &NdjsonSummary) { + let line = match serde_json::to_string(summary) { + Ok(s) => s, + Err(e) => { + eprintln!("summary serialize failed: {e}"); + return; + } + }; + let stdout = io::stdout(); + let mut w = stdout.lock(); + if let Err(e) = writeln!(w, "{line}").and_then(|()| w.flush()) + && e.kind() != io::ErrorKind::BrokenPipe + { + eprintln!("summary emit failed: {e}"); + } +} + +fn emit(scenario: Scenario, report: &GauntletReport, elapsed: Duration) -> io::Result<()> { + let result = NdjsonResult { + scenario: scenario.cli_name(), + seed: report.seed.0, + ops_executed: report.ops_executed.0, + op_errors: report.op_errors.0, + restarts: report.restarts.0, + clean: report.is_clean(), + violations: report + .violations + .iter() + .map(|v: &InvariantViolation| NdjsonViolation { + invariant: v.invariant, + detail: v.detail.clone(), + }) + .collect(), + wall_ms: u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX), + ops_in_stream: report.ops.len(), + }; + let line = serde_json::to_string(&result).map_err(io::Error::other)?; + let stdout = io::stdout(); + let mut w = stdout.lock(); + writeln!(w, "{line}")?; + w.flush() +} + +fn emit_or_log(scenario: Scenario, report: &GauntletReport, elapsed: Duration) { + if let Err(e) = emit(scenario, report, elapsed) + && e.kind() != io::ErrorKind::BrokenPipe + { + eprintln!("ndjson emit failed: {e}"); + } +} + +struct FarmPlan { + scenario: Scenario, + seed_start: u64, + seeds: u64, + hours: Option, + dump_regressions: Option, + overrides: ConfigOverrides, + shrink: bool, + shrink_budget: usize, +} + +#[allow(clippy::too_many_arguments)] +fn resolve_farm( + scenario: Option, + seed_start: Option, + seeds: Option, + hours: Option, + dump_regressions: Option, + config: Option, + shrink: bool, + shrink_budget: usize, +) -> Result { + let file: Option = config.as_ref().map(|p| load_config_file(p)).transpose()?; + let scenario = scenario + .or_else(|| file.as_ref().and_then(|f| f.scenario)) + .ok_or("must pass --scenario or set `scenario` in --config")?; + let seed_start = seed_start + .or_else(|| file.as_ref().and_then(|f| f.seed_start)) + .unwrap_or(0); + let seeds = seeds + .or_else(|| file.as_ref().and_then(|f| f.seeds)) + .unwrap_or(256); + if seeds == 0 { + return Err("--seeds must be greater than zero".to_string()); + } + let hours = hours.or_else(|| file.as_ref().and_then(|f| f.hours)); + if let Some(h) = hours { + validate_hours(h)?; + } + if shrink && shrink_budget == 0 { + return Err("--shrink-budget must be greater than zero".to_string()); + } + let dump_regressions = + dump_regressions.or_else(|| file.as_ref().and_then(|f| f.dump_regressions.clone())); + let overrides = file.map(|f| f.overrides).unwrap_or_default(); + Ok(FarmPlan { + scenario, + seed_start, + seeds, + hours, + dump_regressions, + overrides, + shrink, + shrink_budget, + }) +} + +fn validate_hours(h: f64) -> Result<(), String> { + if !h.is_finite() || h <= 0.0 { + return Err(format!("invalid --hours={h}: must be positive and finite")); + } + if h > MAX_HOURS { + return Err(format!("invalid --hours={h}: must not exceed {MAX_HOURS}")); + } + Ok(()) +} + +enum ReproPlan { + FromFile { + record: RegressionRecord, + dump_regressions: Option, + shrink: bool, + shrink_budget: usize, + }, + FromSeed { + scenario: Scenario, + seed: Seed, + overrides: ConfigOverrides, + dump_regressions: Option, + shrink: bool, + shrink_budget: usize, + }, +} + +#[allow(clippy::too_many_arguments)] +fn resolve_repro( + scenario: Option, + seed: Option, + config: Option, + from: Option, + dump_regressions: Option, + shrink: bool, + shrink_budget: usize, +) -> Result { + if shrink && shrink_budget == 0 { + return Err("--shrink-budget must be greater than zero".to_string()); + } + if let Some(path) = from { + let record = RegressionRecord::load(&path).map_err(|e| e.to_string())?; + return Ok(ReproPlan::FromFile { + record, + dump_regressions, + shrink, + shrink_budget, + }); + } + let file: Option = config.as_ref().map(|p| load_config_file(p)).transpose()?; + let scenario = scenario + .or_else(|| file.as_ref().and_then(|f| f.scenario)) + .ok_or("must pass --scenario, set `scenario` in --config, or use --from")?; + let seed = seed + .or_else(|| file.as_ref().and_then(|f| f.seed)) + .ok_or("must pass --seed, set `seed` in --config, or use --from")?; + let overrides = file.map(|f| f.overrides).unwrap_or_default(); + Ok(ReproPlan::FromSeed { + scenario, + seed: Seed(seed), + overrides, + dump_regressions, + shrink, + shrink_budget, + }) +} + +fn build_runtime() -> Result { + Runtime::new().map_err(|e| { + eprintln!("failed to build tokio runtime: {e}"); + ExitCode::from(2) + }) +} + +fn install_interrupt(rt: &Runtime) -> Arc { + let flag = Arc::new(AtomicBool::new(false)); + let f = flag.clone(); + rt.spawn(async move { + if tokio::signal::ctrl_c().await.is_err() { + return; + } + f.store(true, Ordering::Relaxed); + eprintln!( + "interrupt received, stopping after current batch; press Ctrl-C again to abort" + ); + if tokio::signal::ctrl_c().await.is_ok() { + eprintln!("second interrupt, aborting"); + std::process::exit(130); + } + }); + flag +} + +fn main() -> ExitCode { + let cli = Cli::parse(); + match cli.cmd { + Cmd::Farm { + scenario, + seed_start, + seeds, + hours, + dump_regressions, + config, + no_shrink, + shrink_budget, + } => { + let plan = match resolve_farm( + scenario, + seed_start, + seeds, + hours, + dump_regressions, + config, + !no_shrink, + shrink_budget, + ) { + Ok(p) => p, + Err(e) => { + eprintln!("{e}"); + return ExitCode::from(2); + } + }; + let rt = match build_runtime() { + Ok(rt) => rt, + Err(code) => return code, + }; + let interrupt = install_interrupt(&rt); + run_farm(plan, &rt, interrupt) + } + Cmd::Repro { + scenario, + seed, + config, + from, + dump_regressions, + no_shrink, + shrink_budget, + } => { + let plan = match resolve_repro( + scenario, + seed, + config, + from, + dump_regressions, + !no_shrink, + shrink_budget, + ) { + Ok(p) => p, + Err(e) => { + eprintln!("{e}"); + return ExitCode::from(2); + } + }; + let rt = match build_runtime() { + Ok(rt) => rt, + Err(code) => return code, + }; + run_repro(plan, &rt) + } + } +} + +fn run_farm(plan: FarmPlan, rt: &Runtime, interrupt: Arc) -> ExitCode { + let FarmPlan { + scenario, + seed_start, + seeds, + hours, + dump_regressions, + overrides, + shrink, + shrink_budget, + } = plan; + let deadline = hours.map(|h| Instant::now() + Duration::from_secs_f64(h * 3600.0)); + let run_start = Instant::now(); + let mut any_failed = false; + let mut next_seed = seed_start; + let mut total_seeds: u64 = 0; + let mut total_clean: u64 = 0; + let mut total_failed: u64 = 0; + let mut total_ops: u64 = 0; + + loop { + if interrupt.load(Ordering::Relaxed) { + break; + } + if let Some(d) = deadline + && Instant::now() >= d + { + break; + } + let end = match next_seed.checked_add(seeds) { + Some(e) => e, + None => { + eprintln!("seed range overflowed u64: seed_start={next_seed} seeds={seeds}"); + break; + } + }; + let overrides_ref = &overrides; + let batch_start = Instant::now(); + let reports = farm::run_many_timed( + |s| { + let mut cfg = config_for(scenario, s); + overrides_ref.apply_to(&mut cfg); + cfg + }, + (next_seed..end).map(Seed), + ); + let batch_wall = batch_start.elapsed(); + let batch_failed = reports.iter().filter(|(r, _)| !r.is_clean()).count(); + let batch_clean = reports.len().saturating_sub(batch_failed); + let batch_ops: u64 = reports + .iter() + .map(|(r, _)| r.ops_executed.0 as u64) + .sum(); + reports.iter().for_each(|(r, elapsed)| { + if !r.is_clean() { + any_failed = true; + if let Some(root) = &dump_regressions { + dump_regression(scenario, r, root, &overrides, shrink, shrink_budget, rt); + } + } + emit_or_log(scenario, r, *elapsed); + }); + total_seeds += reports.len() as u64; + total_clean += batch_clean as u64; + total_failed += batch_failed as u64; + total_ops += batch_ops; + let wall_secs = batch_wall.as_secs_f64(); + let ops_per_sec_display: String = if wall_secs > 0.0 { + format!("{:.0} ops/s", batch_ops as f64 / wall_secs) + } else { + "n/a ops/s".to_string() + }; + eprintln!( + "batch {next_seed}..{end}: {batch_clean} clean, {batch_failed} failed, {wall_secs:.1}s, {ops_per_sec_display}", + ); + if deadline.is_none() { + break; + } + next_seed = end; + } + + let wall_ms = u64::try_from(run_start.elapsed().as_millis()).unwrap_or(u64::MAX); + emit_summary(&NdjsonSummary { + kind: "summary", + scenario: scenario.cli_name(), + seeds_run: total_seeds, + clean: total_clean, + failed: total_failed, + total_ops, + wall_ms, + interrupted: interrupt.load(Ordering::Relaxed), + }); + + if any_failed { + ExitCode::from(1) + } else { + ExitCode::SUCCESS + } +} + +fn dump_regression( + scenario: Scenario, + report: &GauntletReport, + root: &Path, + overrides: &ConfigOverrides, + shrink: bool, + shrink_budget: usize, + rt: &Runtime, +) { + let original_len = report.ops.len(); + let (final_ops, final_report) = if shrink && original_len > 0 { + let mut cfg = config_for(scenario, report.seed); + overrides.apply_to(&mut cfg); + let outcome = rt.block_on(shrink_failure( + cfg, + report.ops.clone(), + report.clone(), + shrink_budget, + )); + eprintln!( + "shrank {} -> {} ops for seed {:016x} in {} runs", + original_len, + outcome.ops.len(), + report.seed.0, + outcome.iterations, + ); + (outcome.ops, outcome.report) + } else { + (report.ops.clone(), report.clone()) + }; + let record = RegressionRecord::from_report( + scenario, + overrides.clone(), + &final_report, + original_len, + final_ops, + ); + match record.write_to(root) { + Ok(path) => eprintln!("wrote regression to {}", path.display()), + Err(e) => eprintln!("regression dump failed: {e}"), + } +} + +fn run_repro(plan: ReproPlan, rt: &Runtime) -> ExitCode { + match plan { + ReproPlan::FromFile { + record, + dump_regressions, + shrink, + shrink_budget, + } => run_repro_from_record(record, dump_regressions, shrink, shrink_budget, rt), + ReproPlan::FromSeed { + scenario, + seed, + overrides, + dump_regressions, + shrink, + shrink_budget, + } => { + let mut cfg = config_for(scenario, seed); + overrides.apply_to(&mut cfg); + let start = Instant::now(); + let gauntlet = match Gauntlet::new(cfg) { + Ok(g) => g, + Err(e) => { + eprintln!("gauntlet init failed: {e}"); + return ExitCode::from(2); + } + }; + let report = rt.block_on(gauntlet.run()); + let elapsed = start.elapsed(); + if !report.is_clean() + && let Some(root) = &dump_regressions + { + dump_regression( + scenario, + &report, + root, + &overrides, + shrink, + shrink_budget, + rt, + ); + } + emit_or_log(scenario, &report, elapsed); + if report.is_clean() { + ExitCode::SUCCESS + } else { + ExitCode::from(1) + } + } + } +} + +fn run_repro_from_record( + record: RegressionRecord, + dump_regressions: Option, + shrink: bool, + shrink_budget: usize, + rt: &Runtime, +) -> ExitCode { + let scenario = match record.scenario_enum() { + Ok(s) => s, + Err(e) => { + eprintln!("{e}"); + return ExitCode::from(2); + } + }; + let cfg = match record.build_config() { + Ok(c) => c, + Err(e) => { + eprintln!("{e}"); + return ExitCode::from(2); + } + }; + let shrunk_from = if record.original_ops_len > record.ops.len() { + format!(", shrunk from {}", record.original_ops_len) + } else { + String::new() + }; + eprintln!( + "replay {} seed {:016x}: {} ops{}, {} recorded violations", + scenario.cli_name(), + record.seed.0, + record.ops.len(), + shrunk_from, + record.violations.len(), + ); + record.violations.iter().for_each(|v| { + eprintln!("violation {}: {}", v.invariant, v.detail); + }); + let overrides = record.overrides.clone(); + let ops: OpStream = record.op_stream(); + let start = Instant::now(); + let gauntlet = match Gauntlet::new(cfg) { + Ok(g) => g, + Err(e) => { + eprintln!("build gauntlet: {e}"); + return ExitCode::from(2); + } + }; + let report = rt.block_on(gauntlet.run_with_ops(ops)); + let elapsed = start.elapsed(); + if !report.is_clean() + && let Some(root) = &dump_regressions + { + dump_regression( + scenario, + &report, + root, + &overrides, + shrink, + shrink_budget, + rt, + ); + } + emit_or_log(scenario, &report, elapsed); + if report.is_clean() { + ExitCode::SUCCESS + } else { + ExitCode::from(1) + } +} diff --git a/crates/tranquil-store/src/gauntlet/farm.rs b/crates/tranquil-store/src/gauntlet/farm.rs index dd3e791..9d3a6f0 100644 --- a/crates/tranquil-store/src/gauntlet/farm.rs +++ b/crates/tranquil-store/src/gauntlet/farm.rs @@ -1,10 +1,15 @@ use std::cell::RefCell; +use std::panic::{AssertUnwindSafe, catch_unwind}; +use std::time::{Duration, Instant}; use rayon::prelude::*; use tokio::runtime::Runtime; -use super::op::Seed; -use super::runner::{Gauntlet, GauntletConfig, GauntletReport}; +use super::invariants::InvariantViolation; +use super::op::{OpStream, Seed}; +use super::runner::{ + Gauntlet, GauntletConfig, GauntletReport, OpErrorCount, OpsExecuted, RestartCount, +}; thread_local! { static RUNTIME: RefCell> = const { RefCell::new(None) }; @@ -26,6 +31,19 @@ fn with_runtime(f: impl FnOnce(&Runtime) -> R) -> R { } pub fn run_many(make_config: F, seeds: impl IntoIterator) -> Vec +where + F: Fn(Seed) -> GauntletConfig + Sync + Send, +{ + run_many_timed(make_config, seeds) + .into_iter() + .map(|(r, _)| r) + .collect() +} + +pub fn run_many_timed( + make_config: F, + seeds: impl IntoIterator, +) -> Vec<(GauntletReport, Duration)> where F: Fn(Seed) -> GauntletConfig + Sync + Send, { @@ -33,9 +51,36 @@ where seeds .into_par_iter() .map(|s| { - let cfg = make_config(s); - let gauntlet = Gauntlet::new(cfg).expect("build gauntlet"); - with_runtime(|rt| rt.block_on(gauntlet.run())) + let start = Instant::now(); + let outcome = catch_unwind(AssertUnwindSafe(|| { + let cfg = make_config(s); + let gauntlet = Gauntlet::new(cfg).expect("build gauntlet"); + with_runtime(|rt| rt.block_on(gauntlet.run())) + })); + let report = outcome.unwrap_or_else(|payload| { + RUNTIME.with(|cell| cell.borrow_mut().take()); + panic_report(s, payload) + }); + (report, start.elapsed()) }) .collect() } + +fn panic_report(seed: Seed, payload: Box) -> GauntletReport { + let msg = payload + .downcast_ref::<&'static str>() + .map(|s| (*s).to_string()) + .or_else(|| payload.downcast_ref::().cloned()) + .unwrap_or_else(|| "non-string panic payload".to_string()); + GauntletReport { + seed, + ops_executed: OpsExecuted(0), + op_errors: OpErrorCount(0), + restarts: RestartCount(0), + violations: vec![InvariantViolation { + invariant: "FarmPanic", + detail: msg, + }], + ops: OpStream::empty(), + } +} diff --git a/crates/tranquil-store/src/gauntlet/overrides.rs b/crates/tranquil-store/src/gauntlet/overrides.rs new file mode 100644 index 0000000..e37475c --- /dev/null +++ b/crates/tranquil-store/src/gauntlet/overrides.rs @@ -0,0 +1,117 @@ +use serde::{Deserialize, Serialize}; + +use super::runner::{GauntletConfig, MaxFileSize, RunLimits, ShardCount, WallMs}; +use super::workload::OpCount; + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct ConfigOverrides { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub op_count: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_wall_ms: Option, + #[serde(default, skip_serializing_if = "StoreOverrides::is_empty")] + pub store: StoreOverrides, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct StoreOverrides { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_file_size: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub shard_count: Option, + #[serde(default, skip_serializing_if = "GroupCommitOverrides::is_empty")] + pub group_commit: GroupCommitOverrides, +} + +impl StoreOverrides { + pub fn is_empty(&self) -> bool { + self.max_file_size.is_none() && self.shard_count.is_none() && self.group_commit.is_empty() + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +#[serde(deny_unknown_fields)] +pub struct GroupCommitOverrides { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_batch_size: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub channel_capacity: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub checkpoint_interval_ms: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub checkpoint_write_threshold: Option, +} + +impl GroupCommitOverrides { + pub fn is_empty(&self) -> bool { + self.max_batch_size.is_none() + && self.channel_capacity.is_none() + && self.checkpoint_interval_ms.is_none() + && self.checkpoint_write_threshold.is_none() + } +} + +impl ConfigOverrides { + pub fn apply_to(&self, cfg: &mut GauntletConfig) { + if let Some(n) = self.op_count { + cfg.op_count = OpCount(n); + } + if let Some(ms) = self.max_wall_ms { + cfg.limits = RunLimits { + max_wall_ms: Some(WallMs(ms)), + }; + } + if let Some(n) = self.store.max_file_size { + cfg.store.max_file_size = MaxFileSize(n); + } + if let Some(n) = self.store.shard_count { + cfg.store.shard_count = ShardCount(n); + } + let gc = &self.store.group_commit; + if let Some(n) = gc.max_batch_size { + cfg.store.group_commit.max_batch_size = n; + } + if let Some(n) = gc.channel_capacity { + cfg.store.group_commit.channel_capacity = n; + } + if let Some(n) = gc.checkpoint_interval_ms { + cfg.store.group_commit.checkpoint_interval_ms = n; + } + if let Some(n) = gc.checkpoint_write_threshold { + cfg.store.group_commit.checkpoint_write_threshold = n; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_overrides_serialize_empty() { + let o = ConfigOverrides::default(); + let json = serde_json::to_string(&o).unwrap(); + assert_eq!(json, "{}"); + } + + #[test] + fn round_trip_preserves_set_fields() { + let o = ConfigOverrides { + op_count: Some(42), + store: StoreOverrides { + max_file_size: Some(4096), + group_commit: GroupCommitOverrides { + max_batch_size: Some(16), + ..GroupCommitOverrides::default() + }, + ..StoreOverrides::default() + }, + ..ConfigOverrides::default() + }; + let json = serde_json::to_string(&o).unwrap(); + let back: ConfigOverrides = serde_json::from_str(&json).unwrap(); + assert_eq!(o, back); + } +} diff --git a/crates/tranquil-store/src/gauntlet/scenarios.rs b/crates/tranquil-store/src/gauntlet/scenarios.rs index 4bacba4..9ab9152 100644 --- a/crates/tranquil-store/src/gauntlet/scenarios.rs +++ b/crates/tranquil-store/src/gauntlet/scenarios.rs @@ -53,10 +53,64 @@ impl Scenario { } } + pub const fn cli_name(self) -> &'static str { + match self { + Self::SmokePR => "smoke-pr", + Self::MstChurn => "mst-churn", + Self::MstRestartChurn => "mst-restart-churn", + Self::FullStackRestart => "full-stack-restart", + Self::CatastrophicChurn => "catastrophic-churn", + Self::HugeValues => "huge-values", + Self::TinyBatches => "tiny-batches", + Self::GiantBatches => "giant-batches", + Self::ManyFiles => "many-files", + Self::ModerateFaults => "moderate-faults", + Self::AggressiveFaults => "aggressive-faults", + Self::TornPages => "torn-pages", + Self::Fsyncgate => "fsyncgate", + Self::FirehoseFanout => "firehose-fanout", + Self::ContendedReaders => "contended-readers", + Self::ContendedWriters => "contended-writers", + } + } + + pub const fn description(self) -> &'static str { + match self { + Self::SmokePR => "60s canary, 10k ops, core invariants. Default PR gate.", + Self::MstChurn => "100k churn, no restart. Refcount + reachability focus.", + Self::MstRestartChurn => "100k churn with Poisson restart bursts every ~5k ops.", + Self::FullStackRestart => "5k ops, deterministic restart every 500 ops.", + Self::CatastrophicChurn => { + "1M ops, phase-2 invariants, Poisson restart. 30 min budget." + } + Self::HugeValues => "Heavy-tail values up to 16 MiB. 32 MiB file cap.", + Self::TinyBatches => "Group-commit batch size 1, tight checkpoints, 4 KiB files.", + Self::GiantBatches => "Group-commit batch size 100k, 16 MiB files.", + Self::ManyFiles => "256-byte file cap, many segments, delete-heavy.", + Self::ModerateFaults => { + "Simulated IO with moderate fault config. CrashAtSyscall restarts." + } + Self::AggressiveFaults => { + "Simulated IO with aggressive fault config. CrashAtSyscall restarts." + } + Self::TornPages => "Torn-page faults only, 20k ops.", + Self::Fsyncgate => "Fsync-drop faults only, 10k ops.", + Self::FirehoseFanout => { + "Eventlog-heavy workload with FSYNC_ORDERING / MONOTONIC_SEQ / TOMBSTONE_BOUND invariants." + } + Self::ContendedReaders => "60% reads, 64 writer tasks, simulated moderate faults.", + Self::ContendedWriters => "Add/delete heavy, 32 writer tasks, simulated moderate faults.", + } + } + pub fn from_name(name: &str) -> Option { Self::ALL.iter().copied().find(|s| s.name() == name) } + pub fn from_cli_name(name: &str) -> Option { + Self::ALL.iter().copied().find(|s| s.cli_name() == name) + } + pub const ALL: &'static [Scenario] = &[ Self::SmokePR, Self::MstChurn, @@ -77,6 +131,39 @@ impl Scenario { ]; } +impl serde::Serialize for Scenario { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(self.cli_name()) + } +} + +impl<'de> serde::Deserialize<'de> for Scenario { + fn deserialize>(deserializer: D) -> Result { + let s = >::deserialize(deserializer)?; + Self::from_cli_name(&s).ok_or_else(|| { + serde::de::Error::custom(format!( + "unknown scenario {s:?}; expected one of {}", + Self::ALL + .iter() + .map(|s| s.cli_name()) + .collect::>() + .join(", ") + )) + }) + } +} + +#[cfg(feature = "gauntlet-cli")] +impl clap::ValueEnum for Scenario { + fn value_variants<'a>() -> &'a [Self] { + Self::ALL + } + + fn to_possible_value(&self) -> Option { + Some(clap::builder::PossibleValue::new(self.cli_name()).help(self.description())) + } +} + impl std::fmt::Display for Scenario { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str(self.name()) diff --git a/justfile b/justfile index 59db959..c12b083 100644 --- a/justfile +++ b/justfile @@ -25,6 +25,21 @@ test-store: test-store-sim-nightly: SQLX_OFFLINE=true TRANQUIL_SIM_SEEDS=10000 cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile sim-nightly +gauntlet-pr: + SQLX_OFFLINE=true cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile gauntlet-pr --test gauntlet_smoke + +gauntlet-nightly HOURS="6": + SQLX_OFFLINE=true GAUNTLET_DURATION_HOURS={{HOURS}} cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile gauntlet-nightly --test gauntlet_smoke --run-ignored all + +gauntlet-farm SCENARIO HOURS="6" DUMP="proptest-regressions": + SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- farm --scenario {{SCENARIO}} --hours {{HOURS}} --dump-regressions {{DUMP}} + +gauntlet-repro SEED SCENARIO="smoke-pr": + SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --scenario {{SCENARIO}} --seed {{SEED}} + +gauntlet-repro-from FILE: + SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --from {{FILE}} + test-unit: SQLX_OFFLINE=true cargo test --test dpop_unit --test validation_edge_cases --test scope_edge_cases