mirror of
https://tangled.org/tranquil.farm/tranquil-pds
synced 2026-04-24 18:30:31 +00:00
feat(tranquil-store): tranquil-gauntlet CLI, config overrides, profiles
Lewis: May this revision serve well! <lu5a@proton.me>
This commit is contained in:
@@ -25,6 +25,27 @@ fail-fast = false
|
||||
test-threads = "num-cpus"
|
||||
slow-timeout = { period = "300s", terminate-after = 2 }
|
||||
|
||||
[profile.gauntlet-pr]
|
||||
retries = 0
|
||||
fail-fast = true
|
||||
test-threads = "num-cpus"
|
||||
slow-timeout = { period = "60s", terminate-after = 5 }
|
||||
|
||||
[[profile.gauntlet-pr.overrides]]
|
||||
filter = "binary(gauntlet_smoke)"
|
||||
slow-timeout = { period = "300s", terminate-after = 2 }
|
||||
|
||||
[profile.gauntlet-nightly]
|
||||
retries = 0
|
||||
fail-fast = false
|
||||
test-threads = "num-cpus"
|
||||
slow-timeout = { period = "600s", terminate-after = 1 }
|
||||
|
||||
[profile.gauntlet-soak]
|
||||
retries = 0
|
||||
fail-fast = false
|
||||
test-threads = 1
|
||||
|
||||
[test-groups]
|
||||
serial-env-tests = { max-threads = 1 }
|
||||
heavy-load-tests = { max-threads = 4 }
|
||||
|
||||
@@ -35,9 +35,17 @@ rayon = "1"
|
||||
smallvec = "1"
|
||||
uuid = { workspace = true }
|
||||
tempfile = { version = "3", optional = true }
|
||||
clap = { workspace = true, optional = true }
|
||||
toml = { version = "0.8", optional = true }
|
||||
|
||||
[features]
|
||||
test-harness = ["dep:tempfile"]
|
||||
gauntlet-cli = ["test-harness", "dep:clap", "dep:toml"]
|
||||
|
||||
[[bin]]
|
||||
name = "tranquil-gauntlet"
|
||||
path = "src/bin/tranquil_gauntlet.rs"
|
||||
required-features = ["gauntlet-cli"]
|
||||
|
||||
[dev-dependencies]
|
||||
tranquil-store = { path = ".", features = ["test-harness"] }
|
||||
|
||||
698
crates/tranquil-store/src/bin/tranquil_gauntlet.rs
Normal file
698
crates/tranquil-store/src/bin/tranquil_gauntlet.rs
Normal file
@@ -0,0 +1,698 @@
|
||||
use std::io::{self, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::ExitCode;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::runtime::Runtime;
|
||||
use tranquil_store::gauntlet::{
|
||||
ConfigOverrides, Gauntlet, GauntletReport, InvariantViolation, OpStream, RegressionRecord,
|
||||
Scenario, Seed, config_for, farm,
|
||||
shrink::{DEFAULT_MAX_SHRINK_ITERATIONS, shrink_failure},
|
||||
};
|
||||
|
||||
const MAX_HOURS: f64 = 1.0e6;
|
||||
|
||||
/// Deterministic storage-engine gauntlet: scenario fuzzing, shrinking, regression replay.
|
||||
///
|
||||
/// Writes one NDjson record per seed to stdout; `farm` adds a final summary record.
|
||||
/// Progress, batch stats, interrupt notices, and errors go to stderr.
|
||||
/// Exits 0 on success, 1 on invariant violation, 2 on argument or runtime error.
|
||||
/// First SIGINT stops after the current batch; a second press aborts.
|
||||
///
|
||||
/// Hopefully we'll catch super complicated tranquil-store bugs with this!!
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(name = "tranquil-gauntlet", version)]
|
||||
struct Cli {
|
||||
#[command(subcommand)]
|
||||
cmd: Cmd,
|
||||
}
|
||||
|
||||
#[derive(Debug, Subcommand)]
|
||||
enum Cmd {
|
||||
/// Run a scenario across many seeds in parallel.
|
||||
///
|
||||
/// With --hours, the command loops batches of --seeds until the deadline passes.
|
||||
/// Without --hours, a single batch runs and the command exits.
|
||||
/// The last stdout line is always a `"type":"summary"` record.
|
||||
Farm {
|
||||
/// Scenario to run.
|
||||
#[arg(long, value_enum, required_unless_present = "config")]
|
||||
scenario: Option<Scenario>,
|
||||
|
||||
/// First seed in the batch range. Default 0.
|
||||
#[arg(long)]
|
||||
seed_start: Option<u64>,
|
||||
|
||||
/// Number of seeds per batch. Default 256. Must be > 0.
|
||||
#[arg(long)]
|
||||
seeds: Option<u64>,
|
||||
|
||||
/// Wall-clock budget in hours; batches repeat until the deadline elapses.
|
||||
#[arg(long)]
|
||||
hours: Option<f64>,
|
||||
|
||||
/// Directory to dump regression Json on failure.
|
||||
#[arg(long)]
|
||||
dump_regressions: Option<PathBuf>,
|
||||
|
||||
/// Toml config with any of the above fields plus an `[overrides]` table.
|
||||
#[arg(long)]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Skip shrinking when dumping regressions.
|
||||
#[arg(long)]
|
||||
no_shrink: bool,
|
||||
|
||||
/// Max shrink attempts per failing seed.
|
||||
#[arg(long, default_value_t = DEFAULT_MAX_SHRINK_ITERATIONS, conflicts_with = "no_shrink")]
|
||||
shrink_budget: usize,
|
||||
},
|
||||
/// Replay a single seed or a saved regression file.
|
||||
///
|
||||
/// With --from, replays a regression Json produced by `farm --dump-regressions`.
|
||||
/// Otherwise supply --scenario and --seed, or a --config that sets them.
|
||||
/// Writes one NDjson record to stdout.
|
||||
Repro {
|
||||
/// Scenario to replay. Ignored when --from is set.
|
||||
#[arg(long, value_enum, conflicts_with = "from", required_unless_present_any = ["config", "from"])]
|
||||
scenario: Option<Scenario>,
|
||||
|
||||
/// Seed to replay. Ignored when --from is set.
|
||||
#[arg(long, conflicts_with = "from", required_unless_present_any = ["config", "from"])]
|
||||
seed: Option<u64>,
|
||||
|
||||
/// Toml config with optional scenario, seed, and overrides.
|
||||
#[arg(long, conflicts_with = "from")]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Replay a saved regression Json from `farm --dump-regressions`.
|
||||
#[arg(long)]
|
||||
from: Option<PathBuf>,
|
||||
|
||||
/// Directory to dump regression Json if replay fails.
|
||||
#[arg(long)]
|
||||
dump_regressions: Option<PathBuf>,
|
||||
|
||||
/// Skip shrinking when dumping regressions.
|
||||
#[arg(long)]
|
||||
no_shrink: bool,
|
||||
|
||||
/// Max shrink attempts when dumping regressions.
|
||||
#[arg(long, default_value_t = DEFAULT_MAX_SHRINK_ITERATIONS, conflicts_with = "no_shrink")]
|
||||
shrink_budget: usize,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
struct ConfigFile {
|
||||
#[serde(default)]
|
||||
scenario: Option<Scenario>,
|
||||
#[serde(default)]
|
||||
seed: Option<u64>,
|
||||
#[serde(default)]
|
||||
seed_start: Option<u64>,
|
||||
#[serde(default)]
|
||||
seeds: Option<u64>,
|
||||
#[serde(default)]
|
||||
hours: Option<f64>,
|
||||
#[serde(default)]
|
||||
dump_regressions: Option<PathBuf>,
|
||||
#[serde(default)]
|
||||
overrides: ConfigOverrides,
|
||||
}
|
||||
|
||||
fn load_config_file(path: &Path) -> Result<ConfigFile, String> {
|
||||
let raw = std::fs::read_to_string(path).map_err(|e| format!("read {}: {e}", path.display()))?;
|
||||
toml::from_str(&raw).map_err(|e| format!("parse {}: {e}", path.display()))
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct NdjsonResult {
|
||||
scenario: &'static str,
|
||||
seed: u64,
|
||||
ops_executed: usize,
|
||||
op_errors: usize,
|
||||
restarts: usize,
|
||||
clean: bool,
|
||||
violations: Vec<NdjsonViolation>,
|
||||
wall_ms: u64,
|
||||
ops_in_stream: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct NdjsonViolation {
|
||||
invariant: &'static str,
|
||||
detail: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct NdjsonSummary {
|
||||
#[serde(rename = "type")]
|
||||
kind: &'static str,
|
||||
scenario: &'static str,
|
||||
seeds_run: u64,
|
||||
clean: u64,
|
||||
failed: u64,
|
||||
total_ops: u64,
|
||||
wall_ms: u64,
|
||||
interrupted: bool,
|
||||
}
|
||||
|
||||
fn emit_summary(summary: &NdjsonSummary) {
|
||||
let line = match serde_json::to_string(summary) {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
eprintln!("summary serialize failed: {e}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
let stdout = io::stdout();
|
||||
let mut w = stdout.lock();
|
||||
if let Err(e) = writeln!(w, "{line}").and_then(|()| w.flush())
|
||||
&& e.kind() != io::ErrorKind::BrokenPipe
|
||||
{
|
||||
eprintln!("summary emit failed: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
fn emit(scenario: Scenario, report: &GauntletReport, elapsed: Duration) -> io::Result<()> {
|
||||
let result = NdjsonResult {
|
||||
scenario: scenario.cli_name(),
|
||||
seed: report.seed.0,
|
||||
ops_executed: report.ops_executed.0,
|
||||
op_errors: report.op_errors.0,
|
||||
restarts: report.restarts.0,
|
||||
clean: report.is_clean(),
|
||||
violations: report
|
||||
.violations
|
||||
.iter()
|
||||
.map(|v: &InvariantViolation| NdjsonViolation {
|
||||
invariant: v.invariant,
|
||||
detail: v.detail.clone(),
|
||||
})
|
||||
.collect(),
|
||||
wall_ms: u64::try_from(elapsed.as_millis()).unwrap_or(u64::MAX),
|
||||
ops_in_stream: report.ops.len(),
|
||||
};
|
||||
let line = serde_json::to_string(&result).map_err(io::Error::other)?;
|
||||
let stdout = io::stdout();
|
||||
let mut w = stdout.lock();
|
||||
writeln!(w, "{line}")?;
|
||||
w.flush()
|
||||
}
|
||||
|
||||
fn emit_or_log(scenario: Scenario, report: &GauntletReport, elapsed: Duration) {
|
||||
if let Err(e) = emit(scenario, report, elapsed)
|
||||
&& e.kind() != io::ErrorKind::BrokenPipe
|
||||
{
|
||||
eprintln!("ndjson emit failed: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
struct FarmPlan {
|
||||
scenario: Scenario,
|
||||
seed_start: u64,
|
||||
seeds: u64,
|
||||
hours: Option<f64>,
|
||||
dump_regressions: Option<PathBuf>,
|
||||
overrides: ConfigOverrides,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn resolve_farm(
|
||||
scenario: Option<Scenario>,
|
||||
seed_start: Option<u64>,
|
||||
seeds: Option<u64>,
|
||||
hours: Option<f64>,
|
||||
dump_regressions: Option<PathBuf>,
|
||||
config: Option<PathBuf>,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
) -> Result<FarmPlan, String> {
|
||||
let file: Option<ConfigFile> = config.as_ref().map(|p| load_config_file(p)).transpose()?;
|
||||
let scenario = scenario
|
||||
.or_else(|| file.as_ref().and_then(|f| f.scenario))
|
||||
.ok_or("must pass --scenario or set `scenario` in --config")?;
|
||||
let seed_start = seed_start
|
||||
.or_else(|| file.as_ref().and_then(|f| f.seed_start))
|
||||
.unwrap_or(0);
|
||||
let seeds = seeds
|
||||
.or_else(|| file.as_ref().and_then(|f| f.seeds))
|
||||
.unwrap_or(256);
|
||||
if seeds == 0 {
|
||||
return Err("--seeds must be greater than zero".to_string());
|
||||
}
|
||||
let hours = hours.or_else(|| file.as_ref().and_then(|f| f.hours));
|
||||
if let Some(h) = hours {
|
||||
validate_hours(h)?;
|
||||
}
|
||||
if shrink && shrink_budget == 0 {
|
||||
return Err("--shrink-budget must be greater than zero".to_string());
|
||||
}
|
||||
let dump_regressions =
|
||||
dump_regressions.or_else(|| file.as_ref().and_then(|f| f.dump_regressions.clone()));
|
||||
let overrides = file.map(|f| f.overrides).unwrap_or_default();
|
||||
Ok(FarmPlan {
|
||||
scenario,
|
||||
seed_start,
|
||||
seeds,
|
||||
hours,
|
||||
dump_regressions,
|
||||
overrides,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
})
|
||||
}
|
||||
|
||||
fn validate_hours(h: f64) -> Result<(), String> {
|
||||
if !h.is_finite() || h <= 0.0 {
|
||||
return Err(format!("invalid --hours={h}: must be positive and finite"));
|
||||
}
|
||||
if h > MAX_HOURS {
|
||||
return Err(format!("invalid --hours={h}: must not exceed {MAX_HOURS}"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
enum ReproPlan {
|
||||
FromFile {
|
||||
record: RegressionRecord,
|
||||
dump_regressions: Option<PathBuf>,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
},
|
||||
FromSeed {
|
||||
scenario: Scenario,
|
||||
seed: Seed,
|
||||
overrides: ConfigOverrides,
|
||||
dump_regressions: Option<PathBuf>,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
},
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn resolve_repro(
|
||||
scenario: Option<Scenario>,
|
||||
seed: Option<u64>,
|
||||
config: Option<PathBuf>,
|
||||
from: Option<PathBuf>,
|
||||
dump_regressions: Option<PathBuf>,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
) -> Result<ReproPlan, String> {
|
||||
if shrink && shrink_budget == 0 {
|
||||
return Err("--shrink-budget must be greater than zero".to_string());
|
||||
}
|
||||
if let Some(path) = from {
|
||||
let record = RegressionRecord::load(&path).map_err(|e| e.to_string())?;
|
||||
return Ok(ReproPlan::FromFile {
|
||||
record,
|
||||
dump_regressions,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
});
|
||||
}
|
||||
let file: Option<ConfigFile> = config.as_ref().map(|p| load_config_file(p)).transpose()?;
|
||||
let scenario = scenario
|
||||
.or_else(|| file.as_ref().and_then(|f| f.scenario))
|
||||
.ok_or("must pass --scenario, set `scenario` in --config, or use --from")?;
|
||||
let seed = seed
|
||||
.or_else(|| file.as_ref().and_then(|f| f.seed))
|
||||
.ok_or("must pass --seed, set `seed` in --config, or use --from")?;
|
||||
let overrides = file.map(|f| f.overrides).unwrap_or_default();
|
||||
Ok(ReproPlan::FromSeed {
|
||||
scenario,
|
||||
seed: Seed(seed),
|
||||
overrides,
|
||||
dump_regressions,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_runtime() -> Result<Runtime, ExitCode> {
|
||||
Runtime::new().map_err(|e| {
|
||||
eprintln!("failed to build tokio runtime: {e}");
|
||||
ExitCode::from(2)
|
||||
})
|
||||
}
|
||||
|
||||
fn install_interrupt(rt: &Runtime) -> Arc<AtomicBool> {
|
||||
let flag = Arc::new(AtomicBool::new(false));
|
||||
let f = flag.clone();
|
||||
rt.spawn(async move {
|
||||
if tokio::signal::ctrl_c().await.is_err() {
|
||||
return;
|
||||
}
|
||||
f.store(true, Ordering::Relaxed);
|
||||
eprintln!(
|
||||
"interrupt received, stopping after current batch; press Ctrl-C again to abort"
|
||||
);
|
||||
if tokio::signal::ctrl_c().await.is_ok() {
|
||||
eprintln!("second interrupt, aborting");
|
||||
std::process::exit(130);
|
||||
}
|
||||
});
|
||||
flag
|
||||
}
|
||||
|
||||
fn main() -> ExitCode {
|
||||
let cli = Cli::parse();
|
||||
match cli.cmd {
|
||||
Cmd::Farm {
|
||||
scenario,
|
||||
seed_start,
|
||||
seeds,
|
||||
hours,
|
||||
dump_regressions,
|
||||
config,
|
||||
no_shrink,
|
||||
shrink_budget,
|
||||
} => {
|
||||
let plan = match resolve_farm(
|
||||
scenario,
|
||||
seed_start,
|
||||
seeds,
|
||||
hours,
|
||||
dump_regressions,
|
||||
config,
|
||||
!no_shrink,
|
||||
shrink_budget,
|
||||
) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
return ExitCode::from(2);
|
||||
}
|
||||
};
|
||||
let rt = match build_runtime() {
|
||||
Ok(rt) => rt,
|
||||
Err(code) => return code,
|
||||
};
|
||||
let interrupt = install_interrupt(&rt);
|
||||
run_farm(plan, &rt, interrupt)
|
||||
}
|
||||
Cmd::Repro {
|
||||
scenario,
|
||||
seed,
|
||||
config,
|
||||
from,
|
||||
dump_regressions,
|
||||
no_shrink,
|
||||
shrink_budget,
|
||||
} => {
|
||||
let plan = match resolve_repro(
|
||||
scenario,
|
||||
seed,
|
||||
config,
|
||||
from,
|
||||
dump_regressions,
|
||||
!no_shrink,
|
||||
shrink_budget,
|
||||
) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
return ExitCode::from(2);
|
||||
}
|
||||
};
|
||||
let rt = match build_runtime() {
|
||||
Ok(rt) => rt,
|
||||
Err(code) => return code,
|
||||
};
|
||||
run_repro(plan, &rt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn run_farm(plan: FarmPlan, rt: &Runtime, interrupt: Arc<AtomicBool>) -> ExitCode {
|
||||
let FarmPlan {
|
||||
scenario,
|
||||
seed_start,
|
||||
seeds,
|
||||
hours,
|
||||
dump_regressions,
|
||||
overrides,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
} = plan;
|
||||
let deadline = hours.map(|h| Instant::now() + Duration::from_secs_f64(h * 3600.0));
|
||||
let run_start = Instant::now();
|
||||
let mut any_failed = false;
|
||||
let mut next_seed = seed_start;
|
||||
let mut total_seeds: u64 = 0;
|
||||
let mut total_clean: u64 = 0;
|
||||
let mut total_failed: u64 = 0;
|
||||
let mut total_ops: u64 = 0;
|
||||
|
||||
loop {
|
||||
if interrupt.load(Ordering::Relaxed) {
|
||||
break;
|
||||
}
|
||||
if let Some(d) = deadline
|
||||
&& Instant::now() >= d
|
||||
{
|
||||
break;
|
||||
}
|
||||
let end = match next_seed.checked_add(seeds) {
|
||||
Some(e) => e,
|
||||
None => {
|
||||
eprintln!("seed range overflowed u64: seed_start={next_seed} seeds={seeds}");
|
||||
break;
|
||||
}
|
||||
};
|
||||
let overrides_ref = &overrides;
|
||||
let batch_start = Instant::now();
|
||||
let reports = farm::run_many_timed(
|
||||
|s| {
|
||||
let mut cfg = config_for(scenario, s);
|
||||
overrides_ref.apply_to(&mut cfg);
|
||||
cfg
|
||||
},
|
||||
(next_seed..end).map(Seed),
|
||||
);
|
||||
let batch_wall = batch_start.elapsed();
|
||||
let batch_failed = reports.iter().filter(|(r, _)| !r.is_clean()).count();
|
||||
let batch_clean = reports.len().saturating_sub(batch_failed);
|
||||
let batch_ops: u64 = reports
|
||||
.iter()
|
||||
.map(|(r, _)| r.ops_executed.0 as u64)
|
||||
.sum();
|
||||
reports.iter().for_each(|(r, elapsed)| {
|
||||
if !r.is_clean() {
|
||||
any_failed = true;
|
||||
if let Some(root) = &dump_regressions {
|
||||
dump_regression(scenario, r, root, &overrides, shrink, shrink_budget, rt);
|
||||
}
|
||||
}
|
||||
emit_or_log(scenario, r, *elapsed);
|
||||
});
|
||||
total_seeds += reports.len() as u64;
|
||||
total_clean += batch_clean as u64;
|
||||
total_failed += batch_failed as u64;
|
||||
total_ops += batch_ops;
|
||||
let wall_secs = batch_wall.as_secs_f64();
|
||||
let ops_per_sec_display: String = if wall_secs > 0.0 {
|
||||
format!("{:.0} ops/s", batch_ops as f64 / wall_secs)
|
||||
} else {
|
||||
"n/a ops/s".to_string()
|
||||
};
|
||||
eprintln!(
|
||||
"batch {next_seed}..{end}: {batch_clean} clean, {batch_failed} failed, {wall_secs:.1}s, {ops_per_sec_display}",
|
||||
);
|
||||
if deadline.is_none() {
|
||||
break;
|
||||
}
|
||||
next_seed = end;
|
||||
}
|
||||
|
||||
let wall_ms = u64::try_from(run_start.elapsed().as_millis()).unwrap_or(u64::MAX);
|
||||
emit_summary(&NdjsonSummary {
|
||||
kind: "summary",
|
||||
scenario: scenario.cli_name(),
|
||||
seeds_run: total_seeds,
|
||||
clean: total_clean,
|
||||
failed: total_failed,
|
||||
total_ops,
|
||||
wall_ms,
|
||||
interrupted: interrupt.load(Ordering::Relaxed),
|
||||
});
|
||||
|
||||
if any_failed {
|
||||
ExitCode::from(1)
|
||||
} else {
|
||||
ExitCode::SUCCESS
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_regression(
|
||||
scenario: Scenario,
|
||||
report: &GauntletReport,
|
||||
root: &Path,
|
||||
overrides: &ConfigOverrides,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
rt: &Runtime,
|
||||
) {
|
||||
let original_len = report.ops.len();
|
||||
let (final_ops, final_report) = if shrink && original_len > 0 {
|
||||
let mut cfg = config_for(scenario, report.seed);
|
||||
overrides.apply_to(&mut cfg);
|
||||
let outcome = rt.block_on(shrink_failure(
|
||||
cfg,
|
||||
report.ops.clone(),
|
||||
report.clone(),
|
||||
shrink_budget,
|
||||
));
|
||||
eprintln!(
|
||||
"shrank {} -> {} ops for seed {:016x} in {} runs",
|
||||
original_len,
|
||||
outcome.ops.len(),
|
||||
report.seed.0,
|
||||
outcome.iterations,
|
||||
);
|
||||
(outcome.ops, outcome.report)
|
||||
} else {
|
||||
(report.ops.clone(), report.clone())
|
||||
};
|
||||
let record = RegressionRecord::from_report(
|
||||
scenario,
|
||||
overrides.clone(),
|
||||
&final_report,
|
||||
original_len,
|
||||
final_ops,
|
||||
);
|
||||
match record.write_to(root) {
|
||||
Ok(path) => eprintln!("wrote regression to {}", path.display()),
|
||||
Err(e) => eprintln!("regression dump failed: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn run_repro(plan: ReproPlan, rt: &Runtime) -> ExitCode {
|
||||
match plan {
|
||||
ReproPlan::FromFile {
|
||||
record,
|
||||
dump_regressions,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
} => run_repro_from_record(record, dump_regressions, shrink, shrink_budget, rt),
|
||||
ReproPlan::FromSeed {
|
||||
scenario,
|
||||
seed,
|
||||
overrides,
|
||||
dump_regressions,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
} => {
|
||||
let mut cfg = config_for(scenario, seed);
|
||||
overrides.apply_to(&mut cfg);
|
||||
let start = Instant::now();
|
||||
let gauntlet = match Gauntlet::new(cfg) {
|
||||
Ok(g) => g,
|
||||
Err(e) => {
|
||||
eprintln!("gauntlet init failed: {e}");
|
||||
return ExitCode::from(2);
|
||||
}
|
||||
};
|
||||
let report = rt.block_on(gauntlet.run());
|
||||
let elapsed = start.elapsed();
|
||||
if !report.is_clean()
|
||||
&& let Some(root) = &dump_regressions
|
||||
{
|
||||
dump_regression(
|
||||
scenario,
|
||||
&report,
|
||||
root,
|
||||
&overrides,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
rt,
|
||||
);
|
||||
}
|
||||
emit_or_log(scenario, &report, elapsed);
|
||||
if report.is_clean() {
|
||||
ExitCode::SUCCESS
|
||||
} else {
|
||||
ExitCode::from(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn run_repro_from_record(
|
||||
record: RegressionRecord,
|
||||
dump_regressions: Option<PathBuf>,
|
||||
shrink: bool,
|
||||
shrink_budget: usize,
|
||||
rt: &Runtime,
|
||||
) -> ExitCode {
|
||||
let scenario = match record.scenario_enum() {
|
||||
Ok(s) => s,
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
return ExitCode::from(2);
|
||||
}
|
||||
};
|
||||
let cfg = match record.build_config() {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
eprintln!("{e}");
|
||||
return ExitCode::from(2);
|
||||
}
|
||||
};
|
||||
let shrunk_from = if record.original_ops_len > record.ops.len() {
|
||||
format!(", shrunk from {}", record.original_ops_len)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
eprintln!(
|
||||
"replay {} seed {:016x}: {} ops{}, {} recorded violations",
|
||||
scenario.cli_name(),
|
||||
record.seed.0,
|
||||
record.ops.len(),
|
||||
shrunk_from,
|
||||
record.violations.len(),
|
||||
);
|
||||
record.violations.iter().for_each(|v| {
|
||||
eprintln!("violation {}: {}", v.invariant, v.detail);
|
||||
});
|
||||
let overrides = record.overrides.clone();
|
||||
let ops: OpStream = record.op_stream();
|
||||
let start = Instant::now();
|
||||
let gauntlet = match Gauntlet::new(cfg) {
|
||||
Ok(g) => g,
|
||||
Err(e) => {
|
||||
eprintln!("build gauntlet: {e}");
|
||||
return ExitCode::from(2);
|
||||
}
|
||||
};
|
||||
let report = rt.block_on(gauntlet.run_with_ops(ops));
|
||||
let elapsed = start.elapsed();
|
||||
if !report.is_clean()
|
||||
&& let Some(root) = &dump_regressions
|
||||
{
|
||||
dump_regression(
|
||||
scenario,
|
||||
&report,
|
||||
root,
|
||||
&overrides,
|
||||
shrink,
|
||||
shrink_budget,
|
||||
rt,
|
||||
);
|
||||
}
|
||||
emit_or_log(scenario, &report, elapsed);
|
||||
if report.is_clean() {
|
||||
ExitCode::SUCCESS
|
||||
} else {
|
||||
ExitCode::from(1)
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,15 @@
|
||||
use std::cell::RefCell;
|
||||
use std::panic::{AssertUnwindSafe, catch_unwind};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use rayon::prelude::*;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
use super::op::Seed;
|
||||
use super::runner::{Gauntlet, GauntletConfig, GauntletReport};
|
||||
use super::invariants::InvariantViolation;
|
||||
use super::op::{OpStream, Seed};
|
||||
use super::runner::{
|
||||
Gauntlet, GauntletConfig, GauntletReport, OpErrorCount, OpsExecuted, RestartCount,
|
||||
};
|
||||
|
||||
thread_local! {
|
||||
static RUNTIME: RefCell<Option<Runtime>> = const { RefCell::new(None) };
|
||||
@@ -26,6 +31,19 @@ fn with_runtime<R>(f: impl FnOnce(&Runtime) -> R) -> R {
|
||||
}
|
||||
|
||||
pub fn run_many<F>(make_config: F, seeds: impl IntoIterator<Item = Seed>) -> Vec<GauntletReport>
|
||||
where
|
||||
F: Fn(Seed) -> GauntletConfig + Sync + Send,
|
||||
{
|
||||
run_many_timed(make_config, seeds)
|
||||
.into_iter()
|
||||
.map(|(r, _)| r)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn run_many_timed<F>(
|
||||
make_config: F,
|
||||
seeds: impl IntoIterator<Item = Seed>,
|
||||
) -> Vec<(GauntletReport, Duration)>
|
||||
where
|
||||
F: Fn(Seed) -> GauntletConfig + Sync + Send,
|
||||
{
|
||||
@@ -33,9 +51,36 @@ where
|
||||
seeds
|
||||
.into_par_iter()
|
||||
.map(|s| {
|
||||
let cfg = make_config(s);
|
||||
let gauntlet = Gauntlet::new(cfg).expect("build gauntlet");
|
||||
with_runtime(|rt| rt.block_on(gauntlet.run()))
|
||||
let start = Instant::now();
|
||||
let outcome = catch_unwind(AssertUnwindSafe(|| {
|
||||
let cfg = make_config(s);
|
||||
let gauntlet = Gauntlet::new(cfg).expect("build gauntlet");
|
||||
with_runtime(|rt| rt.block_on(gauntlet.run()))
|
||||
}));
|
||||
let report = outcome.unwrap_or_else(|payload| {
|
||||
RUNTIME.with(|cell| cell.borrow_mut().take());
|
||||
panic_report(s, payload)
|
||||
});
|
||||
(report, start.elapsed())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn panic_report(seed: Seed, payload: Box<dyn std::any::Any + Send>) -> GauntletReport {
|
||||
let msg = payload
|
||||
.downcast_ref::<&'static str>()
|
||||
.map(|s| (*s).to_string())
|
||||
.or_else(|| payload.downcast_ref::<String>().cloned())
|
||||
.unwrap_or_else(|| "non-string panic payload".to_string());
|
||||
GauntletReport {
|
||||
seed,
|
||||
ops_executed: OpsExecuted(0),
|
||||
op_errors: OpErrorCount(0),
|
||||
restarts: RestartCount(0),
|
||||
violations: vec![InvariantViolation {
|
||||
invariant: "FarmPanic",
|
||||
detail: msg,
|
||||
}],
|
||||
ops: OpStream::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
117
crates/tranquil-store/src/gauntlet/overrides.rs
Normal file
117
crates/tranquil-store/src/gauntlet/overrides.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::runner::{GauntletConfig, MaxFileSize, RunLimits, ShardCount, WallMs};
|
||||
use super::workload::OpCount;
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct ConfigOverrides {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub op_count: Option<usize>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub max_wall_ms: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "StoreOverrides::is_empty")]
|
||||
pub store: StoreOverrides,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct StoreOverrides {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub max_file_size: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub shard_count: Option<u8>,
|
||||
#[serde(default, skip_serializing_if = "GroupCommitOverrides::is_empty")]
|
||||
pub group_commit: GroupCommitOverrides,
|
||||
}
|
||||
|
||||
impl StoreOverrides {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.max_file_size.is_none() && self.shard_count.is_none() && self.group_commit.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct GroupCommitOverrides {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub max_batch_size: Option<usize>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub channel_capacity: Option<usize>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub checkpoint_interval_ms: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub checkpoint_write_threshold: Option<u64>,
|
||||
}
|
||||
|
||||
impl GroupCommitOverrides {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.max_batch_size.is_none()
|
||||
&& self.channel_capacity.is_none()
|
||||
&& self.checkpoint_interval_ms.is_none()
|
||||
&& self.checkpoint_write_threshold.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
impl ConfigOverrides {
|
||||
pub fn apply_to(&self, cfg: &mut GauntletConfig) {
|
||||
if let Some(n) = self.op_count {
|
||||
cfg.op_count = OpCount(n);
|
||||
}
|
||||
if let Some(ms) = self.max_wall_ms {
|
||||
cfg.limits = RunLimits {
|
||||
max_wall_ms: Some(WallMs(ms)),
|
||||
};
|
||||
}
|
||||
if let Some(n) = self.store.max_file_size {
|
||||
cfg.store.max_file_size = MaxFileSize(n);
|
||||
}
|
||||
if let Some(n) = self.store.shard_count {
|
||||
cfg.store.shard_count = ShardCount(n);
|
||||
}
|
||||
let gc = &self.store.group_commit;
|
||||
if let Some(n) = gc.max_batch_size {
|
||||
cfg.store.group_commit.max_batch_size = n;
|
||||
}
|
||||
if let Some(n) = gc.channel_capacity {
|
||||
cfg.store.group_commit.channel_capacity = n;
|
||||
}
|
||||
if let Some(n) = gc.checkpoint_interval_ms {
|
||||
cfg.store.group_commit.checkpoint_interval_ms = n;
|
||||
}
|
||||
if let Some(n) = gc.checkpoint_write_threshold {
|
||||
cfg.store.group_commit.checkpoint_write_threshold = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn default_overrides_serialize_empty() {
|
||||
let o = ConfigOverrides::default();
|
||||
let json = serde_json::to_string(&o).unwrap();
|
||||
assert_eq!(json, "{}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn round_trip_preserves_set_fields() {
|
||||
let o = ConfigOverrides {
|
||||
op_count: Some(42),
|
||||
store: StoreOverrides {
|
||||
max_file_size: Some(4096),
|
||||
group_commit: GroupCommitOverrides {
|
||||
max_batch_size: Some(16),
|
||||
..GroupCommitOverrides::default()
|
||||
},
|
||||
..StoreOverrides::default()
|
||||
},
|
||||
..ConfigOverrides::default()
|
||||
};
|
||||
let json = serde_json::to_string(&o).unwrap();
|
||||
let back: ConfigOverrides = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(o, back);
|
||||
}
|
||||
}
|
||||
@@ -53,10 +53,64 @@ impl Scenario {
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn cli_name(self) -> &'static str {
|
||||
match self {
|
||||
Self::SmokePR => "smoke-pr",
|
||||
Self::MstChurn => "mst-churn",
|
||||
Self::MstRestartChurn => "mst-restart-churn",
|
||||
Self::FullStackRestart => "full-stack-restart",
|
||||
Self::CatastrophicChurn => "catastrophic-churn",
|
||||
Self::HugeValues => "huge-values",
|
||||
Self::TinyBatches => "tiny-batches",
|
||||
Self::GiantBatches => "giant-batches",
|
||||
Self::ManyFiles => "many-files",
|
||||
Self::ModerateFaults => "moderate-faults",
|
||||
Self::AggressiveFaults => "aggressive-faults",
|
||||
Self::TornPages => "torn-pages",
|
||||
Self::Fsyncgate => "fsyncgate",
|
||||
Self::FirehoseFanout => "firehose-fanout",
|
||||
Self::ContendedReaders => "contended-readers",
|
||||
Self::ContendedWriters => "contended-writers",
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn description(self) -> &'static str {
|
||||
match self {
|
||||
Self::SmokePR => "60s canary, 10k ops, core invariants. Default PR gate.",
|
||||
Self::MstChurn => "100k churn, no restart. Refcount + reachability focus.",
|
||||
Self::MstRestartChurn => "100k churn with Poisson restart bursts every ~5k ops.",
|
||||
Self::FullStackRestart => "5k ops, deterministic restart every 500 ops.",
|
||||
Self::CatastrophicChurn => {
|
||||
"1M ops, phase-2 invariants, Poisson restart. 30 min budget."
|
||||
}
|
||||
Self::HugeValues => "Heavy-tail values up to 16 MiB. 32 MiB file cap.",
|
||||
Self::TinyBatches => "Group-commit batch size 1, tight checkpoints, 4 KiB files.",
|
||||
Self::GiantBatches => "Group-commit batch size 100k, 16 MiB files.",
|
||||
Self::ManyFiles => "256-byte file cap, many segments, delete-heavy.",
|
||||
Self::ModerateFaults => {
|
||||
"Simulated IO with moderate fault config. CrashAtSyscall restarts."
|
||||
}
|
||||
Self::AggressiveFaults => {
|
||||
"Simulated IO with aggressive fault config. CrashAtSyscall restarts."
|
||||
}
|
||||
Self::TornPages => "Torn-page faults only, 20k ops.",
|
||||
Self::Fsyncgate => "Fsync-drop faults only, 10k ops.",
|
||||
Self::FirehoseFanout => {
|
||||
"Eventlog-heavy workload with FSYNC_ORDERING / MONOTONIC_SEQ / TOMBSTONE_BOUND invariants."
|
||||
}
|
||||
Self::ContendedReaders => "60% reads, 64 writer tasks, simulated moderate faults.",
|
||||
Self::ContendedWriters => "Add/delete heavy, 32 writer tasks, simulated moderate faults.",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_name(name: &str) -> Option<Self> {
|
||||
Self::ALL.iter().copied().find(|s| s.name() == name)
|
||||
}
|
||||
|
||||
pub fn from_cli_name(name: &str) -> Option<Self> {
|
||||
Self::ALL.iter().copied().find(|s| s.cli_name() == name)
|
||||
}
|
||||
|
||||
pub const ALL: &'static [Scenario] = &[
|
||||
Self::SmokePR,
|
||||
Self::MstChurn,
|
||||
@@ -77,6 +131,39 @@ impl Scenario {
|
||||
];
|
||||
}
|
||||
|
||||
impl serde::Serialize for Scenario {
|
||||
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
serializer.serialize_str(self.cli_name())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for Scenario {
|
||||
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
|
||||
let s = <std::borrow::Cow<'de, str>>::deserialize(deserializer)?;
|
||||
Self::from_cli_name(&s).ok_or_else(|| {
|
||||
serde::de::Error::custom(format!(
|
||||
"unknown scenario {s:?}; expected one of {}",
|
||||
Self::ALL
|
||||
.iter()
|
||||
.map(|s| s.cli_name())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gauntlet-cli")]
|
||||
impl clap::ValueEnum for Scenario {
|
||||
fn value_variants<'a>() -> &'a [Self] {
|
||||
Self::ALL
|
||||
}
|
||||
|
||||
fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
|
||||
Some(clap::builder::PossibleValue::new(self.cli_name()).help(self.description()))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Scenario {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.name())
|
||||
|
||||
15
justfile
15
justfile
@@ -25,6 +25,21 @@ test-store:
|
||||
test-store-sim-nightly:
|
||||
SQLX_OFFLINE=true TRANQUIL_SIM_SEEDS=10000 cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile sim-nightly
|
||||
|
||||
gauntlet-pr:
|
||||
SQLX_OFFLINE=true cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile gauntlet-pr --test gauntlet_smoke
|
||||
|
||||
gauntlet-nightly HOURS="6":
|
||||
SQLX_OFFLINE=true GAUNTLET_DURATION_HOURS={{HOURS}} cargo nextest run -p tranquil-store --features tranquil-store/test-harness --profile gauntlet-nightly --test gauntlet_smoke --run-ignored all
|
||||
|
||||
gauntlet-farm SCENARIO HOURS="6" DUMP="proptest-regressions":
|
||||
SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- farm --scenario {{SCENARIO}} --hours {{HOURS}} --dump-regressions {{DUMP}}
|
||||
|
||||
gauntlet-repro SEED SCENARIO="smoke-pr":
|
||||
SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --scenario {{SCENARIO}} --seed {{SEED}}
|
||||
|
||||
gauntlet-repro-from FILE:
|
||||
SQLX_OFFLINE=true cargo run --release -p tranquil-store --bin tranquil-gauntlet --features tranquil-store/gauntlet-cli -- repro --from {{FILE}}
|
||||
|
||||
test-unit:
|
||||
SQLX_OFFLINE=true cargo test --test dpop_unit --test validation_edge_cases --test scope_edge_cases
|
||||
|
||||
|
||||
Reference in New Issue
Block a user