claude-skills-reference/engineering/autoresearch-agent/evaluators/benchmark_speed.py

#!/usr/bin/env python3
"""Measure execution speed of a target function or command.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""

import statistics
import subprocess
import sys
import time

# --- CONFIGURE THESE ---
COMMAND = "python src/module.py"  # Command to benchmark
RUNS = 5                          # Number of runs
WARMUP = 1                        # Warmup runs (not counted)
# --- END CONFIG ---

times = []

# Warmup
for _ in range(WARMUP):
    subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)

# Benchmark
for i in range(RUNS):
    t0 = time.perf_counter()
    result = subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
    elapsed = (time.perf_counter() - t0) * 1000  # ms

    if result.returncode != 0:
        print(f"Run {i+1} failed (exit {result.returncode})", file=sys.stderr)
        print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
        sys.exit(1)

    times.append(elapsed)

p50 = statistics.median(times)
p95 = sorted(times)[int(len(times) * 0.95)] if len(times) >= 5 else max(times)

print(f"p50_ms: {p50:.2f}")
print(f"p95_ms: {p95:.2f}")
print(f"runs: {RUNS}")