refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo. Architecture changes: - Multi-experiment support: .autoresearch/{domain}/{name}/ structure - Domain categories: engineering, marketing, content, prompts, custom - Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope - User chooses scope during setup, not installation New evaluators (8 ready-to-use): - Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage - LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy - LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed Script improvements: - setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators - run_experiment.py: --experiment domain/name, --resume, --loop, --single - log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output Results export: - Terminal (default), CSV, and Markdown formats - Per-experiment, per-domain, or cross-experiment dashboard view SKILL.md rewritten: - Clear activation triggers (when the skill should activate) - Practical examples for each domain - Evaluator documentation with cost transparency - Simplified loop protocol matching Karpathy's original philosophy
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions
--- a/engineering/autoresearch-agent/scripts/run_experiment.py
+++ b/engineering/autoresearch-agent/scripts/run_experiment.py
@@ -2,17 +2,15 @@
 """
 autoresearch-agent: Experiment Runner

-Executes the autonomous experiment loop:
- Reads .autoresearch.cfg for project config
- Runs the target evaluation
- Keeps improvements (git commit) or discards failures (git reset)
- Logs everything to results.tsv
- Loops indefinitely until interrupted
+Executes the autonomous experiment loop for a specific experiment.
+Reads config from .autoresearch/{domain}/{name}/config.cfg.

 Usage:
-    python scripts/run_experiment.py --loop      # Run forever
-    python scripts/run_experiment.py --single    # Run one experiment
-    python scripts/run_experiment.py --dry-run   # Show what would happen
+    python scripts/run_experiment.py --experiment engineering/api-speed --loop
+    python scripts/run_experiment.py --experiment engineering/api-speed --single
+    python scripts/run_experiment.py --experiment marketing/medium-ctr --loop
+    python scripts/run_experiment.py --resume --loop
+    python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
 """

 import argparse
@@ -25,11 +23,22 @@ from datetime import datetime
 from pathlib import Path


-def load_config(path):
-    """Load .autoresearch.cfg"""
-    cfg_file = Path(path) / ".autoresearch.cfg"
+def find_autoresearch_root():
+    """Find .autoresearch/ in project or user home."""
+    project_root = Path(".").resolve() / ".autoresearch"
+    if project_root.exists():
+        return project_root
+    user_root = Path.home() / ".autoresearch"
+    if user_root.exists():
+        return user_root
+    return None
+
+
+def load_config(experiment_dir):
+    """Load config.cfg from experiment directory."""
+    cfg_file = experiment_dir / "config.cfg"
    if not cfg_file.exists():
-        print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
+        print(f"  Error: no config.cfg in {experiment_dir}")
        sys.exit(1)
    config = {}
    for line in cfg_file.read_text().splitlines():
@@ -49,239 +58,293 @@ def run_cmd(cmd, cwd=None, timeout=None):


 def get_current_commit(path):
+    """Get short hash of current HEAD."""
    _, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
    return commit


-def get_current_metric(path, metric_grep):
-    """Read the last recorded metric from results.tsv."""
-    tsv = Path(path) / "results.tsv"
+def get_best_metric(experiment_dir, direction):
+    """Read the best metric from results.tsv."""
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return None
-    lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
+    lines = [l for l in tsv.read_text().splitlines()[1:] if "\tkeep\t" in l]
    if not lines:
        return None
-    last = lines[-1].split("\t")
-    try:
-        return float(last[1])
-    except (ValueError, IndexError):
+    metrics = []
+    for line in lines:
+        parts = line.split("\t")
+        try:
+            if parts[1] != "N/A":
+                metrics.append(float(parts[1]))
+        except (ValueError, IndexError):
+            continue
+    if not metrics:
        return None
+    return min(metrics) if direction == "lower" else max(metrics)


-def run_evaluation(path, evaluate_cmd, time_budget_minutes):
-    """Run evaluation with time limit."""
-    hard_limit = time_budget_minutes * 60 * 2.5  # 2.5x as hard timeout
+def run_evaluation(project_root, eval_cmd, time_budget_minutes, log_file):
+    """Run evaluation with time limit. Output goes to log_file."""
+    hard_limit = time_budget_minutes * 60 * 2.5
    t0 = time.time()
    try:
        code, _, _ = run_cmd(
-            f"{evaluate_cmd} > run.log 2>&1",
-            cwd=path,
+            f"{eval_cmd} > {log_file} 2>&1",
+            cwd=str(project_root),
            timeout=hard_limit
        )
        elapsed = time.time() - t0
        return code, elapsed
    except subprocess.TimeoutExpired:
        elapsed = time.time() - t0
-        return -1, elapsed  # -1 = timeout
+        return -1, elapsed


-def extract_metric(path, metric_grep):
-    """Extract metric value from run.log."""
-    code, out, _ = run_cmd(
-        f"grep '{metric_grep}' run.log | tail -1",
-        cwd=path
-    )
-    if not out:
-        return None
-    try:
-        return float(out.split(":")[-1].strip())
-    except ValueError:
+def extract_metric(log_file, metric_grep):
+    """Extract metric value from log file."""
+    log_path = Path(log_file)
+    if not log_path.exists():
        return None
+    for line in reversed(log_path.read_text().splitlines()):
+        stripped = line.strip()
+        if stripped.startswith(metric_grep.lstrip("^")):
+            try:
+                return float(stripped.split(":")[-1].strip())
+            except ValueError:
+                continue
+    return None


 def is_improvement(new_val, old_val, direction):
    """Check if new result is better than old."""
    if old_val is None:
-        return True  # First run always "improves"
+        return True
    if direction == "lower":
        return new_val < old_val
-    else:
-        return new_val > old_val
+    return new_val > old_val


-def log_result(path, commit, metric_val, status, description):
+def log_result(experiment_dir, commit, metric_val, status, description):
    """Append result to results.tsv."""
-    tsv = Path(path) / "results.tsv"
+    tsv = experiment_dir / "results.tsv"
    metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
    with open(tsv, "a") as f:
        f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")


-def get_experiment_count(path):
+def get_experiment_count(experiment_dir):
    """Count experiments run so far."""
-    tsv = Path(path) / "results.tsv"
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return 0
-    lines = tsv.read_text().splitlines()
-    return max(0, len(lines) - 1)  # subtract header
+    return max(0, len(tsv.read_text().splitlines()) - 1)


-def run_single_experiment(path, config, exp_num, dry_run=False):
+def get_last_active(root):
+    """Find the most recently modified experiment."""
+    latest = None
+    latest_time = 0
+    for domain_dir in root.iterdir():
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in domain_dir.iterdir():
+            if not exp_dir.is_dir():
+                continue
+            cfg = exp_dir / "config.cfg"
+            if cfg.exists() and cfg.stat().st_mtime > latest_time:
+                latest_time = cfg.stat().st_mtime
+                latest = f"{domain_dir.name}/{exp_dir.name}"
+    return latest
+
+
+def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
    """Run one experiment iteration."""
    direction = config.get("metric_direction", "lower")
    metric_grep = config.get("metric_grep", "^metric:")
-    evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
+    eval_cmd = config.get("evaluate_cmd", "python evaluate.py")
    time_budget = int(config.get("time_budget_minutes", 5))
    metric_name = config.get("metric", "metric")
+    log_file = str(experiment_dir / "run.log")

-    best_so_far = get_current_metric(path, metric_grep)
+    best = get_best_metric(experiment_dir, direction)
    ts = datetime.now().strftime("%H:%M:%S")

    print(f"\n[{ts}] Experiment #{exp_num}")
-    print(f"  Best {metric_name} so far: {best_so_far}")
+    print(f"  Best {metric_name}: {best}")

    if dry_run:
        print("  [DRY RUN] Would run evaluation and check metric")
        return "dry_run"

-    # Save pre-experiment state for rollback
-    code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
+    # Save state for rollback
+    code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=str(project_root))
    if code != 0:
-        print("  ✗ Can't get git state. Is this a git repo with commits?")
+        print("  Error: can't get git state")
        return "error"

    # Run evaluation
-    print(f"  Running: {evaluate_cmd} (budget: {time_budget} min)")
-    ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)
+    print(f"  Running: {eval_cmd} (budget: {time_budget}m)")
+    ret_code, elapsed = run_evaluation(project_root, eval_cmd, time_budget, log_file)

-    # Handle timeout
+    commit = get_current_commit(str(project_root))
+
+    # Timeout
    if ret_code == -1:
-        print(f"  ✗ TIMEOUT after {elapsed:.0f}s — discarding")
-        run_cmd("git checkout -- .", cwd=path)  # revert uncommitted changes
-        # Commit was already made by the agent before evaluation
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
+        print(f"  TIMEOUT after {elapsed:.0f}s — discarding")
+        run_cmd("git checkout -- .", cwd=str(project_root))
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        log_result(experiment_dir, commit, None, "crash", f"timeout_{elapsed:.0f}s")
        return "crash"

-    # Handle non-zero exit
+    # Crash
    if ret_code != 0:
-        # Check if it crashed
-        code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
-        print(f"  ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
+        _, tail, _ = run_cmd(f"tail -5 {log_file}", cwd=str(project_root))
+        print(f"  CRASH (exit {ret_code}) after {elapsed:.0f}s")
        print(f"  Last output: {tail[:200]}")
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        log_result(experiment_dir, commit, None, "crash", f"exit_{ret_code}")
        return "crash"

    # Extract metric
-    metric_val = extract_metric(path, metric_grep)
+    metric_val = extract_metric(log_file, metric_grep)
    if metric_val is None:
-        print(f"  ✗ Could not parse metric from run.log")
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, None, "crash", "metric_parse_failed")
+        print(f"  Could not parse {metric_name} from run.log")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        log_result(experiment_dir, commit, None, "crash", "metric_parse_failed")
        return "crash"

-    curr_commit = get_current_commit(path)
    delta = ""
-    if best_so_far is not None:
-        diff = metric_val - best_so_far
-        delta = f" (Δ{diff:+.4f})"
+    if best is not None:
+        diff = metric_val - best
+        delta = f" (delta {diff:+.4f})"

    print(f"  {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")

    # Keep or discard
-    if is_improvement(metric_val, best_so_far, direction):
-        print(f"  ✅ KEEP — improvement confirmed")
-        log_result(path, curr_commit, metric_val, "keep",
-                   f"improvement_{metric_name}_{metric_val:.4f}")
+    if is_improvement(metric_val, best, direction):
+        print(f"  KEEP — improvement")
+        log_result(experiment_dir, commit, metric_val, "keep",
+                   f"improved_{metric_name}_{metric_val:.4f}")
        return "keep"
    else:
-        print(f"  ❌ DISCARD — no improvement")
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, metric_val, "discard",
-                   f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
+        print(f"  DISCARD — no improvement")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        best_str = f"{best:.4f}" if best else "?"
+        log_result(experiment_dir, commit, metric_val, "discard",
+                   f"no_improvement_{metric_val:.4f}_vs_{best_str}")
        return "discard"


-def print_summary(path):
-    """Print experiment summary."""
-    tsv = Path(path) / "results.tsv"
+def print_summary(experiment_dir, config):
+    """Print session summary."""
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return
-    lines = tsv.read_text().splitlines()[1:]  # skip header
+    lines = tsv.read_text().splitlines()[1:]
    if not lines:
        return

    keeps = [l for l in lines if "\tkeep\t" in l]
    discards = [l for l in lines if "\tdiscard\t" in l]
    crashes = [l for l in lines if "\tcrash\t" in l]
+    metric_name = config.get("metric", "metric")
+    direction = config.get("metric_direction", "lower")

-    print(f"\n{'='*50}")
-    print(f"  Session Summary")
+    print(f"\n{'=' * 55}")
+    print(f"  autoresearch — Session Summary")
    print(f"  Experiments: {len(lines)} total")
-    print(f"  ✅ Keep: {len(keeps)} | ❌ Discard: {len(discards)} | 💥 Crash: {len(crashes)}")
+    print(f"  Keep: {len(keeps)} | Discard: {len(discards)} | Crash: {len(crashes)}")

    if keeps:
        try:
-            first_metric = float(keeps[0].split("\t")[1])
-            last_metric = float(keeps[-1].split("\t")[1])
-            direction = "↓" if last_metric < first_metric else "↑"
-            print(f"  Best progress: {first_metric:.6f} → {last_metric:.6f} {direction}")
+            valid = []
+            for l in keeps:
+                parts = l.split("\t")
+                if parts[1] != "N/A":
+                    valid.append(float(parts[1]))
+            if len(valid) >= 2:
+                first, last = valid[0], valid[-1]
+                best = min(valid) if direction == "lower" else max(valid)
+                pct = ((first - best) / first * 100) if direction == "lower" else ((best - first) / first * 100)
+                print(f"  {metric_name}: {first:.6f} -> {best:.6f} ({pct:+.1f}%)")
        except (ValueError, IndexError):
            pass
-    print(f"{'='*50}\n")
+    print(f"{'=' * 55}\n")


 def main():
    parser = argparse.ArgumentParser(description="autoresearch-agent runner")
+    parser.add_argument("--experiment", help="Experiment path: domain/name (e.g. engineering/api-speed)")
+    parser.add_argument("--resume", action="store_true", help="Resume last active experiment")
    parser.add_argument("--loop", action="store_true", help="Run forever")
    parser.add_argument("--single", action="store_true", help="Run one experiment")
-    parser.add_argument("--dry-run", action="store_true", help="Dry run only")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
+    parser.add_argument("--max-experiments", type=int, default=0, help="Max experiments (0 = unlimited)")
    parser.add_argument("--path", default=".", help="Project root")
-    parser.add_argument("--max-experiments", type=int, default=0,
-                        help="Max experiments (0 = unlimited)")
    args = parser.parse_args()

-    path = Path(args.path).resolve()
-    config = load_config(path)
+    project_root = Path(args.path).resolve()
+    root = find_autoresearch_root()

-    print(f"\n🔬 autoresearch-agent")
-    print(f"   Project: {path}")
-    print(f"   Target: {config.get('target', '?')}")
-    print(f"   Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
-    print(f"   Budget: {config.get('time_budget_minutes', '?')} min/experiment")
-    print(f"   Mode: {'loop' if args.loop else 'single'}")
+    if root is None:
+        print("No .autoresearch/ found. Run setup_experiment.py first.")
+        sys.exit(1)

-    if args.single:
-        exp_num = get_experiment_count(path) + 1
-        run_single_experiment(path, config, exp_num, args.dry_run)
+    # Resolve experiment
+    experiment_path = args.experiment
+    if args.resume:
+        experiment_path = get_last_active(root)
+        if not experiment_path:
+            print("No experiments found to resume.")
+            sys.exit(1)
+        print(f"Resuming: {experiment_path}")
+
+    if not experiment_path:
+        print("Specify --experiment domain/name or --resume")
+        sys.exit(1)
+
+    experiment_dir = root / experiment_path
+    if not experiment_dir.exists():
+        print(f"Experiment not found: {experiment_dir}")
+        print("Run: python scripts/setup_experiment.py --list")
+        sys.exit(1)
+
+    config = load_config(experiment_dir)
+
+    domain, name = experiment_path.split("/", 1)
+    print(f"\n  autoresearch-agent")
+    print(f"  Experiment: {experiment_path}")
+    print(f"  Target: {config.get('target', '?')}")
+    print(f"  Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
+    print(f"  Budget: {config.get('time_budget_minutes', '?')} min/experiment")
+    print(f"  Mode: {'loop' if args.loop else 'single'}")
+
+    if args.single or args.dry_run:
+        exp_num = get_experiment_count(experiment_dir) + 1
+        run_single(project_root, experiment_dir, config, exp_num, args.dry_run)
        return

-    if not args.loop and not args.dry_run:
+    if not args.loop:
        print("\nSpecify --loop (forever) or --single (one experiment)")
        sys.exit(1)

-    # Setup graceful shutdown
+    # Graceful shutdown
    def handle_interrupt(sig, frame):
-        print_summary(path)
-        print("\n⏹ Stopped by user.")
+        print_summary(experiment_dir, config)
+        print("\nStopped by user.")
        sys.exit(0)

    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_interrupt)

-    # Main loop
    consecutive_crashes = 0
-    exp_num = get_experiment_count(path) + 1
+    exp_num = get_experiment_count(experiment_dir) + 1

-    print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")
+    print(f"\nStarting loop. Ctrl+C to stop.\n")

    while True:
-        result = run_single_experiment(path, config, exp_num, args.dry_run)
+        result = run_single(project_root, experiment_dir, config, exp_num, False)
        exp_num += 1

        if result == "crash":
@@ -289,21 +352,16 @@ def main():
        else:
            consecutive_crashes = 0

-        # Bail if 5 consecutive crashes
        if consecutive_crashes >= 5:
-            print("\n⚠ 5 consecutive crashes. Pausing for investigation.")
-            print("  Check run.log for the last error.")
+            print("\n  5 consecutive crashes. Pausing.")
+            print("  Check .autoresearch/{}/run.log".format(experiment_path))
            break

-        # Check max experiments
-        if args.max_experiments > 0 and exp_num > args.max_experiments:
-            print(f"\n✓ Reached max experiments ({args.max_experiments})")
+        if 0 < args.max_experiments < exp_num:
+            print(f"\n  Reached max experiments ({args.max_experiments})")
            break

-        if args.single:
-            break
-
-    print_summary(path)
+    print_summary(experiment_dir, config)


 if __name__ == "__main__":