feat: add autoresearch-agent — autonomous experiment loop for ML, prompt, code & skill optimization

Inspired by Karpathy's autoresearch. The agent modifies a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely — no human in the loop. Includes: - SKILL.md with setup wizard, 4 domain configs, experiment loop protocol - 3 stdlib-only Python scripts (setup, run, log — 687 lines) - Reference docs: experiment domains guide, program.md templates Domains: ML training (val_bpb), prompt engineering (eval_score), code performance (p50_ms), agent skill optimization (pass_rate). Cherry-picked from feat/autoresearch-agent and rebased onto dev. Fixes: timeout inconsistency (2x→2.5x), results.tsv tracking clarity, zero-metric edge case, installation section aligned with multi-tool support.
2026-03-13 07:21:44 +01:00
parent 9cc5d51d4a
commit a799d8bdb8
6 changed files with 1282 additions and 0 deletions
--- a/engineering/autoresearch-agent/scripts/setup_experiment.py
+++ b/engineering/autoresearch-agent/scripts/setup_experiment.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+autoresearch-agent: Setup Wizard
+
+Initializes a new research run:
+1. Validates the project structure
+2. Creates a git branch
+3. Runs the baseline experiment
+4. Initializes results.tsv
+
+Usage:
+    python scripts/setup_experiment.py [--config experiment.yaml]
+    python scripts/setup_experiment.py --domain ml|prompt|code|skill
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+
+DOMAINS = {
+    "ml": {
+        "target": "train.py",
+        "evaluate_cmd": "uv run train.py",
+        "metric": "val_bpb",
+        "metric_direction": "lower",
+        "time_budget_minutes": 5,
+        "metric_grep": "^val_bpb:",
+    },
+    "prompt": {
+        "target": "prompt.md",
+        "evaluate_cmd": "python evaluate.py",
+        "metric": "eval_score",
+        "metric_direction": "higher",
+        "time_budget_minutes": 2,
+        "metric_grep": "^eval_score:",
+    },
+    "code": {
+        "target": "src/module.py",
+        "evaluate_cmd": "python benchmark.py",
+        "metric": "p50_ms",
+        "metric_direction": "lower",
+        "time_budget_minutes": 10,
+        "metric_grep": "^p50_ms:",
+    },
+    "skill": {
+        "target": "SKILL.md",
+        "evaluate_cmd": "python scripts/skill_evaluator.py",
+        "metric": "pass_rate",
+        "metric_direction": "higher",
+        "time_budget_minutes": 5,
+        "metric_grep": "^pass_rate:",
+    },
+}
+
+
+def run_cmd(cmd, cwd=None, timeout=None):
+    """Run a shell command and return (returncode, stdout, stderr)."""
+    result = subprocess.run(
+        cmd, shell=True, capture_output=True, text=True,
+        cwd=cwd, timeout=timeout
+    )
+    return result.returncode, result.stdout.strip(), result.stderr.strip()
+
+
+def check_git_repo(path):
+    """Verify we're in a git repo."""
+    code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
+    if code != 0:
+        print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
+        return False
+    print("✓ Git repository found")
+    return True
+
+
+def check_program_md(path):
+    """Check program.md exists and has content."""
+    pm = Path(path) / "program.md"
+    if not pm.exists():
+        print("⚠ program.md not found. Creating template...")
+        return False
+    content = pm.read_text()
+    if len(content) < 100:
+        print("⚠ program.md looks empty. Fill it out before running experiments.")
+        return False
+    print(f"✓ program.md found ({len(content)} chars)")
+    return True
+
+
+def check_target_file(path, target):
+    """Check target file exists."""
+    tf = Path(path) / target
+    if not tf.exists():
+        print(f"✗ Target file not found: {target}")
+        return False
+    print(f"✓ Target file found: {target}")
+    return True
+
+
+def check_evaluate_script(path):
+    """Check evaluate.py exists."""
+    ev = Path(path) / "evaluate.py"
+    if not ev.exists():
+        print("⚠ evaluate.py not found. You need a fixed evaluation function.")
+        print("  Create evaluate.py that outputs: metric_name: <value>")
+        return False
+    print("✓ evaluate.py found")
+    return True
+
+
+def create_branch(path, tag):
+    """Create and checkout the experiment branch."""
+    branch = f"autoresearch/{tag}"
+    code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
+    if code != 0:
+        if "already exists" in err:
+            print(f"✗ Branch '{branch}' already exists. Use a different tag.")
+        else:
+            print(f"✗ Failed to create branch: {err}")
+        return None
+    print(f"✓ Created branch: {branch}")
+    return branch
+
+
+def init_results_tsv(path):
+    """Create results.tsv with header."""
+    tsv = Path(path) / "results.tsv"
+    if tsv.exists():
+        print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
+        return
+    tsv.write_text("commit\tmetric\tstatus\tdescription\n")
+    print("✓ Created results.tsv")
+
+
+def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
+    """Run the baseline experiment."""
+    print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
+    timeout = time_budget_minutes * 60 * 2.5  # 2.5x budget as hard limit
+
+    t0 = time.time()
+    code, out, err = run_cmd(
+        f"{evaluate_cmd} > run.log 2>&1",
+        cwd=path,
+        timeout=timeout
+    )
+    elapsed = time.time() - t0
+
+    if code != 0:
+        print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
+        return None
+
+    # Extract metric
+    grep_code, grep_out, _ = run_cmd(
+        f"grep '{metric_grep}' run.log | tail -1",
+        cwd=path
+    )
+    if not grep_out:
+        print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
+        return None
+
+    metric_value = grep_out.split(":")[-1].strip()
+    print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
+    return metric_value
+
+
+def main():
+    parser = argparse.ArgumentParser(description="autoresearch-agent setup")
+    parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
+    parser.add_argument("--target", help="Target file to optimize")
+    parser.add_argument("--evaluate-cmd", help="Evaluation command")
+    parser.add_argument("--metric", help="Metric name")
+    parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
+    parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
+    parser.add_argument("--tag", help="Run tag (used in branch name)")
+    parser.add_argument("--path", default=".", help="Project root path")
+    parser.add_argument("--skip-baseline", action="store_true")
+    args = parser.parse_args()
+
+    path = Path(args.path).resolve()
+    print(f"\n🔬 autoresearch-agent setup")
+    print(f"   Project: {path}")
+    print(f"   Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
+
+    # Get config from domain or args
+    if args.domain:
+        config = DOMAINS[args.domain].copy()
+    else:
+        config = {
+            "target": args.target or "target.py",
+            "evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
+            "metric": args.metric or "score",
+            "metric_direction": args.direction,
+            "time_budget_minutes": args.budget,
+            "metric_grep": f"^{args.metric or 'score'}:",
+        }
+
+    tag = args.tag or datetime.now().strftime("%b%d").lower()
+
+    # Validation checks
+    checks = [
+        check_git_repo(path),
+        check_program_md(path),
+        check_target_file(path, config["target"]),
+        check_evaluate_script(path),
+    ]
+
+    if not all(checks):
+        print("\n⚠ Fix the above issues before running experiments.")
+        sys.exit(1)
+
+    # Create branch
+    branch = create_branch(path, tag)
+    if not branch:
+        sys.exit(1)
+
+    # Init results TSV
+    init_results_tsv(path)
+
+    # Save config for run_experiment.py
+    config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
+    (path / ".autoresearch.cfg").write_text(config_content + "\n")
+    print("✓ Saved .autoresearch.cfg")
+
+    # Run baseline
+    if not args.skip_baseline:
+        baseline = run_baseline(
+            path,
+            config["evaluate_cmd"],
+            config["metric_grep"],
+            config["time_budget_minutes"]
+        )
+        if baseline:
+            # Log baseline to TSV
+            code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
+            with open(path / "results.tsv", "a") as f:
+                f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
+            print(f"✓ Baseline logged to results.tsv")
+
+    print(f"\n✅ Setup complete!")
+    print(f"   Branch: {branch}")
+    print(f"   Target: {config['target']}")
+    print(f"   Metric: {config['metric']} ({config['metric_direction']} is better)")
+    print(f"   Budget: {config['time_budget_minutes']} min/experiment")
+    print(f"\nTo start the autonomous loop:")
+    print(f"   python scripts/run_experiment.py --loop")
+    print(f"\nOr run a single experiment:")
+    print(f"   python scripts/run_experiment.py --single")
+
+
+if __name__ == "__main__":
+    main()