Files
claude-skills-reference/engineering/autoresearch-agent/scripts/setup_experiment.py
Leo a799d8bdb8 feat: add autoresearch-agent — autonomous experiment loop for ML, prompt, code & skill optimization
Inspired by Karpathy's autoresearch. The agent modifies a target file, runs a
fixed evaluation, keeps improvements (git commit), discards failures (git reset),
and loops indefinitely — no human in the loop.

Includes:
- SKILL.md with setup wizard, 4 domain configs, experiment loop protocol
- 3 stdlib-only Python scripts (setup, run, log — 687 lines)
- Reference docs: experiment domains guide, program.md templates

Domains: ML training (val_bpb), prompt engineering (eval_score),
code performance (p50_ms), agent skill optimization (pass_rate).

Cherry-picked from feat/autoresearch-agent and rebased onto dev.
Fixes: timeout inconsistency (2x→2.5x), results.tsv tracking clarity,
zero-metric edge case, installation section aligned with multi-tool support.
2026-03-13 07:21:44 +01:00

256 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""
autoresearch-agent: Setup Wizard
Initializes a new research run:
1. Validates the project structure
2. Creates a git branch
3. Runs the baseline experiment
4. Initializes results.tsv
Usage:
python scripts/setup_experiment.py [--config experiment.yaml]
python scripts/setup_experiment.py --domain ml|prompt|code|skill
"""
import argparse
import os
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
DOMAINS = {
"ml": {
"target": "train.py",
"evaluate_cmd": "uv run train.py",
"metric": "val_bpb",
"metric_direction": "lower",
"time_budget_minutes": 5,
"metric_grep": "^val_bpb:",
},
"prompt": {
"target": "prompt.md",
"evaluate_cmd": "python evaluate.py",
"metric": "eval_score",
"metric_direction": "higher",
"time_budget_minutes": 2,
"metric_grep": "^eval_score:",
},
"code": {
"target": "src/module.py",
"evaluate_cmd": "python benchmark.py",
"metric": "p50_ms",
"metric_direction": "lower",
"time_budget_minutes": 10,
"metric_grep": "^p50_ms:",
},
"skill": {
"target": "SKILL.md",
"evaluate_cmd": "python scripts/skill_evaluator.py",
"metric": "pass_rate",
"metric_direction": "higher",
"time_budget_minutes": 5,
"metric_grep": "^pass_rate:",
},
}
def run_cmd(cmd, cwd=None, timeout=None):
"""Run a shell command and return (returncode, stdout, stderr)."""
result = subprocess.run(
cmd, shell=True, capture_output=True, text=True,
cwd=cwd, timeout=timeout
)
return result.returncode, result.stdout.strip(), result.stderr.strip()
def check_git_repo(path):
"""Verify we're in a git repo."""
code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
if code != 0:
print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
return False
print("✓ Git repository found")
return True
def check_program_md(path):
"""Check program.md exists and has content."""
pm = Path(path) / "program.md"
if not pm.exists():
print("⚠ program.md not found. Creating template...")
return False
content = pm.read_text()
if len(content) < 100:
print("⚠ program.md looks empty. Fill it out before running experiments.")
return False
print(f"✓ program.md found ({len(content)} chars)")
return True
def check_target_file(path, target):
"""Check target file exists."""
tf = Path(path) / target
if not tf.exists():
print(f"✗ Target file not found: {target}")
return False
print(f"✓ Target file found: {target}")
return True
def check_evaluate_script(path):
"""Check evaluate.py exists."""
ev = Path(path) / "evaluate.py"
if not ev.exists():
print("⚠ evaluate.py not found. You need a fixed evaluation function.")
print(" Create evaluate.py that outputs: metric_name: <value>")
return False
print("✓ evaluate.py found")
return True
def create_branch(path, tag):
"""Create and checkout the experiment branch."""
branch = f"autoresearch/{tag}"
code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
if code != 0:
if "already exists" in err:
print(f"✗ Branch '{branch}' already exists. Use a different tag.")
else:
print(f"✗ Failed to create branch: {err}")
return None
print(f"✓ Created branch: {branch}")
return branch
def init_results_tsv(path):
"""Create results.tsv with header."""
tsv = Path(path) / "results.tsv"
if tsv.exists():
print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
return
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
print("✓ Created results.tsv")
def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
"""Run the baseline experiment."""
print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
timeout = time_budget_minutes * 60 * 2.5 # 2.5x budget as hard limit
t0 = time.time()
code, out, err = run_cmd(
f"{evaluate_cmd} > run.log 2>&1",
cwd=path,
timeout=timeout
)
elapsed = time.time() - t0
if code != 0:
print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
return None
# Extract metric
grep_code, grep_out, _ = run_cmd(
f"grep '{metric_grep}' run.log | tail -1",
cwd=path
)
if not grep_out:
print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
return None
metric_value = grep_out.split(":")[-1].strip()
print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
return metric_value
def main():
parser = argparse.ArgumentParser(description="autoresearch-agent setup")
parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
parser.add_argument("--target", help="Target file to optimize")
parser.add_argument("--evaluate-cmd", help="Evaluation command")
parser.add_argument("--metric", help="Metric name")
parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
parser.add_argument("--tag", help="Run tag (used in branch name)")
parser.add_argument("--path", default=".", help="Project root path")
parser.add_argument("--skip-baseline", action="store_true")
args = parser.parse_args()
path = Path(args.path).resolve()
print(f"\n🔬 autoresearch-agent setup")
print(f" Project: {path}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
# Get config from domain or args
if args.domain:
config = DOMAINS[args.domain].copy()
else:
config = {
"target": args.target or "target.py",
"evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
"metric": args.metric or "score",
"metric_direction": args.direction,
"time_budget_minutes": args.budget,
"metric_grep": f"^{args.metric or 'score'}:",
}
tag = args.tag or datetime.now().strftime("%b%d").lower()
# Validation checks
checks = [
check_git_repo(path),
check_program_md(path),
check_target_file(path, config["target"]),
check_evaluate_script(path),
]
if not all(checks):
print("\n⚠ Fix the above issues before running experiments.")
sys.exit(1)
# Create branch
branch = create_branch(path, tag)
if not branch:
sys.exit(1)
# Init results TSV
init_results_tsv(path)
# Save config for run_experiment.py
config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
(path / ".autoresearch.cfg").write_text(config_content + "\n")
print("✓ Saved .autoresearch.cfg")
# Run baseline
if not args.skip_baseline:
baseline = run_baseline(
path,
config["evaluate_cmd"],
config["metric_grep"],
config["time_budget_minutes"]
)
if baseline:
# Log baseline to TSV
code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
with open(path / "results.tsv", "a") as f:
f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
print(f"✓ Baseline logged to results.tsv")
print(f"\n✅ Setup complete!")
print(f" Branch: {branch}")
print(f" Target: {config['target']}")
print(f" Metric: {config['metric']} ({config['metric_direction']} is better)")
print(f" Budget: {config['time_budget_minutes']} min/experiment")
print(f"\nTo start the autonomous loop:")
print(f" python scripts/run_experiment.py --loop")
print(f"\nOr run a single experiment:")
print(f" python scripts/run_experiment.py --single")
if __name__ == "__main__":
main()