feat: add autoresearch-agent — autonomous experiment loop for ML, prompt, code & skill optimization

Inspired by Karpathy's autoresearch. The agent modifies a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely — no human in the loop. Includes: - SKILL.md with setup wizard, 4 domain configs, experiment loop protocol - 3 stdlib-only Python scripts (setup, run, log — 687 lines) - Reference docs: experiment domains guide, program.md templates Domains: ML training (val_bpb), prompt engineering (eval_score), code performance (p50_ms), agent skill optimization (pass_rate). Cherry-picked from feat/autoresearch-agent and rebased onto dev. Fixes: timeout inconsistency (2x→2.5x), results.tsv tracking clarity, zero-metric edge case, installation section aligned with multi-tool support.
2026-03-13 07:21:44 +01:00
parent 9cc5d51d4a
commit a799d8bdb8
6 changed files with 1282 additions and 0 deletions
--- a/engineering/autoresearch-agent/scripts/log_results.py
+++ b/engineering/autoresearch-agent/scripts/log_results.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+autoresearch-agent: Results Logger
+
+View and analyze experiment results from results.tsv.
+
+Usage:
+    python scripts/log_results.py --summary          # Print progress table
+    python scripts/log_results.py --best             # Show best result
+    python scripts/log_results.py --history          # Full experiment history
+    python scripts/log_results.py --record commit val status desc  # Add entry manually
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def load_results(path):
+    tsv = Path(path) / "results.tsv"
+    if not tsv.exists():
+        return []
+    lines = tsv.read_text().splitlines()[1:]  # skip header
+    results = []
+    for line in lines:
+        parts = line.split("\t")
+        if len(parts) >= 4:
+            try:
+                metric_val = float(parts[1]) if parts[1] != "N/A" else None
+            except ValueError:
+                metric_val = None
+            results.append({
+                "commit": parts[0],
+                "metric": metric_val,
+                "status": parts[2],
+                "description": parts[3]
+            })
+    return results
+
+
+def print_summary(results, metric_name="metric", direction="lower"):
+    if not results:
+        print("No experiments logged yet.")
+        return
+
+    keeps = [r for r in results if r["status"] == "keep"]
+    discards = [r for r in results if r["status"] == "discard"]
+    crashes = [r for r in results if r["status"] == "crash"]
+
+    print(f"\n{'─'*60}")
+    print(f"  autoresearch-agent — Results Summary")
+    print(f"{'─'*60}")
+    print(f"  Total experiments: {len(results)}")
+    print(f"  ✅ Keep:    {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
+    print(f"  ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
+    print(f"  💥 Crash:   {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
+
+    if keeps:
+        valid = [r for r in keeps if r["metric"] is not None]
+        if valid:
+            baseline = valid[0]["metric"]
+            best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
+            best_run = next(r for r in valid if r["metric"] == best)
+            improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
+
+            print(f"\n  {metric_name}:")
+            print(f"    Baseline: {baseline:.6f}")
+            print(f"    Best:     {best:.6f}  (commit: {best_run['commit']})")
+            print(f"    Change:   {improvement:+.2f}%")
+
+    print(f"{'─'*60}\n")
+
+
+def print_history(results):
+    if not results:
+        print("No experiments logged yet.")
+        return
+
+    print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
+    print("─" * 60)
+    for r in results:
+        metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash   "
+        status_icon = {"keep": "✅", "discard": "❌", "crash": "💥"}.get(r["status"], "?")
+        print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--summary", action="store_true")
+    parser.add_argument("--best", action="store_true")
+    parser.add_argument("--history", action="store_true")
+    parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
+    parser.add_argument("--path", default=".")
+    parser.add_argument("--metric", default="metric")
+    parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
+    args = parser.parse_args()
+
+    path = Path(args.path).resolve()
+
+    if args.record:
+        commit, metric, status, desc = args.record
+        tsv = path / "results.tsv"
+        if not tsv.exists():
+            tsv.write_text("commit\tmetric\tstatus\tdescription\n")
+        with open(tsv, "a") as f:
+            f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
+        print(f"✓ Logged: {commit} {metric} {status}")
+        return
+
+    results = load_results(path)
+
+    if args.history:
+        print_history(results)
+    elif args.best:
+        keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
+        if not keeps:
+            print("No successful experiments yet.")
+            return
+        best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
+        print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
+    else:
+        print_summary(results, args.metric, args.direction)
+
+
+if __name__ == "__main__":
+    main()
--- a/engineering/autoresearch-agent/scripts/run_experiment.py
+++ b/engineering/autoresearch-agent/scripts/run_experiment.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""
+autoresearch-agent: Experiment Runner
+
+Executes the autonomous experiment loop:
+- Reads .autoresearch.cfg for project config
+- Runs the target evaluation
+- Keeps improvements (git commit) or discards failures (git reset)
+- Logs everything to results.tsv
+- Loops indefinitely until interrupted
+
+Usage:
+    python scripts/run_experiment.py --loop      # Run forever
+    python scripts/run_experiment.py --single    # Run one experiment
+    python scripts/run_experiment.py --dry-run   # Show what would happen
+"""
+
+import argparse
+import os
+import signal
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+
+def load_config(path):
+    """Load .autoresearch.cfg"""
+    cfg_file = Path(path) / ".autoresearch.cfg"
+    if not cfg_file.exists():
+        print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
+        sys.exit(1)
+    config = {}
+    for line in cfg_file.read_text().splitlines():
+        if ":" in line:
+            k, v = line.split(":", 1)
+            config[k.strip()] = v.strip()
+    return config
+
+
+def run_cmd(cmd, cwd=None, timeout=None):
+    """Run shell command, return (returncode, stdout, stderr)."""
+    result = subprocess.run(
+        cmd, shell=True, capture_output=True, text=True,
+        cwd=cwd, timeout=timeout
+    )
+    return result.returncode, result.stdout.strip(), result.stderr.strip()
+
+
+def get_current_commit(path):
+    _, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
+    return commit
+
+
+def get_current_metric(path, metric_grep):
+    """Read the last recorded metric from results.tsv."""
+    tsv = Path(path) / "results.tsv"
+    if not tsv.exists():
+        return None
+    lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
+    if not lines:
+        return None
+    last = lines[-1].split("\t")
+    try:
+        return float(last[1])
+    except (ValueError, IndexError):
+        return None
+
+
+def run_evaluation(path, evaluate_cmd, time_budget_minutes):
+    """Run evaluation with time limit."""
+    hard_limit = time_budget_minutes * 60 * 2.5  # 2.5x as hard timeout
+    t0 = time.time()
+    try:
+        code, _, _ = run_cmd(
+            f"{evaluate_cmd} > run.log 2>&1",
+            cwd=path,
+            timeout=hard_limit
+        )
+        elapsed = time.time() - t0
+        return code, elapsed
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - t0
+        return -1, elapsed  # -1 = timeout
+
+
+def extract_metric(path, metric_grep):
+    """Extract metric value from run.log."""
+    code, out, _ = run_cmd(
+        f"grep '{metric_grep}' run.log | tail -1",
+        cwd=path
+    )
+    if not out:
+        return None
+    try:
+        return float(out.split(":")[-1].strip())
+    except ValueError:
+        return None
+
+
+def is_improvement(new_val, old_val, direction):
+    """Check if new result is better than old."""
+    if old_val is None:
+        return True  # First run always "improves"
+    if direction == "lower":
+        return new_val < old_val
+    else:
+        return new_val > old_val
+
+
+def log_result(path, commit, metric_val, status, description):
+    """Append result to results.tsv."""
+    tsv = Path(path) / "results.tsv"
+    metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
+    with open(tsv, "a") as f:
+        f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")
+
+
+def get_experiment_count(path):
+    """Count experiments run so far."""
+    tsv = Path(path) / "results.tsv"
+    if not tsv.exists():
+        return 0
+    lines = tsv.read_text().splitlines()
+    return max(0, len(lines) - 1)  # subtract header
+
+
+def run_single_experiment(path, config, exp_num, dry_run=False):
+    """Run one experiment iteration."""
+    direction = config.get("metric_direction", "lower")
+    metric_grep = config.get("metric_grep", "^metric:")
+    evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
+    time_budget = int(config.get("time_budget_minutes", 5))
+    metric_name = config.get("metric", "metric")
+
+    best_so_far = get_current_metric(path, metric_grep)
+    ts = datetime.now().strftime("%H:%M:%S")
+
+    print(f"\n[{ts}] Experiment #{exp_num}")
+    print(f"  Best {metric_name} so far: {best_so_far}")
+
+    if dry_run:
+        print("  [DRY RUN] Would run evaluation and check metric")
+        return "dry_run"
+
+    # Save pre-experiment state for rollback
+    code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
+    if code != 0:
+        print("  ✗ Can't get git state. Is this a git repo with commits?")
+        return "error"
+
+    # Run evaluation
+    print(f"  Running: {evaluate_cmd} (budget: {time_budget} min)")
+    ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)
+
+    # Handle timeout
+    if ret_code == -1:
+        print(f"  ✗ TIMEOUT after {elapsed:.0f}s — discarding")
+        run_cmd("git checkout -- .", cwd=path)  # revert uncommitted changes
+        # Commit was already made by the agent before evaluation
+        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
+        curr_commit = get_current_commit(path)
+        log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
+        return "crash"
+
+    # Handle non-zero exit
+    if ret_code != 0:
+        # Check if it crashed
+        code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
+        print(f"  ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
+        print(f"  Last output: {tail[:200]}")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
+        curr_commit = get_current_commit(path)
+        log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
+        return "crash"
+
+    # Extract metric
+    metric_val = extract_metric(path, metric_grep)
+    if metric_val is None:
+        print(f"  ✗ Could not parse metric from run.log")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
+        curr_commit = get_current_commit(path)
+        log_result(path, curr_commit, None, "crash", "metric_parse_failed")
+        return "crash"
+
+    curr_commit = get_current_commit(path)
+    delta = ""
+    if best_so_far is not None:
+        diff = metric_val - best_so_far
+        delta = f" (Δ{diff:+.4f})"
+
+    print(f"  {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")
+
+    # Keep or discard
+    if is_improvement(metric_val, best_so_far, direction):
+        print(f"  ✅ KEEP — improvement confirmed")
+        log_result(path, curr_commit, metric_val, "keep",
+                   f"improvement_{metric_name}_{metric_val:.4f}")
+        return "keep"
+    else:
+        print(f"  ❌ DISCARD — no improvement")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
+        curr_commit = get_current_commit(path)
+        log_result(path, curr_commit, metric_val, "discard",
+                   f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
+        return "discard"
+
+
+def print_summary(path):
+    """Print experiment summary."""
+    tsv = Path(path) / "results.tsv"
+    if not tsv.exists():
+        return
+    lines = tsv.read_text().splitlines()[1:]  # skip header
+    if not lines:
+        return
+
+    keeps = [l for l in lines if "\tkeep\t" in l]
+    discards = [l for l in lines if "\tdiscard\t" in l]
+    crashes = [l for l in lines if "\tcrash\t" in l]
+
+    print(f"\n{'='*50}")
+    print(f"  Session Summary")
+    print(f"  Experiments: {len(lines)} total")
+    print(f"  ✅ Keep: {len(keeps)} | ❌ Discard: {len(discards)} | 💥 Crash: {len(crashes)}")
+
+    if keeps:
+        try:
+            first_metric = float(keeps[0].split("\t")[1])
+            last_metric = float(keeps[-1].split("\t")[1])
+            direction = "↓" if last_metric < first_metric else "↑"
+            print(f"  Best progress: {first_metric:.6f} → {last_metric:.6f} {direction}")
+        except (ValueError, IndexError):
+            pass
+    print(f"{'='*50}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="autoresearch-agent runner")
+    parser.add_argument("--loop", action="store_true", help="Run forever")
+    parser.add_argument("--single", action="store_true", help="Run one experiment")
+    parser.add_argument("--dry-run", action="store_true", help="Dry run only")
+    parser.add_argument("--path", default=".", help="Project root")
+    parser.add_argument("--max-experiments", type=int, default=0,
+                        help="Max experiments (0 = unlimited)")
+    args = parser.parse_args()
+
+    path = Path(args.path).resolve()
+    config = load_config(path)
+
+    print(f"\n🔬 autoresearch-agent")
+    print(f"   Project: {path}")
+    print(f"   Target: {config.get('target', '?')}")
+    print(f"   Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
+    print(f"   Budget: {config.get('time_budget_minutes', '?')} min/experiment")
+    print(f"   Mode: {'loop' if args.loop else 'single'}")
+
+    if args.single:
+        exp_num = get_experiment_count(path) + 1
+        run_single_experiment(path, config, exp_num, args.dry_run)
+        return
+
+    if not args.loop and not args.dry_run:
+        print("\nSpecify --loop (forever) or --single (one experiment)")
+        sys.exit(1)
+
+    # Setup graceful shutdown
+    def handle_interrupt(sig, frame):
+        print_summary(path)
+        print("\n⏹ Stopped by user.")
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, handle_interrupt)
+    signal.signal(signal.SIGTERM, handle_interrupt)
+
+    # Main loop
+    consecutive_crashes = 0
+    exp_num = get_experiment_count(path) + 1
+
+    print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")
+
+    while True:
+        result = run_single_experiment(path, config, exp_num, args.dry_run)
+        exp_num += 1
+
+        if result == "crash":
+            consecutive_crashes += 1
+        else:
+            consecutive_crashes = 0
+
+        # Bail if 5 consecutive crashes
+        if consecutive_crashes >= 5:
+            print("\n⚠ 5 consecutive crashes. Pausing for investigation.")
+            print("  Check run.log for the last error.")
+            break
+
+        # Check max experiments
+        if args.max_experiments > 0 and exp_num > args.max_experiments:
+            print(f"\n✓ Reached max experiments ({args.max_experiments})")
+            break
+
+        if args.single:
+            break
+
+    print_summary(path)
+
+
+if __name__ == "__main__":
+    main()
--- a/engineering/autoresearch-agent/scripts/setup_experiment.py
+++ b/engineering/autoresearch-agent/scripts/setup_experiment.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+autoresearch-agent: Setup Wizard
+
+Initializes a new research run:
+1. Validates the project structure
+2. Creates a git branch
+3. Runs the baseline experiment
+4. Initializes results.tsv
+
+Usage:
+    python scripts/setup_experiment.py [--config experiment.yaml]
+    python scripts/setup_experiment.py --domain ml|prompt|code|skill
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+
+
+DOMAINS = {
+    "ml": {
+        "target": "train.py",
+        "evaluate_cmd": "uv run train.py",
+        "metric": "val_bpb",
+        "metric_direction": "lower",
+        "time_budget_minutes": 5,
+        "metric_grep": "^val_bpb:",
+    },
+    "prompt": {
+        "target": "prompt.md",
+        "evaluate_cmd": "python evaluate.py",
+        "metric": "eval_score",
+        "metric_direction": "higher",
+        "time_budget_minutes": 2,
+        "metric_grep": "^eval_score:",
+    },
+    "code": {
+        "target": "src/module.py",
+        "evaluate_cmd": "python benchmark.py",
+        "metric": "p50_ms",
+        "metric_direction": "lower",
+        "time_budget_minutes": 10,
+        "metric_grep": "^p50_ms:",
+    },
+    "skill": {
+        "target": "SKILL.md",
+        "evaluate_cmd": "python scripts/skill_evaluator.py",
+        "metric": "pass_rate",
+        "metric_direction": "higher",
+        "time_budget_minutes": 5,
+        "metric_grep": "^pass_rate:",
+    },
+}
+
+
+def run_cmd(cmd, cwd=None, timeout=None):
+    """Run a shell command and return (returncode, stdout, stderr)."""
+    result = subprocess.run(
+        cmd, shell=True, capture_output=True, text=True,
+        cwd=cwd, timeout=timeout
+    )
+    return result.returncode, result.stdout.strip(), result.stderr.strip()
+
+
+def check_git_repo(path):
+    """Verify we're in a git repo."""
+    code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
+    if code != 0:
+        print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
+        return False
+    print("✓ Git repository found")
+    return True
+
+
+def check_program_md(path):
+    """Check program.md exists and has content."""
+    pm = Path(path) / "program.md"
+    if not pm.exists():
+        print("⚠ program.md not found. Creating template...")
+        return False
+    content = pm.read_text()
+    if len(content) < 100:
+        print("⚠ program.md looks empty. Fill it out before running experiments.")
+        return False
+    print(f"✓ program.md found ({len(content)} chars)")
+    return True
+
+
+def check_target_file(path, target):
+    """Check target file exists."""
+    tf = Path(path) / target
+    if not tf.exists():
+        print(f"✗ Target file not found: {target}")
+        return False
+    print(f"✓ Target file found: {target}")
+    return True
+
+
+def check_evaluate_script(path):
+    """Check evaluate.py exists."""
+    ev = Path(path) / "evaluate.py"
+    if not ev.exists():
+        print("⚠ evaluate.py not found. You need a fixed evaluation function.")
+        print("  Create evaluate.py that outputs: metric_name: <value>")
+        return False
+    print("✓ evaluate.py found")
+    return True
+
+
+def create_branch(path, tag):
+    """Create and checkout the experiment branch."""
+    branch = f"autoresearch/{tag}"
+    code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
+    if code != 0:
+        if "already exists" in err:
+            print(f"✗ Branch '{branch}' already exists. Use a different tag.")
+        else:
+            print(f"✗ Failed to create branch: {err}")
+        return None
+    print(f"✓ Created branch: {branch}")
+    return branch
+
+
+def init_results_tsv(path):
+    """Create results.tsv with header."""
+    tsv = Path(path) / "results.tsv"
+    if tsv.exists():
+        print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
+        return
+    tsv.write_text("commit\tmetric\tstatus\tdescription\n")
+    print("✓ Created results.tsv")
+
+
+def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
+    """Run the baseline experiment."""
+    print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
+    timeout = time_budget_minutes * 60 * 2.5  # 2.5x budget as hard limit
+
+    t0 = time.time()
+    code, out, err = run_cmd(
+        f"{evaluate_cmd} > run.log 2>&1",
+        cwd=path,
+        timeout=timeout
+    )
+    elapsed = time.time() - t0
+
+    if code != 0:
+        print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
+        return None
+
+    # Extract metric
+    grep_code, grep_out, _ = run_cmd(
+        f"grep '{metric_grep}' run.log | tail -1",
+        cwd=path
+    )
+    if not grep_out:
+        print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
+        return None
+
+    metric_value = grep_out.split(":")[-1].strip()
+    print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
+    return metric_value
+
+
+def main():
+    parser = argparse.ArgumentParser(description="autoresearch-agent setup")
+    parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
+    parser.add_argument("--target", help="Target file to optimize")
+    parser.add_argument("--evaluate-cmd", help="Evaluation command")
+    parser.add_argument("--metric", help="Metric name")
+    parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
+    parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
+    parser.add_argument("--tag", help="Run tag (used in branch name)")
+    parser.add_argument("--path", default=".", help="Project root path")
+    parser.add_argument("--skip-baseline", action="store_true")
+    args = parser.parse_args()
+
+    path = Path(args.path).resolve()
+    print(f"\n🔬 autoresearch-agent setup")
+    print(f"   Project: {path}")
+    print(f"   Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
+
+    # Get config from domain or args
+    if args.domain:
+        config = DOMAINS[args.domain].copy()
+    else:
+        config = {
+            "target": args.target or "target.py",
+            "evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
+            "metric": args.metric or "score",
+            "metric_direction": args.direction,
+            "time_budget_minutes": args.budget,
+            "metric_grep": f"^{args.metric or 'score'}:",
+        }
+
+    tag = args.tag or datetime.now().strftime("%b%d").lower()
+
+    # Validation checks
+    checks = [
+        check_git_repo(path),
+        check_program_md(path),
+        check_target_file(path, config["target"]),
+        check_evaluate_script(path),
+    ]
+
+    if not all(checks):
+        print("\n⚠ Fix the above issues before running experiments.")
+        sys.exit(1)
+
+    # Create branch
+    branch = create_branch(path, tag)
+    if not branch:
+        sys.exit(1)
+
+    # Init results TSV
+    init_results_tsv(path)
+
+    # Save config for run_experiment.py
+    config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
+    (path / ".autoresearch.cfg").write_text(config_content + "\n")
+    print("✓ Saved .autoresearch.cfg")
+
+    # Run baseline
+    if not args.skip_baseline:
+        baseline = run_baseline(
+            path,
+            config["evaluate_cmd"],
+            config["metric_grep"],
+            config["time_budget_minutes"]
+        )
+        if baseline:
+            # Log baseline to TSV
+            code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
+            with open(path / "results.tsv", "a") as f:
+                f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
+            print(f"✓ Baseline logged to results.tsv")
+
+    print(f"\n✅ Setup complete!")
+    print(f"   Branch: {branch}")
+    print(f"   Target: {config['target']}")
+    print(f"   Metric: {config['metric']} ({config['metric_direction']} is better)")
+    print(f"   Budget: {config['time_budget_minutes']} min/experiment")
+    print(f"\nTo start the autonomous loop:")
+    print(f"   python scripts/run_experiment.py --loop")
+    print(f"\nOr run a single experiment:")
+    print(f"   python scripts/run_experiment.py --single")
+
+
+if __name__ == "__main__":
+    main()