claude-skills-reference/engineering/autoresearch-agent/scripts/run_experiment.py

#!/usr/bin/env python3
"""
autoresearch-agent: Experiment Runner

Executes the autonomous experiment loop:
- Reads .autoresearch.cfg for project config
- Runs the target evaluation
- Keeps improvements (git commit) or discards failures (git reset)
- Logs everything to results.tsv
- Loops indefinitely until interrupted

Usage:
    python scripts/run_experiment.py --loop      # Run forever
    python scripts/run_experiment.py --single    # Run one experiment
    python scripts/run_experiment.py --dry-run   # Show what would happen
"""

import argparse
import os
import signal
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path


def load_config(path):
    """Load .autoresearch.cfg"""
    cfg_file = Path(path) / ".autoresearch.cfg"
    if not cfg_file.exists():
        print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
        sys.exit(1)
    config = {}
    for line in cfg_file.read_text().splitlines():
        if ":" in line:
            k, v = line.split(":", 1)
            config[k.strip()] = v.strip()
    return config


def run_cmd(cmd, cwd=None, timeout=None):
    """Run shell command, return (returncode, stdout, stderr)."""
    result = subprocess.run(
        cmd, shell=True, capture_output=True, text=True,
        cwd=cwd, timeout=timeout
    )
    return result.returncode, result.stdout.strip(), result.stderr.strip()


def get_current_commit(path):
    _, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
    return commit


def get_current_metric(path, metric_grep):
    """Read the last recorded metric from results.tsv."""
    tsv = Path(path) / "results.tsv"
    if not tsv.exists():
        return None
    lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
    if not lines:
        return None
    last = lines[-1].split("\t")
    try:
        return float(last[1])
    except (ValueError, IndexError):
        return None


def run_evaluation(path, evaluate_cmd, time_budget_minutes):
    """Run evaluation with time limit."""
    hard_limit = time_budget_minutes * 60 * 2.5  # 2.5x as hard timeout
    t0 = time.time()
    try:
        code, _, _ = run_cmd(
            f"{evaluate_cmd} > run.log 2>&1",
            cwd=path,
            timeout=hard_limit
        )
        elapsed = time.time() - t0
        return code, elapsed
    except subprocess.TimeoutExpired:
        elapsed = time.time() - t0
        return -1, elapsed  # -1 = timeout


def extract_metric(path, metric_grep):
    """Extract metric value from run.log."""
    code, out, _ = run_cmd(
        f"grep '{metric_grep}' run.log | tail -1",
        cwd=path
    )
    if not out:
        return None
    try:
        return float(out.split(":")[-1].strip())
    except ValueError:
        return None


def is_improvement(new_val, old_val, direction):
    """Check if new result is better than old."""
    if old_val is None:
        return True  # First run always "improves"
    if direction == "lower":
        return new_val < old_val
    else:
        return new_val > old_val


def log_result(path, commit, metric_val, status, description):
    """Append result to results.tsv."""
    tsv = Path(path) / "results.tsv"
    metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
    with open(tsv, "a") as f:
        f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")


def get_experiment_count(path):
    """Count experiments run so far."""
    tsv = Path(path) / "results.tsv"
    if not tsv.exists():
        return 0
    lines = tsv.read_text().splitlines()
    return max(0, len(lines) - 1)  # subtract header


def run_single_experiment(path, config, exp_num, dry_run=False):
    """Run one experiment iteration."""
    direction = config.get("metric_direction", "lower")
    metric_grep = config.get("metric_grep", "^metric:")
    evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
    time_budget = int(config.get("time_budget_minutes", 5))
    metric_name = config.get("metric", "metric")

    best_so_far = get_current_metric(path, metric_grep)
    ts = datetime.now().strftime("%H:%M:%S")

    print(f"\n[{ts}] Experiment #{exp_num}")
    print(f"  Best {metric_name} so far: {best_so_far}")

    if dry_run:
        print("  [DRY RUN] Would run evaluation and check metric")
        return "dry_run"

    # Save pre-experiment state for rollback
    code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
    if code != 0:
        print("  ✗ Can't get git state. Is this a git repo with commits?")
        return "error"

    # Run evaluation
    print(f"  Running: {evaluate_cmd} (budget: {time_budget} min)")
    ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)

    # Handle timeout
    if ret_code == -1:
        print(f"  ✗ TIMEOUT after {elapsed:.0f}s — discarding")
        run_cmd("git checkout -- .", cwd=path)  # revert uncommitted changes
        # Commit was already made by the agent before evaluation
        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
        curr_commit = get_current_commit(path)
        log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
        return "crash"

    # Handle non-zero exit
    if ret_code != 0:
        # Check if it crashed
        code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
        print(f"  ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
        print(f"  Last output: {tail[:200]}")
        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
        curr_commit = get_current_commit(path)
        log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
        return "crash"

    # Extract metric
    metric_val = extract_metric(path, metric_grep)
    if metric_val is None:
        print(f"  ✗ Could not parse metric from run.log")
        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
        curr_commit = get_current_commit(path)
        log_result(path, curr_commit, None, "crash", "metric_parse_failed")
        return "crash"

    curr_commit = get_current_commit(path)
    delta = ""
    if best_so_far is not None:
        diff = metric_val - best_so_far
        delta = f" (Δ{diff:+.4f})"

    print(f"  {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")

    # Keep or discard
    if is_improvement(metric_val, best_so_far, direction):
        print(f"  ✅ KEEP — improvement confirmed")
        log_result(path, curr_commit, metric_val, "keep",
                   f"improvement_{metric_name}_{metric_val:.4f}")
        return "keep"
    else:
        print(f"  ❌ DISCARD — no improvement")
        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
        curr_commit = get_current_commit(path)
        log_result(path, curr_commit, metric_val, "discard",
                   f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
        return "discard"


def print_summary(path):
    """Print experiment summary."""
    tsv = Path(path) / "results.tsv"
    if not tsv.exists():
        return
    lines = tsv.read_text().splitlines()[1:]  # skip header
    if not lines:
        return

    keeps = [l for l in lines if "\tkeep\t" in l]
    discards = [l for l in lines if "\tdiscard\t" in l]
    crashes = [l for l in lines if "\tcrash\t" in l]

    print(f"\n{'='*50}")
    print(f"  Session Summary")
    print(f"  Experiments: {len(lines)} total")
    print(f"  ✅ Keep: {len(keeps)} | ❌ Discard: {len(discards)} | 💥 Crash: {len(crashes)}")

    if keeps:
        try:
            first_metric = float(keeps[0].split("\t")[1])
            last_metric = float(keeps[-1].split("\t")[1])
            direction = "↓" if last_metric < first_metric else "↑"
            print(f"  Best progress: {first_metric:.6f} → {last_metric:.6f} {direction}")
        except (ValueError, IndexError):
            pass
    print(f"{'='*50}\n")


def main():
    parser = argparse.ArgumentParser(description="autoresearch-agent runner")
    parser.add_argument("--loop", action="store_true", help="Run forever")
    parser.add_argument("--single", action="store_true", help="Run one experiment")
    parser.add_argument("--dry-run", action="store_true", help="Dry run only")
    parser.add_argument("--path", default=".", help="Project root")
    parser.add_argument("--max-experiments", type=int, default=0,
                        help="Max experiments (0 = unlimited)")
    args = parser.parse_args()

    path = Path(args.path).resolve()
    config = load_config(path)

    print(f"\n🔬 autoresearch-agent")
    print(f"   Project: {path}")
    print(f"   Target: {config.get('target', '?')}")
    print(f"   Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
    print(f"   Budget: {config.get('time_budget_minutes', '?')} min/experiment")
    print(f"   Mode: {'loop' if args.loop else 'single'}")

    if args.single:
        exp_num = get_experiment_count(path) + 1
        run_single_experiment(path, config, exp_num, args.dry_run)
        return

    if not args.loop and not args.dry_run:
        print("\nSpecify --loop (forever) or --single (one experiment)")
        sys.exit(1)

    # Setup graceful shutdown
    def handle_interrupt(sig, frame):
        print_summary(path)
        print("\n⏹ Stopped by user.")
        sys.exit(0)

    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_interrupt)

    # Main loop
    consecutive_crashes = 0
    exp_num = get_experiment_count(path) + 1

    print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")

    while True:
        result = run_single_experiment(path, config, exp_num, args.dry_run)
        exp_num += 1

        if result == "crash":
            consecutive_crashes += 1
        else:
            consecutive_crashes = 0

        # Bail if 5 consecutive crashes
        if consecutive_crashes >= 5:
            print("\n⚠ 5 consecutive crashes. Pausing for investigation.")
            print("  Check run.log for the last error.")
            break

        # Check max experiments
        if args.max_experiments > 0 and exp_num > args.max_experiments:
            print(f"\n✓ Reached max experiments ({args.max_experiments})")
            break

        if args.single:
            break

    print_summary(path)


if __name__ == "__main__":
    main()