refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo. Architecture changes: - Multi-experiment support: .autoresearch/{domain}/{name}/ structure - Domain categories: engineering, marketing, content, prompts, custom - Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope - User chooses scope during setup, not installation New evaluators (8 ready-to-use): - Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage - LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy - LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed Script improvements: - setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators - run_experiment.py: --experiment domain/name, --resume, --loop, --single - log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output Results export: - Terminal (default), CSV, and Markdown formats - Per-experiment, per-domain, or cross-experiment dashboard view SKILL.md rewritten: - Clear activation triggers (when the skill should activate) - Practical examples for each domain - Evaluator documentation with cost transparency - Simplified loop protocol matching Karpathy's original philosophy
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions
--- a/engineering/autoresearch-agent/scripts/log_results.py
+++ b/engineering/autoresearch-agent/scripts/log_results.py
@@ -1,125 +1,389 @@
 #!/usr/bin/env python3
 """
-autoresearch-agent: Results Logger
+autoresearch-agent: Results Viewer

-View and analyze experiment results from results.tsv.
+View experiment results in multiple formats: terminal, CSV, Markdown.
+Supports single experiment, domain, or cross-experiment dashboard.

 Usage:
-    python scripts/log_results.py --summary          # Print progress table
-    python scripts/log_results.py --best             # Show best result
-    python scripts/log_results.py --history          # Full experiment history
-    python scripts/log_results.py --record commit val status desc  # Add entry manually
+    python scripts/log_results.py --experiment engineering/api-speed
+    python scripts/log_results.py --domain engineering
+    python scripts/log_results.py --dashboard
+    python scripts/log_results.py --experiment engineering/api-speed --format csv --output results.csv
+    python scripts/log_results.py --experiment engineering/api-speed --format markdown --output results.md
+    python scripts/log_results.py --dashboard --format markdown --output dashboard.md
 """

 import argparse
+import csv
+import io
 import sys
 from pathlib import Path


-def load_results(path):
-    tsv = Path(path) / "results.tsv"
+def find_autoresearch_root():
+    """Find .autoresearch/ in project or user home."""
+    project_root = Path(".").resolve() / ".autoresearch"
+    if project_root.exists():
+        return project_root
+    user_root = Path.home() / ".autoresearch"
+    if user_root.exists():
+        return user_root
+    return None
+
+
+def load_config(experiment_dir):
+    """Load config.cfg."""
+    cfg_file = experiment_dir / "config.cfg"
+    config = {}
+    if cfg_file.exists():
+        for line in cfg_file.read_text().splitlines():
+            if ":" in line:
+                k, v = line.split(":", 1)
+                config[k.strip()] = v.strip()
+    return config
+
+
+def load_results(experiment_dir):
+    """Load results.tsv into list of dicts."""
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return []
-    lines = tsv.read_text().splitlines()[1:]  # skip header
    results = []
-    for line in lines:
+    for line in tsv.read_text().splitlines()[1:]:
        parts = line.split("\t")
        if len(parts) >= 4:
            try:
-                metric_val = float(parts[1]) if parts[1] != "N/A" else None
+                metric = float(parts[1]) if parts[1] != "N/A" else None
            except ValueError:
-                metric_val = None
+                metric = None
            results.append({
                "commit": parts[0],
-                "metric": metric_val,
+                "metric": metric,
                "status": parts[2],
-                "description": parts[3]
+                "description": parts[3],
            })
    return results


-def print_summary(results, metric_name="metric", direction="lower"):
-    if not results:
-        print("No experiments logged yet.")
-        return
-
+def compute_stats(results, direction):
+    """Compute statistics from results."""
    keeps = [r for r in results if r["status"] == "keep"]
    discards = [r for r in results if r["status"] == "discard"]
    crashes = [r for r in results if r["status"] == "crash"]

-    print(f"\n{'─'*60}")
-    print(f"  autoresearch-agent — Results Summary")
-    print(f"{'─'*60}")
-    print(f"  Total experiments: {len(results)}")
-    print(f"  ✅ Keep:    {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
-    print(f"  ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
-    print(f"  💥 Crash:   {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
+    valid_keeps = [r for r in keeps if r["metric"] is not None]
+    baseline = valid_keeps[0]["metric"] if valid_keeps else None
+    if valid_keeps:
+        best = min(r["metric"] for r in valid_keeps) if direction == "lower" else max(r["metric"] for r in valid_keeps)
+    else:
+        best = None

-    if keeps:
-        valid = [r for r in keeps if r["metric"] is not None]
-        if valid:
-            baseline = valid[0]["metric"]
-            best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
-            best_run = next(r for r in valid if r["metric"] == best)
-            improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
+    pct_change = None
+    if baseline and best and baseline != 0:
+        if direction == "lower":
+            pct_change = (baseline - best) / baseline * 100
+        else:
+            pct_change = (best - baseline) / baseline * 100

-            print(f"\n  {metric_name}:")
-            print(f"    Baseline: {baseline:.6f}")
-            print(f"    Best:     {best:.6f}  (commit: {best_run['commit']})")
-            print(f"    Change:   {improvement:+.2f}%")
-
-    print(f"{'─'*60}\n")
+    return {
+        "total": len(results),
+        "keeps": len(keeps),
+        "discards": len(discards),
+        "crashes": len(crashes),
+        "baseline": baseline,
+        "best": best,
+        "pct_change": pct_change,
+    }


-def print_history(results):
+# --- Terminal Output ---
+
+def print_experiment(experiment_dir, experiment_path):
+    """Print single experiment results to terminal."""
+    config = load_config(experiment_dir)
+    results = load_results(experiment_dir)
+    direction = config.get("metric_direction", "lower")
+    metric_name = config.get("metric", "metric")
+
    if not results:
-        print("No experiments logged yet.")
+        print(f"No results for {experiment_path}")
        return

-    print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
-    print("─" * 60)
-    for r in results:
-        metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash   "
-        status_icon = {"keep": "✅", "discard": "❌", "crash": "💥"}.get(r["status"], "?")
-        print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
+    stats = compute_stats(results, direction)

+    print(f"\n{'─' * 65}")
+    print(f"  {experiment_path}")
+    print(f"  Target: {config.get('target', '?')} | Metric: {metric_name} ({direction})")
+    print(f"{'─' * 65}")
+    print(f"  Total: {stats['total']} | Keep: {stats['keeps']} | Discard: {stats['discards']} | Crash: {stats['crashes']}")
+
+    if stats["baseline"] is not None and stats["best"] is not None:
+        pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] is not None else ""
+        print(f"  Baseline: {stats['baseline']:.6f} -> Best: {stats['best']:.6f}{pct}")
+
+    print(f"\n  {'COMMIT':<10} {'METRIC':>12} {'STATUS':<10} DESCRIPTION")
+    print(f"  {'─' * 60}")
+    for r in results:
+        m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A     "
+        icon = {"keep": "+", "discard": "-", "crash": "!"}.get(r["status"], "?")
+        print(f"  {r['commit']:<10} {m:>12} {icon} {r['status']:<7} {r['description'][:35]}")
+    print()
+
+
+def print_dashboard(root):
+    """Print cross-experiment dashboard."""
+    experiments = []
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
+                continue
+            config = load_config(exp_dir)
+            results = load_results(exp_dir)
+            direction = config.get("metric_direction", "lower")
+            stats = compute_stats(results, direction)
+
+            # Determine status
+            status = "idle"
+            if stats["total"] > 0:
+                tsv = exp_dir / "results.tsv"
+                if tsv.exists():
+                    import time
+                    age_hours = (time.time() - tsv.stat().st_mtime) / 3600
+                    status = "active" if age_hours < 1 else "paused" if age_hours < 24 else "done"
+
+            best_str = f"{stats['best']:.4f}" if stats["best"] is not None else "—"
+            pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else "—"
+
+            experiments.append({
+                "domain": domain_dir.name,
+                "name": exp_dir.name,
+                "runs": stats["total"],
+                "kept": stats["keeps"],
+                "best": best_str,
+                "change": pct_str,
+                "status": status,
+                "metric": config.get("metric", "?"),
+            })
+
+    if not experiments:
+        print("No experiments found.")
+        return experiments
+
+    print(f"\n{'─' * 90}")
+    print(f"  autoresearch — Dashboard")
+    print(f"{'─' * 90}")
+    print(f"  {'DOMAIN':<15} {'EXPERIMENT':<20} {'RUNS':>5} {'KEPT':>5} {'BEST':>12} {'CHANGE':>10} {'STATUS':<8}")
+    print(f"  {'─' * 85}")
+    for e in experiments:
+        print(f"  {e['domain']:<15} {e['name']:<20} {e['runs']:>5} {e['kept']:>5} {e['best']:>12} {e['change']:>10} {e['status']:<8}")
+    print()
+    return experiments
+
+
+# --- CSV Export ---
+
+def export_experiment_csv(experiment_dir, experiment_path):
+    """Export single experiment as CSV string."""
+    config = load_config(experiment_dir)
+    results = load_results(experiment_dir)
+    direction = config.get("metric_direction", "lower")
+    stats = compute_stats(results, direction)
+
+    buf = io.StringIO()
+    writer = csv.writer(buf)
+
+    # Header with metadata
+    writer.writerow(["# Experiment", experiment_path])
+    writer.writerow(["# Target", config.get("target", "")])
+    writer.writerow(["# Metric", f"{config.get('metric', '')} ({direction} is better)"])
+    if stats["baseline"] is not None:
+        writer.writerow(["# Baseline", f"{stats['baseline']:.6f}"])
+    if stats["best"] is not None:
+        pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
+        writer.writerow(["# Best", f"{stats['best']:.6f}{pct}"])
+    writer.writerow(["# Total", stats["total"]])
+    writer.writerow(["# Keep/Discard/Crash", f"{stats['keeps']}/{stats['discards']}/{stats['crashes']}"])
+    writer.writerow([])
+
+    writer.writerow(["Commit", "Metric", "Status", "Description"])
+    for r in results:
+        m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A"
+        writer.writerow([r["commit"], m, r["status"], r["description"]])
+
+    return buf.getvalue()
+
+
+def export_dashboard_csv(root):
+    """Export dashboard as CSV string."""
+    experiments = []
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
+                continue
+            config = load_config(exp_dir)
+            results = load_results(exp_dir)
+            direction = config.get("metric_direction", "lower")
+            stats = compute_stats(results, direction)
+            best_str = f"{stats['best']:.6f}" if stats["best"] else ""
+            pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else ""
+            experiments.append([
+                domain_dir.name, exp_dir.name, config.get("metric", ""),
+                stats["total"], stats["keeps"], stats["discards"], stats["crashes"],
+                best_str, pct_str
+            ])
+
+    buf = io.StringIO()
+    writer = csv.writer(buf)
+    writer.writerow(["Domain", "Experiment", "Metric", "Runs", "Kept", "Discarded", "Crashed", "Best", "Change"])
+    for e in experiments:
+        writer.writerow(e)
+    return buf.getvalue()
+
+
+# --- Markdown Export ---
+
+def export_experiment_markdown(experiment_dir, experiment_path):
+    """Export single experiment as Markdown string."""
+    config = load_config(experiment_dir)
+    results = load_results(experiment_dir)
+    direction = config.get("metric_direction", "lower")
+    metric_name = config.get("metric", "metric")
+    stats = compute_stats(results, direction)
+
+    lines = []
+    lines.append(f"# Autoresearch: {experiment_path}\n")
+    lines.append(f"**Target:** `{config.get('target', '?')}`  ")
+    lines.append(f"**Metric:** `{metric_name}` ({direction} is better)  ")
+    lines.append(f"**Experiments:** {stats['total']} total — {stats['keeps']} kept, {stats['discards']} discarded, {stats['crashes']} crashed\n")
+
+    if stats["baseline"] is not None and stats["best"] is not None:
+        pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
+        lines.append(f"**Progress:** `{stats['baseline']:.6f}` → `{stats['best']:.6f}`{pct}\n")
+
+    lines.append(f"| Commit | Metric | Status | Description |")
+    lines.append(f"|--------|--------|--------|-------------|")
+    for r in results:
+        m = f"`{r['metric']:.6f}`" if r["metric"] is not None else "N/A"
+        lines.append(f"| `{r['commit']}` | {m} | {r['status']} | {r['description']} |")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def export_dashboard_markdown(root):
+    """Export dashboard as Markdown string."""
+    lines = []
+    lines.append("# Autoresearch Dashboard\n")
+    lines.append("| Domain | Experiment | Metric | Runs | Kept | Best | Change | Status |")
+    lines.append("|--------|-----------|--------|------|------|------|--------|--------|")
+
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
+                continue
+            config = load_config(exp_dir)
+            results = load_results(exp_dir)
+            direction = config.get("metric_direction", "lower")
+            stats = compute_stats(results, direction)
+            best = f"`{stats['best']:.4f}`" if stats["best"] else "—"
+            pct = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else "—"
+
+            import time
+            tsv = exp_dir / "results.tsv"
+            status = "idle"
+            if tsv.exists() and stats["total"] > 0:
+                age_h = (time.time() - tsv.stat().st_mtime) / 3600
+                status = "active" if age_h < 1 else "paused" if age_h < 24 else "done"
+
+            lines.append(f"| {domain_dir.name} | {exp_dir.name} | {config.get('metric', '?')} | {stats['total']} | {stats['keeps']} | {best} | {pct} | {status} |")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+# --- Main ---

 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--summary", action="store_true")
-    parser.add_argument("--best", action="store_true")
-    parser.add_argument("--history", action="store_true")
-    parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
-    parser.add_argument("--path", default=".")
-    parser.add_argument("--metric", default="metric")
-    parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
+    parser = argparse.ArgumentParser(description="autoresearch-agent results viewer")
+    parser.add_argument("--experiment", help="Show one experiment: domain/name")
+    parser.add_argument("--domain", help="Show all experiments in a domain")
+    parser.add_argument("--dashboard", action="store_true", help="Cross-experiment dashboard")
+    parser.add_argument("--format", choices=["terminal", "csv", "markdown"], default="terminal",
+                        help="Output format (default: terminal)")
+    parser.add_argument("--output", "-o", help="Write to file instead of stdout")
+    parser.add_argument("--all", action="store_true", help="Show all experiments (alias for --dashboard)")
    args = parser.parse_args()

-    path = Path(args.path).resolve()
+    root = find_autoresearch_root()
+    if root is None:
+        print("No .autoresearch/ found. Run setup_experiment.py first.")
+        sys.exit(1)

-    if args.record:
-        commit, metric, status, desc = args.record
-        tsv = path / "results.tsv"
-        if not tsv.exists():
-            tsv.write_text("commit\tmetric\tstatus\tdescription\n")
-        with open(tsv, "a") as f:
-            f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
-        print(f"✓ Logged: {commit} {metric} {status}")
-        return
+    output_text = None

-    results = load_results(path)
+    # Single experiment
+    if args.experiment:
+        experiment_dir = root / args.experiment
+        if not experiment_dir.exists():
+            print(f"Experiment not found: {args.experiment}")
+            sys.exit(1)

-    if args.history:
-        print_history(results)
-    elif args.best:
-        keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
-        if not keeps:
-            print("No successful experiments yet.")
+        if args.format == "csv":
+            output_text = export_experiment_csv(experiment_dir, args.experiment)
+        elif args.format == "markdown":
+            output_text = export_experiment_markdown(experiment_dir, args.experiment)
+        else:
+            print_experiment(experiment_dir, args.experiment)
            return
-        best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
-        print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
+
+    # Domain
+    elif args.domain:
+        domain_dir = root / args.domain
+        if not domain_dir.exists():
+            print(f"Domain not found: {args.domain}")
+            sys.exit(1)
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if exp_dir.is_dir() and (exp_dir / "config.cfg").exists():
+                if args.format == "terminal":
+                    print_experiment(exp_dir, f"{args.domain}/{exp_dir.name}")
+                # For CSV/MD, fall through to dashboard with domain filter
+        if args.format != "terminal":
+            # Use dashboard export filtered to domain
+            output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
+        else:
+            return
+
+    # Dashboard
+    elif args.dashboard or args.all:
+        if args.format == "csv":
+            output_text = export_dashboard_csv(root)
+        elif args.format == "markdown":
+            output_text = export_dashboard_markdown(root)
+        else:
+            print_dashboard(root)
+            return
+
    else:
-        print_summary(results, args.metric, args.direction)
+        # Default: dashboard
+        if args.format == "terminal":
+            print_dashboard(root)
+            return
+        output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
+
+    # Write output
+    if output_text:
+        if args.output:
+            Path(args.output).write_text(output_text)
+            print(f"Written to {args.output}")
+        else:
+            print(output_text)


 if __name__ == "__main__":
--- a/engineering/autoresearch-agent/scripts/run_experiment.py
+++ b/engineering/autoresearch-agent/scripts/run_experiment.py
@@ -2,17 +2,15 @@
 """
 autoresearch-agent: Experiment Runner

-Executes the autonomous experiment loop:
- Reads .autoresearch.cfg for project config
- Runs the target evaluation
- Keeps improvements (git commit) or discards failures (git reset)
- Logs everything to results.tsv
- Loops indefinitely until interrupted
+Executes the autonomous experiment loop for a specific experiment.
+Reads config from .autoresearch/{domain}/{name}/config.cfg.

 Usage:
-    python scripts/run_experiment.py --loop      # Run forever
-    python scripts/run_experiment.py --single    # Run one experiment
-    python scripts/run_experiment.py --dry-run   # Show what would happen
+    python scripts/run_experiment.py --experiment engineering/api-speed --loop
+    python scripts/run_experiment.py --experiment engineering/api-speed --single
+    python scripts/run_experiment.py --experiment marketing/medium-ctr --loop
+    python scripts/run_experiment.py --resume --loop
+    python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
 """

 import argparse
@@ -25,11 +23,22 @@ from datetime import datetime
 from pathlib import Path


-def load_config(path):
-    """Load .autoresearch.cfg"""
-    cfg_file = Path(path) / ".autoresearch.cfg"
+def find_autoresearch_root():
+    """Find .autoresearch/ in project or user home."""
+    project_root = Path(".").resolve() / ".autoresearch"
+    if project_root.exists():
+        return project_root
+    user_root = Path.home() / ".autoresearch"
+    if user_root.exists():
+        return user_root
+    return None
+
+
+def load_config(experiment_dir):
+    """Load config.cfg from experiment directory."""
+    cfg_file = experiment_dir / "config.cfg"
    if not cfg_file.exists():
-        print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
+        print(f"  Error: no config.cfg in {experiment_dir}")
        sys.exit(1)
    config = {}
    for line in cfg_file.read_text().splitlines():
@@ -49,239 +58,293 @@ def run_cmd(cmd, cwd=None, timeout=None):


 def get_current_commit(path):
+    """Get short hash of current HEAD."""
    _, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
    return commit


-def get_current_metric(path, metric_grep):
-    """Read the last recorded metric from results.tsv."""
-    tsv = Path(path) / "results.tsv"
+def get_best_metric(experiment_dir, direction):
+    """Read the best metric from results.tsv."""
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return None
-    lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
+    lines = [l for l in tsv.read_text().splitlines()[1:] if "\tkeep\t" in l]
    if not lines:
        return None
-    last = lines[-1].split("\t")
-    try:
-        return float(last[1])
-    except (ValueError, IndexError):
+    metrics = []
+    for line in lines:
+        parts = line.split("\t")
+        try:
+            if parts[1] != "N/A":
+                metrics.append(float(parts[1]))
+        except (ValueError, IndexError):
+            continue
+    if not metrics:
        return None
+    return min(metrics) if direction == "lower" else max(metrics)


-def run_evaluation(path, evaluate_cmd, time_budget_minutes):
-    """Run evaluation with time limit."""
-    hard_limit = time_budget_minutes * 60 * 2.5  # 2.5x as hard timeout
+def run_evaluation(project_root, eval_cmd, time_budget_minutes, log_file):
+    """Run evaluation with time limit. Output goes to log_file."""
+    hard_limit = time_budget_minutes * 60 * 2.5
    t0 = time.time()
    try:
        code, _, _ = run_cmd(
-            f"{evaluate_cmd} > run.log 2>&1",
-            cwd=path,
+            f"{eval_cmd} > {log_file} 2>&1",
+            cwd=str(project_root),
            timeout=hard_limit
        )
        elapsed = time.time() - t0
        return code, elapsed
    except subprocess.TimeoutExpired:
        elapsed = time.time() - t0
-        return -1, elapsed  # -1 = timeout
+        return -1, elapsed


-def extract_metric(path, metric_grep):
-    """Extract metric value from run.log."""
-    code, out, _ = run_cmd(
-        f"grep '{metric_grep}' run.log | tail -1",
-        cwd=path
-    )
-    if not out:
-        return None
-    try:
-        return float(out.split(":")[-1].strip())
-    except ValueError:
+def extract_metric(log_file, metric_grep):
+    """Extract metric value from log file."""
+    log_path = Path(log_file)
+    if not log_path.exists():
        return None
+    for line in reversed(log_path.read_text().splitlines()):
+        stripped = line.strip()
+        if stripped.startswith(metric_grep.lstrip("^")):
+            try:
+                return float(stripped.split(":")[-1].strip())
+            except ValueError:
+                continue
+    return None


 def is_improvement(new_val, old_val, direction):
    """Check if new result is better than old."""
    if old_val is None:
-        return True  # First run always "improves"
+        return True
    if direction == "lower":
        return new_val < old_val
-    else:
-        return new_val > old_val
+    return new_val > old_val


-def log_result(path, commit, metric_val, status, description):
+def log_result(experiment_dir, commit, metric_val, status, description):
    """Append result to results.tsv."""
-    tsv = Path(path) / "results.tsv"
+    tsv = experiment_dir / "results.tsv"
    metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
    with open(tsv, "a") as f:
        f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")


-def get_experiment_count(path):
+def get_experiment_count(experiment_dir):
    """Count experiments run so far."""
-    tsv = Path(path) / "results.tsv"
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return 0
-    lines = tsv.read_text().splitlines()
-    return max(0, len(lines) - 1)  # subtract header
+    return max(0, len(tsv.read_text().splitlines()) - 1)


-def run_single_experiment(path, config, exp_num, dry_run=False):
+def get_last_active(root):
+    """Find the most recently modified experiment."""
+    latest = None
+    latest_time = 0
+    for domain_dir in root.iterdir():
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in domain_dir.iterdir():
+            if not exp_dir.is_dir():
+                continue
+            cfg = exp_dir / "config.cfg"
+            if cfg.exists() and cfg.stat().st_mtime > latest_time:
+                latest_time = cfg.stat().st_mtime
+                latest = f"{domain_dir.name}/{exp_dir.name}"
+    return latest
+
+
+def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
    """Run one experiment iteration."""
    direction = config.get("metric_direction", "lower")
    metric_grep = config.get("metric_grep", "^metric:")
-    evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
+    eval_cmd = config.get("evaluate_cmd", "python evaluate.py")
    time_budget = int(config.get("time_budget_minutes", 5))
    metric_name = config.get("metric", "metric")
+    log_file = str(experiment_dir / "run.log")

-    best_so_far = get_current_metric(path, metric_grep)
+    best = get_best_metric(experiment_dir, direction)
    ts = datetime.now().strftime("%H:%M:%S")

    print(f"\n[{ts}] Experiment #{exp_num}")
-    print(f"  Best {metric_name} so far: {best_so_far}")
+    print(f"  Best {metric_name}: {best}")

    if dry_run:
        print("  [DRY RUN] Would run evaluation and check metric")
        return "dry_run"

-    # Save pre-experiment state for rollback
-    code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
+    # Save state for rollback
+    code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=str(project_root))
    if code != 0:
-        print("  ✗ Can't get git state. Is this a git repo with commits?")
+        print("  Error: can't get git state")
        return "error"

    # Run evaluation
-    print(f"  Running: {evaluate_cmd} (budget: {time_budget} min)")
-    ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)
+    print(f"  Running: {eval_cmd} (budget: {time_budget}m)")
+    ret_code, elapsed = run_evaluation(project_root, eval_cmd, time_budget, log_file)

-    # Handle timeout
+    commit = get_current_commit(str(project_root))
+
+    # Timeout
    if ret_code == -1:
-        print(f"  ✗ TIMEOUT after {elapsed:.0f}s — discarding")
-        run_cmd("git checkout -- .", cwd=path)  # revert uncommitted changes
-        # Commit was already made by the agent before evaluation
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
+        print(f"  TIMEOUT after {elapsed:.0f}s — discarding")
+        run_cmd("git checkout -- .", cwd=str(project_root))
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        log_result(experiment_dir, commit, None, "crash", f"timeout_{elapsed:.0f}s")
        return "crash"

-    # Handle non-zero exit
+    # Crash
    if ret_code != 0:
-        # Check if it crashed
-        code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
-        print(f"  ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
+        _, tail, _ = run_cmd(f"tail -5 {log_file}", cwd=str(project_root))
+        print(f"  CRASH (exit {ret_code}) after {elapsed:.0f}s")
        print(f"  Last output: {tail[:200]}")
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        log_result(experiment_dir, commit, None, "crash", f"exit_{ret_code}")
        return "crash"

    # Extract metric
-    metric_val = extract_metric(path, metric_grep)
+    metric_val = extract_metric(log_file, metric_grep)
    if metric_val is None:
-        print(f"  ✗ Could not parse metric from run.log")
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, None, "crash", "metric_parse_failed")
+        print(f"  Could not parse {metric_name} from run.log")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        log_result(experiment_dir, commit, None, "crash", "metric_parse_failed")
        return "crash"

-    curr_commit = get_current_commit(path)
    delta = ""
-    if best_so_far is not None:
-        diff = metric_val - best_so_far
-        delta = f" (Δ{diff:+.4f})"
+    if best is not None:
+        diff = metric_val - best
+        delta = f" (delta {diff:+.4f})"

    print(f"  {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")

    # Keep or discard
-    if is_improvement(metric_val, best_so_far, direction):
-        print(f"  ✅ KEEP — improvement confirmed")
-        log_result(path, curr_commit, metric_val, "keep",
-                   f"improvement_{metric_name}_{metric_val:.4f}")
+    if is_improvement(metric_val, best, direction):
+        print(f"  KEEP — improvement")
+        log_result(experiment_dir, commit, metric_val, "keep",
+                   f"improved_{metric_name}_{metric_val:.4f}")
        return "keep"
    else:
-        print(f"  ❌ DISCARD — no improvement")
-        run_cmd(f"git reset --hard {pre_commit}", cwd=path)
-        curr_commit = get_current_commit(path)
-        log_result(path, curr_commit, metric_val, "discard",
-                   f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
+        print(f"  DISCARD — no improvement")
+        run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
+        best_str = f"{best:.4f}" if best else "?"
+        log_result(experiment_dir, commit, metric_val, "discard",
+                   f"no_improvement_{metric_val:.4f}_vs_{best_str}")
        return "discard"


-def print_summary(path):
-    """Print experiment summary."""
-    tsv = Path(path) / "results.tsv"
+def print_summary(experiment_dir, config):
+    """Print session summary."""
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return
-    lines = tsv.read_text().splitlines()[1:]  # skip header
+    lines = tsv.read_text().splitlines()[1:]
    if not lines:
        return

    keeps = [l for l in lines if "\tkeep\t" in l]
    discards = [l for l in lines if "\tdiscard\t" in l]
    crashes = [l for l in lines if "\tcrash\t" in l]
+    metric_name = config.get("metric", "metric")
+    direction = config.get("metric_direction", "lower")

-    print(f"\n{'='*50}")
-    print(f"  Session Summary")
+    print(f"\n{'=' * 55}")
+    print(f"  autoresearch — Session Summary")
    print(f"  Experiments: {len(lines)} total")
-    print(f"  ✅ Keep: {len(keeps)} | ❌ Discard: {len(discards)} | 💥 Crash: {len(crashes)}")
+    print(f"  Keep: {len(keeps)} | Discard: {len(discards)} | Crash: {len(crashes)}")

    if keeps:
        try:
-            first_metric = float(keeps[0].split("\t")[1])
-            last_metric = float(keeps[-1].split("\t")[1])
-            direction = "↓" if last_metric < first_metric else "↑"
-            print(f"  Best progress: {first_metric:.6f} → {last_metric:.6f} {direction}")
+            valid = []
+            for l in keeps:
+                parts = l.split("\t")
+                if parts[1] != "N/A":
+                    valid.append(float(parts[1]))
+            if len(valid) >= 2:
+                first, last = valid[0], valid[-1]
+                best = min(valid) if direction == "lower" else max(valid)
+                pct = ((first - best) / first * 100) if direction == "lower" else ((best - first) / first * 100)
+                print(f"  {metric_name}: {first:.6f} -> {best:.6f} ({pct:+.1f}%)")
        except (ValueError, IndexError):
            pass
-    print(f"{'='*50}\n")
+    print(f"{'=' * 55}\n")


 def main():
    parser = argparse.ArgumentParser(description="autoresearch-agent runner")
+    parser.add_argument("--experiment", help="Experiment path: domain/name (e.g. engineering/api-speed)")
+    parser.add_argument("--resume", action="store_true", help="Resume last active experiment")
    parser.add_argument("--loop", action="store_true", help="Run forever")
    parser.add_argument("--single", action="store_true", help="Run one experiment")
-    parser.add_argument("--dry-run", action="store_true", help="Dry run only")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
+    parser.add_argument("--max-experiments", type=int, default=0, help="Max experiments (0 = unlimited)")
    parser.add_argument("--path", default=".", help="Project root")
-    parser.add_argument("--max-experiments", type=int, default=0,
-                        help="Max experiments (0 = unlimited)")
    args = parser.parse_args()

-    path = Path(args.path).resolve()
-    config = load_config(path)
+    project_root = Path(args.path).resolve()
+    root = find_autoresearch_root()

-    print(f"\n🔬 autoresearch-agent")
-    print(f"   Project: {path}")
-    print(f"   Target: {config.get('target', '?')}")
-    print(f"   Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
-    print(f"   Budget: {config.get('time_budget_minutes', '?')} min/experiment")
-    print(f"   Mode: {'loop' if args.loop else 'single'}")
+    if root is None:
+        print("No .autoresearch/ found. Run setup_experiment.py first.")
+        sys.exit(1)

-    if args.single:
-        exp_num = get_experiment_count(path) + 1
-        run_single_experiment(path, config, exp_num, args.dry_run)
+    # Resolve experiment
+    experiment_path = args.experiment
+    if args.resume:
+        experiment_path = get_last_active(root)
+        if not experiment_path:
+            print("No experiments found to resume.")
+            sys.exit(1)
+        print(f"Resuming: {experiment_path}")
+
+    if not experiment_path:
+        print("Specify --experiment domain/name or --resume")
+        sys.exit(1)
+
+    experiment_dir = root / experiment_path
+    if not experiment_dir.exists():
+        print(f"Experiment not found: {experiment_dir}")
+        print("Run: python scripts/setup_experiment.py --list")
+        sys.exit(1)
+
+    config = load_config(experiment_dir)
+
+    domain, name = experiment_path.split("/", 1)
+    print(f"\n  autoresearch-agent")
+    print(f"  Experiment: {experiment_path}")
+    print(f"  Target: {config.get('target', '?')}")
+    print(f"  Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
+    print(f"  Budget: {config.get('time_budget_minutes', '?')} min/experiment")
+    print(f"  Mode: {'loop' if args.loop else 'single'}")
+
+    if args.single or args.dry_run:
+        exp_num = get_experiment_count(experiment_dir) + 1
+        run_single(project_root, experiment_dir, config, exp_num, args.dry_run)
        return

-    if not args.loop and not args.dry_run:
+    if not args.loop:
        print("\nSpecify --loop (forever) or --single (one experiment)")
        sys.exit(1)

-    # Setup graceful shutdown
+    # Graceful shutdown
    def handle_interrupt(sig, frame):
-        print_summary(path)
-        print("\n⏹ Stopped by user.")
+        print_summary(experiment_dir, config)
+        print("\nStopped by user.")
        sys.exit(0)

    signal.signal(signal.SIGINT, handle_interrupt)
    signal.signal(signal.SIGTERM, handle_interrupt)

-    # Main loop
    consecutive_crashes = 0
-    exp_num = get_experiment_count(path) + 1
+    exp_num = get_experiment_count(experiment_dir) + 1

-    print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")
+    print(f"\nStarting loop. Ctrl+C to stop.\n")

    while True:
-        result = run_single_experiment(path, config, exp_num, args.dry_run)
+        result = run_single(project_root, experiment_dir, config, exp_num, False)
        exp_num += 1

        if result == "crash":
@@ -289,21 +352,16 @@ def main():
        else:
            consecutive_crashes = 0

-        # Bail if 5 consecutive crashes
        if consecutive_crashes >= 5:
-            print("\n⚠ 5 consecutive crashes. Pausing for investigation.")
-            print("  Check run.log for the last error.")
+            print("\n  5 consecutive crashes. Pausing.")
+            print("  Check .autoresearch/{}/run.log".format(experiment_path))
            break

-        # Check max experiments
-        if args.max_experiments > 0 and exp_num > args.max_experiments:
-            print(f"\n✓ Reached max experiments ({args.max_experiments})")
+        if 0 < args.max_experiments < exp_num:
+            print(f"\n  Reached max experiments ({args.max_experiments})")
            break

-        if args.single:
-            break
-
-    print_summary(path)
+    print_summary(experiment_dir, config)


 if __name__ == "__main__":
--- a/engineering/autoresearch-agent/scripts/setup_experiment.py
+++ b/engineering/autoresearch-agent/scripts/setup_experiment.py
@@ -1,65 +1,52 @@
 #!/usr/bin/env python3
 """
-autoresearch-agent: Setup Wizard
+autoresearch-agent: Setup Experiment

-Initializes a new research run:
-1. Validates the project structure
-2. Creates a git branch
-3. Runs the baseline experiment
-4. Initializes results.tsv
+Initialize a new experiment with domain, target, evaluator, and git branch.
+Creates the .autoresearch/{domain}/{name}/ directory structure.

 Usage:
-    python scripts/setup_experiment.py [--config experiment.yaml]
-    python scripts/setup_experiment.py --domain ml|prompt|code|skill
+    python scripts/setup_experiment.py --domain engineering --name api-speed \
+        --target src/api/search.py --eval "pytest bench.py" \
+        --metric p50_ms --direction lower
+
+    python scripts/setup_experiment.py --domain marketing --name medium-ctr \
+        --target content/titles.md --eval "python evaluate.py" \
+        --metric ctr_score --direction higher --evaluator llm_judge_content
+
+    python scripts/setup_experiment.py --list          # List all experiments
+    python scripts/setup_experiment.py --list-evaluators  # List available evaluators
 """

 import argparse
 import os
+import shutil
 import subprocess
 import sys
 import time
 from datetime import datetime
 from pathlib import Path

+DOMAINS = ["engineering", "marketing", "content", "prompts", "custom"]

-DOMAINS = {
-    "ml": {
-        "target": "train.py",
-        "evaluate_cmd": "uv run train.py",
-        "metric": "val_bpb",
-        "metric_direction": "lower",
-        "time_budget_minutes": 5,
-        "metric_grep": "^val_bpb:",
-    },
-    "prompt": {
-        "target": "prompt.md",
-        "evaluate_cmd": "python evaluate.py",
-        "metric": "eval_score",
-        "metric_direction": "higher",
-        "time_budget_minutes": 2,
-        "metric_grep": "^eval_score:",
-    },
-    "code": {
-        "target": "src/module.py",
-        "evaluate_cmd": "python benchmark.py",
-        "metric": "p50_ms",
-        "metric_direction": "lower",
-        "time_budget_minutes": 10,
-        "metric_grep": "^p50_ms:",
-    },
-    "skill": {
-        "target": "SKILL.md",
-        "evaluate_cmd": "python scripts/skill_evaluator.py",
-        "metric": "pass_rate",
-        "metric_direction": "higher",
-        "time_budget_minutes": 5,
-        "metric_grep": "^pass_rate:",
-    },
-}
+EVALUATOR_DIR = Path(__file__).parent.parent / "evaluators"
+
+DEFAULT_CONFIG = """# autoresearch global config
+default_time_budget_minutes: 5
+default_scope: project
+dashboard_format: markdown
+"""
+
+GITIGNORE_CONTENT = """# autoresearch — experiment logs are local state
+**/results.tsv
+**/run.log
+**/run.*.log
+config.yaml
+"""


 def run_cmd(cmd, cwd=None, timeout=None):
-    """Run a shell command and return (returncode, stdout, stderr)."""
+    """Run shell command, return (returncode, stdout, stderr)."""
    result = subprocess.run(
        cmd, shell=True, capture_output=True, text=True,
        cwd=cwd, timeout=timeout
@@ -67,188 +54,315 @@ def run_cmd(cmd, cwd=None, timeout=None):
    return result.returncode, result.stdout.strip(), result.stderr.strip()


-def check_git_repo(path):
-    """Verify we're in a git repo."""
-    code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
-    if code != 0:
-        print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
+def get_autoresearch_root(scope, project_root=None):
+    """Get the .autoresearch root directory based on scope."""
+    if scope == "user":
+        return Path.home() / ".autoresearch"
+    return Path(project_root or ".") / ".autoresearch"
+
+
+def init_root(root):
+    """Initialize .autoresearch root if it doesn't exist."""
+    created = False
+    if not root.exists():
+        root.mkdir(parents=True)
+        created = True
+        print(f"  Created {root}/")
+
+    config_file = root / "config.yaml"
+    if not config_file.exists():
+        config_file.write_text(DEFAULT_CONFIG)
+        print(f"  Created {config_file}")
+
+    gitignore = root / ".gitignore"
+    if not gitignore.exists():
+        gitignore.write_text(GITIGNORE_CONTENT)
+        print(f"  Created {gitignore}")
+
+    return created
+
+
+def create_program_md(experiment_dir, domain, name, target, metric, direction, constraints=""):
+    """Generate a program.md template for the experiment."""
+    direction_word = "Minimize" if direction == "lower" else "Maximize"
+    content = f"""# autoresearch — {name}
+
+## Goal
+{direction_word} `{metric}` on `{target}`. {"Lower" if direction == "lower" else "Higher"} is better.
+
+## What the Agent Can Change
+- Only `{target}` — this is the single file being optimized.
+- Everything inside that file is fair game unless constrained below.
+
+## What the Agent Cannot Change
+- The evaluation script (`evaluate.py` or the eval command). It is read-only.
+- Dependencies — do not add new packages or imports that aren't already available.
+- Any other files in the project unless explicitly noted here.
+{f"- Additional constraints: {constraints}" if constraints else ""}
+
+## Strategy
+1. First run: establish baseline. Do not change anything.
+2. Profile/analyze the current state — understand why the metric is what it is.
+3. Try the most obvious improvement first (low-hanging fruit).
+4. If that works, push further in the same direction.
+5. If stuck, try something orthogonal or radical.
+6. Read the git log of previous experiments. Don't repeat failed approaches.
+
+## Simplicity Rule
+A small improvement that adds ugly complexity is NOT worth it.
+Equal performance with simpler code IS worth it.
+Removing code that gets same results is the best outcome.
+
+## Stop When
+You don't stop. The human will interrupt you when they're satisfied.
+If no improvement in 20+ consecutive runs, change strategy drastically.
+"""
+    (experiment_dir / "program.md").write_text(content)
+
+
+def create_config(experiment_dir, target, eval_cmd, metric, direction, time_budget):
+    """Write experiment config."""
+    content = f"""target: {target}
+evaluate_cmd: {eval_cmd}
+metric: {metric}
+metric_direction: {direction}
+metric_grep: ^{metric}:
+time_budget_minutes: {time_budget}
+created: {datetime.now().strftime('%Y-%m-%d %H:%M')}
+"""
+    (experiment_dir / "config.cfg").write_text(content)
+
+
+def init_results_tsv(experiment_dir):
+    """Create results.tsv with header."""
+    tsv = experiment_dir / "results.tsv"
+    if tsv.exists():
+        print(f"  results.tsv already exists ({tsv.stat().st_size} bytes)")
+        return
+    tsv.write_text("commit\tmetric\tstatus\tdescription\n")
+    print("  Created results.tsv")
+
+
+def copy_evaluator(experiment_dir, evaluator_name):
+    """Copy a built-in evaluator to the experiment directory."""
+    source = EVALUATOR_DIR / f"{evaluator_name}.py"
+    if not source.exists():
+        print(f"  Warning: evaluator '{evaluator_name}' not found in {EVALUATOR_DIR}")
+        print(f"  Available: {', '.join(f.stem for f in EVALUATOR_DIR.glob('*.py'))}")
        return False
-    print("✓ Git repository found")
+    dest = experiment_dir / "evaluate.py"
+    shutil.copy2(source, dest)
+    print(f"  Copied evaluator: {evaluator_name}.py -> evaluate.py")
    return True


-def check_program_md(path):
-    """Check program.md exists and has content."""
-    pm = Path(path) / "program.md"
-    if not pm.exists():
-        print("⚠ program.md not found. Creating template...")
-        return False
-    content = pm.read_text()
-    if len(content) < 100:
-        print("⚠ program.md looks empty. Fill it out before running experiments.")
-        return False
-    print(f"✓ program.md found ({len(content)} chars)")
-    return True
-
-
-def check_target_file(path, target):
-    """Check target file exists."""
-    tf = Path(path) / target
-    if not tf.exists():
-        print(f"✗ Target file not found: {target}")
-        return False
-    print(f"✓ Target file found: {target}")
-    return True
-
-
-def check_evaluate_script(path):
-    """Check evaluate.py exists."""
-    ev = Path(path) / "evaluate.py"
-    if not ev.exists():
-        print("⚠ evaluate.py not found. You need a fixed evaluation function.")
-        print("  Create evaluate.py that outputs: metric_name: <value>")
-        return False
-    print("✓ evaluate.py found")
-    return True
-
-
-def create_branch(path, tag):
+def create_branch(path, domain, name):
    """Create and checkout the experiment branch."""
-    branch = f"autoresearch/{tag}"
-    code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
+    branch = f"autoresearch/{domain}/{name}"
+    code, _, err = run_cmd(f"git checkout -b {branch}", cwd=path)
    if code != 0:
        if "already exists" in err:
-            print(f"✗ Branch '{branch}' already exists. Use a different tag.")
-        else:
-            print(f"✗ Failed to create branch: {err}")
+            print(f"  Branch '{branch}' already exists. Checking out...")
+            run_cmd(f"git checkout {branch}", cwd=path)
+            return branch
+        print(f"  Warning: could not create branch: {err}")
        return None
-    print(f"✓ Created branch: {branch}")
+    print(f"  Created branch: {branch}")
    return branch


-def init_results_tsv(path):
-    """Create results.tsv with header."""
-    tsv = Path(path) / "results.tsv"
-    if tsv.exists():
-        print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
+def list_experiments(root):
+    """List all experiments across all domains."""
+    if not root.exists():
+        print("No experiments found. Run setup to create your first experiment.")
        return
-    tsv.write_text("commit\tmetric\tstatus\tdescription\n")
-    print("✓ Created results.tsv")
+
+    experiments = []
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir():
+                continue
+            cfg_file = exp_dir / "config.cfg"
+            if not cfg_file.exists():
+                continue
+            config = {}
+            for line in cfg_file.read_text().splitlines():
+                if ":" in line:
+                    k, v = line.split(":", 1)
+                    config[k.strip()] = v.strip()
+
+            # Count results
+            tsv = exp_dir / "results.tsv"
+            runs = 0
+            if tsv.exists():
+                runs = max(0, len(tsv.read_text().splitlines()) - 1)
+
+            experiments.append({
+                "domain": domain_dir.name,
+                "name": exp_dir.name,
+                "target": config.get("target", "?"),
+                "metric": config.get("metric", "?"),
+                "runs": runs,
+            })
+
+    if not experiments:
+        print("No experiments found.")
+        return
+
+    print(f"\n{'DOMAIN':<15} {'EXPERIMENT':<25} {'TARGET':<30} {'METRIC':<15} {'RUNS':>5}")
+    print("-" * 95)
+    for e in experiments:
+        print(f"{e['domain']:<15} {e['name']:<25} {e['target']:<30} {e['metric']:<15} {e['runs']:>5}")
+    print(f"\nTotal: {len(experiments)} experiments")


-def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
-    """Run the baseline experiment."""
-    print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
-    timeout = time_budget_minutes * 60 * 2.5  # 2.5x budget as hard limit
+def list_evaluators():
+    """List available built-in evaluators."""
+    if not EVALUATOR_DIR.exists():
+        print("No evaluators directory found.")
+        return

-    t0 = time.time()
-    code, out, err = run_cmd(
-        f"{evaluate_cmd} > run.log 2>&1",
-        cwd=path,
-        timeout=timeout
-    )
-    elapsed = time.time() - t0
-
-    if code != 0:
-        print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
-        return None
-
-    # Extract metric
-    grep_code, grep_out, _ = run_cmd(
-        f"grep '{metric_grep}' run.log | tail -1",
-        cwd=path
-    )
-    if not grep_out:
-        print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
-        return None
-
-    metric_value = grep_out.split(":")[-1].strip()
-    print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
-    return metric_value
+    print(f"\nAvailable evaluators ({EVALUATOR_DIR}):\n")
+    for f in sorted(EVALUATOR_DIR.glob("*.py")):
+        # Read first docstring line
+        desc = ""
+        for line in f.read_text().splitlines():
+            if line.strip().startswith('"""') or line.strip().startswith("'''"):
+                continue
+            if line.strip() and not line.startswith("#!"):
+                desc = line.strip().strip('"').strip("'")
+                break
+        print(f"  {f.stem:<25} {desc}")


 def main():
    parser = argparse.ArgumentParser(description="autoresearch-agent setup")
-    parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
+    parser.add_argument("--domain", choices=DOMAINS, help="Experiment domain")
+    parser.add_argument("--name", help="Experiment name (e.g. api-speed, medium-ctr)")
    parser.add_argument("--target", help="Target file to optimize")
-    parser.add_argument("--evaluate-cmd", help="Evaluation command")
-    parser.add_argument("--metric", help="Metric name")
-    parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
-    parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
-    parser.add_argument("--tag", help="Run tag (used in branch name)")
+    parser.add_argument("--eval", dest="eval_cmd", help="Evaluation command")
+    parser.add_argument("--metric", help="Metric name (must appear in eval output as 'name: value')")
+    parser.add_argument("--direction", choices=["lower", "higher"], default="lower",
+                        help="Is lower or higher better?")
+    parser.add_argument("--time-budget", type=int, default=5, help="Minutes per experiment (default: 5)")
+    parser.add_argument("--evaluator", help="Built-in evaluator to copy (e.g. benchmark_speed)")
+    parser.add_argument("--scope", choices=["project", "user"], default="project",
+                        help="Where to store experiments: project (./) or user (~/)")
+    parser.add_argument("--constraints", default="", help="Additional constraints for program.md")
    parser.add_argument("--path", default=".", help="Project root path")
-    parser.add_argument("--skip-baseline", action="store_true")
+    parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline run")
+    parser.add_argument("--skip-branch", action="store_true", help="Don't create git branch")
+    parser.add_argument("--list", action="store_true", help="List all experiments")
+    parser.add_argument("--list-evaluators", action="store_true", help="List available evaluators")
    args = parser.parse_args()

-    path = Path(args.path).resolve()
-    print(f"\n🔬 autoresearch-agent setup")
-    print(f"   Project: {path}")
-    print(f"   Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
+    project_root = Path(args.path).resolve()

-    # Get config from domain or args
-    if args.domain:
-        config = DOMAINS[args.domain].copy()
+    # List mode
+    if args.list:
+        root = get_autoresearch_root("project", project_root)
+        list_experiments(root)
+        user_root = get_autoresearch_root("user")
+        if user_root.exists() and user_root != root:
+            print(f"\n--- User-level experiments ({user_root}) ---")
+            list_experiments(user_root)
+        return
+
+    if args.list_evaluators:
+        list_evaluators()
+        return
+
+    # Validate required args for setup
+    if not all([args.domain, args.name, args.target, args.eval_cmd, args.metric]):
+        parser.error("Required: --domain, --name, --target, --eval, --metric")
+
+    root = get_autoresearch_root(args.scope, project_root)
+
+    print(f"\n  autoresearch-agent setup")
+    print(f"  Project: {project_root}")
+    print(f"  Scope: {args.scope}")
+    print(f"  Domain: {args.domain}")
+    print(f"  Experiment: {args.name}")
+    print(f"  Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
+
+    # Check git
+    code, _, _ = run_cmd("git rev-parse --is-inside-work-tree", cwd=str(project_root))
+    if code != 0:
+        print("  Error: not a git repository. Run: git init && git add . && git commit -m 'initial'")
+        sys.exit(1)
+    print("  Git repository found")
+
+    # Check target file
+    target_path = project_root / args.target
+    if not target_path.exists():
+        print(f"  Error: target file not found: {args.target}")
+        sys.exit(1)
+    print(f"  Target file found: {args.target}")
+
+    # Init root
+    init_root(root)
+
+    # Create experiment directory
+    experiment_dir = root / args.domain / args.name
+    if experiment_dir.exists():
+        print(f"  Warning: experiment '{args.domain}/{args.name}' already exists.")
+        print(f"  Use --name with a different name, or delete {experiment_dir}")
+        sys.exit(1)
+    experiment_dir.mkdir(parents=True)
+    print(f"  Created {experiment_dir}/")
+
+    # Create files
+    create_program_md(experiment_dir, args.domain, args.name,
+                      args.target, args.metric, args.direction, args.constraints)
+    print("  Created program.md")
+
+    create_config(experiment_dir, args.target, args.eval_cmd,
+                  args.metric, args.direction, args.time_budget)
+    print("  Created config.cfg")
+
+    init_results_tsv(experiment_dir)
+
+    # Copy evaluator if specified
+    if args.evaluator:
+        copy_evaluator(experiment_dir, args.evaluator)
+
+    # Create git branch
+    if not args.skip_branch:
+        create_branch(str(project_root), args.domain, args.name)
+
+    # Test evaluation command
+    print(f"\n  Testing evaluation: {args.eval_cmd}")
+    code, out, err = run_cmd(args.eval_cmd, cwd=str(project_root), timeout=60)
+    if code != 0:
+        print(f"  Warning: eval command failed (exit {code})")
+        if err:
+            print(f"  stderr: {err[:200]}")
+        print("  Fix the eval command before running the experiment loop.")
    else:
-        config = {
-            "target": args.target or "target.py",
-            "evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
-            "metric": args.metric or "score",
-            "metric_direction": args.direction,
-            "time_budget_minutes": args.budget,
-            "metric_grep": f"^{args.metric or 'score'}:",
-        }
+        # Check metric is parseable
+        full_output = out + "\n" + err
+        metric_found = False
+        for line in full_output.splitlines():
+            if line.strip().startswith(f"{args.metric}:"):
+                metric_found = True
+                print(f"  Eval works. Baseline: {line.strip()}")
+                break
+        if not metric_found:
+            print(f"  Warning: eval ran but '{args.metric}:' not found in output.")
+            print(f"  Make sure your eval command outputs: {args.metric}: <value>")

-    tag = args.tag or datetime.now().strftime("%b%d").lower()
-
-    # Validation checks
-    checks = [
-        check_git_repo(path),
-        check_program_md(path),
-        check_target_file(path, config["target"]),
-        check_evaluate_script(path),
-    ]
-
-    if not all(checks):
-        print("\n⚠ Fix the above issues before running experiments.")
-        sys.exit(1)
-
-    # Create branch
-    branch = create_branch(path, tag)
-    if not branch:
-        sys.exit(1)
-
-    # Init results TSV
-    init_results_tsv(path)
-
-    # Save config for run_experiment.py
-    config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
-    (path / ".autoresearch.cfg").write_text(config_content + "\n")
-    print("✓ Saved .autoresearch.cfg")
-
-    # Run baseline
-    if not args.skip_baseline:
-        baseline = run_baseline(
-            path,
-            config["evaluate_cmd"],
-            config["metric_grep"],
-            config["time_budget_minutes"]
-        )
-        if baseline:
-            # Log baseline to TSV
-            code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
-            with open(path / "results.tsv", "a") as f:
-                f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
-            print(f"✓ Baseline logged to results.tsv")
-
-    print(f"\n✅ Setup complete!")
-    print(f"   Branch: {branch}")
-    print(f"   Target: {config['target']}")
-    print(f"   Metric: {config['metric']} ({config['metric_direction']} is better)")
-    print(f"   Budget: {config['time_budget_minutes']} min/experiment")
-    print(f"\nTo start the autonomous loop:")
-    print(f"   python scripts/run_experiment.py --loop")
-    print(f"\nOr run a single experiment:")
-    print(f"   python scripts/run_experiment.py --single")
+    # Summary
+    print(f"\n  Setup complete!")
+    print(f"  Experiment: {args.domain}/{args.name}")
+    print(f"  Target: {args.target}")
+    print(f"  Metric: {args.metric} ({args.direction} is better)")
+    print(f"  Budget: {args.time_budget} min/experiment")
+    if not args.skip_branch:
+        print(f"  Branch: autoresearch/{args.domain}/{args.name}")
+    print(f"\n  To start:")
+    print(f"  python scripts/run_experiment.py --experiment {args.domain}/{args.name} --loop")


 if __name__ == "__main__":