refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo. Architecture changes: - Multi-experiment support: .autoresearch/{domain}/{name}/ structure - Domain categories: engineering, marketing, content, prompts, custom - Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope - User chooses scope during setup, not installation New evaluators (8 ready-to-use): - Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage - LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy - LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed Script improvements: - setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators - run_experiment.py: --experiment domain/name, --resume, --loop, --single - log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output Results export: - Terminal (default), CSV, and Markdown formats - Per-experiment, per-domain, or cross-experiment dashboard view SKILL.md rewritten: - Clear activation triggers (when the skill should activate) - Practical examples for each domain - Evaluator documentation with cost transparency - Simplified loop protocol matching Karpathy's original philosophy
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions
--- a/engineering/autoresearch-agent/scripts/log_results.py
+++ b/engineering/autoresearch-agent/scripts/log_results.py
@@ -1,125 +1,389 @@
 #!/usr/bin/env python3
 """
-autoresearch-agent: Results Logger
+autoresearch-agent: Results Viewer

-View and analyze experiment results from results.tsv.
+View experiment results in multiple formats: terminal, CSV, Markdown.
+Supports single experiment, domain, or cross-experiment dashboard.

 Usage:
-    python scripts/log_results.py --summary          # Print progress table
-    python scripts/log_results.py --best             # Show best result
-    python scripts/log_results.py --history          # Full experiment history
-    python scripts/log_results.py --record commit val status desc  # Add entry manually
+    python scripts/log_results.py --experiment engineering/api-speed
+    python scripts/log_results.py --domain engineering
+    python scripts/log_results.py --dashboard
+    python scripts/log_results.py --experiment engineering/api-speed --format csv --output results.csv
+    python scripts/log_results.py --experiment engineering/api-speed --format markdown --output results.md
+    python scripts/log_results.py --dashboard --format markdown --output dashboard.md
 """

 import argparse
+import csv
+import io
 import sys
 from pathlib import Path


-def load_results(path):
-    tsv = Path(path) / "results.tsv"
+def find_autoresearch_root():
+    """Find .autoresearch/ in project or user home."""
+    project_root = Path(".").resolve() / ".autoresearch"
+    if project_root.exists():
+        return project_root
+    user_root = Path.home() / ".autoresearch"
+    if user_root.exists():
+        return user_root
+    return None
+
+
+def load_config(experiment_dir):
+    """Load config.cfg."""
+    cfg_file = experiment_dir / "config.cfg"
+    config = {}
+    if cfg_file.exists():
+        for line in cfg_file.read_text().splitlines():
+            if ":" in line:
+                k, v = line.split(":", 1)
+                config[k.strip()] = v.strip()
+    return config
+
+
+def load_results(experiment_dir):
+    """Load results.tsv into list of dicts."""
+    tsv = experiment_dir / "results.tsv"
    if not tsv.exists():
        return []
-    lines = tsv.read_text().splitlines()[1:]  # skip header
    results = []
-    for line in lines:
+    for line in tsv.read_text().splitlines()[1:]:
        parts = line.split("\t")
        if len(parts) >= 4:
            try:
-                metric_val = float(parts[1]) if parts[1] != "N/A" else None
+                metric = float(parts[1]) if parts[1] != "N/A" else None
            except ValueError:
-                metric_val = None
+                metric = None
            results.append({
                "commit": parts[0],
-                "metric": metric_val,
+                "metric": metric,
                "status": parts[2],
-                "description": parts[3]
+                "description": parts[3],
            })
    return results


-def print_summary(results, metric_name="metric", direction="lower"):
-    if not results:
-        print("No experiments logged yet.")
-        return
-
+def compute_stats(results, direction):
+    """Compute statistics from results."""
    keeps = [r for r in results if r["status"] == "keep"]
    discards = [r for r in results if r["status"] == "discard"]
    crashes = [r for r in results if r["status"] == "crash"]

-    print(f"\n{'─'*60}")
-    print(f"  autoresearch-agent — Results Summary")
-    print(f"{'─'*60}")
-    print(f"  Total experiments: {len(results)}")
-    print(f"  ✅ Keep:    {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
-    print(f"  ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
-    print(f"  💥 Crash:   {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
+    valid_keeps = [r for r in keeps if r["metric"] is not None]
+    baseline = valid_keeps[0]["metric"] if valid_keeps else None
+    if valid_keeps:
+        best = min(r["metric"] for r in valid_keeps) if direction == "lower" else max(r["metric"] for r in valid_keeps)
+    else:
+        best = None

-    if keeps:
-        valid = [r for r in keeps if r["metric"] is not None]
-        if valid:
-            baseline = valid[0]["metric"]
-            best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
-            best_run = next(r for r in valid if r["metric"] == best)
-            improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
+    pct_change = None
+    if baseline and best and baseline != 0:
+        if direction == "lower":
+            pct_change = (baseline - best) / baseline * 100
+        else:
+            pct_change = (best - baseline) / baseline * 100

-            print(f"\n  {metric_name}:")
-            print(f"    Baseline: {baseline:.6f}")
-            print(f"    Best:     {best:.6f}  (commit: {best_run['commit']})")
-            print(f"    Change:   {improvement:+.2f}%")
-
-    print(f"{'─'*60}\n")
+    return {
+        "total": len(results),
+        "keeps": len(keeps),
+        "discards": len(discards),
+        "crashes": len(crashes),
+        "baseline": baseline,
+        "best": best,
+        "pct_change": pct_change,
+    }


-def print_history(results):
+# --- Terminal Output ---
+
+def print_experiment(experiment_dir, experiment_path):
+    """Print single experiment results to terminal."""
+    config = load_config(experiment_dir)
+    results = load_results(experiment_dir)
+    direction = config.get("metric_direction", "lower")
+    metric_name = config.get("metric", "metric")
+
    if not results:
-        print("No experiments logged yet.")
+        print(f"No results for {experiment_path}")
        return

-    print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
-    print("─" * 60)
-    for r in results:
-        metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash   "
-        status_icon = {"keep": "✅", "discard": "❌", "crash": "💥"}.get(r["status"], "?")
-        print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
+    stats = compute_stats(results, direction)

+    print(f"\n{'─' * 65}")
+    print(f"  {experiment_path}")
+    print(f"  Target: {config.get('target', '?')} | Metric: {metric_name} ({direction})")
+    print(f"{'─' * 65}")
+    print(f"  Total: {stats['total']} | Keep: {stats['keeps']} | Discard: {stats['discards']} | Crash: {stats['crashes']}")
+
+    if stats["baseline"] is not None and stats["best"] is not None:
+        pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] is not None else ""
+        print(f"  Baseline: {stats['baseline']:.6f} -> Best: {stats['best']:.6f}{pct}")
+
+    print(f"\n  {'COMMIT':<10} {'METRIC':>12} {'STATUS':<10} DESCRIPTION")
+    print(f"  {'─' * 60}")
+    for r in results:
+        m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A     "
+        icon = {"keep": "+", "discard": "-", "crash": "!"}.get(r["status"], "?")
+        print(f"  {r['commit']:<10} {m:>12} {icon} {r['status']:<7} {r['description'][:35]}")
+    print()
+
+
+def print_dashboard(root):
+    """Print cross-experiment dashboard."""
+    experiments = []
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
+                continue
+            config = load_config(exp_dir)
+            results = load_results(exp_dir)
+            direction = config.get("metric_direction", "lower")
+            stats = compute_stats(results, direction)
+
+            # Determine status
+            status = "idle"
+            if stats["total"] > 0:
+                tsv = exp_dir / "results.tsv"
+                if tsv.exists():
+                    import time
+                    age_hours = (time.time() - tsv.stat().st_mtime) / 3600
+                    status = "active" if age_hours < 1 else "paused" if age_hours < 24 else "done"
+
+            best_str = f"{stats['best']:.4f}" if stats["best"] is not None else "—"
+            pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else "—"
+
+            experiments.append({
+                "domain": domain_dir.name,
+                "name": exp_dir.name,
+                "runs": stats["total"],
+                "kept": stats["keeps"],
+                "best": best_str,
+                "change": pct_str,
+                "status": status,
+                "metric": config.get("metric", "?"),
+            })
+
+    if not experiments:
+        print("No experiments found.")
+        return experiments
+
+    print(f"\n{'─' * 90}")
+    print(f"  autoresearch — Dashboard")
+    print(f"{'─' * 90}")
+    print(f"  {'DOMAIN':<15} {'EXPERIMENT':<20} {'RUNS':>5} {'KEPT':>5} {'BEST':>12} {'CHANGE':>10} {'STATUS':<8}")
+    print(f"  {'─' * 85}")
+    for e in experiments:
+        print(f"  {e['domain']:<15} {e['name']:<20} {e['runs']:>5} {e['kept']:>5} {e['best']:>12} {e['change']:>10} {e['status']:<8}")
+    print()
+    return experiments
+
+
+# --- CSV Export ---
+
+def export_experiment_csv(experiment_dir, experiment_path):
+    """Export single experiment as CSV string."""
+    config = load_config(experiment_dir)
+    results = load_results(experiment_dir)
+    direction = config.get("metric_direction", "lower")
+    stats = compute_stats(results, direction)
+
+    buf = io.StringIO()
+    writer = csv.writer(buf)
+
+    # Header with metadata
+    writer.writerow(["# Experiment", experiment_path])
+    writer.writerow(["# Target", config.get("target", "")])
+    writer.writerow(["# Metric", f"{config.get('metric', '')} ({direction} is better)"])
+    if stats["baseline"] is not None:
+        writer.writerow(["# Baseline", f"{stats['baseline']:.6f}"])
+    if stats["best"] is not None:
+        pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
+        writer.writerow(["# Best", f"{stats['best']:.6f}{pct}"])
+    writer.writerow(["# Total", stats["total"]])
+    writer.writerow(["# Keep/Discard/Crash", f"{stats['keeps']}/{stats['discards']}/{stats['crashes']}"])
+    writer.writerow([])
+
+    writer.writerow(["Commit", "Metric", "Status", "Description"])
+    for r in results:
+        m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A"
+        writer.writerow([r["commit"], m, r["status"], r["description"]])
+
+    return buf.getvalue()
+
+
+def export_dashboard_csv(root):
+    """Export dashboard as CSV string."""
+    experiments = []
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
+                continue
+            config = load_config(exp_dir)
+            results = load_results(exp_dir)
+            direction = config.get("metric_direction", "lower")
+            stats = compute_stats(results, direction)
+            best_str = f"{stats['best']:.6f}" if stats["best"] else ""
+            pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else ""
+            experiments.append([
+                domain_dir.name, exp_dir.name, config.get("metric", ""),
+                stats["total"], stats["keeps"], stats["discards"], stats["crashes"],
+                best_str, pct_str
+            ])
+
+    buf = io.StringIO()
+    writer = csv.writer(buf)
+    writer.writerow(["Domain", "Experiment", "Metric", "Runs", "Kept", "Discarded", "Crashed", "Best", "Change"])
+    for e in experiments:
+        writer.writerow(e)
+    return buf.getvalue()
+
+
+# --- Markdown Export ---
+
+def export_experiment_markdown(experiment_dir, experiment_path):
+    """Export single experiment as Markdown string."""
+    config = load_config(experiment_dir)
+    results = load_results(experiment_dir)
+    direction = config.get("metric_direction", "lower")
+    metric_name = config.get("metric", "metric")
+    stats = compute_stats(results, direction)
+
+    lines = []
+    lines.append(f"# Autoresearch: {experiment_path}\n")
+    lines.append(f"**Target:** `{config.get('target', '?')}`  ")
+    lines.append(f"**Metric:** `{metric_name}` ({direction} is better)  ")
+    lines.append(f"**Experiments:** {stats['total']} total — {stats['keeps']} kept, {stats['discards']} discarded, {stats['crashes']} crashed\n")
+
+    if stats["baseline"] is not None and stats["best"] is not None:
+        pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
+        lines.append(f"**Progress:** `{stats['baseline']:.6f}` → `{stats['best']:.6f}`{pct}\n")
+
+    lines.append(f"| Commit | Metric | Status | Description |")
+    lines.append(f"|--------|--------|--------|-------------|")
+    for r in results:
+        m = f"`{r['metric']:.6f}`" if r["metric"] is not None else "N/A"
+        lines.append(f"| `{r['commit']}` | {m} | {r['status']} | {r['description']} |")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def export_dashboard_markdown(root):
+    """Export dashboard as Markdown string."""
+    lines = []
+    lines.append("# Autoresearch Dashboard\n")
+    lines.append("| Domain | Experiment | Metric | Runs | Kept | Best | Change | Status |")
+    lines.append("|--------|-----------|--------|------|------|------|--------|--------|")
+
+    for domain_dir in sorted(root.iterdir()):
+        if not domain_dir.is_dir() or domain_dir.name.startswith("."):
+            continue
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
+                continue
+            config = load_config(exp_dir)
+            results = load_results(exp_dir)
+            direction = config.get("metric_direction", "lower")
+            stats = compute_stats(results, direction)
+            best = f"`{stats['best']:.4f}`" if stats["best"] else "—"
+            pct = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else "—"
+
+            import time
+            tsv = exp_dir / "results.tsv"
+            status = "idle"
+            if tsv.exists() and stats["total"] > 0:
+                age_h = (time.time() - tsv.stat().st_mtime) / 3600
+                status = "active" if age_h < 1 else "paused" if age_h < 24 else "done"
+
+            lines.append(f"| {domain_dir.name} | {exp_dir.name} | {config.get('metric', '?')} | {stats['total']} | {stats['keeps']} | {best} | {pct} | {status} |")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+# --- Main ---

 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--summary", action="store_true")
-    parser.add_argument("--best", action="store_true")
-    parser.add_argument("--history", action="store_true")
-    parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
-    parser.add_argument("--path", default=".")
-    parser.add_argument("--metric", default="metric")
-    parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
+    parser = argparse.ArgumentParser(description="autoresearch-agent results viewer")
+    parser.add_argument("--experiment", help="Show one experiment: domain/name")
+    parser.add_argument("--domain", help="Show all experiments in a domain")
+    parser.add_argument("--dashboard", action="store_true", help="Cross-experiment dashboard")
+    parser.add_argument("--format", choices=["terminal", "csv", "markdown"], default="terminal",
+                        help="Output format (default: terminal)")
+    parser.add_argument("--output", "-o", help="Write to file instead of stdout")
+    parser.add_argument("--all", action="store_true", help="Show all experiments (alias for --dashboard)")
    args = parser.parse_args()

-    path = Path(args.path).resolve()
+    root = find_autoresearch_root()
+    if root is None:
+        print("No .autoresearch/ found. Run setup_experiment.py first.")
+        sys.exit(1)

-    if args.record:
-        commit, metric, status, desc = args.record
-        tsv = path / "results.tsv"
-        if not tsv.exists():
-            tsv.write_text("commit\tmetric\tstatus\tdescription\n")
-        with open(tsv, "a") as f:
-            f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
-        print(f"✓ Logged: {commit} {metric} {status}")
-        return
+    output_text = None

-    results = load_results(path)
+    # Single experiment
+    if args.experiment:
+        experiment_dir = root / args.experiment
+        if not experiment_dir.exists():
+            print(f"Experiment not found: {args.experiment}")
+            sys.exit(1)

-    if args.history:
-        print_history(results)
-    elif args.best:
-        keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
-        if not keeps:
-            print("No successful experiments yet.")
+        if args.format == "csv":
+            output_text = export_experiment_csv(experiment_dir, args.experiment)
+        elif args.format == "markdown":
+            output_text = export_experiment_markdown(experiment_dir, args.experiment)
+        else:
+            print_experiment(experiment_dir, args.experiment)
            return
-        best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
-        print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
+
+    # Domain
+    elif args.domain:
+        domain_dir = root / args.domain
+        if not domain_dir.exists():
+            print(f"Domain not found: {args.domain}")
+            sys.exit(1)
+        for exp_dir in sorted(domain_dir.iterdir()):
+            if exp_dir.is_dir() and (exp_dir / "config.cfg").exists():
+                if args.format == "terminal":
+                    print_experiment(exp_dir, f"{args.domain}/{exp_dir.name}")
+                # For CSV/MD, fall through to dashboard with domain filter
+        if args.format != "terminal":
+            # Use dashboard export filtered to domain
+            output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
+        else:
+            return
+
+    # Dashboard
+    elif args.dashboard or args.all:
+        if args.format == "csv":
+            output_text = export_dashboard_csv(root)
+        elif args.format == "markdown":
+            output_text = export_dashboard_markdown(root)
+        else:
+            print_dashboard(root)
+            return
+
    else:
-        print_summary(results, args.metric, args.direction)
+        # Default: dashboard
+        if args.format == "terminal":
+            print_dashboard(root)
+            return
+        output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
+
+    # Write output
+    if output_text:
+        if args.output:
+            Path(args.output).write_text(output_text)
+            print(f"Written to {args.output}")
+        else:
+            print(output_text)


 if __name__ == "__main__":