refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators
Major rewrite based on deep study of Karpathy's autoresearch repo.
Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation
New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed
Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output
Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view
SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
@@ -1,125 +1,389 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
autoresearch-agent: Results Logger
|
||||
autoresearch-agent: Results Viewer
|
||||
|
||||
View and analyze experiment results from results.tsv.
|
||||
View experiment results in multiple formats: terminal, CSV, Markdown.
|
||||
Supports single experiment, domain, or cross-experiment dashboard.
|
||||
|
||||
Usage:
|
||||
python scripts/log_results.py --summary # Print progress table
|
||||
python scripts/log_results.py --best # Show best result
|
||||
python scripts/log_results.py --history # Full experiment history
|
||||
python scripts/log_results.py --record commit val status desc # Add entry manually
|
||||
python scripts/log_results.py --experiment engineering/api-speed
|
||||
python scripts/log_results.py --domain engineering
|
||||
python scripts/log_results.py --dashboard
|
||||
python scripts/log_results.py --experiment engineering/api-speed --format csv --output results.csv
|
||||
python scripts/log_results.py --experiment engineering/api-speed --format markdown --output results.md
|
||||
python scripts/log_results.py --dashboard --format markdown --output dashboard.md
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_results(path):
|
||||
tsv = Path(path) / "results.tsv"
|
||||
def find_autoresearch_root():
|
||||
"""Find .autoresearch/ in project or user home."""
|
||||
project_root = Path(".").resolve() / ".autoresearch"
|
||||
if project_root.exists():
|
||||
return project_root
|
||||
user_root = Path.home() / ".autoresearch"
|
||||
if user_root.exists():
|
||||
return user_root
|
||||
return None
|
||||
|
||||
|
||||
def load_config(experiment_dir):
|
||||
"""Load config.cfg."""
|
||||
cfg_file = experiment_dir / "config.cfg"
|
||||
config = {}
|
||||
if cfg_file.exists():
|
||||
for line in cfg_file.read_text().splitlines():
|
||||
if ":" in line:
|
||||
k, v = line.split(":", 1)
|
||||
config[k.strip()] = v.strip()
|
||||
return config
|
||||
|
||||
|
||||
def load_results(experiment_dir):
|
||||
"""Load results.tsv into list of dicts."""
|
||||
tsv = experiment_dir / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return []
|
||||
lines = tsv.read_text().splitlines()[1:] # skip header
|
||||
results = []
|
||||
for line in lines:
|
||||
for line in tsv.read_text().splitlines()[1:]:
|
||||
parts = line.split("\t")
|
||||
if len(parts) >= 4:
|
||||
try:
|
||||
metric_val = float(parts[1]) if parts[1] != "N/A" else None
|
||||
metric = float(parts[1]) if parts[1] != "N/A" else None
|
||||
except ValueError:
|
||||
metric_val = None
|
||||
metric = None
|
||||
results.append({
|
||||
"commit": parts[0],
|
||||
"metric": metric_val,
|
||||
"metric": metric,
|
||||
"status": parts[2],
|
||||
"description": parts[3]
|
||||
"description": parts[3],
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def print_summary(results, metric_name="metric", direction="lower"):
|
||||
if not results:
|
||||
print("No experiments logged yet.")
|
||||
return
|
||||
|
||||
def compute_stats(results, direction):
|
||||
"""Compute statistics from results."""
|
||||
keeps = [r for r in results if r["status"] == "keep"]
|
||||
discards = [r for r in results if r["status"] == "discard"]
|
||||
crashes = [r for r in results if r["status"] == "crash"]
|
||||
|
||||
print(f"\n{'─'*60}")
|
||||
print(f" autoresearch-agent — Results Summary")
|
||||
print(f"{'─'*60}")
|
||||
print(f" Total experiments: {len(results)}")
|
||||
print(f" ✅ Keep: {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
|
||||
print(f" ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
|
||||
print(f" 💥 Crash: {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
|
||||
valid_keeps = [r for r in keeps if r["metric"] is not None]
|
||||
baseline = valid_keeps[0]["metric"] if valid_keeps else None
|
||||
if valid_keeps:
|
||||
best = min(r["metric"] for r in valid_keeps) if direction == "lower" else max(r["metric"] for r in valid_keeps)
|
||||
else:
|
||||
best = None
|
||||
|
||||
if keeps:
|
||||
valid = [r for r in keeps if r["metric"] is not None]
|
||||
if valid:
|
||||
baseline = valid[0]["metric"]
|
||||
best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
|
||||
best_run = next(r for r in valid if r["metric"] == best)
|
||||
improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
|
||||
pct_change = None
|
||||
if baseline and best and baseline != 0:
|
||||
if direction == "lower":
|
||||
pct_change = (baseline - best) / baseline * 100
|
||||
else:
|
||||
pct_change = (best - baseline) / baseline * 100
|
||||
|
||||
print(f"\n {metric_name}:")
|
||||
print(f" Baseline: {baseline:.6f}")
|
||||
print(f" Best: {best:.6f} (commit: {best_run['commit']})")
|
||||
print(f" Change: {improvement:+.2f}%")
|
||||
|
||||
print(f"{'─'*60}\n")
|
||||
return {
|
||||
"total": len(results),
|
||||
"keeps": len(keeps),
|
||||
"discards": len(discards),
|
||||
"crashes": len(crashes),
|
||||
"baseline": baseline,
|
||||
"best": best,
|
||||
"pct_change": pct_change,
|
||||
}
|
||||
|
||||
|
||||
def print_history(results):
|
||||
# --- Terminal Output ---
|
||||
|
||||
def print_experiment(experiment_dir, experiment_path):
|
||||
"""Print single experiment results to terminal."""
|
||||
config = load_config(experiment_dir)
|
||||
results = load_results(experiment_dir)
|
||||
direction = config.get("metric_direction", "lower")
|
||||
metric_name = config.get("metric", "metric")
|
||||
|
||||
if not results:
|
||||
print("No experiments logged yet.")
|
||||
print(f"No results for {experiment_path}")
|
||||
return
|
||||
|
||||
print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
|
||||
print("─" * 60)
|
||||
for r in results:
|
||||
metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash "
|
||||
status_icon = {"keep": "✅", "discard": "❌", "crash": "💥"}.get(r["status"], "?")
|
||||
print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
|
||||
stats = compute_stats(results, direction)
|
||||
|
||||
print(f"\n{'─' * 65}")
|
||||
print(f" {experiment_path}")
|
||||
print(f" Target: {config.get('target', '?')} | Metric: {metric_name} ({direction})")
|
||||
print(f"{'─' * 65}")
|
||||
print(f" Total: {stats['total']} | Keep: {stats['keeps']} | Discard: {stats['discards']} | Crash: {stats['crashes']}")
|
||||
|
||||
if stats["baseline"] is not None and stats["best"] is not None:
|
||||
pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] is not None else ""
|
||||
print(f" Baseline: {stats['baseline']:.6f} -> Best: {stats['best']:.6f}{pct}")
|
||||
|
||||
print(f"\n {'COMMIT':<10} {'METRIC':>12} {'STATUS':<10} DESCRIPTION")
|
||||
print(f" {'─' * 60}")
|
||||
for r in results:
|
||||
m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A "
|
||||
icon = {"keep": "+", "discard": "-", "crash": "!"}.get(r["status"], "?")
|
||||
print(f" {r['commit']:<10} {m:>12} {icon} {r['status']:<7} {r['description'][:35]}")
|
||||
print()
|
||||
|
||||
|
||||
def print_dashboard(root):
|
||||
"""Print cross-experiment dashboard."""
|
||||
experiments = []
|
||||
for domain_dir in sorted(root.iterdir()):
|
||||
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
|
||||
continue
|
||||
for exp_dir in sorted(domain_dir.iterdir()):
|
||||
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
|
||||
continue
|
||||
config = load_config(exp_dir)
|
||||
results = load_results(exp_dir)
|
||||
direction = config.get("metric_direction", "lower")
|
||||
stats = compute_stats(results, direction)
|
||||
|
||||
# Determine status
|
||||
status = "idle"
|
||||
if stats["total"] > 0:
|
||||
tsv = exp_dir / "results.tsv"
|
||||
if tsv.exists():
|
||||
import time
|
||||
age_hours = (time.time() - tsv.stat().st_mtime) / 3600
|
||||
status = "active" if age_hours < 1 else "paused" if age_hours < 24 else "done"
|
||||
|
||||
best_str = f"{stats['best']:.4f}" if stats["best"] is not None else "—"
|
||||
pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else "—"
|
||||
|
||||
experiments.append({
|
||||
"domain": domain_dir.name,
|
||||
"name": exp_dir.name,
|
||||
"runs": stats["total"],
|
||||
"kept": stats["keeps"],
|
||||
"best": best_str,
|
||||
"change": pct_str,
|
||||
"status": status,
|
||||
"metric": config.get("metric", "?"),
|
||||
})
|
||||
|
||||
if not experiments:
|
||||
print("No experiments found.")
|
||||
return experiments
|
||||
|
||||
print(f"\n{'─' * 90}")
|
||||
print(f" autoresearch — Dashboard")
|
||||
print(f"{'─' * 90}")
|
||||
print(f" {'DOMAIN':<15} {'EXPERIMENT':<20} {'RUNS':>5} {'KEPT':>5} {'BEST':>12} {'CHANGE':>10} {'STATUS':<8}")
|
||||
print(f" {'─' * 85}")
|
||||
for e in experiments:
|
||||
print(f" {e['domain']:<15} {e['name']:<20} {e['runs']:>5} {e['kept']:>5} {e['best']:>12} {e['change']:>10} {e['status']:<8}")
|
||||
print()
|
||||
return experiments
|
||||
|
||||
|
||||
# --- CSV Export ---
|
||||
|
||||
def export_experiment_csv(experiment_dir, experiment_path):
|
||||
"""Export single experiment as CSV string."""
|
||||
config = load_config(experiment_dir)
|
||||
results = load_results(experiment_dir)
|
||||
direction = config.get("metric_direction", "lower")
|
||||
stats = compute_stats(results, direction)
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf)
|
||||
|
||||
# Header with metadata
|
||||
writer.writerow(["# Experiment", experiment_path])
|
||||
writer.writerow(["# Target", config.get("target", "")])
|
||||
writer.writerow(["# Metric", f"{config.get('metric', '')} ({direction} is better)"])
|
||||
if stats["baseline"] is not None:
|
||||
writer.writerow(["# Baseline", f"{stats['baseline']:.6f}"])
|
||||
if stats["best"] is not None:
|
||||
pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
|
||||
writer.writerow(["# Best", f"{stats['best']:.6f}{pct}"])
|
||||
writer.writerow(["# Total", stats["total"]])
|
||||
writer.writerow(["# Keep/Discard/Crash", f"{stats['keeps']}/{stats['discards']}/{stats['crashes']}"])
|
||||
writer.writerow([])
|
||||
|
||||
writer.writerow(["Commit", "Metric", "Status", "Description"])
|
||||
for r in results:
|
||||
m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A"
|
||||
writer.writerow([r["commit"], m, r["status"], r["description"]])
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def export_dashboard_csv(root):
|
||||
"""Export dashboard as CSV string."""
|
||||
experiments = []
|
||||
for domain_dir in sorted(root.iterdir()):
|
||||
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
|
||||
continue
|
||||
for exp_dir in sorted(domain_dir.iterdir()):
|
||||
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
|
||||
continue
|
||||
config = load_config(exp_dir)
|
||||
results = load_results(exp_dir)
|
||||
direction = config.get("metric_direction", "lower")
|
||||
stats = compute_stats(results, direction)
|
||||
best_str = f"{stats['best']:.6f}" if stats["best"] else ""
|
||||
pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else ""
|
||||
experiments.append([
|
||||
domain_dir.name, exp_dir.name, config.get("metric", ""),
|
||||
stats["total"], stats["keeps"], stats["discards"], stats["crashes"],
|
||||
best_str, pct_str
|
||||
])
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf)
|
||||
writer.writerow(["Domain", "Experiment", "Metric", "Runs", "Kept", "Discarded", "Crashed", "Best", "Change"])
|
||||
for e in experiments:
|
||||
writer.writerow(e)
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
# --- Markdown Export ---
|
||||
|
||||
def export_experiment_markdown(experiment_dir, experiment_path):
|
||||
"""Export single experiment as Markdown string."""
|
||||
config = load_config(experiment_dir)
|
||||
results = load_results(experiment_dir)
|
||||
direction = config.get("metric_direction", "lower")
|
||||
metric_name = config.get("metric", "metric")
|
||||
stats = compute_stats(results, direction)
|
||||
|
||||
lines = []
|
||||
lines.append(f"# Autoresearch: {experiment_path}\n")
|
||||
lines.append(f"**Target:** `{config.get('target', '?')}` ")
|
||||
lines.append(f"**Metric:** `{metric_name}` ({direction} is better) ")
|
||||
lines.append(f"**Experiments:** {stats['total']} total — {stats['keeps']} kept, {stats['discards']} discarded, {stats['crashes']} crashed\n")
|
||||
|
||||
if stats["baseline"] is not None and stats["best"] is not None:
|
||||
pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
|
||||
lines.append(f"**Progress:** `{stats['baseline']:.6f}` → `{stats['best']:.6f}`{pct}\n")
|
||||
|
||||
lines.append(f"| Commit | Metric | Status | Description |")
|
||||
lines.append(f"|--------|--------|--------|-------------|")
|
||||
for r in results:
|
||||
m = f"`{r['metric']:.6f}`" if r["metric"] is not None else "N/A"
|
||||
lines.append(f"| `{r['commit']}` | {m} | {r['status']} | {r['description']} |")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def export_dashboard_markdown(root):
|
||||
"""Export dashboard as Markdown string."""
|
||||
lines = []
|
||||
lines.append("# Autoresearch Dashboard\n")
|
||||
lines.append("| Domain | Experiment | Metric | Runs | Kept | Best | Change | Status |")
|
||||
lines.append("|--------|-----------|--------|------|------|------|--------|--------|")
|
||||
|
||||
for domain_dir in sorted(root.iterdir()):
|
||||
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
|
||||
continue
|
||||
for exp_dir in sorted(domain_dir.iterdir()):
|
||||
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
|
||||
continue
|
||||
config = load_config(exp_dir)
|
||||
results = load_results(exp_dir)
|
||||
direction = config.get("metric_direction", "lower")
|
||||
stats = compute_stats(results, direction)
|
||||
best = f"`{stats['best']:.4f}`" if stats["best"] else "—"
|
||||
pct = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else "—"
|
||||
|
||||
import time
|
||||
tsv = exp_dir / "results.tsv"
|
||||
status = "idle"
|
||||
if tsv.exists() and stats["total"] > 0:
|
||||
age_h = (time.time() - tsv.stat().st_mtime) / 3600
|
||||
status = "active" if age_h < 1 else "paused" if age_h < 24 else "done"
|
||||
|
||||
lines.append(f"| {domain_dir.name} | {exp_dir.name} | {config.get('metric', '?')} | {stats['total']} | {stats['keeps']} | {best} | {pct} | {status} |")
|
||||
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# --- Main ---
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--summary", action="store_true")
|
||||
parser.add_argument("--best", action="store_true")
|
||||
parser.add_argument("--history", action="store_true")
|
||||
parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
|
||||
parser.add_argument("--path", default=".")
|
||||
parser.add_argument("--metric", default="metric")
|
||||
parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
|
||||
parser = argparse.ArgumentParser(description="autoresearch-agent results viewer")
|
||||
parser.add_argument("--experiment", help="Show one experiment: domain/name")
|
||||
parser.add_argument("--domain", help="Show all experiments in a domain")
|
||||
parser.add_argument("--dashboard", action="store_true", help="Cross-experiment dashboard")
|
||||
parser.add_argument("--format", choices=["terminal", "csv", "markdown"], default="terminal",
|
||||
help="Output format (default: terminal)")
|
||||
parser.add_argument("--output", "-o", help="Write to file instead of stdout")
|
||||
parser.add_argument("--all", action="store_true", help="Show all experiments (alias for --dashboard)")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.path).resolve()
|
||||
root = find_autoresearch_root()
|
||||
if root is None:
|
||||
print("No .autoresearch/ found. Run setup_experiment.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
if args.record:
|
||||
commit, metric, status, desc = args.record
|
||||
tsv = path / "results.tsv"
|
||||
if not tsv.exists():
|
||||
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
|
||||
with open(tsv, "a") as f:
|
||||
f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
|
||||
print(f"✓ Logged: {commit} {metric} {status}")
|
||||
return
|
||||
output_text = None
|
||||
|
||||
results = load_results(path)
|
||||
# Single experiment
|
||||
if args.experiment:
|
||||
experiment_dir = root / args.experiment
|
||||
if not experiment_dir.exists():
|
||||
print(f"Experiment not found: {args.experiment}")
|
||||
sys.exit(1)
|
||||
|
||||
if args.history:
|
||||
print_history(results)
|
||||
elif args.best:
|
||||
keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
|
||||
if not keeps:
|
||||
print("No successful experiments yet.")
|
||||
if args.format == "csv":
|
||||
output_text = export_experiment_csv(experiment_dir, args.experiment)
|
||||
elif args.format == "markdown":
|
||||
output_text = export_experiment_markdown(experiment_dir, args.experiment)
|
||||
else:
|
||||
print_experiment(experiment_dir, args.experiment)
|
||||
return
|
||||
best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
|
||||
print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
|
||||
|
||||
# Domain
|
||||
elif args.domain:
|
||||
domain_dir = root / args.domain
|
||||
if not domain_dir.exists():
|
||||
print(f"Domain not found: {args.domain}")
|
||||
sys.exit(1)
|
||||
for exp_dir in sorted(domain_dir.iterdir()):
|
||||
if exp_dir.is_dir() and (exp_dir / "config.cfg").exists():
|
||||
if args.format == "terminal":
|
||||
print_experiment(exp_dir, f"{args.domain}/{exp_dir.name}")
|
||||
# For CSV/MD, fall through to dashboard with domain filter
|
||||
if args.format != "terminal":
|
||||
# Use dashboard export filtered to domain
|
||||
output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
|
||||
else:
|
||||
return
|
||||
|
||||
# Dashboard
|
||||
elif args.dashboard or args.all:
|
||||
if args.format == "csv":
|
||||
output_text = export_dashboard_csv(root)
|
||||
elif args.format == "markdown":
|
||||
output_text = export_dashboard_markdown(root)
|
||||
else:
|
||||
print_dashboard(root)
|
||||
return
|
||||
|
||||
else:
|
||||
print_summary(results, args.metric, args.direction)
|
||||
# Default: dashboard
|
||||
if args.format == "terminal":
|
||||
print_dashboard(root)
|
||||
return
|
||||
output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
|
||||
|
||||
# Write output
|
||||
if output_text:
|
||||
if args.output:
|
||||
Path(args.output).write_text(output_text)
|
||||
print(f"Written to {args.output}")
|
||||
else:
|
||||
print(output_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user