Inspired by Karpathy's autoresearch. The agent modifies a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely — no human in the loop. Includes: - SKILL.md with setup wizard, 4 domain configs, experiment loop protocol - 3 stdlib-only Python scripts (setup, run, log — 687 lines) - Reference docs: experiment domains guide, program.md templates Domains: ML training (val_bpb), prompt engineering (eval_score), code performance (p50_ms), agent skill optimization (pass_rate). Cherry-picked from feat/autoresearch-agent and rebased onto dev. Fixes: timeout inconsistency (2x→2.5x), results.tsv tracking clarity, zero-metric edge case, installation section aligned with multi-tool support.
127 lines
4.5 KiB
Python
127 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
autoresearch-agent: Results Logger
|
|
|
|
View and analyze experiment results from results.tsv.
|
|
|
|
Usage:
|
|
python scripts/log_results.py --summary # Print progress table
|
|
python scripts/log_results.py --best # Show best result
|
|
python scripts/log_results.py --history # Full experiment history
|
|
python scripts/log_results.py --record commit val status desc # Add entry manually
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def load_results(path):
|
|
tsv = Path(path) / "results.tsv"
|
|
if not tsv.exists():
|
|
return []
|
|
lines = tsv.read_text().splitlines()[1:] # skip header
|
|
results = []
|
|
for line in lines:
|
|
parts = line.split("\t")
|
|
if len(parts) >= 4:
|
|
try:
|
|
metric_val = float(parts[1]) if parts[1] != "N/A" else None
|
|
except ValueError:
|
|
metric_val = None
|
|
results.append({
|
|
"commit": parts[0],
|
|
"metric": metric_val,
|
|
"status": parts[2],
|
|
"description": parts[3]
|
|
})
|
|
return results
|
|
|
|
|
|
def print_summary(results, metric_name="metric", direction="lower"):
|
|
if not results:
|
|
print("No experiments logged yet.")
|
|
return
|
|
|
|
keeps = [r for r in results if r["status"] == "keep"]
|
|
discards = [r for r in results if r["status"] == "discard"]
|
|
crashes = [r for r in results if r["status"] == "crash"]
|
|
|
|
print(f"\n{'─'*60}")
|
|
print(f" autoresearch-agent — Results Summary")
|
|
print(f"{'─'*60}")
|
|
print(f" Total experiments: {len(results)}")
|
|
print(f" ✅ Keep: {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
|
|
print(f" ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
|
|
print(f" 💥 Crash: {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
|
|
|
|
if keeps:
|
|
valid = [r for r in keeps if r["metric"] is not None]
|
|
if valid:
|
|
baseline = valid[0]["metric"]
|
|
best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
|
|
best_run = next(r for r in valid if r["metric"] == best)
|
|
improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
|
|
|
|
print(f"\n {metric_name}:")
|
|
print(f" Baseline: {baseline:.6f}")
|
|
print(f" Best: {best:.6f} (commit: {best_run['commit']})")
|
|
print(f" Change: {improvement:+.2f}%")
|
|
|
|
print(f"{'─'*60}\n")
|
|
|
|
|
|
def print_history(results):
|
|
if not results:
|
|
print("No experiments logged yet.")
|
|
return
|
|
|
|
print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
|
|
print("─" * 60)
|
|
for r in results:
|
|
metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash "
|
|
status_icon = {"keep": "✅", "discard": "❌", "crash": "💥"}.get(r["status"], "?")
|
|
print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--summary", action="store_true")
|
|
parser.add_argument("--best", action="store_true")
|
|
parser.add_argument("--history", action="store_true")
|
|
parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
|
|
parser.add_argument("--path", default=".")
|
|
parser.add_argument("--metric", default="metric")
|
|
parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
|
|
args = parser.parse_args()
|
|
|
|
path = Path(args.path).resolve()
|
|
|
|
if args.record:
|
|
commit, metric, status, desc = args.record
|
|
tsv = path / "results.tsv"
|
|
if not tsv.exists():
|
|
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
|
|
with open(tsv, "a") as f:
|
|
f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
|
|
print(f"✓ Logged: {commit} {metric} {status}")
|
|
return
|
|
|
|
results = load_results(path)
|
|
|
|
if args.history:
|
|
print_history(results)
|
|
elif args.best:
|
|
keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
|
|
if not keeps:
|
|
print("No successful experiments yet.")
|
|
return
|
|
best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
|
|
print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
|
|
else:
|
|
print_summary(results, args.metric, args.direction)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|