refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators
Major rewrite based on deep study of Karpathy's autoresearch repo.
Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation
New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed
Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output
Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view
SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
@@ -2,17 +2,15 @@
|
||||
"""
|
||||
autoresearch-agent: Experiment Runner
|
||||
|
||||
Executes the autonomous experiment loop:
|
||||
- Reads .autoresearch.cfg for project config
|
||||
- Runs the target evaluation
|
||||
- Keeps improvements (git commit) or discards failures (git reset)
|
||||
- Logs everything to results.tsv
|
||||
- Loops indefinitely until interrupted
|
||||
Executes the autonomous experiment loop for a specific experiment.
|
||||
Reads config from .autoresearch/{domain}/{name}/config.cfg.
|
||||
|
||||
Usage:
|
||||
python scripts/run_experiment.py --loop # Run forever
|
||||
python scripts/run_experiment.py --single # Run one experiment
|
||||
python scripts/run_experiment.py --dry-run # Show what would happen
|
||||
python scripts/run_experiment.py --experiment engineering/api-speed --loop
|
||||
python scripts/run_experiment.py --experiment engineering/api-speed --single
|
||||
python scripts/run_experiment.py --experiment marketing/medium-ctr --loop
|
||||
python scripts/run_experiment.py --resume --loop
|
||||
python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@@ -25,11 +23,22 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_config(path):
|
||||
"""Load .autoresearch.cfg"""
|
||||
cfg_file = Path(path) / ".autoresearch.cfg"
|
||||
def find_autoresearch_root():
|
||||
"""Find .autoresearch/ in project or user home."""
|
||||
project_root = Path(".").resolve() / ".autoresearch"
|
||||
if project_root.exists():
|
||||
return project_root
|
||||
user_root = Path.home() / ".autoresearch"
|
||||
if user_root.exists():
|
||||
return user_root
|
||||
return None
|
||||
|
||||
|
||||
def load_config(experiment_dir):
|
||||
"""Load config.cfg from experiment directory."""
|
||||
cfg_file = experiment_dir / "config.cfg"
|
||||
if not cfg_file.exists():
|
||||
print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
|
||||
print(f" Error: no config.cfg in {experiment_dir}")
|
||||
sys.exit(1)
|
||||
config = {}
|
||||
for line in cfg_file.read_text().splitlines():
|
||||
@@ -49,239 +58,293 @@ def run_cmd(cmd, cwd=None, timeout=None):
|
||||
|
||||
|
||||
def get_current_commit(path):
|
||||
"""Get short hash of current HEAD."""
|
||||
_, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
|
||||
return commit
|
||||
|
||||
|
||||
def get_current_metric(path, metric_grep):
|
||||
"""Read the last recorded metric from results.tsv."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
def get_best_metric(experiment_dir, direction):
|
||||
"""Read the best metric from results.tsv."""
|
||||
tsv = experiment_dir / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return None
|
||||
lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
|
||||
lines = [l for l in tsv.read_text().splitlines()[1:] if "\tkeep\t" in l]
|
||||
if not lines:
|
||||
return None
|
||||
last = lines[-1].split("\t")
|
||||
try:
|
||||
return float(last[1])
|
||||
except (ValueError, IndexError):
|
||||
metrics = []
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
try:
|
||||
if parts[1] != "N/A":
|
||||
metrics.append(float(parts[1]))
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if not metrics:
|
||||
return None
|
||||
return min(metrics) if direction == "lower" else max(metrics)
|
||||
|
||||
|
||||
def run_evaluation(path, evaluate_cmd, time_budget_minutes):
|
||||
"""Run evaluation with time limit."""
|
||||
hard_limit = time_budget_minutes * 60 * 2.5 # 2.5x as hard timeout
|
||||
def run_evaluation(project_root, eval_cmd, time_budget_minutes, log_file):
|
||||
"""Run evaluation with time limit. Output goes to log_file."""
|
||||
hard_limit = time_budget_minutes * 60 * 2.5
|
||||
t0 = time.time()
|
||||
try:
|
||||
code, _, _ = run_cmd(
|
||||
f"{evaluate_cmd} > run.log 2>&1",
|
||||
cwd=path,
|
||||
f"{eval_cmd} > {log_file} 2>&1",
|
||||
cwd=str(project_root),
|
||||
timeout=hard_limit
|
||||
)
|
||||
elapsed = time.time() - t0
|
||||
return code, elapsed
|
||||
except subprocess.TimeoutExpired:
|
||||
elapsed = time.time() - t0
|
||||
return -1, elapsed # -1 = timeout
|
||||
return -1, elapsed
|
||||
|
||||
|
||||
def extract_metric(path, metric_grep):
|
||||
"""Extract metric value from run.log."""
|
||||
code, out, _ = run_cmd(
|
||||
f"grep '{metric_grep}' run.log | tail -1",
|
||||
cwd=path
|
||||
)
|
||||
if not out:
|
||||
return None
|
||||
try:
|
||||
return float(out.split(":")[-1].strip())
|
||||
except ValueError:
|
||||
def extract_metric(log_file, metric_grep):
|
||||
"""Extract metric value from log file."""
|
||||
log_path = Path(log_file)
|
||||
if not log_path.exists():
|
||||
return None
|
||||
for line in reversed(log_path.read_text().splitlines()):
|
||||
stripped = line.strip()
|
||||
if stripped.startswith(metric_grep.lstrip("^")):
|
||||
try:
|
||||
return float(stripped.split(":")[-1].strip())
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def is_improvement(new_val, old_val, direction):
|
||||
"""Check if new result is better than old."""
|
||||
if old_val is None:
|
||||
return True # First run always "improves"
|
||||
return True
|
||||
if direction == "lower":
|
||||
return new_val < old_val
|
||||
else:
|
||||
return new_val > old_val
|
||||
return new_val > old_val
|
||||
|
||||
|
||||
def log_result(path, commit, metric_val, status, description):
|
||||
def log_result(experiment_dir, commit, metric_val, status, description):
|
||||
"""Append result to results.tsv."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
tsv = experiment_dir / "results.tsv"
|
||||
metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
|
||||
with open(tsv, "a") as f:
|
||||
f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")
|
||||
|
||||
|
||||
def get_experiment_count(path):
|
||||
def get_experiment_count(experiment_dir):
|
||||
"""Count experiments run so far."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
tsv = experiment_dir / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return 0
|
||||
lines = tsv.read_text().splitlines()
|
||||
return max(0, len(lines) - 1) # subtract header
|
||||
return max(0, len(tsv.read_text().splitlines()) - 1)
|
||||
|
||||
|
||||
def run_single_experiment(path, config, exp_num, dry_run=False):
|
||||
def get_last_active(root):
|
||||
"""Find the most recently modified experiment."""
|
||||
latest = None
|
||||
latest_time = 0
|
||||
for domain_dir in root.iterdir():
|
||||
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
|
||||
continue
|
||||
for exp_dir in domain_dir.iterdir():
|
||||
if not exp_dir.is_dir():
|
||||
continue
|
||||
cfg = exp_dir / "config.cfg"
|
||||
if cfg.exists() and cfg.stat().st_mtime > latest_time:
|
||||
latest_time = cfg.stat().st_mtime
|
||||
latest = f"{domain_dir.name}/{exp_dir.name}"
|
||||
return latest
|
||||
|
||||
|
||||
def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
|
||||
"""Run one experiment iteration."""
|
||||
direction = config.get("metric_direction", "lower")
|
||||
metric_grep = config.get("metric_grep", "^metric:")
|
||||
evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
|
||||
eval_cmd = config.get("evaluate_cmd", "python evaluate.py")
|
||||
time_budget = int(config.get("time_budget_minutes", 5))
|
||||
metric_name = config.get("metric", "metric")
|
||||
log_file = str(experiment_dir / "run.log")
|
||||
|
||||
best_so_far = get_current_metric(path, metric_grep)
|
||||
best = get_best_metric(experiment_dir, direction)
|
||||
ts = datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
print(f"\n[{ts}] Experiment #{exp_num}")
|
||||
print(f" Best {metric_name} so far: {best_so_far}")
|
||||
print(f" Best {metric_name}: {best}")
|
||||
|
||||
if dry_run:
|
||||
print(" [DRY RUN] Would run evaluation and check metric")
|
||||
return "dry_run"
|
||||
|
||||
# Save pre-experiment state for rollback
|
||||
code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
|
||||
# Save state for rollback
|
||||
code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=str(project_root))
|
||||
if code != 0:
|
||||
print(" ✗ Can't get git state. Is this a git repo with commits?")
|
||||
print(" Error: can't get git state")
|
||||
return "error"
|
||||
|
||||
# Run evaluation
|
||||
print(f" Running: {evaluate_cmd} (budget: {time_budget} min)")
|
||||
ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)
|
||||
print(f" Running: {eval_cmd} (budget: {time_budget}m)")
|
||||
ret_code, elapsed = run_evaluation(project_root, eval_cmd, time_budget, log_file)
|
||||
|
||||
# Handle timeout
|
||||
commit = get_current_commit(str(project_root))
|
||||
|
||||
# Timeout
|
||||
if ret_code == -1:
|
||||
print(f" ✗ TIMEOUT after {elapsed:.0f}s — discarding")
|
||||
run_cmd("git checkout -- .", cwd=path) # revert uncommitted changes
|
||||
# Commit was already made by the agent before evaluation
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
|
||||
print(f" TIMEOUT after {elapsed:.0f}s — discarding")
|
||||
run_cmd("git checkout -- .", cwd=str(project_root))
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
|
||||
log_result(experiment_dir, commit, None, "crash", f"timeout_{elapsed:.0f}s")
|
||||
return "crash"
|
||||
|
||||
# Handle non-zero exit
|
||||
# Crash
|
||||
if ret_code != 0:
|
||||
# Check if it crashed
|
||||
code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
|
||||
print(f" ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
|
||||
_, tail, _ = run_cmd(f"tail -5 {log_file}", cwd=str(project_root))
|
||||
print(f" CRASH (exit {ret_code}) after {elapsed:.0f}s")
|
||||
print(f" Last output: {tail[:200]}")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
|
||||
log_result(experiment_dir, commit, None, "crash", f"exit_{ret_code}")
|
||||
return "crash"
|
||||
|
||||
# Extract metric
|
||||
metric_val = extract_metric(path, metric_grep)
|
||||
metric_val = extract_metric(log_file, metric_grep)
|
||||
if metric_val is None:
|
||||
print(f" ✗ Could not parse metric from run.log")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, None, "crash", "metric_parse_failed")
|
||||
print(f" Could not parse {metric_name} from run.log")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
|
||||
log_result(experiment_dir, commit, None, "crash", "metric_parse_failed")
|
||||
return "crash"
|
||||
|
||||
curr_commit = get_current_commit(path)
|
||||
delta = ""
|
||||
if best_so_far is not None:
|
||||
diff = metric_val - best_so_far
|
||||
delta = f" (Δ{diff:+.4f})"
|
||||
if best is not None:
|
||||
diff = metric_val - best
|
||||
delta = f" (delta {diff:+.4f})"
|
||||
|
||||
print(f" {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")
|
||||
|
||||
# Keep or discard
|
||||
if is_improvement(metric_val, best_so_far, direction):
|
||||
print(f" ✅ KEEP — improvement confirmed")
|
||||
log_result(path, curr_commit, metric_val, "keep",
|
||||
f"improvement_{metric_name}_{metric_val:.4f}")
|
||||
if is_improvement(metric_val, best, direction):
|
||||
print(f" KEEP — improvement")
|
||||
log_result(experiment_dir, commit, metric_val, "keep",
|
||||
f"improved_{metric_name}_{metric_val:.4f}")
|
||||
return "keep"
|
||||
else:
|
||||
print(f" ❌ DISCARD — no improvement")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, metric_val, "discard",
|
||||
f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
|
||||
print(f" DISCARD — no improvement")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
|
||||
best_str = f"{best:.4f}" if best else "?"
|
||||
log_result(experiment_dir, commit, metric_val, "discard",
|
||||
f"no_improvement_{metric_val:.4f}_vs_{best_str}")
|
||||
return "discard"
|
||||
|
||||
|
||||
def print_summary(path):
|
||||
"""Print experiment summary."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
def print_summary(experiment_dir, config):
|
||||
"""Print session summary."""
|
||||
tsv = experiment_dir / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return
|
||||
lines = tsv.read_text().splitlines()[1:] # skip header
|
||||
lines = tsv.read_text().splitlines()[1:]
|
||||
if not lines:
|
||||
return
|
||||
|
||||
keeps = [l for l in lines if "\tkeep\t" in l]
|
||||
discards = [l for l in lines if "\tdiscard\t" in l]
|
||||
crashes = [l for l in lines if "\tcrash\t" in l]
|
||||
metric_name = config.get("metric", "metric")
|
||||
direction = config.get("metric_direction", "lower")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f" Session Summary")
|
||||
print(f"\n{'=' * 55}")
|
||||
print(f" autoresearch — Session Summary")
|
||||
print(f" Experiments: {len(lines)} total")
|
||||
print(f" ✅ Keep: {len(keeps)} | ❌ Discard: {len(discards)} | 💥 Crash: {len(crashes)}")
|
||||
print(f" Keep: {len(keeps)} | Discard: {len(discards)} | Crash: {len(crashes)}")
|
||||
|
||||
if keeps:
|
||||
try:
|
||||
first_metric = float(keeps[0].split("\t")[1])
|
||||
last_metric = float(keeps[-1].split("\t")[1])
|
||||
direction = "↓" if last_metric < first_metric else "↑"
|
||||
print(f" Best progress: {first_metric:.6f} → {last_metric:.6f} {direction}")
|
||||
valid = []
|
||||
for l in keeps:
|
||||
parts = l.split("\t")
|
||||
if parts[1] != "N/A":
|
||||
valid.append(float(parts[1]))
|
||||
if len(valid) >= 2:
|
||||
first, last = valid[0], valid[-1]
|
||||
best = min(valid) if direction == "lower" else max(valid)
|
||||
pct = ((first - best) / first * 100) if direction == "lower" else ((best - first) / first * 100)
|
||||
print(f" {metric_name}: {first:.6f} -> {best:.6f} ({pct:+.1f}%)")
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
print(f"{'='*50}\n")
|
||||
print(f"{'=' * 55}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="autoresearch-agent runner")
|
||||
parser.add_argument("--experiment", help="Experiment path: domain/name (e.g. engineering/api-speed)")
|
||||
parser.add_argument("--resume", action="store_true", help="Resume last active experiment")
|
||||
parser.add_argument("--loop", action="store_true", help="Run forever")
|
||||
parser.add_argument("--single", action="store_true", help="Run one experiment")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Dry run only")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
|
||||
parser.add_argument("--max-experiments", type=int, default=0, help="Max experiments (0 = unlimited)")
|
||||
parser.add_argument("--path", default=".", help="Project root")
|
||||
parser.add_argument("--max-experiments", type=int, default=0,
|
||||
help="Max experiments (0 = unlimited)")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.path).resolve()
|
||||
config = load_config(path)
|
||||
project_root = Path(args.path).resolve()
|
||||
root = find_autoresearch_root()
|
||||
|
||||
print(f"\n🔬 autoresearch-agent")
|
||||
print(f" Project: {path}")
|
||||
print(f" Target: {config.get('target', '?')}")
|
||||
print(f" Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
|
||||
print(f" Budget: {config.get('time_budget_minutes', '?')} min/experiment")
|
||||
print(f" Mode: {'loop' if args.loop else 'single'}")
|
||||
if root is None:
|
||||
print("No .autoresearch/ found. Run setup_experiment.py first.")
|
||||
sys.exit(1)
|
||||
|
||||
if args.single:
|
||||
exp_num = get_experiment_count(path) + 1
|
||||
run_single_experiment(path, config, exp_num, args.dry_run)
|
||||
# Resolve experiment
|
||||
experiment_path = args.experiment
|
||||
if args.resume:
|
||||
experiment_path = get_last_active(root)
|
||||
if not experiment_path:
|
||||
print("No experiments found to resume.")
|
||||
sys.exit(1)
|
||||
print(f"Resuming: {experiment_path}")
|
||||
|
||||
if not experiment_path:
|
||||
print("Specify --experiment domain/name or --resume")
|
||||
sys.exit(1)
|
||||
|
||||
experiment_dir = root / experiment_path
|
||||
if not experiment_dir.exists():
|
||||
print(f"Experiment not found: {experiment_dir}")
|
||||
print("Run: python scripts/setup_experiment.py --list")
|
||||
sys.exit(1)
|
||||
|
||||
config = load_config(experiment_dir)
|
||||
|
||||
domain, name = experiment_path.split("/", 1)
|
||||
print(f"\n autoresearch-agent")
|
||||
print(f" Experiment: {experiment_path}")
|
||||
print(f" Target: {config.get('target', '?')}")
|
||||
print(f" Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
|
||||
print(f" Budget: {config.get('time_budget_minutes', '?')} min/experiment")
|
||||
print(f" Mode: {'loop' if args.loop else 'single'}")
|
||||
|
||||
if args.single or args.dry_run:
|
||||
exp_num = get_experiment_count(experiment_dir) + 1
|
||||
run_single(project_root, experiment_dir, config, exp_num, args.dry_run)
|
||||
return
|
||||
|
||||
if not args.loop and not args.dry_run:
|
||||
if not args.loop:
|
||||
print("\nSpecify --loop (forever) or --single (one experiment)")
|
||||
sys.exit(1)
|
||||
|
||||
# Setup graceful shutdown
|
||||
# Graceful shutdown
|
||||
def handle_interrupt(sig, frame):
|
||||
print_summary(path)
|
||||
print("\n⏹ Stopped by user.")
|
||||
print_summary(experiment_dir, config)
|
||||
print("\nStopped by user.")
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_interrupt)
|
||||
signal.signal(signal.SIGTERM, handle_interrupt)
|
||||
|
||||
# Main loop
|
||||
consecutive_crashes = 0
|
||||
exp_num = get_experiment_count(path) + 1
|
||||
exp_num = get_experiment_count(experiment_dir) + 1
|
||||
|
||||
print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")
|
||||
print(f"\nStarting loop. Ctrl+C to stop.\n")
|
||||
|
||||
while True:
|
||||
result = run_single_experiment(path, config, exp_num, args.dry_run)
|
||||
result = run_single(project_root, experiment_dir, config, exp_num, False)
|
||||
exp_num += 1
|
||||
|
||||
if result == "crash":
|
||||
@@ -289,21 +352,16 @@ def main():
|
||||
else:
|
||||
consecutive_crashes = 0
|
||||
|
||||
# Bail if 5 consecutive crashes
|
||||
if consecutive_crashes >= 5:
|
||||
print("\n⚠ 5 consecutive crashes. Pausing for investigation.")
|
||||
print(" Check run.log for the last error.")
|
||||
print("\n 5 consecutive crashes. Pausing.")
|
||||
print(" Check .autoresearch/{}/run.log".format(experiment_path))
|
||||
break
|
||||
|
||||
# Check max experiments
|
||||
if args.max_experiments > 0 and exp_num > args.max_experiments:
|
||||
print(f"\n✓ Reached max experiments ({args.max_experiments})")
|
||||
if 0 < args.max_experiments < exp_num:
|
||||
print(f"\n Reached max experiments ({args.max_experiments})")
|
||||
break
|
||||
|
||||
if args.single:
|
||||
break
|
||||
|
||||
print_summary(path)
|
||||
print_summary(experiment_dir, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user