refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo.

Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation

New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed

Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output

Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view

SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
Leo
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions

View File

@@ -1,125 +1,389 @@
#!/usr/bin/env python3
"""
autoresearch-agent: Results Logger
autoresearch-agent: Results Viewer
View and analyze experiment results from results.tsv.
View experiment results in multiple formats: terminal, CSV, Markdown.
Supports single experiment, domain, or cross-experiment dashboard.
Usage:
python scripts/log_results.py --summary # Print progress table
python scripts/log_results.py --best # Show best result
python scripts/log_results.py --history # Full experiment history
python scripts/log_results.py --record commit val status desc # Add entry manually
python scripts/log_results.py --experiment engineering/api-speed
python scripts/log_results.py --domain engineering
python scripts/log_results.py --dashboard
python scripts/log_results.py --experiment engineering/api-speed --format csv --output results.csv
python scripts/log_results.py --experiment engineering/api-speed --format markdown --output results.md
python scripts/log_results.py --dashboard --format markdown --output dashboard.md
"""
import argparse
import csv
import io
import sys
from pathlib import Path
def load_results(path):
tsv = Path(path) / "results.tsv"
def find_autoresearch_root():
"""Find .autoresearch/ in project or user home."""
project_root = Path(".").resolve() / ".autoresearch"
if project_root.exists():
return project_root
user_root = Path.home() / ".autoresearch"
if user_root.exists():
return user_root
return None
def load_config(experiment_dir):
"""Load config.cfg."""
cfg_file = experiment_dir / "config.cfg"
config = {}
if cfg_file.exists():
for line in cfg_file.read_text().splitlines():
if ":" in line:
k, v = line.split(":", 1)
config[k.strip()] = v.strip()
return config
def load_results(experiment_dir):
"""Load results.tsv into list of dicts."""
tsv = experiment_dir / "results.tsv"
if not tsv.exists():
return []
lines = tsv.read_text().splitlines()[1:] # skip header
results = []
for line in lines:
for line in tsv.read_text().splitlines()[1:]:
parts = line.split("\t")
if len(parts) >= 4:
try:
metric_val = float(parts[1]) if parts[1] != "N/A" else None
metric = float(parts[1]) if parts[1] != "N/A" else None
except ValueError:
metric_val = None
metric = None
results.append({
"commit": parts[0],
"metric": metric_val,
"metric": metric,
"status": parts[2],
"description": parts[3]
"description": parts[3],
})
return results
def print_summary(results, metric_name="metric", direction="lower"):
if not results:
print("No experiments logged yet.")
return
def compute_stats(results, direction):
"""Compute statistics from results."""
keeps = [r for r in results if r["status"] == "keep"]
discards = [r for r in results if r["status"] == "discard"]
crashes = [r for r in results if r["status"] == "crash"]
print(f"\n{''*60}")
print(f" autoresearch-agent — Results Summary")
print(f"{''*60}")
print(f" Total experiments: {len(results)}")
print(f" ✅ Keep: {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
print(f" ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
print(f" 💥 Crash: {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
valid_keeps = [r for r in keeps if r["metric"] is not None]
baseline = valid_keeps[0]["metric"] if valid_keeps else None
if valid_keeps:
best = min(r["metric"] for r in valid_keeps) if direction == "lower" else max(r["metric"] for r in valid_keeps)
else:
best = None
if keeps:
valid = [r for r in keeps if r["metric"] is not None]
if valid:
baseline = valid[0]["metric"]
best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
best_run = next(r for r in valid if r["metric"] == best)
improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
pct_change = None
if baseline and best and baseline != 0:
if direction == "lower":
pct_change = (baseline - best) / baseline * 100
else:
pct_change = (best - baseline) / baseline * 100
print(f"\n {metric_name}:")
print(f" Baseline: {baseline:.6f}")
print(f" Best: {best:.6f} (commit: {best_run['commit']})")
print(f" Change: {improvement:+.2f}%")
print(f"{''*60}\n")
return {
"total": len(results),
"keeps": len(keeps),
"discards": len(discards),
"crashes": len(crashes),
"baseline": baseline,
"best": best,
"pct_change": pct_change,
}
def print_history(results):
# --- Terminal Output ---
def print_experiment(experiment_dir, experiment_path):
"""Print single experiment results to terminal."""
config = load_config(experiment_dir)
results = load_results(experiment_dir)
direction = config.get("metric_direction", "lower")
metric_name = config.get("metric", "metric")
if not results:
print("No experiments logged yet.")
print(f"No results for {experiment_path}")
return
print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
print("" * 60)
for r in results:
metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash "
status_icon = {"keep": "", "discard": "", "crash": "💥"}.get(r["status"], "?")
print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
stats = compute_stats(results, direction)
print(f"\n{'' * 65}")
print(f" {experiment_path}")
print(f" Target: {config.get('target', '?')} | Metric: {metric_name} ({direction})")
print(f"{'' * 65}")
print(f" Total: {stats['total']} | Keep: {stats['keeps']} | Discard: {stats['discards']} | Crash: {stats['crashes']}")
if stats["baseline"] is not None and stats["best"] is not None:
pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] is not None else ""
print(f" Baseline: {stats['baseline']:.6f} -> Best: {stats['best']:.6f}{pct}")
print(f"\n {'COMMIT':<10} {'METRIC':>12} {'STATUS':<10} DESCRIPTION")
print(f" {'' * 60}")
for r in results:
m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A "
icon = {"keep": "+", "discard": "-", "crash": "!"}.get(r["status"], "?")
print(f" {r['commit']:<10} {m:>12} {icon} {r['status']:<7} {r['description'][:35]}")
print()
def print_dashboard(root):
"""Print cross-experiment dashboard."""
experiments = []
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
continue
config = load_config(exp_dir)
results = load_results(exp_dir)
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
# Determine status
status = "idle"
if stats["total"] > 0:
tsv = exp_dir / "results.tsv"
if tsv.exists():
import time
age_hours = (time.time() - tsv.stat().st_mtime) / 3600
status = "active" if age_hours < 1 else "paused" if age_hours < 24 else "done"
best_str = f"{stats['best']:.4f}" if stats["best"] is not None else ""
pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] is not None else ""
experiments.append({
"domain": domain_dir.name,
"name": exp_dir.name,
"runs": stats["total"],
"kept": stats["keeps"],
"best": best_str,
"change": pct_str,
"status": status,
"metric": config.get("metric", "?"),
})
if not experiments:
print("No experiments found.")
return experiments
print(f"\n{'' * 90}")
print(f" autoresearch — Dashboard")
print(f"{'' * 90}")
print(f" {'DOMAIN':<15} {'EXPERIMENT':<20} {'RUNS':>5} {'KEPT':>5} {'BEST':>12} {'CHANGE':>10} {'STATUS':<8}")
print(f" {'' * 85}")
for e in experiments:
print(f" {e['domain']:<15} {e['name']:<20} {e['runs']:>5} {e['kept']:>5} {e['best']:>12} {e['change']:>10} {e['status']:<8}")
print()
return experiments
# --- CSV Export ---
def export_experiment_csv(experiment_dir, experiment_path):
"""Export single experiment as CSV string."""
config = load_config(experiment_dir)
results = load_results(experiment_dir)
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
buf = io.StringIO()
writer = csv.writer(buf)
# Header with metadata
writer.writerow(["# Experiment", experiment_path])
writer.writerow(["# Target", config.get("target", "")])
writer.writerow(["# Metric", f"{config.get('metric', '')} ({direction} is better)"])
if stats["baseline"] is not None:
writer.writerow(["# Baseline", f"{stats['baseline']:.6f}"])
if stats["best"] is not None:
pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
writer.writerow(["# Best", f"{stats['best']:.6f}{pct}"])
writer.writerow(["# Total", stats["total"]])
writer.writerow(["# Keep/Discard/Crash", f"{stats['keeps']}/{stats['discards']}/{stats['crashes']}"])
writer.writerow([])
writer.writerow(["Commit", "Metric", "Status", "Description"])
for r in results:
m = f"{r['metric']:.6f}" if r["metric"] is not None else "N/A"
writer.writerow([r["commit"], m, r["status"], r["description"]])
return buf.getvalue()
def export_dashboard_csv(root):
"""Export dashboard as CSV string."""
experiments = []
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
continue
config = load_config(exp_dir)
results = load_results(exp_dir)
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
best_str = f"{stats['best']:.6f}" if stats["best"] else ""
pct_str = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else ""
experiments.append([
domain_dir.name, exp_dir.name, config.get("metric", ""),
stats["total"], stats["keeps"], stats["discards"], stats["crashes"],
best_str, pct_str
])
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(["Domain", "Experiment", "Metric", "Runs", "Kept", "Discarded", "Crashed", "Best", "Change"])
for e in experiments:
writer.writerow(e)
return buf.getvalue()
# --- Markdown Export ---
def export_experiment_markdown(experiment_dir, experiment_path):
"""Export single experiment as Markdown string."""
config = load_config(experiment_dir)
results = load_results(experiment_dir)
direction = config.get("metric_direction", "lower")
metric_name = config.get("metric", "metric")
stats = compute_stats(results, direction)
lines = []
lines.append(f"# Autoresearch: {experiment_path}\n")
lines.append(f"**Target:** `{config.get('target', '?')}` ")
lines.append(f"**Metric:** `{metric_name}` ({direction} is better) ")
lines.append(f"**Experiments:** {stats['total']} total — {stats['keeps']} kept, {stats['discards']} discarded, {stats['crashes']} crashed\n")
if stats["baseline"] is not None and stats["best"] is not None:
pct = f" ({stats['pct_change']:+.1f}%)" if stats["pct_change"] else ""
lines.append(f"**Progress:** `{stats['baseline']:.6f}` → `{stats['best']:.6f}`{pct}\n")
lines.append(f"| Commit | Metric | Status | Description |")
lines.append(f"|--------|--------|--------|-------------|")
for r in results:
m = f"`{r['metric']:.6f}`" if r["metric"] is not None else "N/A"
lines.append(f"| `{r['commit']}` | {m} | {r['status']} | {r['description']} |")
lines.append("")
return "\n".join(lines)
def export_dashboard_markdown(root):
"""Export dashboard as Markdown string."""
lines = []
lines.append("# Autoresearch Dashboard\n")
lines.append("| Domain | Experiment | Metric | Runs | Kept | Best | Change | Status |")
lines.append("|--------|-----------|--------|------|------|------|--------|--------|")
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir() or not (exp_dir / "config.cfg").exists():
continue
config = load_config(exp_dir)
results = load_results(exp_dir)
direction = config.get("metric_direction", "lower")
stats = compute_stats(results, direction)
best = f"`{stats['best']:.4f}`" if stats["best"] else ""
pct = f"{stats['pct_change']:+.1f}%" if stats["pct_change"] else ""
import time
tsv = exp_dir / "results.tsv"
status = "idle"
if tsv.exists() and stats["total"] > 0:
age_h = (time.time() - tsv.stat().st_mtime) / 3600
status = "active" if age_h < 1 else "paused" if age_h < 24 else "done"
lines.append(f"| {domain_dir.name} | {exp_dir.name} | {config.get('metric', '?')} | {stats['total']} | {stats['keeps']} | {best} | {pct} | {status} |")
lines.append("")
return "\n".join(lines)
# --- Main ---
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--summary", action="store_true")
parser.add_argument("--best", action="store_true")
parser.add_argument("--history", action="store_true")
parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
parser.add_argument("--path", default=".")
parser.add_argument("--metric", default="metric")
parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
parser = argparse.ArgumentParser(description="autoresearch-agent results viewer")
parser.add_argument("--experiment", help="Show one experiment: domain/name")
parser.add_argument("--domain", help="Show all experiments in a domain")
parser.add_argument("--dashboard", action="store_true", help="Cross-experiment dashboard")
parser.add_argument("--format", choices=["terminal", "csv", "markdown"], default="terminal",
help="Output format (default: terminal)")
parser.add_argument("--output", "-o", help="Write to file instead of stdout")
parser.add_argument("--all", action="store_true", help="Show all experiments (alias for --dashboard)")
args = parser.parse_args()
path = Path(args.path).resolve()
root = find_autoresearch_root()
if root is None:
print("No .autoresearch/ found. Run setup_experiment.py first.")
sys.exit(1)
if args.record:
commit, metric, status, desc = args.record
tsv = path / "results.tsv"
if not tsv.exists():
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
with open(tsv, "a") as f:
f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
print(f"✓ Logged: {commit} {metric} {status}")
return
output_text = None
results = load_results(path)
# Single experiment
if args.experiment:
experiment_dir = root / args.experiment
if not experiment_dir.exists():
print(f"Experiment not found: {args.experiment}")
sys.exit(1)
if args.history:
print_history(results)
elif args.best:
keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
if not keeps:
print("No successful experiments yet.")
if args.format == "csv":
output_text = export_experiment_csv(experiment_dir, args.experiment)
elif args.format == "markdown":
output_text = export_experiment_markdown(experiment_dir, args.experiment)
else:
print_experiment(experiment_dir, args.experiment)
return
best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
# Domain
elif args.domain:
domain_dir = root / args.domain
if not domain_dir.exists():
print(f"Domain not found: {args.domain}")
sys.exit(1)
for exp_dir in sorted(domain_dir.iterdir()):
if exp_dir.is_dir() and (exp_dir / "config.cfg").exists():
if args.format == "terminal":
print_experiment(exp_dir, f"{args.domain}/{exp_dir.name}")
# For CSV/MD, fall through to dashboard with domain filter
if args.format != "terminal":
# Use dashboard export filtered to domain
output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
else:
return
# Dashboard
elif args.dashboard or args.all:
if args.format == "csv":
output_text = export_dashboard_csv(root)
elif args.format == "markdown":
output_text = export_dashboard_markdown(root)
else:
print_dashboard(root)
return
else:
print_summary(results, args.metric, args.direction)
# Default: dashboard
if args.format == "terminal":
print_dashboard(root)
return
output_text = export_dashboard_csv(root) if args.format == "csv" else export_dashboard_markdown(root)
# Write output
if output_text:
if args.output:
Path(args.output).write_text(output_text)
print(f"Written to {args.output}")
else:
print(output_text)
if __name__ == "__main__":

View File

@@ -2,17 +2,15 @@
"""
autoresearch-agent: Experiment Runner
Executes the autonomous experiment loop:
- Reads .autoresearch.cfg for project config
- Runs the target evaluation
- Keeps improvements (git commit) or discards failures (git reset)
- Logs everything to results.tsv
- Loops indefinitely until interrupted
Executes the autonomous experiment loop for a specific experiment.
Reads config from .autoresearch/{domain}/{name}/config.cfg.
Usage:
python scripts/run_experiment.py --loop # Run forever
python scripts/run_experiment.py --single # Run one experiment
python scripts/run_experiment.py --dry-run # Show what would happen
python scripts/run_experiment.py --experiment engineering/api-speed --loop
python scripts/run_experiment.py --experiment engineering/api-speed --single
python scripts/run_experiment.py --experiment marketing/medium-ctr --loop
python scripts/run_experiment.py --resume --loop
python scripts/run_experiment.py --experiment engineering/api-speed --dry-run
"""
import argparse
@@ -25,11 +23,22 @@ from datetime import datetime
from pathlib import Path
def load_config(path):
"""Load .autoresearch.cfg"""
cfg_file = Path(path) / ".autoresearch.cfg"
def find_autoresearch_root():
"""Find .autoresearch/ in project or user home."""
project_root = Path(".").resolve() / ".autoresearch"
if project_root.exists():
return project_root
user_root = Path.home() / ".autoresearch"
if user_root.exists():
return user_root
return None
def load_config(experiment_dir):
"""Load config.cfg from experiment directory."""
cfg_file = experiment_dir / "config.cfg"
if not cfg_file.exists():
print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
print(f" Error: no config.cfg in {experiment_dir}")
sys.exit(1)
config = {}
for line in cfg_file.read_text().splitlines():
@@ -49,239 +58,293 @@ def run_cmd(cmd, cwd=None, timeout=None):
def get_current_commit(path):
"""Get short hash of current HEAD."""
_, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
return commit
def get_current_metric(path, metric_grep):
"""Read the last recorded metric from results.tsv."""
tsv = Path(path) / "results.tsv"
def get_best_metric(experiment_dir, direction):
"""Read the best metric from results.tsv."""
tsv = experiment_dir / "results.tsv"
if not tsv.exists():
return None
lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
lines = [l for l in tsv.read_text().splitlines()[1:] if "\tkeep\t" in l]
if not lines:
return None
last = lines[-1].split("\t")
try:
return float(last[1])
except (ValueError, IndexError):
metrics = []
for line in lines:
parts = line.split("\t")
try:
if parts[1] != "N/A":
metrics.append(float(parts[1]))
except (ValueError, IndexError):
continue
if not metrics:
return None
return min(metrics) if direction == "lower" else max(metrics)
def run_evaluation(path, evaluate_cmd, time_budget_minutes):
"""Run evaluation with time limit."""
hard_limit = time_budget_minutes * 60 * 2.5 # 2.5x as hard timeout
def run_evaluation(project_root, eval_cmd, time_budget_minutes, log_file):
"""Run evaluation with time limit. Output goes to log_file."""
hard_limit = time_budget_minutes * 60 * 2.5
t0 = time.time()
try:
code, _, _ = run_cmd(
f"{evaluate_cmd} > run.log 2>&1",
cwd=path,
f"{eval_cmd} > {log_file} 2>&1",
cwd=str(project_root),
timeout=hard_limit
)
elapsed = time.time() - t0
return code, elapsed
except subprocess.TimeoutExpired:
elapsed = time.time() - t0
return -1, elapsed # -1 = timeout
return -1, elapsed
def extract_metric(path, metric_grep):
"""Extract metric value from run.log."""
code, out, _ = run_cmd(
f"grep '{metric_grep}' run.log | tail -1",
cwd=path
)
if not out:
return None
try:
return float(out.split(":")[-1].strip())
except ValueError:
def extract_metric(log_file, metric_grep):
"""Extract metric value from log file."""
log_path = Path(log_file)
if not log_path.exists():
return None
for line in reversed(log_path.read_text().splitlines()):
stripped = line.strip()
if stripped.startswith(metric_grep.lstrip("^")):
try:
return float(stripped.split(":")[-1].strip())
except ValueError:
continue
return None
def is_improvement(new_val, old_val, direction):
"""Check if new result is better than old."""
if old_val is None:
return True # First run always "improves"
return True
if direction == "lower":
return new_val < old_val
else:
return new_val > old_val
return new_val > old_val
def log_result(path, commit, metric_val, status, description):
def log_result(experiment_dir, commit, metric_val, status, description):
"""Append result to results.tsv."""
tsv = Path(path) / "results.tsv"
tsv = experiment_dir / "results.tsv"
metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
with open(tsv, "a") as f:
f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")
def get_experiment_count(path):
def get_experiment_count(experiment_dir):
"""Count experiments run so far."""
tsv = Path(path) / "results.tsv"
tsv = experiment_dir / "results.tsv"
if not tsv.exists():
return 0
lines = tsv.read_text().splitlines()
return max(0, len(lines) - 1) # subtract header
return max(0, len(tsv.read_text().splitlines()) - 1)
def run_single_experiment(path, config, exp_num, dry_run=False):
def get_last_active(root):
"""Find the most recently modified experiment."""
latest = None
latest_time = 0
for domain_dir in root.iterdir():
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
for exp_dir in domain_dir.iterdir():
if not exp_dir.is_dir():
continue
cfg = exp_dir / "config.cfg"
if cfg.exists() and cfg.stat().st_mtime > latest_time:
latest_time = cfg.stat().st_mtime
latest = f"{domain_dir.name}/{exp_dir.name}"
return latest
def run_single(project_root, experiment_dir, config, exp_num, dry_run=False):
"""Run one experiment iteration."""
direction = config.get("metric_direction", "lower")
metric_grep = config.get("metric_grep", "^metric:")
evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
eval_cmd = config.get("evaluate_cmd", "python evaluate.py")
time_budget = int(config.get("time_budget_minutes", 5))
metric_name = config.get("metric", "metric")
log_file = str(experiment_dir / "run.log")
best_so_far = get_current_metric(path, metric_grep)
best = get_best_metric(experiment_dir, direction)
ts = datetime.now().strftime("%H:%M:%S")
print(f"\n[{ts}] Experiment #{exp_num}")
print(f" Best {metric_name} so far: {best_so_far}")
print(f" Best {metric_name}: {best}")
if dry_run:
print(" [DRY RUN] Would run evaluation and check metric")
return "dry_run"
# Save pre-experiment state for rollback
code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
# Save state for rollback
code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=str(project_root))
if code != 0:
print(" ✗ Can't get git state. Is this a git repo with commits?")
print(" Error: can't get git state")
return "error"
# Run evaluation
print(f" Running: {evaluate_cmd} (budget: {time_budget} min)")
ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)
print(f" Running: {eval_cmd} (budget: {time_budget}m)")
ret_code, elapsed = run_evaluation(project_root, eval_cmd, time_budget, log_file)
# Handle timeout
commit = get_current_commit(str(project_root))
# Timeout
if ret_code == -1:
print(f" TIMEOUT after {elapsed:.0f}s — discarding")
run_cmd("git checkout -- .", cwd=path) # revert uncommitted changes
# Commit was already made by the agent before evaluation
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
curr_commit = get_current_commit(path)
log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
print(f" TIMEOUT after {elapsed:.0f}s — discarding")
run_cmd("git checkout -- .", cwd=str(project_root))
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
log_result(experiment_dir, commit, None, "crash", f"timeout_{elapsed:.0f}s")
return "crash"
# Handle non-zero exit
# Crash
if ret_code != 0:
# Check if it crashed
code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
print(f" ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
_, tail, _ = run_cmd(f"tail -5 {log_file}", cwd=str(project_root))
print(f" CRASH (exit {ret_code}) after {elapsed:.0f}s")
print(f" Last output: {tail[:200]}")
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
curr_commit = get_current_commit(path)
log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
log_result(experiment_dir, commit, None, "crash", f"exit_{ret_code}")
return "crash"
# Extract metric
metric_val = extract_metric(path, metric_grep)
metric_val = extract_metric(log_file, metric_grep)
if metric_val is None:
print(f" Could not parse metric from run.log")
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
curr_commit = get_current_commit(path)
log_result(path, curr_commit, None, "crash", "metric_parse_failed")
print(f" Could not parse {metric_name} from run.log")
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
log_result(experiment_dir, commit, None, "crash", "metric_parse_failed")
return "crash"
curr_commit = get_current_commit(path)
delta = ""
if best_so_far is not None:
diff = metric_val - best_so_far
delta = f" (Δ{diff:+.4f})"
if best is not None:
diff = metric_val - best
delta = f" (delta {diff:+.4f})"
print(f" {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")
# Keep or discard
if is_improvement(metric_val, best_so_far, direction):
print(f" KEEP — improvement confirmed")
log_result(path, curr_commit, metric_val, "keep",
f"improvement_{metric_name}_{metric_val:.4f}")
if is_improvement(metric_val, best, direction):
print(f" KEEP — improvement")
log_result(experiment_dir, commit, metric_val, "keep",
f"improved_{metric_name}_{metric_val:.4f}")
return "keep"
else:
print(f" DISCARD — no improvement")
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
curr_commit = get_current_commit(path)
log_result(path, curr_commit, metric_val, "discard",
f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
print(f" DISCARD — no improvement")
run_cmd(f"git reset --hard {pre_commit}", cwd=str(project_root))
best_str = f"{best:.4f}" if best else "?"
log_result(experiment_dir, commit, metric_val, "discard",
f"no_improvement_{metric_val:.4f}_vs_{best_str}")
return "discard"
def print_summary(path):
"""Print experiment summary."""
tsv = Path(path) / "results.tsv"
def print_summary(experiment_dir, config):
"""Print session summary."""
tsv = experiment_dir / "results.tsv"
if not tsv.exists():
return
lines = tsv.read_text().splitlines()[1:] # skip header
lines = tsv.read_text().splitlines()[1:]
if not lines:
return
keeps = [l for l in lines if "\tkeep\t" in l]
discards = [l for l in lines if "\tdiscard\t" in l]
crashes = [l for l in lines if "\tcrash\t" in l]
metric_name = config.get("metric", "metric")
direction = config.get("metric_direction", "lower")
print(f"\n{'='*50}")
print(f" Session Summary")
print(f"\n{'=' * 55}")
print(f" autoresearch — Session Summary")
print(f" Experiments: {len(lines)} total")
print(f" Keep: {len(keeps)} | Discard: {len(discards)} | 💥 Crash: {len(crashes)}")
print(f" Keep: {len(keeps)} | Discard: {len(discards)} | Crash: {len(crashes)}")
if keeps:
try:
first_metric = float(keeps[0].split("\t")[1])
last_metric = float(keeps[-1].split("\t")[1])
direction = "" if last_metric < first_metric else ""
print(f" Best progress: {first_metric:.6f}{last_metric:.6f} {direction}")
valid = []
for l in keeps:
parts = l.split("\t")
if parts[1] != "N/A":
valid.append(float(parts[1]))
if len(valid) >= 2:
first, last = valid[0], valid[-1]
best = min(valid) if direction == "lower" else max(valid)
pct = ((first - best) / first * 100) if direction == "lower" else ((best - first) / first * 100)
print(f" {metric_name}: {first:.6f} -> {best:.6f} ({pct:+.1f}%)")
except (ValueError, IndexError):
pass
print(f"{'='*50}\n")
print(f"{'=' * 55}\n")
def main():
parser = argparse.ArgumentParser(description="autoresearch-agent runner")
parser.add_argument("--experiment", help="Experiment path: domain/name (e.g. engineering/api-speed)")
parser.add_argument("--resume", action="store_true", help="Resume last active experiment")
parser.add_argument("--loop", action="store_true", help="Run forever")
parser.add_argument("--single", action="store_true", help="Run one experiment")
parser.add_argument("--dry-run", action="store_true", help="Dry run only")
parser.add_argument("--dry-run", action="store_true", help="Show what would happen")
parser.add_argument("--max-experiments", type=int, default=0, help="Max experiments (0 = unlimited)")
parser.add_argument("--path", default=".", help="Project root")
parser.add_argument("--max-experiments", type=int, default=0,
help="Max experiments (0 = unlimited)")
args = parser.parse_args()
path = Path(args.path).resolve()
config = load_config(path)
project_root = Path(args.path).resolve()
root = find_autoresearch_root()
print(f"\n🔬 autoresearch-agent")
print(f" Project: {path}")
print(f" Target: {config.get('target', '?')}")
print(f" Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
print(f" Budget: {config.get('time_budget_minutes', '?')} min/experiment")
print(f" Mode: {'loop' if args.loop else 'single'}")
if root is None:
print("No .autoresearch/ found. Run setup_experiment.py first.")
sys.exit(1)
if args.single:
exp_num = get_experiment_count(path) + 1
run_single_experiment(path, config, exp_num, args.dry_run)
# Resolve experiment
experiment_path = args.experiment
if args.resume:
experiment_path = get_last_active(root)
if not experiment_path:
print("No experiments found to resume.")
sys.exit(1)
print(f"Resuming: {experiment_path}")
if not experiment_path:
print("Specify --experiment domain/name or --resume")
sys.exit(1)
experiment_dir = root / experiment_path
if not experiment_dir.exists():
print(f"Experiment not found: {experiment_dir}")
print("Run: python scripts/setup_experiment.py --list")
sys.exit(1)
config = load_config(experiment_dir)
domain, name = experiment_path.split("/", 1)
print(f"\n autoresearch-agent")
print(f" Experiment: {experiment_path}")
print(f" Target: {config.get('target', '?')}")
print(f" Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
print(f" Budget: {config.get('time_budget_minutes', '?')} min/experiment")
print(f" Mode: {'loop' if args.loop else 'single'}")
if args.single or args.dry_run:
exp_num = get_experiment_count(experiment_dir) + 1
run_single(project_root, experiment_dir, config, exp_num, args.dry_run)
return
if not args.loop and not args.dry_run:
if not args.loop:
print("\nSpecify --loop (forever) or --single (one experiment)")
sys.exit(1)
# Setup graceful shutdown
# Graceful shutdown
def handle_interrupt(sig, frame):
print_summary(path)
print("\nStopped by user.")
print_summary(experiment_dir, config)
print("\nStopped by user.")
sys.exit(0)
signal.signal(signal.SIGINT, handle_interrupt)
signal.signal(signal.SIGTERM, handle_interrupt)
# Main loop
consecutive_crashes = 0
exp_num = get_experiment_count(path) + 1
exp_num = get_experiment_count(experiment_dir) + 1
print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")
print(f"\nStarting loop. Ctrl+C to stop.\n")
while True:
result = run_single_experiment(path, config, exp_num, args.dry_run)
result = run_single(project_root, experiment_dir, config, exp_num, False)
exp_num += 1
if result == "crash":
@@ -289,21 +352,16 @@ def main():
else:
consecutive_crashes = 0
# Bail if 5 consecutive crashes
if consecutive_crashes >= 5:
print("\n 5 consecutive crashes. Pausing for investigation.")
print(" Check run.log for the last error.")
print("\n 5 consecutive crashes. Pausing.")
print(" Check .autoresearch/{}/run.log".format(experiment_path))
break
# Check max experiments
if args.max_experiments > 0 and exp_num > args.max_experiments:
print(f"\n✓ Reached max experiments ({args.max_experiments})")
if 0 < args.max_experiments < exp_num:
print(f"\n Reached max experiments ({args.max_experiments})")
break
if args.single:
break
print_summary(path)
print_summary(experiment_dir, config)
if __name__ == "__main__":

View File

@@ -1,65 +1,52 @@
#!/usr/bin/env python3
"""
autoresearch-agent: Setup Wizard
autoresearch-agent: Setup Experiment
Initializes a new research run:
1. Validates the project structure
2. Creates a git branch
3. Runs the baseline experiment
4. Initializes results.tsv
Initialize a new experiment with domain, target, evaluator, and git branch.
Creates the .autoresearch/{domain}/{name}/ directory structure.
Usage:
python scripts/setup_experiment.py [--config experiment.yaml]
python scripts/setup_experiment.py --domain ml|prompt|code|skill
python scripts/setup_experiment.py --domain engineering --name api-speed \
--target src/api/search.py --eval "pytest bench.py" \
--metric p50_ms --direction lower
python scripts/setup_experiment.py --domain marketing --name medium-ctr \
--target content/titles.md --eval "python evaluate.py" \
--metric ctr_score --direction higher --evaluator llm_judge_content
python scripts/setup_experiment.py --list # List all experiments
python scripts/setup_experiment.py --list-evaluators # List available evaluators
"""
import argparse
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
DOMAINS = ["engineering", "marketing", "content", "prompts", "custom"]
DOMAINS = {
"ml": {
"target": "train.py",
"evaluate_cmd": "uv run train.py",
"metric": "val_bpb",
"metric_direction": "lower",
"time_budget_minutes": 5,
"metric_grep": "^val_bpb:",
},
"prompt": {
"target": "prompt.md",
"evaluate_cmd": "python evaluate.py",
"metric": "eval_score",
"metric_direction": "higher",
"time_budget_minutes": 2,
"metric_grep": "^eval_score:",
},
"code": {
"target": "src/module.py",
"evaluate_cmd": "python benchmark.py",
"metric": "p50_ms",
"metric_direction": "lower",
"time_budget_minutes": 10,
"metric_grep": "^p50_ms:",
},
"skill": {
"target": "SKILL.md",
"evaluate_cmd": "python scripts/skill_evaluator.py",
"metric": "pass_rate",
"metric_direction": "higher",
"time_budget_minutes": 5,
"metric_grep": "^pass_rate:",
},
}
EVALUATOR_DIR = Path(__file__).parent.parent / "evaluators"
DEFAULT_CONFIG = """# autoresearch global config
default_time_budget_minutes: 5
default_scope: project
dashboard_format: markdown
"""
GITIGNORE_CONTENT = """# autoresearch — experiment logs are local state
**/results.tsv
**/run.log
**/run.*.log
config.yaml
"""
def run_cmd(cmd, cwd=None, timeout=None):
"""Run a shell command and return (returncode, stdout, stderr)."""
"""Run shell command, return (returncode, stdout, stderr)."""
result = subprocess.run(
cmd, shell=True, capture_output=True, text=True,
cwd=cwd, timeout=timeout
@@ -67,188 +54,315 @@ def run_cmd(cmd, cwd=None, timeout=None):
return result.returncode, result.stdout.strip(), result.stderr.strip()
def check_git_repo(path):
"""Verify we're in a git repo."""
code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
if code != 0:
print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
def get_autoresearch_root(scope, project_root=None):
"""Get the .autoresearch root directory based on scope."""
if scope == "user":
return Path.home() / ".autoresearch"
return Path(project_root or ".") / ".autoresearch"
def init_root(root):
"""Initialize .autoresearch root if it doesn't exist."""
created = False
if not root.exists():
root.mkdir(parents=True)
created = True
print(f" Created {root}/")
config_file = root / "config.yaml"
if not config_file.exists():
config_file.write_text(DEFAULT_CONFIG)
print(f" Created {config_file}")
gitignore = root / ".gitignore"
if not gitignore.exists():
gitignore.write_text(GITIGNORE_CONTENT)
print(f" Created {gitignore}")
return created
def create_program_md(experiment_dir, domain, name, target, metric, direction, constraints=""):
"""Generate a program.md template for the experiment."""
direction_word = "Minimize" if direction == "lower" else "Maximize"
content = f"""# autoresearch — {name}
## Goal
{direction_word} `{metric}` on `{target}`. {"Lower" if direction == "lower" else "Higher"} is better.
## What the Agent Can Change
- Only `{target}` — this is the single file being optimized.
- Everything inside that file is fair game unless constrained below.
## What the Agent Cannot Change
- The evaluation script (`evaluate.py` or the eval command). It is read-only.
- Dependencies — do not add new packages or imports that aren't already available.
- Any other files in the project unless explicitly noted here.
{f"- Additional constraints: {constraints}" if constraints else ""}
## Strategy
1. First run: establish baseline. Do not change anything.
2. Profile/analyze the current state — understand why the metric is what it is.
3. Try the most obvious improvement first (low-hanging fruit).
4. If that works, push further in the same direction.
5. If stuck, try something orthogonal or radical.
6. Read the git log of previous experiments. Don't repeat failed approaches.
## Simplicity Rule
A small improvement that adds ugly complexity is NOT worth it.
Equal performance with simpler code IS worth it.
Removing code that gets same results is the best outcome.
## Stop When
You don't stop. The human will interrupt you when they're satisfied.
If no improvement in 20+ consecutive runs, change strategy drastically.
"""
(experiment_dir / "program.md").write_text(content)
def create_config(experiment_dir, target, eval_cmd, metric, direction, time_budget):
"""Write experiment config."""
content = f"""target: {target}
evaluate_cmd: {eval_cmd}
metric: {metric}
metric_direction: {direction}
metric_grep: ^{metric}:
time_budget_minutes: {time_budget}
created: {datetime.now().strftime('%Y-%m-%d %H:%M')}
"""
(experiment_dir / "config.cfg").write_text(content)
def init_results_tsv(experiment_dir):
"""Create results.tsv with header."""
tsv = experiment_dir / "results.tsv"
if tsv.exists():
print(f" results.tsv already exists ({tsv.stat().st_size} bytes)")
return
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
print(" Created results.tsv")
def copy_evaluator(experiment_dir, evaluator_name):
"""Copy a built-in evaluator to the experiment directory."""
source = EVALUATOR_DIR / f"{evaluator_name}.py"
if not source.exists():
print(f" Warning: evaluator '{evaluator_name}' not found in {EVALUATOR_DIR}")
print(f" Available: {', '.join(f.stem for f in EVALUATOR_DIR.glob('*.py'))}")
return False
print("✓ Git repository found")
dest = experiment_dir / "evaluate.py"
shutil.copy2(source, dest)
print(f" Copied evaluator: {evaluator_name}.py -> evaluate.py")
return True
def check_program_md(path):
"""Check program.md exists and has content."""
pm = Path(path) / "program.md"
if not pm.exists():
print("⚠ program.md not found. Creating template...")
return False
content = pm.read_text()
if len(content) < 100:
print("⚠ program.md looks empty. Fill it out before running experiments.")
return False
print(f"✓ program.md found ({len(content)} chars)")
return True
def check_target_file(path, target):
"""Check target file exists."""
tf = Path(path) / target
if not tf.exists():
print(f"✗ Target file not found: {target}")
return False
print(f"✓ Target file found: {target}")
return True
def check_evaluate_script(path):
"""Check evaluate.py exists."""
ev = Path(path) / "evaluate.py"
if not ev.exists():
print("⚠ evaluate.py not found. You need a fixed evaluation function.")
print(" Create evaluate.py that outputs: metric_name: <value>")
return False
print("✓ evaluate.py found")
return True
def create_branch(path, tag):
def create_branch(path, domain, name):
"""Create and checkout the experiment branch."""
branch = f"autoresearch/{tag}"
code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
branch = f"autoresearch/{domain}/{name}"
code, _, err = run_cmd(f"git checkout -b {branch}", cwd=path)
if code != 0:
if "already exists" in err:
print(f" Branch '{branch}' already exists. Use a different tag.")
else:
print(f"✗ Failed to create branch: {err}")
print(f" Branch '{branch}' already exists. Checking out...")
run_cmd(f"git checkout {branch}", cwd=path)
return branch
print(f" Warning: could not create branch: {err}")
return None
print(f" Created branch: {branch}")
print(f" Created branch: {branch}")
return branch
def init_results_tsv(path):
"""Create results.tsv with header."""
tsv = Path(path) / "results.tsv"
if tsv.exists():
print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
def list_experiments(root):
"""List all experiments across all domains."""
if not root.exists():
print("No experiments found. Run setup to create your first experiment.")
return
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
print("✓ Created results.tsv")
experiments = []
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir():
continue
cfg_file = exp_dir / "config.cfg"
if not cfg_file.exists():
continue
config = {}
for line in cfg_file.read_text().splitlines():
if ":" in line:
k, v = line.split(":", 1)
config[k.strip()] = v.strip()
# Count results
tsv = exp_dir / "results.tsv"
runs = 0
if tsv.exists():
runs = max(0, len(tsv.read_text().splitlines()) - 1)
experiments.append({
"domain": domain_dir.name,
"name": exp_dir.name,
"target": config.get("target", "?"),
"metric": config.get("metric", "?"),
"runs": runs,
})
if not experiments:
print("No experiments found.")
return
print(f"\n{'DOMAIN':<15} {'EXPERIMENT':<25} {'TARGET':<30} {'METRIC':<15} {'RUNS':>5}")
print("-" * 95)
for e in experiments:
print(f"{e['domain']:<15} {e['name']:<25} {e['target']:<30} {e['metric']:<15} {e['runs']:>5}")
print(f"\nTotal: {len(experiments)} experiments")
def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
"""Run the baseline experiment."""
print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
timeout = time_budget_minutes * 60 * 2.5 # 2.5x budget as hard limit
def list_evaluators():
"""List available built-in evaluators."""
if not EVALUATOR_DIR.exists():
print("No evaluators directory found.")
return
t0 = time.time()
code, out, err = run_cmd(
f"{evaluate_cmd} > run.log 2>&1",
cwd=path,
timeout=timeout
)
elapsed = time.time() - t0
if code != 0:
print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
return None
# Extract metric
grep_code, grep_out, _ = run_cmd(
f"grep '{metric_grep}' run.log | tail -1",
cwd=path
)
if not grep_out:
print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
return None
metric_value = grep_out.split(":")[-1].strip()
print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
return metric_value
print(f"\nAvailable evaluators ({EVALUATOR_DIR}):\n")
for f in sorted(EVALUATOR_DIR.glob("*.py")):
# Read first docstring line
desc = ""
for line in f.read_text().splitlines():
if line.strip().startswith('"""') or line.strip().startswith("'''"):
continue
if line.strip() and not line.startswith("#!"):
desc = line.strip().strip('"').strip("'")
break
print(f" {f.stem:<25} {desc}")
def main():
parser = argparse.ArgumentParser(description="autoresearch-agent setup")
parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
parser.add_argument("--domain", choices=DOMAINS, help="Experiment domain")
parser.add_argument("--name", help="Experiment name (e.g. api-speed, medium-ctr)")
parser.add_argument("--target", help="Target file to optimize")
parser.add_argument("--evaluate-cmd", help="Evaluation command")
parser.add_argument("--metric", help="Metric name")
parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
parser.add_argument("--tag", help="Run tag (used in branch name)")
parser.add_argument("--eval", dest="eval_cmd", help="Evaluation command")
parser.add_argument("--metric", help="Metric name (must appear in eval output as 'name: value')")
parser.add_argument("--direction", choices=["lower", "higher"], default="lower",
help="Is lower or higher better?")
parser.add_argument("--time-budget", type=int, default=5, help="Minutes per experiment (default: 5)")
parser.add_argument("--evaluator", help="Built-in evaluator to copy (e.g. benchmark_speed)")
parser.add_argument("--scope", choices=["project", "user"], default="project",
help="Where to store experiments: project (./) or user (~/)")
parser.add_argument("--constraints", default="", help="Additional constraints for program.md")
parser.add_argument("--path", default=".", help="Project root path")
parser.add_argument("--skip-baseline", action="store_true")
parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline run")
parser.add_argument("--skip-branch", action="store_true", help="Don't create git branch")
parser.add_argument("--list", action="store_true", help="List all experiments")
parser.add_argument("--list-evaluators", action="store_true", help="List available evaluators")
args = parser.parse_args()
path = Path(args.path).resolve()
print(f"\n🔬 autoresearch-agent setup")
print(f" Project: {path}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
project_root = Path(args.path).resolve()
# Get config from domain or args
if args.domain:
config = DOMAINS[args.domain].copy()
# List mode
if args.list:
root = get_autoresearch_root("project", project_root)
list_experiments(root)
user_root = get_autoresearch_root("user")
if user_root.exists() and user_root != root:
print(f"\n--- User-level experiments ({user_root}) ---")
list_experiments(user_root)
return
if args.list_evaluators:
list_evaluators()
return
# Validate required args for setup
if not all([args.domain, args.name, args.target, args.eval_cmd, args.metric]):
parser.error("Required: --domain, --name, --target, --eval, --metric")
root = get_autoresearch_root(args.scope, project_root)
print(f"\n autoresearch-agent setup")
print(f" Project: {project_root}")
print(f" Scope: {args.scope}")
print(f" Domain: {args.domain}")
print(f" Experiment: {args.name}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
# Check git
code, _, _ = run_cmd("git rev-parse --is-inside-work-tree", cwd=str(project_root))
if code != 0:
print(" Error: not a git repository. Run: git init && git add . && git commit -m 'initial'")
sys.exit(1)
print(" Git repository found")
# Check target file
target_path = project_root / args.target
if not target_path.exists():
print(f" Error: target file not found: {args.target}")
sys.exit(1)
print(f" Target file found: {args.target}")
# Init root
init_root(root)
# Create experiment directory
experiment_dir = root / args.domain / args.name
if experiment_dir.exists():
print(f" Warning: experiment '{args.domain}/{args.name}' already exists.")
print(f" Use --name with a different name, or delete {experiment_dir}")
sys.exit(1)
experiment_dir.mkdir(parents=True)
print(f" Created {experiment_dir}/")
# Create files
create_program_md(experiment_dir, args.domain, args.name,
args.target, args.metric, args.direction, args.constraints)
print(" Created program.md")
create_config(experiment_dir, args.target, args.eval_cmd,
args.metric, args.direction, args.time_budget)
print(" Created config.cfg")
init_results_tsv(experiment_dir)
# Copy evaluator if specified
if args.evaluator:
copy_evaluator(experiment_dir, args.evaluator)
# Create git branch
if not args.skip_branch:
create_branch(str(project_root), args.domain, args.name)
# Test evaluation command
print(f"\n Testing evaluation: {args.eval_cmd}")
code, out, err = run_cmd(args.eval_cmd, cwd=str(project_root), timeout=60)
if code != 0:
print(f" Warning: eval command failed (exit {code})")
if err:
print(f" stderr: {err[:200]}")
print(" Fix the eval command before running the experiment loop.")
else:
config = {
"target": args.target or "target.py",
"evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
"metric": args.metric or "score",
"metric_direction": args.direction,
"time_budget_minutes": args.budget,
"metric_grep": f"^{args.metric or 'score'}:",
}
# Check metric is parseable
full_output = out + "\n" + err
metric_found = False
for line in full_output.splitlines():
if line.strip().startswith(f"{args.metric}:"):
metric_found = True
print(f" Eval works. Baseline: {line.strip()}")
break
if not metric_found:
print(f" Warning: eval ran but '{args.metric}:' not found in output.")
print(f" Make sure your eval command outputs: {args.metric}: <value>")
tag = args.tag or datetime.now().strftime("%b%d").lower()
# Validation checks
checks = [
check_git_repo(path),
check_program_md(path),
check_target_file(path, config["target"]),
check_evaluate_script(path),
]
if not all(checks):
print("\n⚠ Fix the above issues before running experiments.")
sys.exit(1)
# Create branch
branch = create_branch(path, tag)
if not branch:
sys.exit(1)
# Init results TSV
init_results_tsv(path)
# Save config for run_experiment.py
config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
(path / ".autoresearch.cfg").write_text(config_content + "\n")
print("✓ Saved .autoresearch.cfg")
# Run baseline
if not args.skip_baseline:
baseline = run_baseline(
path,
config["evaluate_cmd"],
config["metric_grep"],
config["time_budget_minutes"]
)
if baseline:
# Log baseline to TSV
code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
with open(path / "results.tsv", "a") as f:
f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
print(f"✓ Baseline logged to results.tsv")
print(f"\n✅ Setup complete!")
print(f" Branch: {branch}")
print(f" Target: {config['target']}")
print(f" Metric: {config['metric']} ({config['metric_direction']} is better)")
print(f" Budget: {config['time_budget_minutes']} min/experiment")
print(f"\nTo start the autonomous loop:")
print(f" python scripts/run_experiment.py --loop")
print(f"\nOr run a single experiment:")
print(f" python scripts/run_experiment.py --single")
# Summary
print(f"\n Setup complete!")
print(f" Experiment: {args.domain}/{args.name}")
print(f" Target: {args.target}")
print(f" Metric: {args.metric} ({args.direction} is better)")
print(f" Budget: {args.time_budget} min/experiment")
if not args.skip_branch:
print(f" Branch: autoresearch/{args.domain}/{args.name}")
print(f"\n To start:")
print(f" python scripts/run_experiment.py --experiment {args.domain}/{args.name} --loop")
if __name__ == "__main__":