refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo.

Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation

New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed

Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output

Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view

SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
Leo
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions

View File

@@ -1,65 +1,52 @@
#!/usr/bin/env python3
"""
autoresearch-agent: Setup Wizard
autoresearch-agent: Setup Experiment
Initializes a new research run:
1. Validates the project structure
2. Creates a git branch
3. Runs the baseline experiment
4. Initializes results.tsv
Initialize a new experiment with domain, target, evaluator, and git branch.
Creates the .autoresearch/{domain}/{name}/ directory structure.
Usage:
python scripts/setup_experiment.py [--config experiment.yaml]
python scripts/setup_experiment.py --domain ml|prompt|code|skill
python scripts/setup_experiment.py --domain engineering --name api-speed \
--target src/api/search.py --eval "pytest bench.py" \
--metric p50_ms --direction lower
python scripts/setup_experiment.py --domain marketing --name medium-ctr \
--target content/titles.md --eval "python evaluate.py" \
--metric ctr_score --direction higher --evaluator llm_judge_content
python scripts/setup_experiment.py --list # List all experiments
python scripts/setup_experiment.py --list-evaluators # List available evaluators
"""
import argparse
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime
from pathlib import Path
DOMAINS = ["engineering", "marketing", "content", "prompts", "custom"]
DOMAINS = {
"ml": {
"target": "train.py",
"evaluate_cmd": "uv run train.py",
"metric": "val_bpb",
"metric_direction": "lower",
"time_budget_minutes": 5,
"metric_grep": "^val_bpb:",
},
"prompt": {
"target": "prompt.md",
"evaluate_cmd": "python evaluate.py",
"metric": "eval_score",
"metric_direction": "higher",
"time_budget_minutes": 2,
"metric_grep": "^eval_score:",
},
"code": {
"target": "src/module.py",
"evaluate_cmd": "python benchmark.py",
"metric": "p50_ms",
"metric_direction": "lower",
"time_budget_minutes": 10,
"metric_grep": "^p50_ms:",
},
"skill": {
"target": "SKILL.md",
"evaluate_cmd": "python scripts/skill_evaluator.py",
"metric": "pass_rate",
"metric_direction": "higher",
"time_budget_minutes": 5,
"metric_grep": "^pass_rate:",
},
}
EVALUATOR_DIR = Path(__file__).parent.parent / "evaluators"
DEFAULT_CONFIG = """# autoresearch global config
default_time_budget_minutes: 5
default_scope: project
dashboard_format: markdown
"""
GITIGNORE_CONTENT = """# autoresearch — experiment logs are local state
**/results.tsv
**/run.log
**/run.*.log
config.yaml
"""
def run_cmd(cmd, cwd=None, timeout=None):
"""Run a shell command and return (returncode, stdout, stderr)."""
"""Run shell command, return (returncode, stdout, stderr)."""
result = subprocess.run(
cmd, shell=True, capture_output=True, text=True,
cwd=cwd, timeout=timeout
@@ -67,188 +54,315 @@ def run_cmd(cmd, cwd=None, timeout=None):
return result.returncode, result.stdout.strip(), result.stderr.strip()
def check_git_repo(path):
"""Verify we're in a git repo."""
code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
if code != 0:
print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
def get_autoresearch_root(scope, project_root=None):
"""Get the .autoresearch root directory based on scope."""
if scope == "user":
return Path.home() / ".autoresearch"
return Path(project_root or ".") / ".autoresearch"
def init_root(root):
"""Initialize .autoresearch root if it doesn't exist."""
created = False
if not root.exists():
root.mkdir(parents=True)
created = True
print(f" Created {root}/")
config_file = root / "config.yaml"
if not config_file.exists():
config_file.write_text(DEFAULT_CONFIG)
print(f" Created {config_file}")
gitignore = root / ".gitignore"
if not gitignore.exists():
gitignore.write_text(GITIGNORE_CONTENT)
print(f" Created {gitignore}")
return created
def create_program_md(experiment_dir, domain, name, target, metric, direction, constraints=""):
"""Generate a program.md template for the experiment."""
direction_word = "Minimize" if direction == "lower" else "Maximize"
content = f"""# autoresearch — {name}
## Goal
{direction_word} `{metric}` on `{target}`. {"Lower" if direction == "lower" else "Higher"} is better.
## What the Agent Can Change
- Only `{target}` — this is the single file being optimized.
- Everything inside that file is fair game unless constrained below.
## What the Agent Cannot Change
- The evaluation script (`evaluate.py` or the eval command). It is read-only.
- Dependencies — do not add new packages or imports that aren't already available.
- Any other files in the project unless explicitly noted here.
{f"- Additional constraints: {constraints}" if constraints else ""}
## Strategy
1. First run: establish baseline. Do not change anything.
2. Profile/analyze the current state — understand why the metric is what it is.
3. Try the most obvious improvement first (low-hanging fruit).
4. If that works, push further in the same direction.
5. If stuck, try something orthogonal or radical.
6. Read the git log of previous experiments. Don't repeat failed approaches.
## Simplicity Rule
A small improvement that adds ugly complexity is NOT worth it.
Equal performance with simpler code IS worth it.
Removing code that gets same results is the best outcome.
## Stop When
You don't stop. The human will interrupt you when they're satisfied.
If no improvement in 20+ consecutive runs, change strategy drastically.
"""
(experiment_dir / "program.md").write_text(content)
def create_config(experiment_dir, target, eval_cmd, metric, direction, time_budget):
"""Write experiment config."""
content = f"""target: {target}
evaluate_cmd: {eval_cmd}
metric: {metric}
metric_direction: {direction}
metric_grep: ^{metric}:
time_budget_minutes: {time_budget}
created: {datetime.now().strftime('%Y-%m-%d %H:%M')}
"""
(experiment_dir / "config.cfg").write_text(content)
def init_results_tsv(experiment_dir):
"""Create results.tsv with header."""
tsv = experiment_dir / "results.tsv"
if tsv.exists():
print(f" results.tsv already exists ({tsv.stat().st_size} bytes)")
return
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
print(" Created results.tsv")
def copy_evaluator(experiment_dir, evaluator_name):
"""Copy a built-in evaluator to the experiment directory."""
source = EVALUATOR_DIR / f"{evaluator_name}.py"
if not source.exists():
print(f" Warning: evaluator '{evaluator_name}' not found in {EVALUATOR_DIR}")
print(f" Available: {', '.join(f.stem for f in EVALUATOR_DIR.glob('*.py'))}")
return False
print("✓ Git repository found")
dest = experiment_dir / "evaluate.py"
shutil.copy2(source, dest)
print(f" Copied evaluator: {evaluator_name}.py -> evaluate.py")
return True
def check_program_md(path):
"""Check program.md exists and has content."""
pm = Path(path) / "program.md"
if not pm.exists():
print("⚠ program.md not found. Creating template...")
return False
content = pm.read_text()
if len(content) < 100:
print("⚠ program.md looks empty. Fill it out before running experiments.")
return False
print(f"✓ program.md found ({len(content)} chars)")
return True
def check_target_file(path, target):
"""Check target file exists."""
tf = Path(path) / target
if not tf.exists():
print(f"✗ Target file not found: {target}")
return False
print(f"✓ Target file found: {target}")
return True
def check_evaluate_script(path):
"""Check evaluate.py exists."""
ev = Path(path) / "evaluate.py"
if not ev.exists():
print("⚠ evaluate.py not found. You need a fixed evaluation function.")
print(" Create evaluate.py that outputs: metric_name: <value>")
return False
print("✓ evaluate.py found")
return True
def create_branch(path, tag):
def create_branch(path, domain, name):
"""Create and checkout the experiment branch."""
branch = f"autoresearch/{tag}"
code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
branch = f"autoresearch/{domain}/{name}"
code, _, err = run_cmd(f"git checkout -b {branch}", cwd=path)
if code != 0:
if "already exists" in err:
print(f" Branch '{branch}' already exists. Use a different tag.")
else:
print(f"✗ Failed to create branch: {err}")
print(f" Branch '{branch}' already exists. Checking out...")
run_cmd(f"git checkout {branch}", cwd=path)
return branch
print(f" Warning: could not create branch: {err}")
return None
print(f" Created branch: {branch}")
print(f" Created branch: {branch}")
return branch
def init_results_tsv(path):
"""Create results.tsv with header."""
tsv = Path(path) / "results.tsv"
if tsv.exists():
print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
def list_experiments(root):
"""List all experiments across all domains."""
if not root.exists():
print("No experiments found. Run setup to create your first experiment.")
return
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
print("✓ Created results.tsv")
experiments = []
for domain_dir in sorted(root.iterdir()):
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
continue
for exp_dir in sorted(domain_dir.iterdir()):
if not exp_dir.is_dir():
continue
cfg_file = exp_dir / "config.cfg"
if not cfg_file.exists():
continue
config = {}
for line in cfg_file.read_text().splitlines():
if ":" in line:
k, v = line.split(":", 1)
config[k.strip()] = v.strip()
# Count results
tsv = exp_dir / "results.tsv"
runs = 0
if tsv.exists():
runs = max(0, len(tsv.read_text().splitlines()) - 1)
experiments.append({
"domain": domain_dir.name,
"name": exp_dir.name,
"target": config.get("target", "?"),
"metric": config.get("metric", "?"),
"runs": runs,
})
if not experiments:
print("No experiments found.")
return
print(f"\n{'DOMAIN':<15} {'EXPERIMENT':<25} {'TARGET':<30} {'METRIC':<15} {'RUNS':>5}")
print("-" * 95)
for e in experiments:
print(f"{e['domain']:<15} {e['name']:<25} {e['target']:<30} {e['metric']:<15} {e['runs']:>5}")
print(f"\nTotal: {len(experiments)} experiments")
def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
"""Run the baseline experiment."""
print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
timeout = time_budget_minutes * 60 * 2.5 # 2.5x budget as hard limit
def list_evaluators():
"""List available built-in evaluators."""
if not EVALUATOR_DIR.exists():
print("No evaluators directory found.")
return
t0 = time.time()
code, out, err = run_cmd(
f"{evaluate_cmd} > run.log 2>&1",
cwd=path,
timeout=timeout
)
elapsed = time.time() - t0
if code != 0:
print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
return None
# Extract metric
grep_code, grep_out, _ = run_cmd(
f"grep '{metric_grep}' run.log | tail -1",
cwd=path
)
if not grep_out:
print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
return None
metric_value = grep_out.split(":")[-1].strip()
print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
return metric_value
print(f"\nAvailable evaluators ({EVALUATOR_DIR}):\n")
for f in sorted(EVALUATOR_DIR.glob("*.py")):
# Read first docstring line
desc = ""
for line in f.read_text().splitlines():
if line.strip().startswith('"""') or line.strip().startswith("'''"):
continue
if line.strip() and not line.startswith("#!"):
desc = line.strip().strip('"').strip("'")
break
print(f" {f.stem:<25} {desc}")
def main():
parser = argparse.ArgumentParser(description="autoresearch-agent setup")
parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
parser.add_argument("--domain", choices=DOMAINS, help="Experiment domain")
parser.add_argument("--name", help="Experiment name (e.g. api-speed, medium-ctr)")
parser.add_argument("--target", help="Target file to optimize")
parser.add_argument("--evaluate-cmd", help="Evaluation command")
parser.add_argument("--metric", help="Metric name")
parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
parser.add_argument("--tag", help="Run tag (used in branch name)")
parser.add_argument("--eval", dest="eval_cmd", help="Evaluation command")
parser.add_argument("--metric", help="Metric name (must appear in eval output as 'name: value')")
parser.add_argument("--direction", choices=["lower", "higher"], default="lower",
help="Is lower or higher better?")
parser.add_argument("--time-budget", type=int, default=5, help="Minutes per experiment (default: 5)")
parser.add_argument("--evaluator", help="Built-in evaluator to copy (e.g. benchmark_speed)")
parser.add_argument("--scope", choices=["project", "user"], default="project",
help="Where to store experiments: project (./) or user (~/)")
parser.add_argument("--constraints", default="", help="Additional constraints for program.md")
parser.add_argument("--path", default=".", help="Project root path")
parser.add_argument("--skip-baseline", action="store_true")
parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline run")
parser.add_argument("--skip-branch", action="store_true", help="Don't create git branch")
parser.add_argument("--list", action="store_true", help="List all experiments")
parser.add_argument("--list-evaluators", action="store_true", help="List available evaluators")
args = parser.parse_args()
path = Path(args.path).resolve()
print(f"\n🔬 autoresearch-agent setup")
print(f" Project: {path}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
project_root = Path(args.path).resolve()
# Get config from domain or args
if args.domain:
config = DOMAINS[args.domain].copy()
# List mode
if args.list:
root = get_autoresearch_root("project", project_root)
list_experiments(root)
user_root = get_autoresearch_root("user")
if user_root.exists() and user_root != root:
print(f"\n--- User-level experiments ({user_root}) ---")
list_experiments(user_root)
return
if args.list_evaluators:
list_evaluators()
return
# Validate required args for setup
if not all([args.domain, args.name, args.target, args.eval_cmd, args.metric]):
parser.error("Required: --domain, --name, --target, --eval, --metric")
root = get_autoresearch_root(args.scope, project_root)
print(f"\n autoresearch-agent setup")
print(f" Project: {project_root}")
print(f" Scope: {args.scope}")
print(f" Domain: {args.domain}")
print(f" Experiment: {args.name}")
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
# Check git
code, _, _ = run_cmd("git rev-parse --is-inside-work-tree", cwd=str(project_root))
if code != 0:
print(" Error: not a git repository. Run: git init && git add . && git commit -m 'initial'")
sys.exit(1)
print(" Git repository found")
# Check target file
target_path = project_root / args.target
if not target_path.exists():
print(f" Error: target file not found: {args.target}")
sys.exit(1)
print(f" Target file found: {args.target}")
# Init root
init_root(root)
# Create experiment directory
experiment_dir = root / args.domain / args.name
if experiment_dir.exists():
print(f" Warning: experiment '{args.domain}/{args.name}' already exists.")
print(f" Use --name with a different name, or delete {experiment_dir}")
sys.exit(1)
experiment_dir.mkdir(parents=True)
print(f" Created {experiment_dir}/")
# Create files
create_program_md(experiment_dir, args.domain, args.name,
args.target, args.metric, args.direction, args.constraints)
print(" Created program.md")
create_config(experiment_dir, args.target, args.eval_cmd,
args.metric, args.direction, args.time_budget)
print(" Created config.cfg")
init_results_tsv(experiment_dir)
# Copy evaluator if specified
if args.evaluator:
copy_evaluator(experiment_dir, args.evaluator)
# Create git branch
if not args.skip_branch:
create_branch(str(project_root), args.domain, args.name)
# Test evaluation command
print(f"\n Testing evaluation: {args.eval_cmd}")
code, out, err = run_cmd(args.eval_cmd, cwd=str(project_root), timeout=60)
if code != 0:
print(f" Warning: eval command failed (exit {code})")
if err:
print(f" stderr: {err[:200]}")
print(" Fix the eval command before running the experiment loop.")
else:
config = {
"target": args.target or "target.py",
"evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
"metric": args.metric or "score",
"metric_direction": args.direction,
"time_budget_minutes": args.budget,
"metric_grep": f"^{args.metric or 'score'}:",
}
# Check metric is parseable
full_output = out + "\n" + err
metric_found = False
for line in full_output.splitlines():
if line.strip().startswith(f"{args.metric}:"):
metric_found = True
print(f" Eval works. Baseline: {line.strip()}")
break
if not metric_found:
print(f" Warning: eval ran but '{args.metric}:' not found in output.")
print(f" Make sure your eval command outputs: {args.metric}: <value>")
tag = args.tag or datetime.now().strftime("%b%d").lower()
# Validation checks
checks = [
check_git_repo(path),
check_program_md(path),
check_target_file(path, config["target"]),
check_evaluate_script(path),
]
if not all(checks):
print("\n⚠ Fix the above issues before running experiments.")
sys.exit(1)
# Create branch
branch = create_branch(path, tag)
if not branch:
sys.exit(1)
# Init results TSV
init_results_tsv(path)
# Save config for run_experiment.py
config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
(path / ".autoresearch.cfg").write_text(config_content + "\n")
print("✓ Saved .autoresearch.cfg")
# Run baseline
if not args.skip_baseline:
baseline = run_baseline(
path,
config["evaluate_cmd"],
config["metric_grep"],
config["time_budget_minutes"]
)
if baseline:
# Log baseline to TSV
code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
with open(path / "results.tsv", "a") as f:
f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
print(f"✓ Baseline logged to results.tsv")
print(f"\n✅ Setup complete!")
print(f" Branch: {branch}")
print(f" Target: {config['target']}")
print(f" Metric: {config['metric']} ({config['metric_direction']} is better)")
print(f" Budget: {config['time_budget_minutes']} min/experiment")
print(f"\nTo start the autonomous loop:")
print(f" python scripts/run_experiment.py --loop")
print(f"\nOr run a single experiment:")
print(f" python scripts/run_experiment.py --single")
# Summary
print(f"\n Setup complete!")
print(f" Experiment: {args.domain}/{args.name}")
print(f" Target: {args.target}")
print(f" Metric: {args.metric} ({args.direction} is better)")
print(f" Budget: {args.time_budget} min/experiment")
if not args.skip_branch:
print(f" Branch: autoresearch/{args.domain}/{args.name}")
print(f"\n To start:")
print(f" python scripts/run_experiment.py --experiment {args.domain}/{args.name} --loop")
if __name__ == "__main__":