refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators
Major rewrite based on deep study of Karpathy's autoresearch repo.
Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation
New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed
Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output
Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view
SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
@@ -1,65 +1,52 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
autoresearch-agent: Setup Wizard
|
||||
autoresearch-agent: Setup Experiment
|
||||
|
||||
Initializes a new research run:
|
||||
1. Validates the project structure
|
||||
2. Creates a git branch
|
||||
3. Runs the baseline experiment
|
||||
4. Initializes results.tsv
|
||||
Initialize a new experiment with domain, target, evaluator, and git branch.
|
||||
Creates the .autoresearch/{domain}/{name}/ directory structure.
|
||||
|
||||
Usage:
|
||||
python scripts/setup_experiment.py [--config experiment.yaml]
|
||||
python scripts/setup_experiment.py --domain ml|prompt|code|skill
|
||||
python scripts/setup_experiment.py --domain engineering --name api-speed \
|
||||
--target src/api/search.py --eval "pytest bench.py" \
|
||||
--metric p50_ms --direction lower
|
||||
|
||||
python scripts/setup_experiment.py --domain marketing --name medium-ctr \
|
||||
--target content/titles.md --eval "python evaluate.py" \
|
||||
--metric ctr_score --direction higher --evaluator llm_judge_content
|
||||
|
||||
python scripts/setup_experiment.py --list # List all experiments
|
||||
python scripts/setup_experiment.py --list-evaluators # List available evaluators
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DOMAINS = ["engineering", "marketing", "content", "prompts", "custom"]
|
||||
|
||||
DOMAINS = {
|
||||
"ml": {
|
||||
"target": "train.py",
|
||||
"evaluate_cmd": "uv run train.py",
|
||||
"metric": "val_bpb",
|
||||
"metric_direction": "lower",
|
||||
"time_budget_minutes": 5,
|
||||
"metric_grep": "^val_bpb:",
|
||||
},
|
||||
"prompt": {
|
||||
"target": "prompt.md",
|
||||
"evaluate_cmd": "python evaluate.py",
|
||||
"metric": "eval_score",
|
||||
"metric_direction": "higher",
|
||||
"time_budget_minutes": 2,
|
||||
"metric_grep": "^eval_score:",
|
||||
},
|
||||
"code": {
|
||||
"target": "src/module.py",
|
||||
"evaluate_cmd": "python benchmark.py",
|
||||
"metric": "p50_ms",
|
||||
"metric_direction": "lower",
|
||||
"time_budget_minutes": 10,
|
||||
"metric_grep": "^p50_ms:",
|
||||
},
|
||||
"skill": {
|
||||
"target": "SKILL.md",
|
||||
"evaluate_cmd": "python scripts/skill_evaluator.py",
|
||||
"metric": "pass_rate",
|
||||
"metric_direction": "higher",
|
||||
"time_budget_minutes": 5,
|
||||
"metric_grep": "^pass_rate:",
|
||||
},
|
||||
}
|
||||
EVALUATOR_DIR = Path(__file__).parent.parent / "evaluators"
|
||||
|
||||
DEFAULT_CONFIG = """# autoresearch global config
|
||||
default_time_budget_minutes: 5
|
||||
default_scope: project
|
||||
dashboard_format: markdown
|
||||
"""
|
||||
|
||||
GITIGNORE_CONTENT = """# autoresearch — experiment logs are local state
|
||||
**/results.tsv
|
||||
**/run.log
|
||||
**/run.*.log
|
||||
config.yaml
|
||||
"""
|
||||
|
||||
|
||||
def run_cmd(cmd, cwd=None, timeout=None):
|
||||
"""Run a shell command and return (returncode, stdout, stderr)."""
|
||||
"""Run shell command, return (returncode, stdout, stderr)."""
|
||||
result = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True,
|
||||
cwd=cwd, timeout=timeout
|
||||
@@ -67,188 +54,315 @@ def run_cmd(cmd, cwd=None, timeout=None):
|
||||
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
||||
|
||||
|
||||
def check_git_repo(path):
|
||||
"""Verify we're in a git repo."""
|
||||
code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
|
||||
if code != 0:
|
||||
print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
|
||||
def get_autoresearch_root(scope, project_root=None):
|
||||
"""Get the .autoresearch root directory based on scope."""
|
||||
if scope == "user":
|
||||
return Path.home() / ".autoresearch"
|
||||
return Path(project_root or ".") / ".autoresearch"
|
||||
|
||||
|
||||
def init_root(root):
|
||||
"""Initialize .autoresearch root if it doesn't exist."""
|
||||
created = False
|
||||
if not root.exists():
|
||||
root.mkdir(parents=True)
|
||||
created = True
|
||||
print(f" Created {root}/")
|
||||
|
||||
config_file = root / "config.yaml"
|
||||
if not config_file.exists():
|
||||
config_file.write_text(DEFAULT_CONFIG)
|
||||
print(f" Created {config_file}")
|
||||
|
||||
gitignore = root / ".gitignore"
|
||||
if not gitignore.exists():
|
||||
gitignore.write_text(GITIGNORE_CONTENT)
|
||||
print(f" Created {gitignore}")
|
||||
|
||||
return created
|
||||
|
||||
|
||||
def create_program_md(experiment_dir, domain, name, target, metric, direction, constraints=""):
|
||||
"""Generate a program.md template for the experiment."""
|
||||
direction_word = "Minimize" if direction == "lower" else "Maximize"
|
||||
content = f"""# autoresearch — {name}
|
||||
|
||||
## Goal
|
||||
{direction_word} `{metric}` on `{target}`. {"Lower" if direction == "lower" else "Higher"} is better.
|
||||
|
||||
## What the Agent Can Change
|
||||
- Only `{target}` — this is the single file being optimized.
|
||||
- Everything inside that file is fair game unless constrained below.
|
||||
|
||||
## What the Agent Cannot Change
|
||||
- The evaluation script (`evaluate.py` or the eval command). It is read-only.
|
||||
- Dependencies — do not add new packages or imports that aren't already available.
|
||||
- Any other files in the project unless explicitly noted here.
|
||||
{f"- Additional constraints: {constraints}" if constraints else ""}
|
||||
|
||||
## Strategy
|
||||
1. First run: establish baseline. Do not change anything.
|
||||
2. Profile/analyze the current state — understand why the metric is what it is.
|
||||
3. Try the most obvious improvement first (low-hanging fruit).
|
||||
4. If that works, push further in the same direction.
|
||||
5. If stuck, try something orthogonal or radical.
|
||||
6. Read the git log of previous experiments. Don't repeat failed approaches.
|
||||
|
||||
## Simplicity Rule
|
||||
A small improvement that adds ugly complexity is NOT worth it.
|
||||
Equal performance with simpler code IS worth it.
|
||||
Removing code that gets same results is the best outcome.
|
||||
|
||||
## Stop When
|
||||
You don't stop. The human will interrupt you when they're satisfied.
|
||||
If no improvement in 20+ consecutive runs, change strategy drastically.
|
||||
"""
|
||||
(experiment_dir / "program.md").write_text(content)
|
||||
|
||||
|
||||
def create_config(experiment_dir, target, eval_cmd, metric, direction, time_budget):
|
||||
"""Write experiment config."""
|
||||
content = f"""target: {target}
|
||||
evaluate_cmd: {eval_cmd}
|
||||
metric: {metric}
|
||||
metric_direction: {direction}
|
||||
metric_grep: ^{metric}:
|
||||
time_budget_minutes: {time_budget}
|
||||
created: {datetime.now().strftime('%Y-%m-%d %H:%M')}
|
||||
"""
|
||||
(experiment_dir / "config.cfg").write_text(content)
|
||||
|
||||
|
||||
def init_results_tsv(experiment_dir):
|
||||
"""Create results.tsv with header."""
|
||||
tsv = experiment_dir / "results.tsv"
|
||||
if tsv.exists():
|
||||
print(f" results.tsv already exists ({tsv.stat().st_size} bytes)")
|
||||
return
|
||||
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
|
||||
print(" Created results.tsv")
|
||||
|
||||
|
||||
def copy_evaluator(experiment_dir, evaluator_name):
|
||||
"""Copy a built-in evaluator to the experiment directory."""
|
||||
source = EVALUATOR_DIR / f"{evaluator_name}.py"
|
||||
if not source.exists():
|
||||
print(f" Warning: evaluator '{evaluator_name}' not found in {EVALUATOR_DIR}")
|
||||
print(f" Available: {', '.join(f.stem for f in EVALUATOR_DIR.glob('*.py'))}")
|
||||
return False
|
||||
print("✓ Git repository found")
|
||||
dest = experiment_dir / "evaluate.py"
|
||||
shutil.copy2(source, dest)
|
||||
print(f" Copied evaluator: {evaluator_name}.py -> evaluate.py")
|
||||
return True
|
||||
|
||||
|
||||
def check_program_md(path):
|
||||
"""Check program.md exists and has content."""
|
||||
pm = Path(path) / "program.md"
|
||||
if not pm.exists():
|
||||
print("⚠ program.md not found. Creating template...")
|
||||
return False
|
||||
content = pm.read_text()
|
||||
if len(content) < 100:
|
||||
print("⚠ program.md looks empty. Fill it out before running experiments.")
|
||||
return False
|
||||
print(f"✓ program.md found ({len(content)} chars)")
|
||||
return True
|
||||
|
||||
|
||||
def check_target_file(path, target):
|
||||
"""Check target file exists."""
|
||||
tf = Path(path) / target
|
||||
if not tf.exists():
|
||||
print(f"✗ Target file not found: {target}")
|
||||
return False
|
||||
print(f"✓ Target file found: {target}")
|
||||
return True
|
||||
|
||||
|
||||
def check_evaluate_script(path):
|
||||
"""Check evaluate.py exists."""
|
||||
ev = Path(path) / "evaluate.py"
|
||||
if not ev.exists():
|
||||
print("⚠ evaluate.py not found. You need a fixed evaluation function.")
|
||||
print(" Create evaluate.py that outputs: metric_name: <value>")
|
||||
return False
|
||||
print("✓ evaluate.py found")
|
||||
return True
|
||||
|
||||
|
||||
def create_branch(path, tag):
|
||||
def create_branch(path, domain, name):
|
||||
"""Create and checkout the experiment branch."""
|
||||
branch = f"autoresearch/{tag}"
|
||||
code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
|
||||
branch = f"autoresearch/{domain}/{name}"
|
||||
code, _, err = run_cmd(f"git checkout -b {branch}", cwd=path)
|
||||
if code != 0:
|
||||
if "already exists" in err:
|
||||
print(f"✗ Branch '{branch}' already exists. Use a different tag.")
|
||||
else:
|
||||
print(f"✗ Failed to create branch: {err}")
|
||||
print(f" Branch '{branch}' already exists. Checking out...")
|
||||
run_cmd(f"git checkout {branch}", cwd=path)
|
||||
return branch
|
||||
print(f" Warning: could not create branch: {err}")
|
||||
return None
|
||||
print(f"✓ Created branch: {branch}")
|
||||
print(f" Created branch: {branch}")
|
||||
return branch
|
||||
|
||||
|
||||
def init_results_tsv(path):
|
||||
"""Create results.tsv with header."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
if tsv.exists():
|
||||
print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
|
||||
def list_experiments(root):
|
||||
"""List all experiments across all domains."""
|
||||
if not root.exists():
|
||||
print("No experiments found. Run setup to create your first experiment.")
|
||||
return
|
||||
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
|
||||
print("✓ Created results.tsv")
|
||||
|
||||
experiments = []
|
||||
for domain_dir in sorted(root.iterdir()):
|
||||
if not domain_dir.is_dir() or domain_dir.name.startswith("."):
|
||||
continue
|
||||
for exp_dir in sorted(domain_dir.iterdir()):
|
||||
if not exp_dir.is_dir():
|
||||
continue
|
||||
cfg_file = exp_dir / "config.cfg"
|
||||
if not cfg_file.exists():
|
||||
continue
|
||||
config = {}
|
||||
for line in cfg_file.read_text().splitlines():
|
||||
if ":" in line:
|
||||
k, v = line.split(":", 1)
|
||||
config[k.strip()] = v.strip()
|
||||
|
||||
# Count results
|
||||
tsv = exp_dir / "results.tsv"
|
||||
runs = 0
|
||||
if tsv.exists():
|
||||
runs = max(0, len(tsv.read_text().splitlines()) - 1)
|
||||
|
||||
experiments.append({
|
||||
"domain": domain_dir.name,
|
||||
"name": exp_dir.name,
|
||||
"target": config.get("target", "?"),
|
||||
"metric": config.get("metric", "?"),
|
||||
"runs": runs,
|
||||
})
|
||||
|
||||
if not experiments:
|
||||
print("No experiments found.")
|
||||
return
|
||||
|
||||
print(f"\n{'DOMAIN':<15} {'EXPERIMENT':<25} {'TARGET':<30} {'METRIC':<15} {'RUNS':>5}")
|
||||
print("-" * 95)
|
||||
for e in experiments:
|
||||
print(f"{e['domain']:<15} {e['name']:<25} {e['target']:<30} {e['metric']:<15} {e['runs']:>5}")
|
||||
print(f"\nTotal: {len(experiments)} experiments")
|
||||
|
||||
|
||||
def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
|
||||
"""Run the baseline experiment."""
|
||||
print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
|
||||
timeout = time_budget_minutes * 60 * 2.5 # 2.5x budget as hard limit
|
||||
def list_evaluators():
|
||||
"""List available built-in evaluators."""
|
||||
if not EVALUATOR_DIR.exists():
|
||||
print("No evaluators directory found.")
|
||||
return
|
||||
|
||||
t0 = time.time()
|
||||
code, out, err = run_cmd(
|
||||
f"{evaluate_cmd} > run.log 2>&1",
|
||||
cwd=path,
|
||||
timeout=timeout
|
||||
)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if code != 0:
|
||||
print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
|
||||
return None
|
||||
|
||||
# Extract metric
|
||||
grep_code, grep_out, _ = run_cmd(
|
||||
f"grep '{metric_grep}' run.log | tail -1",
|
||||
cwd=path
|
||||
)
|
||||
if not grep_out:
|
||||
print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
|
||||
return None
|
||||
|
||||
metric_value = grep_out.split(":")[-1].strip()
|
||||
print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
|
||||
return metric_value
|
||||
print(f"\nAvailable evaluators ({EVALUATOR_DIR}):\n")
|
||||
for f in sorted(EVALUATOR_DIR.glob("*.py")):
|
||||
# Read first docstring line
|
||||
desc = ""
|
||||
for line in f.read_text().splitlines():
|
||||
if line.strip().startswith('"""') or line.strip().startswith("'''"):
|
||||
continue
|
||||
if line.strip() and not line.startswith("#!"):
|
||||
desc = line.strip().strip('"').strip("'")
|
||||
break
|
||||
print(f" {f.stem:<25} {desc}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="autoresearch-agent setup")
|
||||
parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
|
||||
parser.add_argument("--domain", choices=DOMAINS, help="Experiment domain")
|
||||
parser.add_argument("--name", help="Experiment name (e.g. api-speed, medium-ctr)")
|
||||
parser.add_argument("--target", help="Target file to optimize")
|
||||
parser.add_argument("--evaluate-cmd", help="Evaluation command")
|
||||
parser.add_argument("--metric", help="Metric name")
|
||||
parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
|
||||
parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
|
||||
parser.add_argument("--tag", help="Run tag (used in branch name)")
|
||||
parser.add_argument("--eval", dest="eval_cmd", help="Evaluation command")
|
||||
parser.add_argument("--metric", help="Metric name (must appear in eval output as 'name: value')")
|
||||
parser.add_argument("--direction", choices=["lower", "higher"], default="lower",
|
||||
help="Is lower or higher better?")
|
||||
parser.add_argument("--time-budget", type=int, default=5, help="Minutes per experiment (default: 5)")
|
||||
parser.add_argument("--evaluator", help="Built-in evaluator to copy (e.g. benchmark_speed)")
|
||||
parser.add_argument("--scope", choices=["project", "user"], default="project",
|
||||
help="Where to store experiments: project (./) or user (~/)")
|
||||
parser.add_argument("--constraints", default="", help="Additional constraints for program.md")
|
||||
parser.add_argument("--path", default=".", help="Project root path")
|
||||
parser.add_argument("--skip-baseline", action="store_true")
|
||||
parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline run")
|
||||
parser.add_argument("--skip-branch", action="store_true", help="Don't create git branch")
|
||||
parser.add_argument("--list", action="store_true", help="List all experiments")
|
||||
parser.add_argument("--list-evaluators", action="store_true", help="List available evaluators")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.path).resolve()
|
||||
print(f"\n🔬 autoresearch-agent setup")
|
||||
print(f" Project: {path}")
|
||||
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
|
||||
project_root = Path(args.path).resolve()
|
||||
|
||||
# Get config from domain or args
|
||||
if args.domain:
|
||||
config = DOMAINS[args.domain].copy()
|
||||
# List mode
|
||||
if args.list:
|
||||
root = get_autoresearch_root("project", project_root)
|
||||
list_experiments(root)
|
||||
user_root = get_autoresearch_root("user")
|
||||
if user_root.exists() and user_root != root:
|
||||
print(f"\n--- User-level experiments ({user_root}) ---")
|
||||
list_experiments(user_root)
|
||||
return
|
||||
|
||||
if args.list_evaluators:
|
||||
list_evaluators()
|
||||
return
|
||||
|
||||
# Validate required args for setup
|
||||
if not all([args.domain, args.name, args.target, args.eval_cmd, args.metric]):
|
||||
parser.error("Required: --domain, --name, --target, --eval, --metric")
|
||||
|
||||
root = get_autoresearch_root(args.scope, project_root)
|
||||
|
||||
print(f"\n autoresearch-agent setup")
|
||||
print(f" Project: {project_root}")
|
||||
print(f" Scope: {args.scope}")
|
||||
print(f" Domain: {args.domain}")
|
||||
print(f" Experiment: {args.name}")
|
||||
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
|
||||
|
||||
# Check git
|
||||
code, _, _ = run_cmd("git rev-parse --is-inside-work-tree", cwd=str(project_root))
|
||||
if code != 0:
|
||||
print(" Error: not a git repository. Run: git init && git add . && git commit -m 'initial'")
|
||||
sys.exit(1)
|
||||
print(" Git repository found")
|
||||
|
||||
# Check target file
|
||||
target_path = project_root / args.target
|
||||
if not target_path.exists():
|
||||
print(f" Error: target file not found: {args.target}")
|
||||
sys.exit(1)
|
||||
print(f" Target file found: {args.target}")
|
||||
|
||||
# Init root
|
||||
init_root(root)
|
||||
|
||||
# Create experiment directory
|
||||
experiment_dir = root / args.domain / args.name
|
||||
if experiment_dir.exists():
|
||||
print(f" Warning: experiment '{args.domain}/{args.name}' already exists.")
|
||||
print(f" Use --name with a different name, or delete {experiment_dir}")
|
||||
sys.exit(1)
|
||||
experiment_dir.mkdir(parents=True)
|
||||
print(f" Created {experiment_dir}/")
|
||||
|
||||
# Create files
|
||||
create_program_md(experiment_dir, args.domain, args.name,
|
||||
args.target, args.metric, args.direction, args.constraints)
|
||||
print(" Created program.md")
|
||||
|
||||
create_config(experiment_dir, args.target, args.eval_cmd,
|
||||
args.metric, args.direction, args.time_budget)
|
||||
print(" Created config.cfg")
|
||||
|
||||
init_results_tsv(experiment_dir)
|
||||
|
||||
# Copy evaluator if specified
|
||||
if args.evaluator:
|
||||
copy_evaluator(experiment_dir, args.evaluator)
|
||||
|
||||
# Create git branch
|
||||
if not args.skip_branch:
|
||||
create_branch(str(project_root), args.domain, args.name)
|
||||
|
||||
# Test evaluation command
|
||||
print(f"\n Testing evaluation: {args.eval_cmd}")
|
||||
code, out, err = run_cmd(args.eval_cmd, cwd=str(project_root), timeout=60)
|
||||
if code != 0:
|
||||
print(f" Warning: eval command failed (exit {code})")
|
||||
if err:
|
||||
print(f" stderr: {err[:200]}")
|
||||
print(" Fix the eval command before running the experiment loop.")
|
||||
else:
|
||||
config = {
|
||||
"target": args.target or "target.py",
|
||||
"evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
|
||||
"metric": args.metric or "score",
|
||||
"metric_direction": args.direction,
|
||||
"time_budget_minutes": args.budget,
|
||||
"metric_grep": f"^{args.metric or 'score'}:",
|
||||
}
|
||||
# Check metric is parseable
|
||||
full_output = out + "\n" + err
|
||||
metric_found = False
|
||||
for line in full_output.splitlines():
|
||||
if line.strip().startswith(f"{args.metric}:"):
|
||||
metric_found = True
|
||||
print(f" Eval works. Baseline: {line.strip()}")
|
||||
break
|
||||
if not metric_found:
|
||||
print(f" Warning: eval ran but '{args.metric}:' not found in output.")
|
||||
print(f" Make sure your eval command outputs: {args.metric}: <value>")
|
||||
|
||||
tag = args.tag or datetime.now().strftime("%b%d").lower()
|
||||
|
||||
# Validation checks
|
||||
checks = [
|
||||
check_git_repo(path),
|
||||
check_program_md(path),
|
||||
check_target_file(path, config["target"]),
|
||||
check_evaluate_script(path),
|
||||
]
|
||||
|
||||
if not all(checks):
|
||||
print("\n⚠ Fix the above issues before running experiments.")
|
||||
sys.exit(1)
|
||||
|
||||
# Create branch
|
||||
branch = create_branch(path, tag)
|
||||
if not branch:
|
||||
sys.exit(1)
|
||||
|
||||
# Init results TSV
|
||||
init_results_tsv(path)
|
||||
|
||||
# Save config for run_experiment.py
|
||||
config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
|
||||
(path / ".autoresearch.cfg").write_text(config_content + "\n")
|
||||
print("✓ Saved .autoresearch.cfg")
|
||||
|
||||
# Run baseline
|
||||
if not args.skip_baseline:
|
||||
baseline = run_baseline(
|
||||
path,
|
||||
config["evaluate_cmd"],
|
||||
config["metric_grep"],
|
||||
config["time_budget_minutes"]
|
||||
)
|
||||
if baseline:
|
||||
# Log baseline to TSV
|
||||
code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
|
||||
with open(path / "results.tsv", "a") as f:
|
||||
f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
|
||||
print(f"✓ Baseline logged to results.tsv")
|
||||
|
||||
print(f"\n✅ Setup complete!")
|
||||
print(f" Branch: {branch}")
|
||||
print(f" Target: {config['target']}")
|
||||
print(f" Metric: {config['metric']} ({config['metric_direction']} is better)")
|
||||
print(f" Budget: {config['time_budget_minutes']} min/experiment")
|
||||
print(f"\nTo start the autonomous loop:")
|
||||
print(f" python scripts/run_experiment.py --loop")
|
||||
print(f"\nOr run a single experiment:")
|
||||
print(f" python scripts/run_experiment.py --single")
|
||||
# Summary
|
||||
print(f"\n Setup complete!")
|
||||
print(f" Experiment: {args.domain}/{args.name}")
|
||||
print(f" Target: {args.target}")
|
||||
print(f" Metric: {args.metric} ({args.direction} is better)")
|
||||
print(f" Budget: {args.time_budget} min/experiment")
|
||||
if not args.skip_branch:
|
||||
print(f" Branch: autoresearch/{args.domain}/{args.name}")
|
||||
print(f"\n To start:")
|
||||
print(f" python scripts/run_experiment.py --experiment {args.domain}/{args.name} --loop")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user