feat: add autoresearch-agent — autonomous experiment loop for ML, prompt, code & skill optimization
Inspired by Karpathy's autoresearch. The agent modifies a target file, runs a fixed evaluation, keeps improvements (git commit), discards failures (git reset), and loops indefinitely — no human in the loop. Includes: - SKILL.md with setup wizard, 4 domain configs, experiment loop protocol - 3 stdlib-only Python scripts (setup, run, log — 687 lines) - Reference docs: experiment domains guide, program.md templates Domains: ML training (val_bpb), prompt engineering (eval_score), code performance (p50_ms), agent skill optimization (pass_rate). Cherry-picked from feat/autoresearch-agent and rebased onto dev. Fixes: timeout inconsistency (2x→2.5x), results.tsv tracking clarity, zero-metric edge case, installation section aligned with multi-tool support.
This commit is contained in:
126
engineering/autoresearch-agent/scripts/log_results.py
Normal file
126
engineering/autoresearch-agent/scripts/log_results.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
autoresearch-agent: Results Logger
|
||||
|
||||
View and analyze experiment results from results.tsv.
|
||||
|
||||
Usage:
|
||||
python scripts/log_results.py --summary # Print progress table
|
||||
python scripts/log_results.py --best # Show best result
|
||||
python scripts/log_results.py --history # Full experiment history
|
||||
python scripts/log_results.py --record commit val status desc # Add entry manually
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_results(path):
|
||||
tsv = Path(path) / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return []
|
||||
lines = tsv.read_text().splitlines()[1:] # skip header
|
||||
results = []
|
||||
for line in lines:
|
||||
parts = line.split("\t")
|
||||
if len(parts) >= 4:
|
||||
try:
|
||||
metric_val = float(parts[1]) if parts[1] != "N/A" else None
|
||||
except ValueError:
|
||||
metric_val = None
|
||||
results.append({
|
||||
"commit": parts[0],
|
||||
"metric": metric_val,
|
||||
"status": parts[2],
|
||||
"description": parts[3]
|
||||
})
|
||||
return results
|
||||
|
||||
|
||||
def print_summary(results, metric_name="metric", direction="lower"):
|
||||
if not results:
|
||||
print("No experiments logged yet.")
|
||||
return
|
||||
|
||||
keeps = [r for r in results if r["status"] == "keep"]
|
||||
discards = [r for r in results if r["status"] == "discard"]
|
||||
crashes = [r for r in results if r["status"] == "crash"]
|
||||
|
||||
print(f"\n{'─'*60}")
|
||||
print(f" autoresearch-agent — Results Summary")
|
||||
print(f"{'─'*60}")
|
||||
print(f" Total experiments: {len(results)}")
|
||||
print(f" ✅ Keep: {len(keeps):3d} ({len(keeps)/max(len(results),1)*100:.0f}%)")
|
||||
print(f" ❌ Discard: {len(discards):3d} ({len(discards)/max(len(results),1)*100:.0f}%)")
|
||||
print(f" 💥 Crash: {len(crashes):3d} ({len(crashes)/max(len(results),1)*100:.0f}%)")
|
||||
|
||||
if keeps:
|
||||
valid = [r for r in keeps if r["metric"] is not None]
|
||||
if valid:
|
||||
baseline = valid[0]["metric"]
|
||||
best = min(r["metric"] for r in valid) if direction == "lower" else max(r["metric"] for r in valid)
|
||||
best_run = next(r for r in valid if r["metric"] == best)
|
||||
improvement = ((baseline - best) / baseline * 100) if direction == "lower" else ((best - baseline) / baseline * 100)
|
||||
|
||||
print(f"\n {metric_name}:")
|
||||
print(f" Baseline: {baseline:.6f}")
|
||||
print(f" Best: {best:.6f} (commit: {best_run['commit']})")
|
||||
print(f" Change: {improvement:+.2f}%")
|
||||
|
||||
print(f"{'─'*60}\n")
|
||||
|
||||
|
||||
def print_history(results):
|
||||
if not results:
|
||||
print("No experiments logged yet.")
|
||||
return
|
||||
|
||||
print(f"\n{'COMMIT':8} {'METRIC':10} {'STATUS':8} DESCRIPTION")
|
||||
print("─" * 60)
|
||||
for r in results:
|
||||
metric_str = f"{r['metric']:.6f}" if r['metric'] is not None else "crash "
|
||||
status_icon = {"keep": "✅", "discard": "❌", "crash": "💥"}.get(r["status"], "?")
|
||||
print(f"{r['commit']:8} {metric_str:10} {status_icon} {r['description'][:40]}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--summary", action="store_true")
|
||||
parser.add_argument("--best", action="store_true")
|
||||
parser.add_argument("--history", action="store_true")
|
||||
parser.add_argument("--record", nargs=4, metavar=("COMMIT", "METRIC", "STATUS", "DESC"))
|
||||
parser.add_argument("--path", default=".")
|
||||
parser.add_argument("--metric", default="metric")
|
||||
parser.add_argument("--direction", default="lower", choices=["lower", "higher"])
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.path).resolve()
|
||||
|
||||
if args.record:
|
||||
commit, metric, status, desc = args.record
|
||||
tsv = path / "results.tsv"
|
||||
if not tsv.exists():
|
||||
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
|
||||
with open(tsv, "a") as f:
|
||||
f.write(f"{commit}\t{metric}\t{status}\t{desc}\n")
|
||||
print(f"✓ Logged: {commit} {metric} {status}")
|
||||
return
|
||||
|
||||
results = load_results(path)
|
||||
|
||||
if args.history:
|
||||
print_history(results)
|
||||
elif args.best:
|
||||
keeps = [r for r in results if r["status"] == "keep" and r["metric"]]
|
||||
if not keeps:
|
||||
print("No successful experiments yet.")
|
||||
return
|
||||
best = min(keeps, key=lambda r: r["metric"]) if args.direction == "lower" else max(keeps, key=lambda r: r["metric"])
|
||||
print(f"Best: {best['metric']:.6f} (commit: {best['commit']}) — {best['description']}")
|
||||
else:
|
||||
print_summary(results, args.metric, args.direction)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
310
engineering/autoresearch-agent/scripts/run_experiment.py
Normal file
310
engineering/autoresearch-agent/scripts/run_experiment.py
Normal file
@@ -0,0 +1,310 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
autoresearch-agent: Experiment Runner
|
||||
|
||||
Executes the autonomous experiment loop:
|
||||
- Reads .autoresearch.cfg for project config
|
||||
- Runs the target evaluation
|
||||
- Keeps improvements (git commit) or discards failures (git reset)
|
||||
- Logs everything to results.tsv
|
||||
- Loops indefinitely until interrupted
|
||||
|
||||
Usage:
|
||||
python scripts/run_experiment.py --loop # Run forever
|
||||
python scripts/run_experiment.py --single # Run one experiment
|
||||
python scripts/run_experiment.py --dry-run # Show what would happen
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_config(path):
|
||||
"""Load .autoresearch.cfg"""
|
||||
cfg_file = Path(path) / ".autoresearch.cfg"
|
||||
if not cfg_file.exists():
|
||||
print("✗ No .autoresearch.cfg found. Run setup_experiment.py first.")
|
||||
sys.exit(1)
|
||||
config = {}
|
||||
for line in cfg_file.read_text().splitlines():
|
||||
if ":" in line:
|
||||
k, v = line.split(":", 1)
|
||||
config[k.strip()] = v.strip()
|
||||
return config
|
||||
|
||||
|
||||
def run_cmd(cmd, cwd=None, timeout=None):
|
||||
"""Run shell command, return (returncode, stdout, stderr)."""
|
||||
result = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True,
|
||||
cwd=cwd, timeout=timeout
|
||||
)
|
||||
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
||||
|
||||
|
||||
def get_current_commit(path):
|
||||
_, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
|
||||
return commit
|
||||
|
||||
|
||||
def get_current_metric(path, metric_grep):
|
||||
"""Read the last recorded metric from results.tsv."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return None
|
||||
lines = [l for l in tsv.read_text().splitlines() if "\tkeep\t" in l]
|
||||
if not lines:
|
||||
return None
|
||||
last = lines[-1].split("\t")
|
||||
try:
|
||||
return float(last[1])
|
||||
except (ValueError, IndexError):
|
||||
return None
|
||||
|
||||
|
||||
def run_evaluation(path, evaluate_cmd, time_budget_minutes):
|
||||
"""Run evaluation with time limit."""
|
||||
hard_limit = time_budget_minutes * 60 * 2.5 # 2.5x as hard timeout
|
||||
t0 = time.time()
|
||||
try:
|
||||
code, _, _ = run_cmd(
|
||||
f"{evaluate_cmd} > run.log 2>&1",
|
||||
cwd=path,
|
||||
timeout=hard_limit
|
||||
)
|
||||
elapsed = time.time() - t0
|
||||
return code, elapsed
|
||||
except subprocess.TimeoutExpired:
|
||||
elapsed = time.time() - t0
|
||||
return -1, elapsed # -1 = timeout
|
||||
|
||||
|
||||
def extract_metric(path, metric_grep):
|
||||
"""Extract metric value from run.log."""
|
||||
code, out, _ = run_cmd(
|
||||
f"grep '{metric_grep}' run.log | tail -1",
|
||||
cwd=path
|
||||
)
|
||||
if not out:
|
||||
return None
|
||||
try:
|
||||
return float(out.split(":")[-1].strip())
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def is_improvement(new_val, old_val, direction):
|
||||
"""Check if new result is better than old."""
|
||||
if old_val is None:
|
||||
return True # First run always "improves"
|
||||
if direction == "lower":
|
||||
return new_val < old_val
|
||||
else:
|
||||
return new_val > old_val
|
||||
|
||||
|
||||
def log_result(path, commit, metric_val, status, description):
|
||||
"""Append result to results.tsv."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
metric_str = f"{metric_val:.6f}" if metric_val is not None else "N/A"
|
||||
with open(tsv, "a") as f:
|
||||
f.write(f"{commit}\t{metric_str}\t{status}\t{description}\n")
|
||||
|
||||
|
||||
def get_experiment_count(path):
|
||||
"""Count experiments run so far."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return 0
|
||||
lines = tsv.read_text().splitlines()
|
||||
return max(0, len(lines) - 1) # subtract header
|
||||
|
||||
|
||||
def run_single_experiment(path, config, exp_num, dry_run=False):
|
||||
"""Run one experiment iteration."""
|
||||
direction = config.get("metric_direction", "lower")
|
||||
metric_grep = config.get("metric_grep", "^metric:")
|
||||
evaluate_cmd = config.get("evaluate_cmd", "python evaluate.py")
|
||||
time_budget = int(config.get("time_budget_minutes", 5))
|
||||
metric_name = config.get("metric", "metric")
|
||||
|
||||
best_so_far = get_current_metric(path, metric_grep)
|
||||
ts = datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
print(f"\n[{ts}] Experiment #{exp_num}")
|
||||
print(f" Best {metric_name} so far: {best_so_far}")
|
||||
|
||||
if dry_run:
|
||||
print(" [DRY RUN] Would run evaluation and check metric")
|
||||
return "dry_run"
|
||||
|
||||
# Save pre-experiment state for rollback
|
||||
code, pre_commit, _ = run_cmd("git rev-parse HEAD", cwd=path)
|
||||
if code != 0:
|
||||
print(" ✗ Can't get git state. Is this a git repo with commits?")
|
||||
return "error"
|
||||
|
||||
# Run evaluation
|
||||
print(f" Running: {evaluate_cmd} (budget: {time_budget} min)")
|
||||
ret_code, elapsed = run_evaluation(path, evaluate_cmd, time_budget)
|
||||
|
||||
# Handle timeout
|
||||
if ret_code == -1:
|
||||
print(f" ✗ TIMEOUT after {elapsed:.0f}s — discarding")
|
||||
run_cmd("git checkout -- .", cwd=path) # revert uncommitted changes
|
||||
# Commit was already made by the agent before evaluation
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, None, "crash", f"timeout after {elapsed:.0f}s")
|
||||
return "crash"
|
||||
|
||||
# Handle non-zero exit
|
||||
if ret_code != 0:
|
||||
# Check if it crashed
|
||||
code, tail, _ = run_cmd("tail -n 5 run.log", cwd=path)
|
||||
print(f" ✗ CRASH (exit {ret_code}) after {elapsed:.0f}s")
|
||||
print(f" Last output: {tail[:200]}")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, None, "crash", f"exit_code_{ret_code}")
|
||||
return "crash"
|
||||
|
||||
# Extract metric
|
||||
metric_val = extract_metric(path, metric_grep)
|
||||
if metric_val is None:
|
||||
print(f" ✗ Could not parse metric from run.log")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, None, "crash", "metric_parse_failed")
|
||||
return "crash"
|
||||
|
||||
curr_commit = get_current_commit(path)
|
||||
delta = ""
|
||||
if best_so_far is not None:
|
||||
diff = metric_val - best_so_far
|
||||
delta = f" (Δ{diff:+.4f})"
|
||||
|
||||
print(f" {metric_name}: {metric_val:.6f}{delta} in {elapsed:.0f}s")
|
||||
|
||||
# Keep or discard
|
||||
if is_improvement(metric_val, best_so_far, direction):
|
||||
print(f" ✅ KEEP — improvement confirmed")
|
||||
log_result(path, curr_commit, metric_val, "keep",
|
||||
f"improvement_{metric_name}_{metric_val:.4f}")
|
||||
return "keep"
|
||||
else:
|
||||
print(f" ❌ DISCARD — no improvement")
|
||||
run_cmd(f"git reset --hard {pre_commit}", cwd=path)
|
||||
curr_commit = get_current_commit(path)
|
||||
log_result(path, curr_commit, metric_val, "discard",
|
||||
f"no_improvement_{metric_val:.4f}_vs_{best_so_far:.4f}")
|
||||
return "discard"
|
||||
|
||||
|
||||
def print_summary(path):
|
||||
"""Print experiment summary."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
if not tsv.exists():
|
||||
return
|
||||
lines = tsv.read_text().splitlines()[1:] # skip header
|
||||
if not lines:
|
||||
return
|
||||
|
||||
keeps = [l for l in lines if "\tkeep\t" in l]
|
||||
discards = [l for l in lines if "\tdiscard\t" in l]
|
||||
crashes = [l for l in lines if "\tcrash\t" in l]
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f" Session Summary")
|
||||
print(f" Experiments: {len(lines)} total")
|
||||
print(f" ✅ Keep: {len(keeps)} | ❌ Discard: {len(discards)} | 💥 Crash: {len(crashes)}")
|
||||
|
||||
if keeps:
|
||||
try:
|
||||
first_metric = float(keeps[0].split("\t")[1])
|
||||
last_metric = float(keeps[-1].split("\t")[1])
|
||||
direction = "↓" if last_metric < first_metric else "↑"
|
||||
print(f" Best progress: {first_metric:.6f} → {last_metric:.6f} {direction}")
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
print(f"{'='*50}\n")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="autoresearch-agent runner")
|
||||
parser.add_argument("--loop", action="store_true", help="Run forever")
|
||||
parser.add_argument("--single", action="store_true", help="Run one experiment")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Dry run only")
|
||||
parser.add_argument("--path", default=".", help="Project root")
|
||||
parser.add_argument("--max-experiments", type=int, default=0,
|
||||
help="Max experiments (0 = unlimited)")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.path).resolve()
|
||||
config = load_config(path)
|
||||
|
||||
print(f"\n🔬 autoresearch-agent")
|
||||
print(f" Project: {path}")
|
||||
print(f" Target: {config.get('target', '?')}")
|
||||
print(f" Metric: {config.get('metric', '?')} ({config.get('metric_direction', '?')} is better)")
|
||||
print(f" Budget: {config.get('time_budget_minutes', '?')} min/experiment")
|
||||
print(f" Mode: {'loop' if args.loop else 'single'}")
|
||||
|
||||
if args.single:
|
||||
exp_num = get_experiment_count(path) + 1
|
||||
run_single_experiment(path, config, exp_num, args.dry_run)
|
||||
return
|
||||
|
||||
if not args.loop and not args.dry_run:
|
||||
print("\nSpecify --loop (forever) or --single (one experiment)")
|
||||
sys.exit(1)
|
||||
|
||||
# Setup graceful shutdown
|
||||
def handle_interrupt(sig, frame):
|
||||
print_summary(path)
|
||||
print("\n⏹ Stopped by user.")
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, handle_interrupt)
|
||||
signal.signal(signal.SIGTERM, handle_interrupt)
|
||||
|
||||
# Main loop
|
||||
consecutive_crashes = 0
|
||||
exp_num = get_experiment_count(path) + 1
|
||||
|
||||
print(f"\nStarting loop. Ctrl+C to stop and print summary.\n")
|
||||
|
||||
while True:
|
||||
result = run_single_experiment(path, config, exp_num, args.dry_run)
|
||||
exp_num += 1
|
||||
|
||||
if result == "crash":
|
||||
consecutive_crashes += 1
|
||||
else:
|
||||
consecutive_crashes = 0
|
||||
|
||||
# Bail if 5 consecutive crashes
|
||||
if consecutive_crashes >= 5:
|
||||
print("\n⚠ 5 consecutive crashes. Pausing for investigation.")
|
||||
print(" Check run.log for the last error.")
|
||||
break
|
||||
|
||||
# Check max experiments
|
||||
if args.max_experiments > 0 and exp_num > args.max_experiments:
|
||||
print(f"\n✓ Reached max experiments ({args.max_experiments})")
|
||||
break
|
||||
|
||||
if args.single:
|
||||
break
|
||||
|
||||
print_summary(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
255
engineering/autoresearch-agent/scripts/setup_experiment.py
Normal file
255
engineering/autoresearch-agent/scripts/setup_experiment.py
Normal file
@@ -0,0 +1,255 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
autoresearch-agent: Setup Wizard
|
||||
|
||||
Initializes a new research run:
|
||||
1. Validates the project structure
|
||||
2. Creates a git branch
|
||||
3. Runs the baseline experiment
|
||||
4. Initializes results.tsv
|
||||
|
||||
Usage:
|
||||
python scripts/setup_experiment.py [--config experiment.yaml]
|
||||
python scripts/setup_experiment.py --domain ml|prompt|code|skill
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
DOMAINS = {
|
||||
"ml": {
|
||||
"target": "train.py",
|
||||
"evaluate_cmd": "uv run train.py",
|
||||
"metric": "val_bpb",
|
||||
"metric_direction": "lower",
|
||||
"time_budget_minutes": 5,
|
||||
"metric_grep": "^val_bpb:",
|
||||
},
|
||||
"prompt": {
|
||||
"target": "prompt.md",
|
||||
"evaluate_cmd": "python evaluate.py",
|
||||
"metric": "eval_score",
|
||||
"metric_direction": "higher",
|
||||
"time_budget_minutes": 2,
|
||||
"metric_grep": "^eval_score:",
|
||||
},
|
||||
"code": {
|
||||
"target": "src/module.py",
|
||||
"evaluate_cmd": "python benchmark.py",
|
||||
"metric": "p50_ms",
|
||||
"metric_direction": "lower",
|
||||
"time_budget_minutes": 10,
|
||||
"metric_grep": "^p50_ms:",
|
||||
},
|
||||
"skill": {
|
||||
"target": "SKILL.md",
|
||||
"evaluate_cmd": "python scripts/skill_evaluator.py",
|
||||
"metric": "pass_rate",
|
||||
"metric_direction": "higher",
|
||||
"time_budget_minutes": 5,
|
||||
"metric_grep": "^pass_rate:",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def run_cmd(cmd, cwd=None, timeout=None):
|
||||
"""Run a shell command and return (returncode, stdout, stderr)."""
|
||||
result = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True,
|
||||
cwd=cwd, timeout=timeout
|
||||
)
|
||||
return result.returncode, result.stdout.strip(), result.stderr.strip()
|
||||
|
||||
|
||||
def check_git_repo(path):
|
||||
"""Verify we're in a git repo."""
|
||||
code, out, err = run_cmd("git rev-parse --is-inside-work-tree", cwd=path)
|
||||
if code != 0:
|
||||
print("✗ Not a git repository. Run: git init && git add . && git commit -m 'initial'")
|
||||
return False
|
||||
print("✓ Git repository found")
|
||||
return True
|
||||
|
||||
|
||||
def check_program_md(path):
|
||||
"""Check program.md exists and has content."""
|
||||
pm = Path(path) / "program.md"
|
||||
if not pm.exists():
|
||||
print("⚠ program.md not found. Creating template...")
|
||||
return False
|
||||
content = pm.read_text()
|
||||
if len(content) < 100:
|
||||
print("⚠ program.md looks empty. Fill it out before running experiments.")
|
||||
return False
|
||||
print(f"✓ program.md found ({len(content)} chars)")
|
||||
return True
|
||||
|
||||
|
||||
def check_target_file(path, target):
|
||||
"""Check target file exists."""
|
||||
tf = Path(path) / target
|
||||
if not tf.exists():
|
||||
print(f"✗ Target file not found: {target}")
|
||||
return False
|
||||
print(f"✓ Target file found: {target}")
|
||||
return True
|
||||
|
||||
|
||||
def check_evaluate_script(path):
|
||||
"""Check evaluate.py exists."""
|
||||
ev = Path(path) / "evaluate.py"
|
||||
if not ev.exists():
|
||||
print("⚠ evaluate.py not found. You need a fixed evaluation function.")
|
||||
print(" Create evaluate.py that outputs: metric_name: <value>")
|
||||
return False
|
||||
print("✓ evaluate.py found")
|
||||
return True
|
||||
|
||||
|
||||
def create_branch(path, tag):
|
||||
"""Create and checkout the experiment branch."""
|
||||
branch = f"autoresearch/{tag}"
|
||||
code, out, err = run_cmd(f"git checkout -b {branch}", cwd=path)
|
||||
if code != 0:
|
||||
if "already exists" in err:
|
||||
print(f"✗ Branch '{branch}' already exists. Use a different tag.")
|
||||
else:
|
||||
print(f"✗ Failed to create branch: {err}")
|
||||
return None
|
||||
print(f"✓ Created branch: {branch}")
|
||||
return branch
|
||||
|
||||
|
||||
def init_results_tsv(path):
|
||||
"""Create results.tsv with header."""
|
||||
tsv = Path(path) / "results.tsv"
|
||||
if tsv.exists():
|
||||
print(f"✓ results.tsv already exists ({tsv.stat().st_size} bytes)")
|
||||
return
|
||||
tsv.write_text("commit\tmetric\tstatus\tdescription\n")
|
||||
print("✓ Created results.tsv")
|
||||
|
||||
|
||||
def run_baseline(path, evaluate_cmd, metric_grep, time_budget_minutes):
|
||||
"""Run the baseline experiment."""
|
||||
print(f"\nRunning baseline experiment (~{time_budget_minutes} min)...")
|
||||
timeout = time_budget_minutes * 60 * 2.5 # 2.5x budget as hard limit
|
||||
|
||||
t0 = time.time()
|
||||
code, out, err = run_cmd(
|
||||
f"{evaluate_cmd} > run.log 2>&1",
|
||||
cwd=path,
|
||||
timeout=timeout
|
||||
)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if code != 0:
|
||||
print(f"✗ Baseline run failed after {elapsed:.0f}s. Check run.log")
|
||||
return None
|
||||
|
||||
# Extract metric
|
||||
grep_code, grep_out, _ = run_cmd(
|
||||
f"grep '{metric_grep}' run.log | tail -1",
|
||||
cwd=path
|
||||
)
|
||||
if not grep_out:
|
||||
print("✗ Could not extract metric from run.log. Check metric_grep pattern.")
|
||||
return None
|
||||
|
||||
metric_value = grep_out.split(":")[-1].strip()
|
||||
print(f"✓ Baseline complete in {elapsed:.0f}s — metric: {metric_value}")
|
||||
return metric_value
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="autoresearch-agent setup")
|
||||
parser.add_argument("--domain", choices=list(DOMAINS.keys()), help="Experiment domain")
|
||||
parser.add_argument("--target", help="Target file to optimize")
|
||||
parser.add_argument("--evaluate-cmd", help="Evaluation command")
|
||||
parser.add_argument("--metric", help="Metric name")
|
||||
parser.add_argument("--direction", choices=["lower", "higher"], default="lower")
|
||||
parser.add_argument("--budget", type=int, default=5, help="Time budget in minutes")
|
||||
parser.add_argument("--tag", help="Run tag (used in branch name)")
|
||||
parser.add_argument("--path", default=".", help="Project root path")
|
||||
parser.add_argument("--skip-baseline", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.path).resolve()
|
||||
print(f"\n🔬 autoresearch-agent setup")
|
||||
print(f" Project: {path}")
|
||||
print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n")
|
||||
|
||||
# Get config from domain or args
|
||||
if args.domain:
|
||||
config = DOMAINS[args.domain].copy()
|
||||
else:
|
||||
config = {
|
||||
"target": args.target or "target.py",
|
||||
"evaluate_cmd": args.evaluate_cmd or "python evaluate.py",
|
||||
"metric": args.metric or "score",
|
||||
"metric_direction": args.direction,
|
||||
"time_budget_minutes": args.budget,
|
||||
"metric_grep": f"^{args.metric or 'score'}:",
|
||||
}
|
||||
|
||||
tag = args.tag or datetime.now().strftime("%b%d").lower()
|
||||
|
||||
# Validation checks
|
||||
checks = [
|
||||
check_git_repo(path),
|
||||
check_program_md(path),
|
||||
check_target_file(path, config["target"]),
|
||||
check_evaluate_script(path),
|
||||
]
|
||||
|
||||
if not all(checks):
|
||||
print("\n⚠ Fix the above issues before running experiments.")
|
||||
sys.exit(1)
|
||||
|
||||
# Create branch
|
||||
branch = create_branch(path, tag)
|
||||
if not branch:
|
||||
sys.exit(1)
|
||||
|
||||
# Init results TSV
|
||||
init_results_tsv(path)
|
||||
|
||||
# Save config for run_experiment.py
|
||||
config_content = "\n".join(f"{k}: {v}" for k, v in config.items())
|
||||
(path / ".autoresearch.cfg").write_text(config_content + "\n")
|
||||
print("✓ Saved .autoresearch.cfg")
|
||||
|
||||
# Run baseline
|
||||
if not args.skip_baseline:
|
||||
baseline = run_baseline(
|
||||
path,
|
||||
config["evaluate_cmd"],
|
||||
config["metric_grep"],
|
||||
config["time_budget_minutes"]
|
||||
)
|
||||
if baseline:
|
||||
# Log baseline to TSV
|
||||
code, commit, _ = run_cmd("git rev-parse --short HEAD", cwd=path)
|
||||
with open(path / "results.tsv", "a") as f:
|
||||
f.write(f"{commit}\t{baseline}\tkeep\tbaseline\n")
|
||||
print(f"✓ Baseline logged to results.tsv")
|
||||
|
||||
print(f"\n✅ Setup complete!")
|
||||
print(f" Branch: {branch}")
|
||||
print(f" Target: {config['target']}")
|
||||
print(f" Metric: {config['metric']} ({config['metric_direction']} is better)")
|
||||
print(f" Budget: {config['time_budget_minutes']} min/experiment")
|
||||
print(f"\nTo start the autonomous loop:")
|
||||
print(f" python scripts/run_experiment.py --loop")
|
||||
print(f"\nOr run a single experiment:")
|
||||
print(f" python scripts/run_experiment.py --single")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user