#!/usr/bin/env python3 """ autoresearch-agent: Setup Experiment Initialize a new experiment with domain, target, evaluator, and git branch. Creates the .autoresearch/{domain}/{name}/ directory structure. Usage: python scripts/setup_experiment.py --domain engineering --name api-speed \ --target src/api/search.py --eval "pytest bench.py" \ --metric p50_ms --direction lower python scripts/setup_experiment.py --domain marketing --name medium-ctr \ --target content/titles.md --eval "python evaluate.py" \ --metric ctr_score --direction higher --evaluator llm_judge_content python scripts/setup_experiment.py --list # List all experiments python scripts/setup_experiment.py --list-evaluators # List available evaluators """ import argparse import os import shutil import subprocess import sys import time from datetime import datetime from pathlib import Path DOMAINS = ["engineering", "marketing", "content", "prompts", "custom"] EVALUATOR_DIR = Path(__file__).parent.parent / "evaluators" DEFAULT_CONFIG = """# autoresearch global config default_time_budget_minutes: 5 default_scope: project dashboard_format: markdown """ GITIGNORE_CONTENT = """# autoresearch — experiment logs are local state **/results.tsv **/run.log **/run.*.log config.yaml """ def run_cmd(cmd, cwd=None, timeout=None): """Run shell command, return (returncode, stdout, stderr).""" result = subprocess.run( cmd, shell=True, capture_output=True, text=True, cwd=cwd, timeout=timeout ) return result.returncode, result.stdout.strip(), result.stderr.strip() def get_autoresearch_root(scope, project_root=None): """Get the .autoresearch root directory based on scope.""" if scope == "user": return Path.home() / ".autoresearch" return Path(project_root or ".") / ".autoresearch" def init_root(root): """Initialize .autoresearch root if it doesn't exist.""" created = False if not root.exists(): root.mkdir(parents=True) created = True print(f" Created {root}/") config_file = root / "config.yaml" if not config_file.exists(): config_file.write_text(DEFAULT_CONFIG) print(f" Created {config_file}") gitignore = root / ".gitignore" if not gitignore.exists(): gitignore.write_text(GITIGNORE_CONTENT) print(f" Created {gitignore}") return created def create_program_md(experiment_dir, domain, name, target, metric, direction, constraints=""): """Generate a program.md template for the experiment.""" direction_word = "Minimize" if direction == "lower" else "Maximize" content = f"""# autoresearch — {name} ## Goal {direction_word} `{metric}` on `{target}`. {"Lower" if direction == "lower" else "Higher"} is better. ## What the Agent Can Change - Only `{target}` — this is the single file being optimized. - Everything inside that file is fair game unless constrained below. ## What the Agent Cannot Change - The evaluation script (`evaluate.py` or the eval command). It is read-only. - Dependencies — do not add new packages or imports that aren't already available. - Any other files in the project unless explicitly noted here. {f"- Additional constraints: {constraints}" if constraints else ""} ## Strategy 1. First run: establish baseline. Do not change anything. 2. Profile/analyze the current state — understand why the metric is what it is. 3. Try the most obvious improvement first (low-hanging fruit). 4. If that works, push further in the same direction. 5. If stuck, try something orthogonal or radical. 6. Read the git log of previous experiments. Don't repeat failed approaches. ## Simplicity Rule A small improvement that adds ugly complexity is NOT worth it. Equal performance with simpler code IS worth it. Removing code that gets same results is the best outcome. ## Stop When You don't stop. The human will interrupt you when they're satisfied. If no improvement in 20+ consecutive runs, change strategy drastically. """ (experiment_dir / "program.md").write_text(content) def create_config(experiment_dir, target, eval_cmd, metric, direction, time_budget): """Write experiment config.""" content = f"""target: {target} evaluate_cmd: {eval_cmd} metric: {metric} metric_direction: {direction} metric_grep: ^{metric}: time_budget_minutes: {time_budget} created: {datetime.now().strftime('%Y-%m-%d %H:%M')} """ (experiment_dir / "config.cfg").write_text(content) def init_results_tsv(experiment_dir): """Create results.tsv with header.""" tsv = experiment_dir / "results.tsv" if tsv.exists(): print(f" results.tsv already exists ({tsv.stat().st_size} bytes)") return tsv.write_text("commit\tmetric\tstatus\tdescription\n") print(" Created results.tsv") def copy_evaluator(experiment_dir, evaluator_name): """Copy a built-in evaluator to the experiment directory.""" source = EVALUATOR_DIR / f"{evaluator_name}.py" if not source.exists(): print(f" Warning: evaluator '{evaluator_name}' not found in {EVALUATOR_DIR}") print(f" Available: {', '.join(f.stem for f in EVALUATOR_DIR.glob('*.py'))}") return False dest = experiment_dir / "evaluate.py" shutil.copy2(source, dest) print(f" Copied evaluator: {evaluator_name}.py -> evaluate.py") return True def create_branch(path, domain, name): """Create and checkout the experiment branch.""" branch = f"autoresearch/{domain}/{name}" code, _, err = run_cmd(f"git checkout -b {branch}", cwd=path) if code != 0: if "already exists" in err: print(f" Branch '{branch}' already exists. Checking out...") run_cmd(f"git checkout {branch}", cwd=path) return branch print(f" Warning: could not create branch: {err}") return None print(f" Created branch: {branch}") return branch def list_experiments(root): """List all experiments across all domains.""" if not root.exists(): print("No experiments found. Run setup to create your first experiment.") return experiments = [] for domain_dir in sorted(root.iterdir()): if not domain_dir.is_dir() or domain_dir.name.startswith("."): continue for exp_dir in sorted(domain_dir.iterdir()): if not exp_dir.is_dir(): continue cfg_file = exp_dir / "config.cfg" if not cfg_file.exists(): continue config = {} for line in cfg_file.read_text().splitlines(): if ":" in line: k, v = line.split(":", 1) config[k.strip()] = v.strip() # Count results tsv = exp_dir / "results.tsv" runs = 0 if tsv.exists(): runs = max(0, len(tsv.read_text().splitlines()) - 1) experiments.append({ "domain": domain_dir.name, "name": exp_dir.name, "target": config.get("target", "?"), "metric": config.get("metric", "?"), "runs": runs, }) if not experiments: print("No experiments found.") return print(f"\n{'DOMAIN':<15} {'EXPERIMENT':<25} {'TARGET':<30} {'METRIC':<15} {'RUNS':>5}") print("-" * 95) for e in experiments: print(f"{e['domain']:<15} {e['name']:<25} {e['target']:<30} {e['metric']:<15} {e['runs']:>5}") print(f"\nTotal: {len(experiments)} experiments") def list_evaluators(): """List available built-in evaluators.""" if not EVALUATOR_DIR.exists(): print("No evaluators directory found.") return print(f"\nAvailable evaluators ({EVALUATOR_DIR}):\n") for f in sorted(EVALUATOR_DIR.glob("*.py")): # Read first docstring line desc = "" for line in f.read_text().splitlines(): if line.strip().startswith('"""') or line.strip().startswith("'''"): continue if line.strip() and not line.startswith("#!"): desc = line.strip().strip('"').strip("'") break print(f" {f.stem:<25} {desc}") def main(): parser = argparse.ArgumentParser(description="autoresearch-agent setup") parser.add_argument("--domain", choices=DOMAINS, help="Experiment domain") parser.add_argument("--name", help="Experiment name (e.g. api-speed, medium-ctr)") parser.add_argument("--target", help="Target file to optimize") parser.add_argument("--eval", dest="eval_cmd", help="Evaluation command") parser.add_argument("--metric", help="Metric name (must appear in eval output as 'name: value')") parser.add_argument("--direction", choices=["lower", "higher"], default="lower", help="Is lower or higher better?") parser.add_argument("--time-budget", type=int, default=5, help="Minutes per experiment (default: 5)") parser.add_argument("--evaluator", help="Built-in evaluator to copy (e.g. benchmark_speed)") parser.add_argument("--scope", choices=["project", "user"], default="project", help="Where to store experiments: project (./) or user (~/)") parser.add_argument("--constraints", default="", help="Additional constraints for program.md") parser.add_argument("--path", default=".", help="Project root path") parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline run") parser.add_argument("--skip-branch", action="store_true", help="Don't create git branch") parser.add_argument("--list", action="store_true", help="List all experiments") parser.add_argument("--list-evaluators", action="store_true", help="List available evaluators") args = parser.parse_args() project_root = Path(args.path).resolve() # List mode if args.list: root = get_autoresearch_root("project", project_root) list_experiments(root) user_root = get_autoresearch_root("user") if user_root.exists() and user_root != root: print(f"\n--- User-level experiments ({user_root}) ---") list_experiments(user_root) return if args.list_evaluators: list_evaluators() return # Validate required args for setup if not all([args.domain, args.name, args.target, args.eval_cmd, args.metric]): parser.error("Required: --domain, --name, --target, --eval, --metric") root = get_autoresearch_root(args.scope, project_root) print(f"\n autoresearch-agent setup") print(f" Project: {project_root}") print(f" Scope: {args.scope}") print(f" Domain: {args.domain}") print(f" Experiment: {args.name}") print(f" Time: {datetime.now().strftime('%Y-%m-%d %H:%M')}\n") # Check git code, _, _ = run_cmd("git rev-parse --is-inside-work-tree", cwd=str(project_root)) if code != 0: print(" Error: not a git repository. Run: git init && git add . && git commit -m 'initial'") sys.exit(1) print(" Git repository found") # Check target file target_path = project_root / args.target if not target_path.exists(): print(f" Error: target file not found: {args.target}") sys.exit(1) print(f" Target file found: {args.target}") # Init root init_root(root) # Create experiment directory experiment_dir = root / args.domain / args.name if experiment_dir.exists(): print(f" Warning: experiment '{args.domain}/{args.name}' already exists.") print(f" Use --name with a different name, or delete {experiment_dir}") sys.exit(1) experiment_dir.mkdir(parents=True) print(f" Created {experiment_dir}/") # Create files create_program_md(experiment_dir, args.domain, args.name, args.target, args.metric, args.direction, args.constraints) print(" Created program.md") create_config(experiment_dir, args.target, args.eval_cmd, args.metric, args.direction, args.time_budget) print(" Created config.cfg") init_results_tsv(experiment_dir) # Copy evaluator if specified if args.evaluator: copy_evaluator(experiment_dir, args.evaluator) # Create git branch if not args.skip_branch: create_branch(str(project_root), args.domain, args.name) # Test evaluation command print(f"\n Testing evaluation: {args.eval_cmd}") code, out, err = run_cmd(args.eval_cmd, cwd=str(project_root), timeout=60) if code != 0: print(f" Warning: eval command failed (exit {code})") if err: print(f" stderr: {err[:200]}") print(" Fix the eval command before running the experiment loop.") else: # Check metric is parseable full_output = out + "\n" + err metric_found = False for line in full_output.splitlines(): if line.strip().startswith(f"{args.metric}:"): metric_found = True print(f" Eval works. Baseline: {line.strip()}") break if not metric_found: print(f" Warning: eval ran but '{args.metric}:' not found in output.") print(f" Make sure your eval command outputs: {args.metric}: ") # Summary print(f"\n Setup complete!") print(f" Experiment: {args.domain}/{args.name}") print(f" Target: {args.target}") print(f" Metric: {args.metric} ({args.direction} is better)") print(f" Budget: {args.time_budget} min/experiment") if not args.skip_branch: print(f" Branch: autoresearch/{args.domain}/{args.name}") print(f"\n To start:") print(f" python scripts/run_experiment.py --experiment {args.domain}/{args.name} --loop") if __name__ == "__main__": main()