refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo.

Architecture changes:
- Multi-experiment support: .autoresearch/{domain}/{name}/ structure
- Domain categories: engineering, marketing, content, prompts, custom
- Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope
- User chooses scope during setup, not installation

New evaluators (8 ready-to-use):
- Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage
- LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy
- LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed

Script improvements:
- setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators
- run_experiment.py: --experiment domain/name, --resume, --loop, --single
- log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output

Results export:
- Terminal (default), CSV, and Markdown formats
- Per-experiment, per-domain, or cross-experiment dashboard view

SKILL.md rewritten:
- Clear activation triggers (when the skill should activate)
- Practical examples for each domain
- Evaluator documentation with cost transparency
- Simplified loop protocol matching Karpathy's original philosophy
This commit is contained in:
Leo
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions

View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""Measure file, bundle, or Docker image size.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import os
import subprocess
import sys
# --- CONFIGURE ONE OF THESE ---
# Option 1: File size
TARGET_FILE = "dist/main.js"
# Option 2: Directory size (uncomment to use)
# TARGET_DIR = "dist/"
# Option 3: Docker image (uncomment to use)
# DOCKER_IMAGE = "myapp:latest"
# DOCKER_BUILD_CMD = "docker build -t myapp:latest ."
# Option 4: Build first, then measure (uncomment to use)
# BUILD_CMD = "npm run build"
# --- END CONFIG ---
# Build if needed
if "BUILD_CMD" in dir() or "BUILD_CMD" in globals():
result = subprocess.run(BUILD_CMD, shell=True, capture_output=True)
if result.returncode != 0:
print(f"Build failed: {result.stderr.decode()[:200]}", file=sys.stderr)
sys.exit(1)
# Measure
if "DOCKER_IMAGE" in dir() or "DOCKER_IMAGE" in globals():
if "DOCKER_BUILD_CMD" in dir():
subprocess.run(DOCKER_BUILD_CMD, shell=True, capture_output=True)
result = subprocess.run(
f"docker image inspect {DOCKER_IMAGE} --format '{{{{.Size}}}}'",
shell=True, capture_output=True, text=True
)
size_bytes = int(result.stdout.strip())
elif "TARGET_DIR" in dir() or "TARGET_DIR" in globals():
size_bytes = sum(
os.path.getsize(os.path.join(dp, f))
for dp, _, fns in os.walk(TARGET_DIR) for f in fns
)
elif os.path.exists(TARGET_FILE):
size_bytes = os.path.getsize(TARGET_FILE)
else:
print(f"Target not found: {TARGET_FILE}", file=sys.stderr)
sys.exit(1)
size_kb = size_bytes / 1024
size_mb = size_bytes / (1024 * 1024)
print(f"size_bytes: {size_bytes}")
print(f"size_kb: {size_kb:.1f}")
print(f"size_mb: {size_mb:.2f}")

View File

@@ -0,0 +1,40 @@
#!/usr/bin/env python3
"""Measure execution speed of a target function or command.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import statistics
import subprocess
import sys
import time
# --- CONFIGURE THESE ---
COMMAND = "python src/module.py" # Command to benchmark
RUNS = 5 # Number of runs
WARMUP = 1 # Warmup runs (not counted)
# --- END CONFIG ---
times = []
# Warmup
for _ in range(WARMUP):
subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
# Benchmark
for i in range(RUNS):
t0 = time.perf_counter()
result = subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
elapsed = (time.perf_counter() - t0) * 1000 # ms
if result.returncode != 0:
print(f"Run {i+1} failed (exit {result.returncode})", file=sys.stderr)
print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
sys.exit(1)
times.append(elapsed)
p50 = statistics.median(times)
p95 = sorted(times)[int(len(times) * 0.95)] if len(times) >= 5 else max(times)
print(f"p50_ms: {p50:.2f}")
print(f"p95_ms: {p95:.2f}")
print(f"runs: {RUNS}")

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env python3
"""Measure build/compile time.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import subprocess
import sys
import time
# --- CONFIGURE THESE ---
BUILD_CMD = "npm run build" # or: docker build -t test .
CLEAN_CMD = "" # optional: npm run clean (run before each build)
RUNS = 3 # Number of builds to average
# --- END CONFIG ---
times = []
for i in range(RUNS):
# Clean if configured
if CLEAN_CMD:
subprocess.run(CLEAN_CMD, shell=True, capture_output=True, timeout=60)
t0 = time.perf_counter()
result = subprocess.run(BUILD_CMD, shell=True, capture_output=True, timeout=600)
elapsed = time.perf_counter() - t0
if result.returncode != 0:
print(f"Build {i+1} failed (exit {result.returncode})", file=sys.stderr)
print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
sys.exit(1)
times.append(elapsed)
import statistics
avg = statistics.mean(times)
median = statistics.median(times)
print(f"build_seconds: {median:.2f}")
print(f"build_avg: {avg:.2f}")
print(f"runs: {RUNS}")

View File

@@ -0,0 +1,72 @@
#!/usr/bin/env python3
"""LLM judge for content quality (headlines, titles, descriptions).
Uses the user's existing CLI tool (claude, codex, gemini) for evaluation.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import subprocess
import sys
from pathlib import Path
# --- CONFIGURE THESE ---
TARGET_FILE = "content/titles.md" # File being optimized
CLI_TOOL = "claude" # or: codex, gemini
# --- END CONFIG ---
# The judge prompt is FIXED — the agent cannot change how it's evaluated
JUDGE_PROMPT = """You are a content quality evaluator. Score the following content strictly.
Criteria (each scored 1-10):
1. CURIOSITY GAP — Does this make you want to click? Is there an information gap
that can only be resolved by reading? Generic titles score 1-3. Specific,
intriguing titles score 7-10.
2. SPECIFICITY — Are there concrete numbers, tools, or details? "How I improved
performance" = 2. "How I reduced API latency from 800ms to 185ms" = 9.
3. EMOTIONAL PULL — Does it trigger curiosity, surprise, fear of missing out,
or recognition? Flat titles score 1-3. Emotionally charged score 7-10.
4. SCROLL-STOP POWER — Would this stop someone scrolling through a feed or
search results? Would they pause on this headline? Rate honestly.
5. SEO KEYWORD PRESENCE — Are searchable, high-intent terms present naturally?
Keyword-stuffed = 3. Natural integration of search terms = 8-10.
Output EXACTLY this format (nothing else):
curiosity: <score>
specificity: <score>
emotional: <score>
scroll_stop: <score>
seo: <score>
ctr_score: <average of all 5 scores>
Be harsh. Most content is mediocre (4-6 range). Only exceptional content scores 8+."""
content = Path(TARGET_FILE).read_text()
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nContent to evaluate:\n\n{content}"
# Call the user's CLI tool
result = subprocess.run(
[CLI_TOOL, "-p", full_prompt],
capture_output=True, text=True, timeout=120
)
if result.returncode != 0:
print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
sys.exit(1)
# Parse output — look for ctr_score line
output = result.stdout
for line in output.splitlines():
line = line.strip()
if line.startswith("ctr_score:"):
print(line)
elif line.startswith(("curiosity:", "specificity:", "emotional:", "scroll_stop:", "seo:")):
print(line)
# Verify ctr_score was found
if "ctr_score:" not in output:
print("Could not parse ctr_score from LLM output", file=sys.stderr)
print(f"Raw output: {output[:500]}", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env python3
"""LLM judge for marketing copy (social posts, ads, emails).
Uses the user's existing CLI tool for evaluation.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import subprocess
import sys
from pathlib import Path
# --- CONFIGURE THESE ---
TARGET_FILE = "posts.md" # Copy being optimized
CLI_TOOL = "claude" # or: codex, gemini
PLATFORM = "twitter" # twitter, linkedin, instagram, email, ad
# --- END CONFIG ---
JUDGE_PROMPTS = {
"twitter": """Score this Twitter/X post strictly:
1. HOOK (1-10) — Does the first line stop the scroll?
2. VALUE (1-10) — Does it provide insight, entertainment, or utility?
3. ENGAGEMENT (1-10) — Would people reply, retweet, or like?
4. BREVITY (1-10) — Is every word earning its place? No filler?
5. CTA (1-10) — Is there a clear next action (even implicit)?""",
"linkedin": """Score this LinkedIn post strictly:
1. HOOK (1-10) — Does the first line make you click "see more"?
2. STORYTELLING (1-10) — Is there a narrative arc or just statements?
3. CREDIBILITY (1-10) — Does it demonstrate expertise without bragging?
4. ENGAGEMENT (1-10) — Would professionals comment or share?
5. CTA (1-10) — Does it invite discussion or action?""",
"instagram": """Score this Instagram caption strictly:
1. HOOK (1-10) — Does the first line grab attention?
2. RELATABILITY (1-10) — Does the audience see themselves in this?
3. VISUAL MATCH (1-10) — Does the copy complement visual content?
4. HASHTAG STRATEGY (1-10) — Are hashtags relevant and not spammy?
5. CTA (1-10) — Does it encourage saves, shares, or comments?""",
"email": """Score this email subject + preview strictly:
1. OPEN INCENTIVE (1-10) — Would you open this in a crowded inbox?
2. SPECIFICITY (1-10) — Is it concrete or vague?
3. URGENCY (1-10) — Is there a reason to open now vs later?
4. PERSONALIZATION (1-10) — Does it feel written for someone, not everyone?
5. PREVIEW SYNC (1-10) — Does the preview text complement the subject?""",
"ad": """Score this ad copy strictly:
1. ATTENTION (1-10) — Does it stop someone scrolling past ads?
2. DESIRE (1-10) — Does it create want for the product/service?
3. PROOF (1-10) — Is there credibility (numbers, social proof)?
4. ACTION (1-10) — Is the CTA clear and compelling?
5. OBJECTION HANDLING (1-10) — Does it preempt "why not"?""",
}
platform_prompt = JUDGE_PROMPTS.get(PLATFORM, JUDGE_PROMPTS["twitter"])
JUDGE_PROMPT = f"""{platform_prompt}
Output EXACTLY this format:
criterion_1: <score>
criterion_2: <score>
criterion_3: <score>
criterion_4: <score>
criterion_5: <score>
engagement_score: <average of all 5>
Be harsh. Most copy is mediocre (4-6). Only exceptional copy scores 8+."""
content = Path(TARGET_FILE).read_text()
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nCopy to evaluate:\n\n{content}"
result = subprocess.run(
[CLI_TOOL, "-p", full_prompt],
capture_output=True, text=True, timeout=120
)
if result.returncode != 0:
print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
sys.exit(1)
output = result.stdout
for line in output.splitlines():
line = line.strip()
if line.startswith("engagement_score:") or line.startswith("criterion_"):
print(line)
if "engagement_score:" not in output:
print("Could not parse engagement_score from LLM output", file=sys.stderr)
print(f"Raw: {output[:500]}", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,99 @@
#!/usr/bin/env python3
"""LLM judge for prompt/instruction quality.
Uses the user's existing CLI tool for evaluation.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import json
import subprocess
import sys
from pathlib import Path
# --- CONFIGURE THESE ---
TARGET_FILE = "prompt.md" # Prompt being optimized
TEST_CASES_FILE = "tests/cases.json" # Test cases: [{"input": "...", "expected": "..."}]
CLI_TOOL = "claude" # or: codex, gemini
# --- END CONFIG ---
JUDGE_PROMPT_TEMPLATE = """You are evaluating a system prompt's effectiveness.
SYSTEM PROMPT BEING TESTED:
{prompt}
TEST INPUT:
{input}
EXPECTED OUTPUT (reference):
{expected}
ACTUAL OUTPUT:
{actual}
Score the actual output on these criteria (each 1-10):
1. ACCURACY — Does it match the expected output's intent and facts?
2. COMPLETENESS — Does it cover all required elements?
3. CLARITY — Is it well-structured and easy to understand?
4. INSTRUCTION_FOLLOWING — Does it follow the system prompt's guidelines?
Output EXACTLY: quality_score: <average of all 4>
Nothing else."""
prompt = Path(TARGET_FILE).read_text()
test_cases = json.loads(Path(TEST_CASES_FILE).read_text())
scores = []
for i, case in enumerate(test_cases):
# Generate output using the prompt
gen_prompt = f"{prompt}\n\n{case['input']}"
gen_result = subprocess.run(
[CLI_TOOL, "-p", gen_prompt],
capture_output=True, text=True, timeout=60
)
if gen_result.returncode != 0:
print(f"Generation failed for case {i+1}", file=sys.stderr)
scores.append(0)
continue
actual = gen_result.stdout.strip()
# Judge the output
judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
prompt=prompt[:500],
input=case["input"],
expected=case.get("expected", "N/A"),
actual=actual[:500]
)
judge_result = subprocess.run(
[CLI_TOOL, "-p", judge_prompt],
capture_output=True, text=True, timeout=60
)
if judge_result.returncode != 0:
scores.append(0)
continue
# Parse score
for line in judge_result.stdout.splitlines():
if "quality_score:" in line:
try:
score = float(line.split(":")[-1].strip())
scores.append(score)
except ValueError:
scores.append(0)
break
else:
scores.append(0)
print(f" Case {i+1}/{len(test_cases)}: {scores[-1]:.1f}", file=sys.stderr)
if not scores:
print("No test cases evaluated", file=sys.stderr)
sys.exit(1)
avg = sum(scores) / len(scores)
quality = avg * 10 # Scale to 0-100
print(f"quality_score: {quality:.2f}")
print(f"cases_tested: {len(scores)}")
print(f"avg_per_case: {avg:.2f}")

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env python3
"""Measure peak memory usage of a command.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import os
import platform
import subprocess
import sys
# --- CONFIGURE THESE ---
COMMAND = "python src/module.py" # Command to measure
# --- END CONFIG ---
system = platform.system()
if system == "Linux":
# Use /usr/bin/time for peak RSS
result = subprocess.run(
f"/usr/bin/time -v {COMMAND}",
shell=True, capture_output=True, text=True, timeout=300
)
output = result.stderr
for line in output.splitlines():
if "Maximum resident set size" in line:
kb = int(line.split(":")[-1].strip())
mb = kb / 1024
print(f"peak_mb: {mb:.1f}")
print(f"peak_kb: {kb}")
sys.exit(0)
print("Could not parse memory from /usr/bin/time output", file=sys.stderr)
sys.exit(1)
elif system == "Darwin":
# macOS: use /usr/bin/time -l
result = subprocess.run(
f"/usr/bin/time -l {COMMAND}",
shell=True, capture_output=True, text=True, timeout=300
)
output = result.stderr
for line in output.splitlines():
if "maximum resident set size" in line.lower():
# macOS reports in bytes
val = int(line.strip().split()[0])
mb = val / (1024 * 1024)
print(f"peak_mb: {mb:.1f}")
sys.exit(0)
print("Could not parse memory from time output", file=sys.stderr)
sys.exit(1)
else:
print(f"Unsupported platform: {system}. Use Linux or macOS.", file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,55 @@
#!/usr/bin/env python3
"""Measure test suite pass rate.
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
import re
import subprocess
import sys
# --- CONFIGURE THESE ---
TEST_CMD = "pytest tests/ --tb=no -q" # Test command
# --- END CONFIG ---
result = subprocess.run(TEST_CMD, shell=True, capture_output=True, text=True, timeout=300)
output = result.stdout + "\n" + result.stderr
# Try to parse pytest output: "X passed, Y failed, Z errors"
passed = failed = errors = 0
# pytest short format: "5 passed, 2 failed in 1.23s"
match = re.search(r"(\d+) passed", output)
if match:
passed = int(match.group(1))
match = re.search(r"(\d+) failed", output)
if match:
failed = int(match.group(1))
match = re.search(r"(\d+) error", output)
if match:
errors = int(match.group(1))
total = passed + failed + errors
if total == 0:
# Try unittest format: "Ran X tests"
match = re.search(r"Ran (\d+) test", output)
if match:
total = int(match.group(1))
if result.returncode == 0:
passed = total
else:
# Count failures from output
fail_match = re.search(r"FAILED \(failures=(\d+)", output)
if fail_match:
failed = int(fail_match.group(1))
passed = total - failed
if total == 0:
print("Could not parse test results", file=sys.stderr)
print(f"Output: {output[:500]}", file=sys.stderr)
sys.exit(1)
rate = passed / total
print(f"pass_rate: {rate:.4f}")
print(f"passed: {passed}")
print(f"failed: {failed}")
print(f"total: {total}")