refactor: autoresearch-agent v2.0 — multi-experiment, multi-domain, real-world evaluators

Major rewrite based on deep study of Karpathy's autoresearch repo. Architecture changes: - Multi-experiment support: .autoresearch/{domain}/{name}/ structure - Domain categories: engineering, marketing, content, prompts, custom - Project-level (git-tracked, shareable) or user-level (~/.autoresearch/) scope - User chooses scope during setup, not installation New evaluators (8 ready-to-use): - Free: benchmark_speed, benchmark_size, test_pass_rate, build_speed, memory_usage - LLM judge (uses existing subscription): llm_judge_content, llm_judge_prompt, llm_judge_copy - LLM judges call user's CLI tool (claude/codex/gemini) — no extra API keys needed Script improvements: - setup_experiment.py: --domain, --scope, --evaluator, --list, --list-evaluators - run_experiment.py: --experiment domain/name, --resume, --loop, --single - log_results.py: --dashboard, --domain, --format csv|markdown|terminal, --output Results export: - Terminal (default), CSV, and Markdown formats - Per-experiment, per-domain, or cross-experiment dashboard view SKILL.md rewritten: - Clear activation triggers (when the skill should activate) - Practical examples for each domain - Evaluator documentation with cost transparency - Simplified loop protocol matching Karpathy's original philosophy
2026-03-13 08:22:14 +01:00
parent c834d71a44
commit 12591282da
13 changed files with 1744 additions and 702 deletions
--- a/engineering/autoresearch-agent/evaluators/benchmark_size.py
+++ b/engineering/autoresearch-agent/evaluators/benchmark_size.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""Measure file, bundle, or Docker image size.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import os
+import subprocess
+import sys
+
+# --- CONFIGURE ONE OF THESE ---
+# Option 1: File size
+TARGET_FILE = "dist/main.js"
+
+# Option 2: Directory size (uncomment to use)
+# TARGET_DIR = "dist/"
+
+# Option 3: Docker image (uncomment to use)
+# DOCKER_IMAGE = "myapp:latest"
+# DOCKER_BUILD_CMD = "docker build -t myapp:latest ."
+
+# Option 4: Build first, then measure (uncomment to use)
+# BUILD_CMD = "npm run build"
+# --- END CONFIG ---
+
+# Build if needed
+if "BUILD_CMD" in dir() or "BUILD_CMD" in globals():
+    result = subprocess.run(BUILD_CMD, shell=True, capture_output=True)
+    if result.returncode != 0:
+        print(f"Build failed: {result.stderr.decode()[:200]}", file=sys.stderr)
+        sys.exit(1)
+
+# Measure
+if "DOCKER_IMAGE" in dir() or "DOCKER_IMAGE" in globals():
+    if "DOCKER_BUILD_CMD" in dir():
+        subprocess.run(DOCKER_BUILD_CMD, shell=True, capture_output=True)
+    result = subprocess.run(
+        f"docker image inspect {DOCKER_IMAGE} --format '{{{{.Size}}}}'",
+        shell=True, capture_output=True, text=True
+    )
+    size_bytes = int(result.stdout.strip())
+elif "TARGET_DIR" in dir() or "TARGET_DIR" in globals():
+    size_bytes = sum(
+        os.path.getsize(os.path.join(dp, f))
+        for dp, _, fns in os.walk(TARGET_DIR) for f in fns
+    )
+elif os.path.exists(TARGET_FILE):
+    size_bytes = os.path.getsize(TARGET_FILE)
+else:
+    print(f"Target not found: {TARGET_FILE}", file=sys.stderr)
+    sys.exit(1)
+
+size_kb = size_bytes / 1024
+size_mb = size_bytes / (1024 * 1024)
+
+print(f"size_bytes: {size_bytes}")
+print(f"size_kb: {size_kb:.1f}")
+print(f"size_mb: {size_mb:.2f}")
--- a/engineering/autoresearch-agent/evaluators/benchmark_speed.py
+++ b/engineering/autoresearch-agent/evaluators/benchmark_speed.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Measure execution speed of a target function or command.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import statistics
+import subprocess
+import sys
+import time
+
+# --- CONFIGURE THESE ---
+COMMAND = "python src/module.py"  # Command to benchmark
+RUNS = 5                          # Number of runs
+WARMUP = 1                        # Warmup runs (not counted)
+# --- END CONFIG ---
+
+times = []
+
+# Warmup
+for _ in range(WARMUP):
+    subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
+
+# Benchmark
+for i in range(RUNS):
+    t0 = time.perf_counter()
+    result = subprocess.run(COMMAND, shell=True, capture_output=True, timeout=120)
+    elapsed = (time.perf_counter() - t0) * 1000  # ms
+
+    if result.returncode != 0:
+        print(f"Run {i+1} failed (exit {result.returncode})", file=sys.stderr)
+        print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
+        sys.exit(1)
+
+    times.append(elapsed)
+
+p50 = statistics.median(times)
+p95 = sorted(times)[int(len(times) * 0.95)] if len(times) >= 5 else max(times)
+
+print(f"p50_ms: {p50:.2f}")
+print(f"p95_ms: {p95:.2f}")
+print(f"runs: {RUNS}")
--- a/engineering/autoresearch-agent/evaluators/build_speed.py
+++ b/engineering/autoresearch-agent/evaluators/build_speed.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Measure build/compile time.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import subprocess
+import sys
+import time
+
+# --- CONFIGURE THESE ---
+BUILD_CMD = "npm run build"    # or: docker build -t test .
+CLEAN_CMD = ""                 # optional: npm run clean (run before each build)
+RUNS = 3                       # Number of builds to average
+# --- END CONFIG ---
+
+times = []
+
+for i in range(RUNS):
+    # Clean if configured
+    if CLEAN_CMD:
+        subprocess.run(CLEAN_CMD, shell=True, capture_output=True, timeout=60)
+
+    t0 = time.perf_counter()
+    result = subprocess.run(BUILD_CMD, shell=True, capture_output=True, timeout=600)
+    elapsed = time.perf_counter() - t0
+
+    if result.returncode != 0:
+        print(f"Build {i+1} failed (exit {result.returncode})", file=sys.stderr)
+        print(f"stderr: {result.stderr.decode()[:200]}", file=sys.stderr)
+        sys.exit(1)
+
+    times.append(elapsed)
+
+import statistics
+avg = statistics.mean(times)
+median = statistics.median(times)
+
+print(f"build_seconds: {median:.2f}")
+print(f"build_avg: {avg:.2f}")
+print(f"runs: {RUNS}")
--- a/engineering/autoresearch-agent/evaluators/llm_judge_content.py
+++ b/engineering/autoresearch-agent/evaluators/llm_judge_content.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""LLM judge for content quality (headlines, titles, descriptions).
+Uses the user's existing CLI tool (claude, codex, gemini) for evaluation.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import subprocess
+import sys
+from pathlib import Path
+
+# --- CONFIGURE THESE ---
+TARGET_FILE = "content/titles.md"  # File being optimized
+CLI_TOOL = "claude"                # or: codex, gemini
+# --- END CONFIG ---
+
+# The judge prompt is FIXED — the agent cannot change how it's evaluated
+JUDGE_PROMPT = """You are a content quality evaluator. Score the following content strictly.
+
+Criteria (each scored 1-10):
+
+1. CURIOSITY GAP — Does this make you want to click? Is there an information gap
+   that can only be resolved by reading? Generic titles score 1-3. Specific,
+   intriguing titles score 7-10.
+
+2. SPECIFICITY — Are there concrete numbers, tools, or details? "How I improved
+   performance" = 2. "How I reduced API latency from 800ms to 185ms" = 9.
+
+3. EMOTIONAL PULL — Does it trigger curiosity, surprise, fear of missing out,
+   or recognition? Flat titles score 1-3. Emotionally charged score 7-10.
+
+4. SCROLL-STOP POWER — Would this stop someone scrolling through a feed or
+   search results? Would they pause on this headline? Rate honestly.
+
+5. SEO KEYWORD PRESENCE — Are searchable, high-intent terms present naturally?
+   Keyword-stuffed = 3. Natural integration of search terms = 8-10.
+
+Output EXACTLY this format (nothing else):
+curiosity: <score>
+specificity: <score>
+emotional: <score>
+scroll_stop: <score>
+seo: <score>
+ctr_score: <average of all 5 scores>
+
+Be harsh. Most content is mediocre (4-6 range). Only exceptional content scores 8+."""
+
+content = Path(TARGET_FILE).read_text()
+full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nContent to evaluate:\n\n{content}"
+
+# Call the user's CLI tool
+result = subprocess.run(
+    [CLI_TOOL, "-p", full_prompt],
+    capture_output=True, text=True, timeout=120
+)
+
+if result.returncode != 0:
+    print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
+    sys.exit(1)
+
+# Parse output — look for ctr_score line
+output = result.stdout
+for line in output.splitlines():
+    line = line.strip()
+    if line.startswith("ctr_score:"):
+        print(line)
+    elif line.startswith(("curiosity:", "specificity:", "emotional:", "scroll_stop:", "seo:")):
+        print(line)
+
+# Verify ctr_score was found
+if "ctr_score:" not in output:
+    print("Could not parse ctr_score from LLM output", file=sys.stderr)
+    print(f"Raw output: {output[:500]}", file=sys.stderr)
+    sys.exit(1)
--- a/engineering/autoresearch-agent/evaluators/llm_judge_copy.py
+++ b/engineering/autoresearch-agent/evaluators/llm_judge_copy.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""LLM judge for marketing copy (social posts, ads, emails).
+Uses the user's existing CLI tool for evaluation.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import subprocess
+import sys
+from pathlib import Path
+
+# --- CONFIGURE THESE ---
+TARGET_FILE = "posts.md"           # Copy being optimized
+CLI_TOOL = "claude"                # or: codex, gemini
+PLATFORM = "twitter"               # twitter, linkedin, instagram, email, ad
+# --- END CONFIG ---
+
+JUDGE_PROMPTS = {
+    "twitter": """Score this Twitter/X post strictly:
+1. HOOK (1-10) — Does the first line stop the scroll?
+2. VALUE (1-10) — Does it provide insight, entertainment, or utility?
+3. ENGAGEMENT (1-10) — Would people reply, retweet, or like?
+4. BREVITY (1-10) — Is every word earning its place? No filler?
+5. CTA (1-10) — Is there a clear next action (even implicit)?""",
+
+    "linkedin": """Score this LinkedIn post strictly:
+1. HOOK (1-10) — Does the first line make you click "see more"?
+2. STORYTELLING (1-10) — Is there a narrative arc or just statements?
+3. CREDIBILITY (1-10) — Does it demonstrate expertise without bragging?
+4. ENGAGEMENT (1-10) — Would professionals comment or share?
+5. CTA (1-10) — Does it invite discussion or action?""",
+
+    "instagram": """Score this Instagram caption strictly:
+1. HOOK (1-10) — Does the first line grab attention?
+2. RELATABILITY (1-10) — Does the audience see themselves in this?
+3. VISUAL MATCH (1-10) — Does the copy complement visual content?
+4. HASHTAG STRATEGY (1-10) — Are hashtags relevant and not spammy?
+5. CTA (1-10) — Does it encourage saves, shares, or comments?""",
+
+    "email": """Score this email subject + preview strictly:
+1. OPEN INCENTIVE (1-10) — Would you open this in a crowded inbox?
+2. SPECIFICITY (1-10) — Is it concrete or vague?
+3. URGENCY (1-10) — Is there a reason to open now vs later?
+4. PERSONALIZATION (1-10) — Does it feel written for someone, not everyone?
+5. PREVIEW SYNC (1-10) — Does the preview text complement the subject?""",
+
+    "ad": """Score this ad copy strictly:
+1. ATTENTION (1-10) — Does it stop someone scrolling past ads?
+2. DESIRE (1-10) — Does it create want for the product/service?
+3. PROOF (1-10) — Is there credibility (numbers, social proof)?
+4. ACTION (1-10) — Is the CTA clear and compelling?
+5. OBJECTION HANDLING (1-10) — Does it preempt "why not"?""",
+}
+
+platform_prompt = JUDGE_PROMPTS.get(PLATFORM, JUDGE_PROMPTS["twitter"])
+
+JUDGE_PROMPT = f"""{platform_prompt}
+
+Output EXACTLY this format:
+criterion_1: <score>
+criterion_2: <score>
+criterion_3: <score>
+criterion_4: <score>
+criterion_5: <score>
+engagement_score: <average of all 5>
+
+Be harsh. Most copy is mediocre (4-6). Only exceptional copy scores 8+."""
+
+content = Path(TARGET_FILE).read_text()
+full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nCopy to evaluate:\n\n{content}"
+
+result = subprocess.run(
+    [CLI_TOOL, "-p", full_prompt],
+    capture_output=True, text=True, timeout=120
+)
+
+if result.returncode != 0:
+    print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
+    sys.exit(1)
+
+output = result.stdout
+for line in output.splitlines():
+    line = line.strip()
+    if line.startswith("engagement_score:") or line.startswith("criterion_"):
+        print(line)
+
+if "engagement_score:" not in output:
+    print("Could not parse engagement_score from LLM output", file=sys.stderr)
+    print(f"Raw: {output[:500]}", file=sys.stderr)
+    sys.exit(1)
--- a/engineering/autoresearch-agent/evaluators/llm_judge_prompt.py
+++ b/engineering/autoresearch-agent/evaluators/llm_judge_prompt.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""LLM judge for prompt/instruction quality.
+Uses the user's existing CLI tool for evaluation.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+# --- CONFIGURE THESE ---
+TARGET_FILE = "prompt.md"          # Prompt being optimized
+TEST_CASES_FILE = "tests/cases.json"  # Test cases: [{"input": "...", "expected": "..."}]
+CLI_TOOL = "claude"                # or: codex, gemini
+# --- END CONFIG ---
+
+JUDGE_PROMPT_TEMPLATE = """You are evaluating a system prompt's effectiveness.
+
+SYSTEM PROMPT BEING TESTED:
+{prompt}
+
+TEST INPUT:
+{input}
+
+EXPECTED OUTPUT (reference):
+{expected}
+
+ACTUAL OUTPUT:
+{actual}
+
+Score the actual output on these criteria (each 1-10):
+1. ACCURACY — Does it match the expected output's intent and facts?
+2. COMPLETENESS — Does it cover all required elements?
+3. CLARITY — Is it well-structured and easy to understand?
+4. INSTRUCTION_FOLLOWING — Does it follow the system prompt's guidelines?
+
+Output EXACTLY: quality_score: <average of all 4>
+Nothing else."""
+
+prompt = Path(TARGET_FILE).read_text()
+test_cases = json.loads(Path(TEST_CASES_FILE).read_text())
+
+scores = []
+
+for i, case in enumerate(test_cases):
+    # Generate output using the prompt
+    gen_prompt = f"{prompt}\n\n{case['input']}"
+    gen_result = subprocess.run(
+        [CLI_TOOL, "-p", gen_prompt],
+        capture_output=True, text=True, timeout=60
+    )
+    if gen_result.returncode != 0:
+        print(f"Generation failed for case {i+1}", file=sys.stderr)
+        scores.append(0)
+        continue
+
+    actual = gen_result.stdout.strip()
+
+    # Judge the output
+    judge_prompt = JUDGE_PROMPT_TEMPLATE.format(
+        prompt=prompt[:500],
+        input=case["input"],
+        expected=case.get("expected", "N/A"),
+        actual=actual[:500]
+    )
+
+    judge_result = subprocess.run(
+        [CLI_TOOL, "-p", judge_prompt],
+        capture_output=True, text=True, timeout=60
+    )
+
+    if judge_result.returncode != 0:
+        scores.append(0)
+        continue
+
+    # Parse score
+    for line in judge_result.stdout.splitlines():
+        if "quality_score:" in line:
+            try:
+                score = float(line.split(":")[-1].strip())
+                scores.append(score)
+            except ValueError:
+                scores.append(0)
+            break
+    else:
+        scores.append(0)
+
+    print(f"  Case {i+1}/{len(test_cases)}: {scores[-1]:.1f}", file=sys.stderr)
+
+if not scores:
+    print("No test cases evaluated", file=sys.stderr)
+    sys.exit(1)
+
+avg = sum(scores) / len(scores)
+quality = avg * 10  # Scale to 0-100
+
+print(f"quality_score: {quality:.2f}")
+print(f"cases_tested: {len(scores)}")
+print(f"avg_per_case: {avg:.2f}")
--- a/engineering/autoresearch-agent/evaluators/memory_usage.py
+++ b/engineering/autoresearch-agent/evaluators/memory_usage.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+"""Measure peak memory usage of a command.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import os
+import platform
+import subprocess
+import sys
+
+# --- CONFIGURE THESE ---
+COMMAND = "python src/module.py"  # Command to measure
+# --- END CONFIG ---
+
+system = platform.system()
+
+if system == "Linux":
+    # Use /usr/bin/time for peak RSS
+    result = subprocess.run(
+        f"/usr/bin/time -v {COMMAND}",
+        shell=True, capture_output=True, text=True, timeout=300
+    )
+    output = result.stderr
+    for line in output.splitlines():
+        if "Maximum resident set size" in line:
+            kb = int(line.split(":")[-1].strip())
+            mb = kb / 1024
+            print(f"peak_mb: {mb:.1f}")
+            print(f"peak_kb: {kb}")
+            sys.exit(0)
+    print("Could not parse memory from /usr/bin/time output", file=sys.stderr)
+    sys.exit(1)
+
+elif system == "Darwin":
+    # macOS: use /usr/bin/time -l
+    result = subprocess.run(
+        f"/usr/bin/time -l {COMMAND}",
+        shell=True, capture_output=True, text=True, timeout=300
+    )
+    output = result.stderr
+    for line in output.splitlines():
+        if "maximum resident set size" in line.lower():
+            # macOS reports in bytes
+            val = int(line.strip().split()[0])
+            mb = val / (1024 * 1024)
+            print(f"peak_mb: {mb:.1f}")
+            sys.exit(0)
+    print("Could not parse memory from time output", file=sys.stderr)
+    sys.exit(1)
+
+else:
+    print(f"Unsupported platform: {system}. Use Linux or macOS.", file=sys.stderr)
+    sys.exit(1)
--- a/engineering/autoresearch-agent/evaluators/test_pass_rate.py
+++ b/engineering/autoresearch-agent/evaluators/test_pass_rate.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Measure test suite pass rate.
+DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
+
+import re
+import subprocess
+import sys
+
+# --- CONFIGURE THESE ---
+TEST_CMD = "pytest tests/ --tb=no -q"  # Test command
+# --- END CONFIG ---
+
+result = subprocess.run(TEST_CMD, shell=True, capture_output=True, text=True, timeout=300)
+output = result.stdout + "\n" + result.stderr
+
+# Try to parse pytest output: "X passed, Y failed, Z errors"
+passed = failed = errors = 0
+
+# pytest short format: "5 passed, 2 failed in 1.23s"
+match = re.search(r"(\d+) passed", output)
+if match:
+    passed = int(match.group(1))
+match = re.search(r"(\d+) failed", output)
+if match:
+    failed = int(match.group(1))
+match = re.search(r"(\d+) error", output)
+if match:
+    errors = int(match.group(1))
+
+total = passed + failed + errors
+if total == 0:
+    # Try unittest format: "Ran X tests"
+    match = re.search(r"Ran (\d+) test", output)
+    if match:
+        total = int(match.group(1))
+        if result.returncode == 0:
+            passed = total
+        else:
+            # Count failures from output
+            fail_match = re.search(r"FAILED \(failures=(\d+)", output)
+            if fail_match:
+                failed = int(fail_match.group(1))
+                passed = total - failed
+
+if total == 0:
+    print("Could not parse test results", file=sys.stderr)
+    print(f"Output: {output[:500]}", file=sys.stderr)
+    sys.exit(1)
+
+rate = passed / total
+
+print(f"pass_rate: {rate:.4f}")
+print(f"passed: {passed}")
+print(f"failed: {failed}")
+print(f"total: {total}")