**Bug fixes (run_experiment.py):** - Fix broken revert logic: was saving HEAD as pre_commit (no-op revert), now uses git reset --hard HEAD~1 for correct rollback - Remove broken --loop mode (agent IS the loop, script handles one iteration) - Fix shell injection: all git commands use subprocess list form - Replace shell tail with Python file read **Bug fixes (other scripts):** - setup_experiment.py: fix shell injection in git branch creation, remove dead --skip-baseline flag, fix evaluator docstring parsing - log_results.py: fix 6 falsy-zero bugs (baseline=0 treated as None), add domain_filter to CSV/markdown export, move import time to top - evaluators: add FileNotFoundError handling, fix output format mismatch in llm_judge_copy, add peak_kb on macOS, add ValueError handling **Plugin packaging (NEW):** - plugin.json, settings.json, CLAUDE.md for plugin registry - 5 slash commands: /ar:setup, /ar:run, /ar:loop, /ar:status, /ar:resume - /ar:loop supports user-selected intervals (10m, 1h, daily, weekly, monthly) - experiment-runner agent for autonomous loop iterations - Registered in marketplace.json as plugin #20 **SKILL.md rewrite:** - Replace ambiguous "Loop Protocol" with clear "Agent Protocol" - Add results.tsv format spec, strategy escalation, self-improvement - Replace "NEVER STOP" with resumable stopping logic **Docs & sync:** - Codex (157 skills), Gemini (229 items), convert.sh all pick up the skill - 6 new MkDocs pages, mkdocs.yml nav updated - Counts updated: 17 agents, 22 slash commands Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
114 lines
4.4 KiB
Python
114 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""LLM judge for marketing copy (social posts, ads, emails).
|
|
Uses the user's existing CLI tool for evaluation.
|
|
DO NOT MODIFY after experiment starts — this is the fixed evaluator."""
|
|
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# --- CONFIGURE THESE ---
|
|
TARGET_FILE = "posts.md" # Copy being optimized
|
|
CLI_TOOL = "claude" # or: codex, gemini
|
|
PLATFORM = "twitter" # twitter, linkedin, instagram, email, ad
|
|
# --- END CONFIG ---
|
|
|
|
JUDGE_PROMPTS = {
|
|
"twitter": """Score this Twitter/X post strictly:
|
|
1. HOOK (1-10) — Does the first line stop the scroll?
|
|
2. VALUE (1-10) — Does it provide insight, entertainment, or utility?
|
|
3. ENGAGEMENT (1-10) — Would people reply, retweet, or like?
|
|
4. BREVITY (1-10) — Is every word earning its place? No filler?
|
|
5. CTA (1-10) — Is there a clear next action (even implicit)?""",
|
|
|
|
"linkedin": """Score this LinkedIn post strictly:
|
|
1. HOOK (1-10) — Does the first line make you click "see more"?
|
|
2. STORYTELLING (1-10) — Is there a narrative arc or just statements?
|
|
3. CREDIBILITY (1-10) — Does it demonstrate expertise without bragging?
|
|
4. ENGAGEMENT (1-10) — Would professionals comment or share?
|
|
5. CTA (1-10) — Does it invite discussion or action?""",
|
|
|
|
"instagram": """Score this Instagram caption strictly:
|
|
1. HOOK (1-10) — Does the first line grab attention?
|
|
2. RELATABILITY (1-10) — Does the audience see themselves in this?
|
|
3. VISUAL MATCH (1-10) — Does the copy complement visual content?
|
|
4. HASHTAG STRATEGY (1-10) — Are hashtags relevant and not spammy?
|
|
5. CTA (1-10) — Does it encourage saves, shares, or comments?""",
|
|
|
|
"email": """Score this email subject + preview strictly:
|
|
1. OPEN INCENTIVE (1-10) — Would you open this in a crowded inbox?
|
|
2. SPECIFICITY (1-10) — Is it concrete or vague?
|
|
3. URGENCY (1-10) — Is there a reason to open now vs later?
|
|
4. PERSONALIZATION (1-10) — Does it feel written for someone, not everyone?
|
|
5. PREVIEW SYNC (1-10) — Does the preview text complement the subject?""",
|
|
|
|
"ad": """Score this ad copy strictly:
|
|
1. ATTENTION (1-10) — Does it stop someone scrolling past ads?
|
|
2. DESIRE (1-10) — Does it create want for the product/service?
|
|
3. PROOF (1-10) — Is there credibility (numbers, social proof)?
|
|
4. ACTION (1-10) — Is the CTA clear and compelling?
|
|
5. OBJECTION HANDLING (1-10) — Does it preempt "why not"?""",
|
|
}
|
|
|
|
platform_prompt = JUDGE_PROMPTS.get(PLATFORM, JUDGE_PROMPTS["twitter"])
|
|
|
|
JUDGE_PROMPT = f"""{platform_prompt}
|
|
|
|
IMPORTANT: You MUST use criterion_1 through criterion_5 as labels, NOT the criterion names.
|
|
Do NOT output "hook: 7" — output "criterion_1: 7".
|
|
|
|
Output EXACTLY this format:
|
|
criterion_1: <score>
|
|
criterion_2: <score>
|
|
criterion_3: <score>
|
|
criterion_4: <score>
|
|
criterion_5: <score>
|
|
engagement_score: <average of all 5>
|
|
|
|
Be harsh. Most copy is mediocre (4-6). Only exceptional copy scores 8+."""
|
|
|
|
try:
|
|
content = Path(TARGET_FILE).read_text()
|
|
except FileNotFoundError:
|
|
print(f"Target file not found: {TARGET_FILE}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
full_prompt = f"{JUDGE_PROMPT}\n\n---\n\nCopy to evaluate:\n\n{content}"
|
|
|
|
result = subprocess.run(
|
|
[CLI_TOOL, "-p", full_prompt],
|
|
capture_output=True, text=True, timeout=120
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
print(f"LLM judge failed: {result.stderr[:200]}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
output = result.stdout
|
|
found_scores = False
|
|
for line in output.splitlines():
|
|
line = line.strip()
|
|
if line.startswith("engagement_score:") or line.startswith("criterion_"):
|
|
print(line)
|
|
found_scores = True
|
|
|
|
# Fallback: if no criterion_ lines found, try parsing any "word: digit" lines
|
|
if not found_scores:
|
|
import re
|
|
fallback_scores = []
|
|
for line in output.splitlines():
|
|
line = line.strip()
|
|
match = re.match(r'^(\w[\w\s]*?):\s*(\d+(?:\.\d+)?)\s*$', line)
|
|
if match and match.group(1).lower() not in ("engagement_score",):
|
|
fallback_scores.append(float(match.group(2)))
|
|
print(f"criterion_{len(fallback_scores)}: {match.group(2)}")
|
|
if fallback_scores:
|
|
avg = sum(fallback_scores) / len(fallback_scores)
|
|
print(f"engagement_score: {avg:.1f}")
|
|
found_scores = True
|
|
|
|
if "engagement_score:" not in output and not found_scores:
|
|
print("Could not parse engagement_score from LLM output", file=sys.stderr)
|
|
print(f"Raw: {output[:500]}", file=sys.stderr)
|
|
sys.exit(1)
|