Files
claude-skills-reference/eval/scripts/generate-eval-config.py
Leo 75fa9de2bb feat: add promptfoo eval pipeline for skill quality testing
- Add eval/ directory with 10 pilot skill eval configs
- Add GitHub Action (skill-eval.yml) for automated eval on PR
- Add generate-eval-config.py script for bootstrapping new evals
- Add reusable assertion helpers (skill-quality.js)
- Add eval README with setup and usage docs

Skills covered: copywriting, cto-advisor, seo-audit, content-strategy,
aws-solution-architect, agile-product-owner, senior-frontend,
senior-security, mcp-server-builder, launch-strategy

CI integration:
- Triggers on PR to dev when SKILL.md files change
- Detects which skills changed and runs only those evals
- Posts results as PR comments (non-blocking)
- Uploads full results as artifacts

No existing files modified.
2026-03-12 05:39:24 +01:00

154 lines
5.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""Generate a promptfoo eval config for any skill.
Usage:
python eval/scripts/generate-eval-config.py marketing-skill/copywriting
python eval/scripts/generate-eval-config.py c-level-advisor/cto-advisor --force
"""
import os
import re
import sys
import textwrap
def parse_frontmatter(skill_path):
"""Extract name and description from SKILL.md YAML frontmatter."""
with open(skill_path, "r", encoding="utf-8") as f:
content = f.read()
# Match YAML frontmatter between --- delimiters
match = re.match(r"^---\s*\n(.*?)\n---", content, re.DOTALL)
if not match:
return None, None
frontmatter = match.group(1)
name = None
description = None
for line in frontmatter.split("\n"):
if line.startswith("name:"):
name = line.split(":", 1)[1].strip().strip("'\"")
elif line.startswith("description:"):
# Handle multi-line descriptions
desc = line.split(":", 1)[1].strip().strip("'\"")
description = desc
return name, description
def generate_config(skill_dir, force=False):
"""Generate a promptfoo eval YAML config for the given skill directory."""
# Resolve SKILL.md path
skill_md = os.path.join(skill_dir, "SKILL.md")
if not os.path.exists(skill_md):
print(f"Error: {skill_md} not found", file=sys.stderr)
sys.exit(1)
name, description = parse_frontmatter(skill_md)
if not name:
print(f"Error: Could not parse frontmatter from {skill_md}", file=sys.stderr)
sys.exit(1)
# Output path
output_path = os.path.join("eval", "skills", f"{name}.yaml")
if os.path.exists(output_path) and not force:
print(f"Eval config already exists: {output_path}")
print("Use --force to overwrite.")
sys.exit(0)
# Calculate relative path from eval/skills/ to the skill
rel_path = os.path.relpath(skill_md, os.path.join("eval", "skills"))
# Generate test prompts based on description
desc_lower = (description or "").lower()
# Default test prompts
prompts = [
f"I need help with {name.replace('-', ' ')}. Give me a comprehensive approach for a mid-stage B2B SaaS startup.",
f"Act as an expert in {name.replace('-', ' ')} and review my current approach. I'm a solo founder building a developer tool.",
]
# Add domain-specific third prompt
if any(w in desc_lower for w in ["marketing", "content", "seo", "copy"]):
prompts.append(
"Create a 90-day plan with specific deliverables, metrics, and milestones."
)
elif any(w in desc_lower for w in ["engineer", "architect", "code", "technical"]):
prompts.append(
"Design a technical solution with architecture diagram, tech stack recommendations, and implementation plan."
)
elif any(w in desc_lower for w in ["advisor", "executive", "strategic", "leader"]):
prompts.append(
"Help me prepare a board presentation on this topic with key metrics and strategic recommendations."
)
else:
prompts.append(
f"What are the top 5 mistakes people make with {name.replace('-', ' ')} and how to avoid them?"
)
# Build YAML
config = textwrap.dedent(f"""\
# Eval: {name}
# Source: {skill_dir}/SKILL.md
# Run: npx promptfoo@latest eval -c eval/skills/{name}.yaml
# Auto-generated — customize test prompts and assertions for better coverage
description: "Evaluate {name} skill"
prompts:
- |
You are an expert AI assistant. You have the following skill loaded:
---BEGIN SKILL---
{{{{skill_content}}}}
---END SKILL---
Now complete this task: {{{{task}}}}
providers:
- id: anthropic:messages:claude-sonnet-4-6
config:
max_tokens: 4096
temperature: 0.7
tests:
""")
for i, prompt in enumerate(prompts):
test_block = textwrap.dedent(f"""\
- vars:
skill_content: file://{rel_path}
task: "{prompt}"
assert:
- type: llm-rubric
value: "Response demonstrates specific expertise in {name.replace('-', ' ')}, not generic advice"
- type: llm-rubric
value: "Response is actionable with concrete steps or deliverables"
- type: javascript
value: "output.length > 300"
""")
config += test_block
# Write
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(config)
print(f"✅ Generated: {output_path}")
print(f" Skill: {name}")
print(f" Tests: {len(prompts)}")
print(f" Edit the file to customize prompts and assertions.")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python eval/scripts/generate-eval-config.py <skill-directory>")
print(" python eval/scripts/generate-eval-config.py marketing-skill/copywriting --force")
sys.exit(1)
skill_dir = sys.argv[1].rstrip("/")
force = "--force" in sys.argv
generate_config(skill_dir, force)