feat: add promptfoo eval pipeline for skill quality testing
- Add eval/ directory with 10 pilot skill eval configs - Add GitHub Action (skill-eval.yml) for automated eval on PR - Add generate-eval-config.py script for bootstrapping new evals - Add reusable assertion helpers (skill-quality.js) - Add eval README with setup and usage docs Skills covered: copywriting, cto-advisor, seo-audit, content-strategy, aws-solution-architect, agile-product-owner, senior-frontend, senior-security, mcp-server-builder, launch-strategy CI integration: - Triggers on PR to dev when SKILL.md files change - Detects which skills changed and runs only those evals - Posts results as PR comments (non-blocking) - Uploads full results as artifacts No existing files modified.
This commit is contained in:
153
eval/scripts/generate-eval-config.py
Executable file
153
eval/scripts/generate-eval-config.py
Executable file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate a promptfoo eval config for any skill.
|
||||
|
||||
Usage:
|
||||
python eval/scripts/generate-eval-config.py marketing-skill/copywriting
|
||||
python eval/scripts/generate-eval-config.py c-level-advisor/cto-advisor --force
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
|
||||
def parse_frontmatter(skill_path):
|
||||
"""Extract name and description from SKILL.md YAML frontmatter."""
|
||||
with open(skill_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Match YAML frontmatter between --- delimiters
|
||||
match = re.match(r"^---\s*\n(.*?)\n---", content, re.DOTALL)
|
||||
if not match:
|
||||
return None, None
|
||||
|
||||
frontmatter = match.group(1)
|
||||
name = None
|
||||
description = None
|
||||
|
||||
for line in frontmatter.split("\n"):
|
||||
if line.startswith("name:"):
|
||||
name = line.split(":", 1)[1].strip().strip("'\"")
|
||||
elif line.startswith("description:"):
|
||||
# Handle multi-line descriptions
|
||||
desc = line.split(":", 1)[1].strip().strip("'\"")
|
||||
description = desc
|
||||
|
||||
return name, description
|
||||
|
||||
|
||||
def generate_config(skill_dir, force=False):
|
||||
"""Generate a promptfoo eval YAML config for the given skill directory."""
|
||||
# Resolve SKILL.md path
|
||||
skill_md = os.path.join(skill_dir, "SKILL.md")
|
||||
if not os.path.exists(skill_md):
|
||||
print(f"Error: {skill_md} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
name, description = parse_frontmatter(skill_md)
|
||||
if not name:
|
||||
print(f"Error: Could not parse frontmatter from {skill_md}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Output path
|
||||
output_path = os.path.join("eval", "skills", f"{name}.yaml")
|
||||
if os.path.exists(output_path) and not force:
|
||||
print(f"Eval config already exists: {output_path}")
|
||||
print("Use --force to overwrite.")
|
||||
sys.exit(0)
|
||||
|
||||
# Calculate relative path from eval/skills/ to the skill
|
||||
rel_path = os.path.relpath(skill_md, os.path.join("eval", "skills"))
|
||||
|
||||
# Generate test prompts based on description
|
||||
desc_lower = (description or "").lower()
|
||||
|
||||
# Default test prompts
|
||||
prompts = [
|
||||
f"I need help with {name.replace('-', ' ')}. Give me a comprehensive approach for a mid-stage B2B SaaS startup.",
|
||||
f"Act as an expert in {name.replace('-', ' ')} and review my current approach. I'm a solo founder building a developer tool.",
|
||||
]
|
||||
|
||||
# Add domain-specific third prompt
|
||||
if any(w in desc_lower for w in ["marketing", "content", "seo", "copy"]):
|
||||
prompts.append(
|
||||
"Create a 90-day plan with specific deliverables, metrics, and milestones."
|
||||
)
|
||||
elif any(w in desc_lower for w in ["engineer", "architect", "code", "technical"]):
|
||||
prompts.append(
|
||||
"Design a technical solution with architecture diagram, tech stack recommendations, and implementation plan."
|
||||
)
|
||||
elif any(w in desc_lower for w in ["advisor", "executive", "strategic", "leader"]):
|
||||
prompts.append(
|
||||
"Help me prepare a board presentation on this topic with key metrics and strategic recommendations."
|
||||
)
|
||||
else:
|
||||
prompts.append(
|
||||
f"What are the top 5 mistakes people make with {name.replace('-', ' ')} and how to avoid them?"
|
||||
)
|
||||
|
||||
# Build YAML
|
||||
config = textwrap.dedent(f"""\
|
||||
# Eval: {name}
|
||||
# Source: {skill_dir}/SKILL.md
|
||||
# Run: npx promptfoo@latest eval -c eval/skills/{name}.yaml
|
||||
# Auto-generated — customize test prompts and assertions for better coverage
|
||||
|
||||
description: "Evaluate {name} skill"
|
||||
|
||||
prompts:
|
||||
- |
|
||||
You are an expert AI assistant. You have the following skill loaded:
|
||||
|
||||
---BEGIN SKILL---
|
||||
{{{{skill_content}}}}
|
||||
---END SKILL---
|
||||
|
||||
Now complete this task: {{{{task}}}}
|
||||
|
||||
providers:
|
||||
- id: anthropic:messages:claude-sonnet-4-6
|
||||
config:
|
||||
max_tokens: 4096
|
||||
temperature: 0.7
|
||||
|
||||
tests:
|
||||
""")
|
||||
|
||||
for i, prompt in enumerate(prompts):
|
||||
test_block = textwrap.dedent(f"""\
|
||||
- vars:
|
||||
skill_content: file://{rel_path}
|
||||
task: "{prompt}"
|
||||
assert:
|
||||
- type: llm-rubric
|
||||
value: "Response demonstrates specific expertise in {name.replace('-', ' ')}, not generic advice"
|
||||
- type: llm-rubric
|
||||
value: "Response is actionable with concrete steps or deliverables"
|
||||
- type: javascript
|
||||
value: "output.length > 300"
|
||||
""")
|
||||
config += test_block
|
||||
|
||||
# Write
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(config)
|
||||
|
||||
print(f"✅ Generated: {output_path}")
|
||||
print(f" Skill: {name}")
|
||||
print(f" Tests: {len(prompts)}")
|
||||
print(f" Edit the file to customize prompts and assertions.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python eval/scripts/generate-eval-config.py <skill-directory>")
|
||||
print(" python eval/scripts/generate-eval-config.py marketing-skill/copywriting --force")
|
||||
sys.exit(1)
|
||||
|
||||
skill_dir = sys.argv[1].rstrip("/")
|
||||
force = "--force" in sys.argv
|
||||
|
||||
generate_config(skill_dir, force)
|
||||
Reference in New Issue
Block a user