feat: add promptfoo eval pipeline for skill quality testing

- Add eval/ directory with 10 pilot skill eval configs - Add GitHub Action (skill-eval.yml) for automated eval on PR - Add generate-eval-config.py script for bootstrapping new evals - Add reusable assertion helpers (skill-quality.js) - Add eval README with setup and usage docs Skills covered: copywriting, cto-advisor, seo-audit, content-strategy, aws-solution-architect, agile-product-owner, senior-frontend, senior-security, mcp-server-builder, launch-strategy CI integration: - Triggers on PR to dev when SKILL.md files change - Detects which skills changed and runs only those evals - Posts results as PR comments (non-blocking) - Uploads full results as artifacts No existing files modified.
2026-03-12 05:39:24 +01:00
parent 713e2deb82
commit 75fa9de2bb
15 changed files with 1055 additions and 0 deletions
--- a/eval/scripts/generate-eval-config.py
+++ b/eval/scripts/generate-eval-config.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""Generate a promptfoo eval config for any skill.
+
+Usage:
+    python eval/scripts/generate-eval-config.py marketing-skill/copywriting
+    python eval/scripts/generate-eval-config.py c-level-advisor/cto-advisor --force
+"""
+
+import os
+import re
+import sys
+import textwrap
+
+
+def parse_frontmatter(skill_path):
+    """Extract name and description from SKILL.md YAML frontmatter."""
+    with open(skill_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Match YAML frontmatter between --- delimiters
+    match = re.match(r"^---\s*\n(.*?)\n---", content, re.DOTALL)
+    if not match:
+        return None, None
+
+    frontmatter = match.group(1)
+    name = None
+    description = None
+
+    for line in frontmatter.split("\n"):
+        if line.startswith("name:"):
+            name = line.split(":", 1)[1].strip().strip("'\"")
+        elif line.startswith("description:"):
+            # Handle multi-line descriptions
+            desc = line.split(":", 1)[1].strip().strip("'\"")
+            description = desc
+
+    return name, description
+
+
+def generate_config(skill_dir, force=False):
+    """Generate a promptfoo eval YAML config for the given skill directory."""
+    # Resolve SKILL.md path
+    skill_md = os.path.join(skill_dir, "SKILL.md")
+    if not os.path.exists(skill_md):
+        print(f"Error: {skill_md} not found", file=sys.stderr)
+        sys.exit(1)
+
+    name, description = parse_frontmatter(skill_md)
+    if not name:
+        print(f"Error: Could not parse frontmatter from {skill_md}", file=sys.stderr)
+        sys.exit(1)
+
+    # Output path
+    output_path = os.path.join("eval", "skills", f"{name}.yaml")
+    if os.path.exists(output_path) and not force:
+        print(f"Eval config already exists: {output_path}")
+        print("Use --force to overwrite.")
+        sys.exit(0)
+
+    # Calculate relative path from eval/skills/ to the skill
+    rel_path = os.path.relpath(skill_md, os.path.join("eval", "skills"))
+
+    # Generate test prompts based on description
+    desc_lower = (description or "").lower()
+
+    # Default test prompts
+    prompts = [
+        f"I need help with {name.replace('-', ' ')}. Give me a comprehensive approach for a mid-stage B2B SaaS startup.",
+        f"Act as an expert in {name.replace('-', ' ')} and review my current approach. I'm a solo founder building a developer tool.",
+    ]
+
+    # Add domain-specific third prompt
+    if any(w in desc_lower for w in ["marketing", "content", "seo", "copy"]):
+        prompts.append(
+            "Create a 90-day plan with specific deliverables, metrics, and milestones."
+        )
+    elif any(w in desc_lower for w in ["engineer", "architect", "code", "technical"]):
+        prompts.append(
+            "Design a technical solution with architecture diagram, tech stack recommendations, and implementation plan."
+        )
+    elif any(w in desc_lower for w in ["advisor", "executive", "strategic", "leader"]):
+        prompts.append(
+            "Help me prepare a board presentation on this topic with key metrics and strategic recommendations."
+        )
+    else:
+        prompts.append(
+            f"What are the top 5 mistakes people make with {name.replace('-', ' ')} and how to avoid them?"
+        )
+
+    # Build YAML
+    config = textwrap.dedent(f"""\
+    # Eval: {name}
+    # Source: {skill_dir}/SKILL.md
+    # Run: npx promptfoo@latest eval -c eval/skills/{name}.yaml
+    # Auto-generated — customize test prompts and assertions for better coverage
+
+    description: "Evaluate {name} skill"
+
+    prompts:
+      - |
+        You are an expert AI assistant. You have the following skill loaded:
+
+        ---BEGIN SKILL---
+        {{{{skill_content}}}}
+        ---END SKILL---
+
+        Now complete this task: {{{{task}}}}
+
+    providers:
+      - id: anthropic:messages:claude-sonnet-4-6
+        config:
+          max_tokens: 4096
+          temperature: 0.7
+
+    tests:
+    """)
+
+    for i, prompt in enumerate(prompts):
+        test_block = textwrap.dedent(f"""\
+      - vars:
+          skill_content: file://{rel_path}
+          task: "{prompt}"
+        assert:
+          - type: llm-rubric
+            value: "Response demonstrates specific expertise in {name.replace('-', ' ')}, not generic advice"
+          - type: llm-rubric
+            value: "Response is actionable with concrete steps or deliverables"
+          - type: javascript
+            value: "output.length > 300"
+    """)
+        config += test_block
+
+    # Write
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(config)
+
+    print(f"✅ Generated: {output_path}")
+    print(f"   Skill: {name}")
+    print(f"   Tests: {len(prompts)}")
+    print(f"   Edit the file to customize prompts and assertions.")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python eval/scripts/generate-eval-config.py <skill-directory>")
+        print("       python eval/scripts/generate-eval-config.py marketing-skill/copywriting --force")
+        sys.exit(1)
+
+    skill_dir = sys.argv[1].rstrip("/")
+    force = "--force" in sys.argv
+
+    generate_config(skill_dir, force)