feat: add promptfoo eval pipeline for skill quality testing

- Add eval/ directory with 10 pilot skill eval configs - Add GitHub Action (skill-eval.yml) for automated eval on PR - Add generate-eval-config.py script for bootstrapping new evals - Add reusable assertion helpers (skill-quality.js) - Add eval README with setup and usage docs Skills covered: copywriting, cto-advisor, seo-audit, content-strategy, aws-solution-architect, agile-product-owner, senior-frontend, senior-security, mcp-server-builder, launch-strategy CI integration: - Triggers on PR to dev when SKILL.md files change - Detects which skills changed and runs only those evals - Posts results as PR comments (non-blocking) - Uploads full results as artifacts No existing files modified.
2026-03-12 05:39:24 +01:00
parent 713e2deb82
commit 75fa9de2bb
15 changed files with 1055 additions and 0 deletions
--- a/.github/workflows/skill-eval.yml
+++ b/.github/workflows/skill-eval.yml
@@ -0,0 +1,235 @@
+---
+name: Skill Quality Eval (promptfoo)
+
+'on':
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - '**/SKILL.md'
+  workflow_dispatch:
+    inputs:
+      skill:
+        description: 'Specific skill eval config to run (e.g. copywriting)'
+        required: false
+
+concurrency:
+  group: skill-eval-${{ github.event.pull_request.number || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  detect-changes:
+    name: Detect changed skills
+    runs-on: ubuntu-latest
+    outputs:
+      skills: ${{ steps.find-evals.outputs.skills }}
+      has_evals: ${{ steps.find-evals.outputs.has_evals }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Find eval configs for changed skills
+        id: find-evals
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ github.event.inputs.skill }}" ]]; then
+            SKILL="${{ github.event.inputs.skill }}"
+            if [[ -f "eval/skills/${SKILL}.yaml" ]]; then
+              echo "skills=[\"${SKILL}\"]" >> "$GITHUB_OUTPUT"
+              echo "has_evals=true" >> "$GITHUB_OUTPUT"
+            else
+              echo "No eval config found for: ${SKILL}"
+              echo "has_evals=false" >> "$GITHUB_OUTPUT"
+            fi
+            exit 0
+          fi
+
+          # Get changed SKILL.md files in this PR
+          CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- '**/SKILL.md' | grep -v '.gemini/' | grep -v '.codex/' | grep -v 'sample')
+
+          if [[ -z "$CHANGED" ]]; then
+            echo "No SKILL.md files changed."
+            echo "has_evals=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          echo "Changed SKILL.md files:"
+          echo "$CHANGED"
+
+          # Map changed skills to eval configs
+          EVALS="[]"
+          for skill_path in $CHANGED; do
+            # Extract skill name from path (e.g. marketing-skill/copywriting/SKILL.md -> copywriting)
+            skill_name=$(basename $(dirname "$skill_path"))
+            eval_config="eval/skills/${skill_name}.yaml"
+
+            if [[ -f "$eval_config" ]]; then
+              EVALS=$(echo "$EVALS" | python3 -c "
+          import json, sys
+          arr = json.load(sys.stdin)
+          name = '$skill_name'
+          if name not in arr:
+              arr.append(name)
+          print(json.dumps(arr))
+          ")
+              echo "  ✅ $skill_name → $eval_config"
+            else
+              echo "  ⏭️  $skill_name → no eval config (skipping)"
+            fi
+          done
+
+          echo "skills=$EVALS" >> "$GITHUB_OUTPUT"
+          if [[ "$EVALS" == "[]" ]]; then
+            echo "has_evals=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "has_evals=true" >> "$GITHUB_OUTPUT"
+          fi
+
+  eval:
+    name: "Eval: ${{ matrix.skill }}"
+    needs: detect-changes
+    if: needs.detect-changes.outputs.has_evals == 'true'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        skill: ${{ fromJson(needs.detect-changes.outputs.skills) }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Run promptfoo eval
+        id: eval
+        continue-on-error: true
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          npx promptfoo@latest eval \
+            -c "eval/skills/${{ matrix.skill }}.yaml" \
+            --no-cache \
+            --output "/tmp/${{ matrix.skill }}-results.json" \
+            --output-format json \
+            2>&1 | tee /tmp/eval-output.log
+
+          echo "exit_code=$?" >> "$GITHUB_OUTPUT"
+
+      - name: Parse results
+        id: parse
+        if: always()
+        run: |
+          RESULTS_FILE="/tmp/${{ matrix.skill }}-results.json"
+          if [[ ! -f "$RESULTS_FILE" ]]; then
+            echo "summary=⚠️ No results file generated" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          python3 << 'PYEOF'
+          import json, os
+
+          with open(os.environ.get("RESULTS_FILE", f"/tmp/${{ matrix.skill }}-results.json")) as f:
+              data = json.load(f)
+
+          results = data.get("results", data.get("evalResults", []))
+          total = len(results)
+          passed = 0
+          failed = 0
+          details = []
+
+          for r in results:
+              test_pass = r.get("success", False)
+              if test_pass:
+                  passed += 1
+              else:
+                  failed += 1
+
+              prompt_vars = r.get("vars", {})
+              task = prompt_vars.get("task", "unknown")[:80]
+
+              assertions = r.get("gradingResult", {}).get("componentResults", [])
+              for a in assertions:
+                  status = "✅" if a.get("pass", False) else "❌"
+                  reason = a.get("reason", a.get("assertion", {}).get("value", ""))[:100]
+                  details.append(f"  {status} {reason}")
+
+          rate = (passed / total * 100) if total > 0 else 0
+          icon = "✅" if rate >= 80 else "⚠️" if rate >= 50 else "❌"
+
+          summary = f"{icon} **${{ matrix.skill }}**: {passed}/{total} tests passed ({rate:.0f}%)"
+
+          # Write to file for comment step
+          with open("/tmp/eval-summary.md", "w") as f:
+              f.write(f"### {summary}\n\n")
+              if details:
+                  f.write("<details><summary>Assertion details</summary>\n\n")
+                  f.write("\n".join(details))
+                  f.write("\n\n</details>\n")
+
+          # Output for workflow
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+              f.write(f"summary={summary}\n")
+              f.write(f"pass_rate={rate:.0f}\n")
+          PYEOF
+
+        env:
+          RESULTS_FILE: "/tmp/${{ matrix.skill }}-results.json"
+
+      - name: Comment on PR
+        if: github.event_name == 'pull_request' && always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let body = '### 🧪 Skill Eval: `${{ matrix.skill }}`\n\n';
+
+            try {
+              const summary = fs.readFileSync('/tmp/eval-summary.md', 'utf8');
+              body += summary;
+            } catch {
+              body += '⚠️ Eval did not produce results. Check the workflow logs.\n';
+            }
+
+            body += '\n\n---\n*Powered by [promptfoo](https://promptfoo.dev) · [eval config](eval/skills/${{ matrix.skill }}.yaml)*';
+
+            // Find existing comment to update
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const marker = `Skill Eval: \`${{ matrix.skill }}\``;
+            const existing = comments.find(c => c.body.includes(marker));
+
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body,
+              });
+            }
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results-${{ matrix.skill }}
+          path: /tmp/${{ matrix.skill }}-results.json
+          retention-days: 30
+          if-no-files-found: ignore
--- a/eval/README.md
+++ b/eval/README.md
@@ -0,0 +1,142 @@
+# Skill Evaluation Pipeline
+
+Automated quality evaluation for skills using [promptfoo](https://promptfoo.dev).
+
+## Quick Start
+
+```bash
+# Run a single skill eval
+npx promptfoo@latest eval -c eval/skills/copywriting.yaml
+
+# View results in browser
+npx promptfoo@latest view
+
+# Run all pilot skill evals
+for config in eval/skills/*.yaml; do
+  npx promptfoo@latest eval -c "$config" --no-cache
+done
+```
+
+## Requirements
+
+- Node.js 18+
+- `ANTHROPIC_API_KEY` environment variable set
+- No additional dependencies (promptfoo runs via npx)
+
+## How It Works
+
+Each skill has an eval config in `eval/skills/<skill-name>.yaml` that:
+
+1. Loads the skill's `SKILL.md` content as context
+2. Sends realistic task prompts to an LLM with the skill loaded
+3. Evaluates outputs against quality assertions (LLM rubrics + programmatic checks)
+4. Reports pass/fail per assertion
+
+### CI/CD Integration
+
+The GitHub Action (`.github/workflows/skill-eval.yml`) runs automatically when:
+- A PR to `dev` changes any `SKILL.md` file
+- The changed skill has an eval config in `eval/skills/`
+- Results are posted as PR comments
+
+Currently **non-blocking** — evals are informational, not gates.
+
+## Adding Evals for a New Skill
+
+### Option 1: Auto-generate
+
+```bash
+python eval/scripts/generate-eval-config.py marketing-skill/my-new-skill
+```
+
+This creates a boilerplate config with default prompts and assertions. **Always customize** the generated config with domain-specific test cases.
+
+### Option 2: Manual
+
+Copy an existing config and modify:
+
+```bash
+cp eval/skills/copywriting.yaml eval/skills/my-skill.yaml
+```
+
+### Eval Config Structure
+
+```yaml
+description: "What this eval tests"
+
+prompts:
+  - |
+    You are an expert AI assistant with this skill:
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+    Task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+
+tests:
+  - vars:
+      skill_content: file://../../path/to/SKILL.md
+      task: "A realistic user request"
+    assert:
+      - type: llm-rubric
+        value: "What good output looks like"
+      - type: javascript
+        value: "output.length > 200"
+```
+
+### Assertion Types
+
+| Type | Use For | Example |
+|------|---------|---------|
+| `llm-rubric` | Qualitative checks (expertise, relevance) | `"Response includes actionable next steps"` |
+| `contains` | Required terms | `"React"` |
+| `javascript` | Programmatic checks | `"output.length > 500"` |
+| `similar` | Semantic similarity | Compare against reference output |
+
+## Reading Results
+
+```bash
+# Terminal output (after eval)
+npx promptfoo@latest eval -c eval/skills/copywriting.yaml
+
+# Web UI (interactive)
+npx promptfoo@latest view
+
+# JSON output (for scripting)
+npx promptfoo@latest eval -c eval/skills/copywriting.yaml --output results.json
+```
+
+## File Structure
+
+```
+eval/
+├── promptfooconfig.yaml      # Master config (reference)
+├── skills/                   # Per-skill eval configs
+│   ├── copywriting.yaml      # ← 10 pilot skills
+│   ├── cto-advisor.yaml
+│   └── ...
+├── assertions/
+│   └── skill-quality.js      # Reusable assertion helpers
+├── scripts/
+│   └── generate-eval-config.py  # Config generator
+└── README.md                 # This file
+```
+
+## Running Locally vs CI
+
+| | Local | CI |
+|---|---|---|
+| **Command** | `npx promptfoo@latest eval -c eval/skills/X.yaml` | Automatic on PR |
+| **Results** | Terminal + web viewer | PR comment + artifact |
+| **Caching** | Enabled (faster iteration) | Disabled (`--no-cache`) |
+| **Cost** | Your API key | Repo secret `ANTHROPIC_API_KEY` |
+
+## Cost Estimate
+
+Each skill eval runs 2-3 test cases × ~4K tokens output = ~12K tokens per skill.  
+At Sonnet pricing (~$3/M input, $15/M output): **~$0.05-0.10 per skill eval**.  
+Full 10-skill pilot batch: **~$0.50-1.00 per run**.
--- a/eval/assertions/skill-quality.js
+++ b/eval/assertions/skill-quality.js
@@ -0,0 +1,54 @@
+// Reusable assertion helpers for skill quality evaluation
+// Used by promptfoo configs via: type: javascript, value: file://eval/assertions/skill-quality.js
+
+/**
+ * Check that output demonstrates domain expertise (not generic advice).
+ * Looks for specific terminology, frameworks, or tools mentioned.
+ */
+function hasDomainDepth(output, minTerms = 3) {
+  // Count domain-specific patterns: frameworks, tools, methodologies, metrics
+  const patterns = [
+    /\b(RICE|MoSCoW|OKR|KPI|DORA|SLA|SLO|SLI)\b/gi,
+    /\b(React|Next\.js|Tailwind|TypeScript|PostgreSQL|Redis|Lambda|S3)\b/gi,
+    /\b(SEO|CRO|CTR|LTV|CAC|MRR|ARR|NPS|CSAT)\b/gi,
+    /\b(OWASP|CVE|GDPR|SOC\s?2|ISO\s?27001|PCI)\b/gi,
+    /\b(sprint|backlog|retrospective|standup|velocity)\b/gi,
+  ];
+
+  let termCount = 0;
+  for (const pattern of patterns) {
+    const matches = output.match(pattern);
+    if (matches) termCount += new Set(matches.map(m => m.toLowerCase())).size;
+  }
+
+  return {
+    pass: termCount >= minTerms,
+    score: Math.min(1, termCount / (minTerms * 2)),
+    reason: `Found ${termCount} domain-specific terms (minimum: ${minTerms})`,
+  };
+}
+
+/**
+ * Check that output is actionable (contains concrete next steps, not just analysis).
+ */
+function isActionable(output) {
+  const actionPatterns = [
+    /\b(step \d|first|second|third|next|then|finally)\b/gi,
+    /\b(implement|create|build|configure|set up|install|deploy|run)\b/gi,
+    /\b(action item|todo|checklist|recommendation)\b/gi,
+    /```[\s\S]*?```/g, // code blocks indicate concrete output
+  ];
+
+  let score = 0;
+  for (const pattern of actionPatterns) {
+    if (pattern.test(output)) score += 0.25;
+  }
+
+  return {
+    pass: score >= 0.5,
+    score: Math.min(1, score),
+    reason: `Actionability score: ${score}/1.0`,
+  };
+}
+
+module.exports = { hasDomainDepth, isActionable };
--- a/eval/promptfooconfig.yaml
+++ b/eval/promptfooconfig.yaml
@@ -0,0 +1,32 @@
+# Promptfoo Master Config — claude-skills
+# Run all pilot skill evals: npx promptfoo@latest eval -c eval/promptfooconfig.yaml
+# Run a single skill: npx promptfoo@latest eval -c eval/skills/copywriting.yaml
+
+description: "claude-skills quality evaluation — pilot batch"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded that guides your behavior:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task:
+    {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+defaultTest:
+  assert:
+    - type: javascript
+      value: "output.length > 200"
+    - type: llm-rubric
+      value: "The response demonstrates domain expertise relevant to the task, not generic advice"
+
+# Import per-skill test suites
+tests: []
--- a/eval/scripts/generate-eval-config.py
+++ b/eval/scripts/generate-eval-config.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""Generate a promptfoo eval config for any skill.
+
+Usage:
+    python eval/scripts/generate-eval-config.py marketing-skill/copywriting
+    python eval/scripts/generate-eval-config.py c-level-advisor/cto-advisor --force
+"""
+
+import os
+import re
+import sys
+import textwrap
+
+
+def parse_frontmatter(skill_path):
+    """Extract name and description from SKILL.md YAML frontmatter."""
+    with open(skill_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Match YAML frontmatter between --- delimiters
+    match = re.match(r"^---\s*\n(.*?)\n---", content, re.DOTALL)
+    if not match:
+        return None, None
+
+    frontmatter = match.group(1)
+    name = None
+    description = None
+
+    for line in frontmatter.split("\n"):
+        if line.startswith("name:"):
+            name = line.split(":", 1)[1].strip().strip("'\"")
+        elif line.startswith("description:"):
+            # Handle multi-line descriptions
+            desc = line.split(":", 1)[1].strip().strip("'\"")
+            description = desc
+
+    return name, description
+
+
+def generate_config(skill_dir, force=False):
+    """Generate a promptfoo eval YAML config for the given skill directory."""
+    # Resolve SKILL.md path
+    skill_md = os.path.join(skill_dir, "SKILL.md")
+    if not os.path.exists(skill_md):
+        print(f"Error: {skill_md} not found", file=sys.stderr)
+        sys.exit(1)
+
+    name, description = parse_frontmatter(skill_md)
+    if not name:
+        print(f"Error: Could not parse frontmatter from {skill_md}", file=sys.stderr)
+        sys.exit(1)
+
+    # Output path
+    output_path = os.path.join("eval", "skills", f"{name}.yaml")
+    if os.path.exists(output_path) and not force:
+        print(f"Eval config already exists: {output_path}")
+        print("Use --force to overwrite.")
+        sys.exit(0)
+
+    # Calculate relative path from eval/skills/ to the skill
+    rel_path = os.path.relpath(skill_md, os.path.join("eval", "skills"))
+
+    # Generate test prompts based on description
+    desc_lower = (description or "").lower()
+
+    # Default test prompts
+    prompts = [
+        f"I need help with {name.replace('-', ' ')}. Give me a comprehensive approach for a mid-stage B2B SaaS startup.",
+        f"Act as an expert in {name.replace('-', ' ')} and review my current approach. I'm a solo founder building a developer tool.",
+    ]
+
+    # Add domain-specific third prompt
+    if any(w in desc_lower for w in ["marketing", "content", "seo", "copy"]):
+        prompts.append(
+            "Create a 90-day plan with specific deliverables, metrics, and milestones."
+        )
+    elif any(w in desc_lower for w in ["engineer", "architect", "code", "technical"]):
+        prompts.append(
+            "Design a technical solution with architecture diagram, tech stack recommendations, and implementation plan."
+        )
+    elif any(w in desc_lower for w in ["advisor", "executive", "strategic", "leader"]):
+        prompts.append(
+            "Help me prepare a board presentation on this topic with key metrics and strategic recommendations."
+        )
+    else:
+        prompts.append(
+            f"What are the top 5 mistakes people make with {name.replace('-', ' ')} and how to avoid them?"
+        )
+
+    # Build YAML
+    config = textwrap.dedent(f"""\
+    # Eval: {name}
+    # Source: {skill_dir}/SKILL.md
+    # Run: npx promptfoo@latest eval -c eval/skills/{name}.yaml
+    # Auto-generated — customize test prompts and assertions for better coverage
+
+    description: "Evaluate {name} skill"
+
+    prompts:
+      - |
+        You are an expert AI assistant. You have the following skill loaded:
+
+        ---BEGIN SKILL---
+        {{{{skill_content}}}}
+        ---END SKILL---
+
+        Now complete this task: {{{{task}}}}
+
+    providers:
+      - id: anthropic:messages:claude-sonnet-4-6
+        config:
+          max_tokens: 4096
+          temperature: 0.7
+
+    tests:
+    """)
+
+    for i, prompt in enumerate(prompts):
+        test_block = textwrap.dedent(f"""\
+      - vars:
+          skill_content: file://{rel_path}
+          task: "{prompt}"
+        assert:
+          - type: llm-rubric
+            value: "Response demonstrates specific expertise in {name.replace('-', ' ')}, not generic advice"
+          - type: llm-rubric
+            value: "Response is actionable with concrete steps or deliverables"
+          - type: javascript
+            value: "output.length > 300"
+    """)
+        config += test_block
+
+    # Write
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(config)
+
+    print(f"✅ Generated: {output_path}")
+    print(f"   Skill: {name}")
+    print(f"   Tests: {len(prompts)}")
+    print(f"   Edit the file to customize prompts and assertions.")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python eval/scripts/generate-eval-config.py <skill-directory>")
+        print("       python eval/scripts/generate-eval-config.py marketing-skill/copywriting --force")
+        sys.exit(1)
+
+    skill_dir = sys.argv[1].rstrip("/")
+    force = "--force" in sys.argv
+
+    generate_config(skill_dir, force)
--- a/eval/skills/agile-product-owner.yaml
+++ b/eval/skills/agile-product-owner.yaml
@@ -0,0 +1,41 @@
+# Eval: agile-product-owner
+# Source: product-team/agile-product-owner/SKILL.md
+
+description: "Evaluate agile product owner skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../product-team/agile-product-owner/SKILL.md
+      task: "Write user stories with acceptance criteria for an 'invite team members' feature in a project management tool. Users should be able to invite by email, set roles (admin/member/viewer), and revoke access."
+    assert:
+      - type: llm-rubric
+        value: "Output uses proper user story format (As a..., I want..., So that...) with testable acceptance criteria"
+      - type: llm-rubric
+        value: "Stories cover the three main flows: invite, role assignment, and access revocation"
+      - type: llm-rubric
+        value: "Acceptance criteria are specific and testable, not vague requirements"
+
+  - vars:
+      skill_content: file://../../product-team/agile-product-owner/SKILL.md
+      task: "We have 30 items in our backlog. Help me prioritize for a 2-week sprint with 2 developers (40 story points capacity). The items range from bug fixes to new features to tech debt."
+    assert:
+      - type: llm-rubric
+        value: "Response uses a prioritization framework (RICE, MoSCoW, or similar) with clear reasoning"
+      - type: llm-rubric
+        value: "Response respects the 40 story point capacity constraint"
--- a/eval/skills/aws-solution-architect.yaml
+++ b/eval/skills/aws-solution-architect.yaml
@@ -0,0 +1,41 @@
+# Eval: aws-solution-architect
+# Source: engineering-team/aws-solution-architect/SKILL.md
+
+description: "Evaluate AWS solution architect skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../engineering-team/aws-solution-architect/SKILL.md
+      task: "Design a serverless architecture for a real-time notification system that needs to handle 10K messages per second with sub-200ms delivery. Users connect via WebSocket. Budget is $500/month."
+    assert:
+      - type: llm-rubric
+        value: "Response uses specific AWS services (API Gateway WebSocket, Lambda, DynamoDB, etc.) not generic cloud patterns"
+      - type: llm-rubric
+        value: "Response addresses the throughput requirement (10K msg/s) with concrete scaling strategy"
+      - type: llm-rubric
+        value: "Response includes cost estimation relative to the $500/month budget constraint"
+
+  - vars:
+      skill_content: file://../../engineering-team/aws-solution-architect/SKILL.md
+      task: "We're migrating a Django monolith from Heroku to AWS. We have PostgreSQL, Redis, Celery workers, and S3 for file storage. Team of 3 devs, no DevOps experience. What's the simplest production-ready setup?"
+    assert:
+      - type: llm-rubric
+        value: "Response recommends managed services appropriate for a small team without DevOps (e.g., ECS Fargate, RDS, ElastiCache)"
+      - type: llm-rubric
+        value: "Response includes a migration plan with phases, not just target architecture"
--- a/eval/skills/content-strategy.yaml
+++ b/eval/skills/content-strategy.yaml
@@ -0,0 +1,41 @@
+# Eval: content-strategy
+# Source: marketing-skill/content-strategy/SKILL.md
+
+description: "Evaluate content strategy skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../marketing-skill/content-strategy/SKILL.md
+      task: "Build a 3-month content strategy for a developer tools startup that just launched. We have zero blog posts and a small Twitter following of 500. Our product is an open-source database migration tool."
+    assert:
+      - type: llm-rubric
+        value: "Response includes a phased plan with specific content types, topics, and publishing cadence"
+      - type: llm-rubric
+        value: "Strategy addresses developer audience specifically with appropriate channels (dev blogs, GitHub, HN)"
+      - type: llm-rubric
+        value: "Response includes measurable goals or KPIs for the content program"
+
+  - vars:
+      skill_content: file://../../marketing-skill/content-strategy/SKILL.md
+      task: "We have 50 blog posts but traffic has plateaued at 10K monthly visits. What should we do to 3x our organic traffic in 6 months?"
+    assert:
+      - type: llm-rubric
+        value: "Response diagnoses potential issues with existing content before prescribing new content"
+      - type: llm-rubric
+        value: "Response includes specific tactics like content refresh, internal linking, or topic clusters"
--- a/eval/skills/copywriting.yaml
+++ b/eval/skills/copywriting.yaml
@@ -0,0 +1,57 @@
+# Eval: copywriting
+# Source: marketing-skill/copywriting/SKILL.md
+# Run: npx promptfoo@latest eval -c eval/skills/copywriting.yaml
+
+description: "Evaluate copywriting skill — marketing copy generation"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../marketing-skill/copywriting/SKILL.md
+      task: "Write homepage copy for a B2B SaaS that automates invoicing for freelancers called InvoiceFlow"
+    assert:
+      - type: llm-rubric
+        value: "Output includes a clear headline, subheadline, at least 3 value propositions, and a call-to-action"
+      - type: llm-rubric
+        value: "Copy is specific to InvoiceFlow and freelancer invoicing, not generic B2B marketing"
+      - type: llm-rubric
+        value: "Copy follows direct-response copywriting principles with benefit-driven language"
+      - type: javascript
+        value: "output.length > 500"
+
+  - vars:
+      skill_content: file://../../marketing-skill/copywriting/SKILL.md
+      task: "Rewrite this landing page headline and subheadline: 'Welcome to our platform. We help businesses grow with our comprehensive solution for managing operations.' Make it compelling for a project management tool targeting remote teams."
+    assert:
+      - type: llm-rubric
+        value: "The rewritten headline is specific, benefit-driven, and not generic"
+      - type: llm-rubric
+        value: "The output specifically addresses remote teams, not generic businesses"
+      - type: javascript
+        value: "output.length > 100"
+
+  - vars:
+      skill_content: file://../../marketing-skill/copywriting/SKILL.md
+      task: "Write a pricing page for a developer tool with 3 tiers: Free, Pro ($29/mo), and Enterprise (custom). The tool is an API monitoring service called PingGuard."
+    assert:
+      - type: llm-rubric
+        value: "Output includes copy for all three pricing tiers with differentiated value propositions"
+      - type: llm-rubric
+        value: "Each tier has clear feature descriptions and the copy encourages upgrade paths"
+      - type: javascript
+        value: "output.length > 400"
--- a/eval/skills/cto-advisor.yaml
+++ b/eval/skills/cto-advisor.yaml
@@ -0,0 +1,53 @@
+# Eval: cto-advisor
+# Source: c-level-advisor/cto-advisor/SKILL.md
+# Run: npx promptfoo@latest eval -c eval/skills/cto-advisor.yaml
+
+description: "Evaluate CTO advisor skill — technical leadership guidance"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../c-level-advisor/cto-advisor/SKILL.md
+      task: "We're a 15-person startup with a monolithic Django app serving 50K users. Response times are growing. Should we move to microservices or optimize the monolith? We have 4 backend engineers."
+    assert:
+      - type: llm-rubric
+        value: "Response provides a clear recommendation with reasoning, not just listing pros and cons"
+      - type: llm-rubric
+        value: "Response considers team size (4 engineers) as a factor in the architecture decision"
+      - type: llm-rubric
+        value: "Response includes concrete next steps or an action plan"
+
+  - vars:
+      skill_content: file://../../c-level-advisor/cto-advisor/SKILL.md
+      task: "Our tech debt is slowing us down. Engineering velocity dropped 30% over 6 months. The CEO wants new features but we can barely maintain what we have. How do I make the case for a tech debt sprint to the board?"
+    assert:
+      - type: llm-rubric
+        value: "Response frames tech debt in business terms the board would understand, not just technical jargon"
+      - type: llm-rubric
+        value: "Response includes a strategy for balancing tech debt work with feature delivery"
+      - type: llm-rubric
+        value: "Response provides specific metrics or frameworks to measure tech debt impact"
+
+  - vars:
+      skill_content: file://../../c-level-advisor/cto-advisor/SKILL.md
+      task: "I'm hiring my first VP of Engineering. I'm a technical founder who has been CTO and lead dev. What should I look for, and how do I avoid hiring someone who will clash with me?"
+    assert:
+      - type: llm-rubric
+        value: "Response addresses the founder-VP dynamic specifically, not generic hiring advice"
+      - type: llm-rubric
+        value: "Response includes qualities to look for and red flags to watch for"
--- a/eval/skills/launch-strategy.yaml
+++ b/eval/skills/launch-strategy.yaml
@@ -0,0 +1,41 @@
+# Eval: launch-strategy
+# Source: marketing-skill/launch-strategy/SKILL.md
+
+description: "Evaluate launch strategy skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../marketing-skill/launch-strategy/SKILL.md
+      task: "Plan a Product Hunt launch for an AI writing assistant. We have 2,000 email subscribers, 500 Twitter followers, and the product has been in beta for 3 months with 200 active users. Budget: $0 (bootstrapped)."
+    assert:
+      - type: llm-rubric
+        value: "Response includes a phased timeline (pre-launch, launch day, post-launch) with specific actions"
+      - type: llm-rubric
+        value: "Strategy leverages existing assets (2K email list, 200 beta users, Twitter) concretely"
+      - type: llm-rubric
+        value: "Response includes Product Hunt-specific tactics (hunter selection, timing, asset preparation)"
+
+  - vars:
+      skill_content: file://../../marketing-skill/launch-strategy/SKILL.md
+      task: "We're launching a major feature update (AI-powered analytics) to our existing SaaS product with 5,000 paying customers. How should we announce it to maximize adoption and upsell opportunities?"
+    assert:
+      - type: llm-rubric
+        value: "Response distinguishes between existing customer communication and new user acquisition"
+      - type: llm-rubric
+        value: "Response includes specific channels and messaging for the announcement"
--- a/eval/skills/mcp-server-builder.yaml
+++ b/eval/skills/mcp-server-builder.yaml
@@ -0,0 +1,41 @@
+# Eval: mcp-server-builder
+# Source: engineering/mcp-server-builder/SKILL.md
+
+description: "Evaluate MCP server builder skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../engineering/mcp-server-builder/SKILL.md
+      task: "Build an MCP server in Python that exposes a 'search_github_repos' tool. The tool should take a query string and return top 5 repos with name, stars, and description. Use the GitHub REST API (no auth required for public search)."
+    assert:
+      - type: llm-rubric
+        value: "Output includes working Python code that follows MCP server patterns (tool registration, handler)"
+      - type: llm-rubric
+        value: "Code includes proper error handling for API failures"
+      - type: llm-rubric
+        value: "Tool definition includes proper input schema with type annotations"
+
+  - vars:
+      skill_content: file://../../engineering/mcp-server-builder/SKILL.md
+      task: "Design an MCP server architecture for a CRM system that exposes: list_contacts, get_contact, create_contact, search_contacts, and list_deals tools. Show the tool definitions and server structure."
+    assert:
+      - type: llm-rubric
+        value: "Response includes tool definitions with proper input/output schemas for all 5 tools"
+      - type: llm-rubric
+        value: "Architecture follows MCP best practices (proper transport, error handling, resource definitions)"
--- a/eval/skills/senior-frontend.yaml
+++ b/eval/skills/senior-frontend.yaml
@@ -0,0 +1,41 @@
+# Eval: senior-frontend (replacing frontend-design which doesn't exist as standalone)
+# Source: engineering-team/senior-frontend/SKILL.md
+
+description: "Evaluate senior frontend skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../engineering-team/senior-frontend/SKILL.md
+      task: "Build a responsive dashboard layout in React with TypeScript. It should have a sidebar navigation, a top bar with user menu, and a main content area with a grid of metric cards. Use Tailwind CSS."
+    assert:
+      - type: llm-rubric
+        value: "Output includes actual React/TypeScript code, not just descriptions"
+      - type: llm-rubric
+        value: "Code uses Tailwind CSS classes for responsive design (sm:, md:, lg: breakpoints)"
+      - type: llm-rubric
+        value: "Component structure follows React best practices (proper component decomposition)"
+
+  - vars:
+      skill_content: file://../../engineering-team/senior-frontend/SKILL.md
+      task: "Our Next.js app has a Core Web Vitals score of 45. LCP is 4.2s, CLS is 0.25, and INP is 350ms. Diagnose the likely causes and provide a fix plan."
+    assert:
+      - type: llm-rubric
+        value: "Response addresses each specific metric (LCP, CLS, INP) with targeted fixes"
+      - type: llm-rubric
+        value: "Response includes Next.js-specific optimizations (Image component, dynamic imports, etc.)"
--- a/eval/skills/senior-security.yaml
+++ b/eval/skills/senior-security.yaml
@@ -0,0 +1,41 @@
+# Eval: senior-security
+# Source: engineering-team/senior-security/SKILL.md
+
+description: "Evaluate senior security engineer skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../engineering-team/senior-security/SKILL.md
+      task: "Perform a security review of this Express.js API endpoint pattern: app.post('/api/users', (req, res) => { const query = `SELECT * FROM users WHERE email = '${req.body.email}'`; db.query(query).then(user => res.json(user)); })"
+    assert:
+      - type: llm-rubric
+        value: "Response identifies SQL injection vulnerability as the primary critical issue"
+      - type: llm-rubric
+        value: "Response provides a fixed code example using parameterized queries"
+      - type: llm-rubric
+        value: "Response identifies additional issues beyond SQL injection (input validation, error handling, etc.)"
+
+  - vars:
+      skill_content: file://../../engineering-team/senior-security/SKILL.md
+      task: "Create a security hardening checklist for a new Node.js API going to production. We handle user PII and payment data. Stack: Express, PostgreSQL, Redis, deployed on AWS ECS."
+    assert:
+      - type: llm-rubric
+        value: "Checklist covers OWASP Top 10 categories relevant to the stack"
+      - type: llm-rubric
+        value: "Response includes PII and payment-specific requirements (encryption at rest, PCI considerations)"
--- a/eval/skills/seo-audit.yaml
+++ b/eval/skills/seo-audit.yaml
@@ -0,0 +1,42 @@
+# Eval: seo-audit
+# Source: marketing-skill/seo-audit/SKILL.md
+# Run: npx promptfoo@latest eval -c eval/skills/seo-audit.yaml
+
+description: "Evaluate SEO audit skill"
+
+prompts:
+  - |
+    You are an expert AI assistant. You have the following skill loaded:
+
+    ---BEGIN SKILL---
+    {{skill_content}}
+    ---END SKILL---
+
+    Now complete this task: {{task}}
+
+providers:
+  - id: anthropic:messages:claude-sonnet-4-6
+    config:
+      max_tokens: 4096
+      temperature: 0.7
+
+tests:
+  - vars:
+      skill_content: file://../../marketing-skill/seo-audit/SKILL.md
+      task: "Perform an SEO audit checklist for a new SaaS landing page targeting the keyword 'AI code review tool'. The page has a 3-second load time, no meta description, and 200 words of content."
+    assert:
+      - type: llm-rubric
+        value: "Response identifies specific SEO issues (load time, missing meta description, thin content) rather than generic advice"
+      - type: llm-rubric
+        value: "Response provides actionable fixes with priority ordering"
+      - type: llm-rubric
+        value: "Response references on-page SEO factors like title tags, headings, and internal linking"
+
+  - vars:
+      skill_content: file://../../marketing-skill/seo-audit/SKILL.md
+      task: "Create a keyword strategy for a B2B SaaS in the project management space. We're a small startup competing against Asana, Monday.com, and Jira."
+    assert:
+      - type: llm-rubric
+        value: "Response suggests long-tail keywords rather than only head terms where competition is impossible"
+      - type: llm-rubric
+        value: "Response organizes keywords by intent (informational, commercial, transactional)"