From de0d7482886c89c873bdb4ac08c2825b8a745a51 Mon Sep 17 00:00:00 2001 From: Leo Date: Thu, 12 Mar 2026 09:43:03 +0100 Subject: [PATCH] revert: remove promptfoo eval pipeline Switching to native skill-creator eval workflow instead. No external API key dependency needed. Removes: eval/ directory, skill-eval.yml workflow. No other files affected. --- .github/workflows/skill-eval.yml | 235 ------------------------ eval/README.md | 142 -------------- eval/assertions/skill-quality.js | 54 ------ eval/promptfooconfig.yaml | 32 ---- eval/scripts/generate-eval-config.py | 153 --------------- eval/skills/agile-product-owner.yaml | 41 ----- eval/skills/aws-solution-architect.yaml | 41 ----- eval/skills/content-strategy.yaml | 41 ----- eval/skills/copywriting.yaml | 57 ------ eval/skills/cto-advisor.yaml | 53 ------ eval/skills/launch-strategy.yaml | 41 ----- eval/skills/mcp-server-builder.yaml | 41 ----- eval/skills/senior-frontend.yaml | 41 ----- eval/skills/senior-security.yaml | 41 ----- eval/skills/seo-audit.yaml | 42 ----- 15 files changed, 1055 deletions(-) delete mode 100644 .github/workflows/skill-eval.yml delete mode 100644 eval/README.md delete mode 100644 eval/assertions/skill-quality.js delete mode 100644 eval/promptfooconfig.yaml delete mode 100755 eval/scripts/generate-eval-config.py delete mode 100644 eval/skills/agile-product-owner.yaml delete mode 100644 eval/skills/aws-solution-architect.yaml delete mode 100644 eval/skills/content-strategy.yaml delete mode 100644 eval/skills/copywriting.yaml delete mode 100644 eval/skills/cto-advisor.yaml delete mode 100644 eval/skills/launch-strategy.yaml delete mode 100644 eval/skills/mcp-server-builder.yaml delete mode 100644 eval/skills/senior-frontend.yaml delete mode 100644 eval/skills/senior-security.yaml delete mode 100644 eval/skills/seo-audit.yaml diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml deleted file mode 100644 index 5510232..0000000 --- a/.github/workflows/skill-eval.yml +++ /dev/null @@ -1,235 +0,0 @@ ---- -name: Skill Quality Eval (promptfoo) - -'on': - pull_request: - types: [opened, synchronize, reopened] - paths: - - '**/SKILL.md' - workflow_dispatch: - inputs: - skill: - description: 'Specific skill eval config to run (e.g. copywriting)' - required: false - -concurrency: - group: skill-eval-${{ github.event.pull_request.number || github.run_id }} - cancel-in-progress: true - -jobs: - detect-changes: - name: Detect changed skills - runs-on: ubuntu-latest - outputs: - skills: ${{ steps.find-evals.outputs.skills }} - has_evals: ${{ steps.find-evals.outputs.has_evals }} - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Find eval configs for changed skills - id: find-evals - run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ github.event.inputs.skill }}" ]]; then - SKILL="${{ github.event.inputs.skill }}" - if [[ -f "eval/skills/${SKILL}.yaml" ]]; then - echo "skills=[\"${SKILL}\"]" >> "$GITHUB_OUTPUT" - echo "has_evals=true" >> "$GITHUB_OUTPUT" - else - echo "No eval config found for: ${SKILL}" - echo "has_evals=false" >> "$GITHUB_OUTPUT" - fi - exit 0 - fi - - # Get changed SKILL.md files in this PR - CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD -- '**/SKILL.md' | grep -v '.gemini/' | grep -v '.codex/' | grep -v 'sample') - - if [[ -z "$CHANGED" ]]; then - echo "No SKILL.md files changed." - echo "has_evals=false" >> "$GITHUB_OUTPUT" - exit 0 - fi - - echo "Changed SKILL.md files:" - echo "$CHANGED" - - # Map changed skills to eval configs - EVALS="[]" - for skill_path in $CHANGED; do - # Extract skill name from path (e.g. marketing-skill/copywriting/SKILL.md -> copywriting) - skill_name=$(basename $(dirname "$skill_path")) - eval_config="eval/skills/${skill_name}.yaml" - - if [[ -f "$eval_config" ]]; then - EVALS=$(echo "$EVALS" | python3 -c " - import json, sys - arr = json.load(sys.stdin) - name = '$skill_name' - if name not in arr: - arr.append(name) - print(json.dumps(arr)) - ") - echo " ✅ $skill_name → $eval_config" - else - echo " ⏭️ $skill_name → no eval config (skipping)" - fi - done - - echo "skills=$EVALS" >> "$GITHUB_OUTPUT" - if [[ "$EVALS" == "[]" ]]; then - echo "has_evals=false" >> "$GITHUB_OUTPUT" - else - echo "has_evals=true" >> "$GITHUB_OUTPUT" - fi - - eval: - name: "Eval: ${{ matrix.skill }}" - needs: detect-changes - if: needs.detect-changes.outputs.has_evals == 'true' - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: write - timeout-minutes: 15 - strategy: - fail-fast: false - matrix: - skill: ${{ fromJson(needs.detect-changes.outputs.skills) }} - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - - - name: Run promptfoo eval - id: eval - continue-on-error: true - env: - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - run: | - npx promptfoo@latest eval \ - -c "eval/skills/${{ matrix.skill }}.yaml" \ - --no-cache \ - --output "/tmp/${{ matrix.skill }}-results.json" \ - --output-format json \ - 2>&1 | tee /tmp/eval-output.log - - echo "exit_code=$?" >> "$GITHUB_OUTPUT" - - - name: Parse results - id: parse - if: always() - run: | - RESULTS_FILE="/tmp/${{ matrix.skill }}-results.json" - if [[ ! -f "$RESULTS_FILE" ]]; then - echo "summary=⚠️ No results file generated" >> "$GITHUB_OUTPUT" - exit 0 - fi - - python3 << 'PYEOF' - import json, os - - with open(os.environ.get("RESULTS_FILE", f"/tmp/${{ matrix.skill }}-results.json")) as f: - data = json.load(f) - - results = data.get("results", data.get("evalResults", [])) - total = len(results) - passed = 0 - failed = 0 - details = [] - - for r in results: - test_pass = r.get("success", False) - if test_pass: - passed += 1 - else: - failed += 1 - - prompt_vars = r.get("vars", {}) - task = prompt_vars.get("task", "unknown")[:80] - - assertions = r.get("gradingResult", {}).get("componentResults", []) - for a in assertions: - status = "✅" if a.get("pass", False) else "❌" - reason = a.get("reason", a.get("assertion", {}).get("value", ""))[:100] - details.append(f" {status} {reason}") - - rate = (passed / total * 100) if total > 0 else 0 - icon = "✅" if rate >= 80 else "⚠️" if rate >= 50 else "❌" - - summary = f"{icon} **${{ matrix.skill }}**: {passed}/{total} tests passed ({rate:.0f}%)" - - # Write to file for comment step - with open("/tmp/eval-summary.md", "w") as f: - f.write(f"### {summary}\n\n") - if details: - f.write("
Assertion details\n\n") - f.write("\n".join(details)) - f.write("\n\n
\n") - - # Output for workflow - with open(os.environ["GITHUB_OUTPUT"], "a") as f: - f.write(f"summary={summary}\n") - f.write(f"pass_rate={rate:.0f}\n") - PYEOF - - env: - RESULTS_FILE: "/tmp/${{ matrix.skill }}-results.json" - - - name: Comment on PR - if: github.event_name == 'pull_request' && always() - uses: actions/github-script@v7 - with: - script: | - const fs = require('fs'); - let body = '### 🧪 Skill Eval: `${{ matrix.skill }}`\n\n'; - - try { - const summary = fs.readFileSync('/tmp/eval-summary.md', 'utf8'); - body += summary; - } catch { - body += '⚠️ Eval did not produce results. Check the workflow logs.\n'; - } - - body += '\n\n---\n*Powered by [promptfoo](https://promptfoo.dev) · [eval config](eval/skills/${{ matrix.skill }}.yaml)*'; - - // Find existing comment to update - const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - - const marker = `Skill Eval: \`${{ matrix.skill }}\``; - const existing = comments.find(c => c.body.includes(marker)); - - if (existing) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existing.id, - body, - }); - } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body, - }); - } - - - name: Upload results - if: always() - uses: actions/upload-artifact@v4 - with: - name: eval-results-${{ matrix.skill }} - path: /tmp/${{ matrix.skill }}-results.json - retention-days: 30 - if-no-files-found: ignore diff --git a/eval/README.md b/eval/README.md deleted file mode 100644 index ee604da..0000000 --- a/eval/README.md +++ /dev/null @@ -1,142 +0,0 @@ -# Skill Evaluation Pipeline - -Automated quality evaluation for skills using [promptfoo](https://promptfoo.dev). - -## Quick Start - -```bash -# Run a single skill eval -npx promptfoo@latest eval -c eval/skills/copywriting.yaml - -# View results in browser -npx promptfoo@latest view - -# Run all pilot skill evals -for config in eval/skills/*.yaml; do - npx promptfoo@latest eval -c "$config" --no-cache -done -``` - -## Requirements - -- Node.js 18+ -- `ANTHROPIC_API_KEY` environment variable set -- No additional dependencies (promptfoo runs via npx) - -## How It Works - -Each skill has an eval config in `eval/skills/.yaml` that: - -1. Loads the skill's `SKILL.md` content as context -2. Sends realistic task prompts to an LLM with the skill loaded -3. Evaluates outputs against quality assertions (LLM rubrics + programmatic checks) -4. Reports pass/fail per assertion - -### CI/CD Integration - -The GitHub Action (`.github/workflows/skill-eval.yml`) runs automatically when: -- A PR to `dev` changes any `SKILL.md` file -- The changed skill has an eval config in `eval/skills/` -- Results are posted as PR comments - -Currently **non-blocking** — evals are informational, not gates. - -## Adding Evals for a New Skill - -### Option 1: Auto-generate - -```bash -python eval/scripts/generate-eval-config.py marketing-skill/my-new-skill -``` - -This creates a boilerplate config with default prompts and assertions. **Always customize** the generated config with domain-specific test cases. - -### Option 2: Manual - -Copy an existing config and modify: - -```bash -cp eval/skills/copywriting.yaml eval/skills/my-skill.yaml -``` - -### Eval Config Structure - -```yaml -description: "What this eval tests" - -prompts: - - | - You are an expert AI assistant with this skill: - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - Task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - -tests: - - vars: - skill_content: file://../../path/to/SKILL.md - task: "A realistic user request" - assert: - - type: llm-rubric - value: "What good output looks like" - - type: javascript - value: "output.length > 200" -``` - -### Assertion Types - -| Type | Use For | Example | -|------|---------|---------| -| `llm-rubric` | Qualitative checks (expertise, relevance) | `"Response includes actionable next steps"` | -| `contains` | Required terms | `"React"` | -| `javascript` | Programmatic checks | `"output.length > 500"` | -| `similar` | Semantic similarity | Compare against reference output | - -## Reading Results - -```bash -# Terminal output (after eval) -npx promptfoo@latest eval -c eval/skills/copywriting.yaml - -# Web UI (interactive) -npx promptfoo@latest view - -# JSON output (for scripting) -npx promptfoo@latest eval -c eval/skills/copywriting.yaml --output results.json -``` - -## File Structure - -``` -eval/ -├── promptfooconfig.yaml # Master config (reference) -├── skills/ # Per-skill eval configs -│ ├── copywriting.yaml # ← 10 pilot skills -│ ├── cto-advisor.yaml -│ └── ... -├── assertions/ -│ └── skill-quality.js # Reusable assertion helpers -├── scripts/ -│ └── generate-eval-config.py # Config generator -└── README.md # This file -``` - -## Running Locally vs CI - -| | Local | CI | -|---|---|---| -| **Command** | `npx promptfoo@latest eval -c eval/skills/X.yaml` | Automatic on PR | -| **Results** | Terminal + web viewer | PR comment + artifact | -| **Caching** | Enabled (faster iteration) | Disabled (`--no-cache`) | -| **Cost** | Your API key | Repo secret `ANTHROPIC_API_KEY` | - -## Cost Estimate - -Each skill eval runs 2-3 test cases × ~4K tokens output = ~12K tokens per skill. -At Sonnet pricing (~$3/M input, $15/M output): **~$0.05-0.10 per skill eval**. -Full 10-skill pilot batch: **~$0.50-1.00 per run**. diff --git a/eval/assertions/skill-quality.js b/eval/assertions/skill-quality.js deleted file mode 100644 index 00d19b3..0000000 --- a/eval/assertions/skill-quality.js +++ /dev/null @@ -1,54 +0,0 @@ -// Reusable assertion helpers for skill quality evaluation -// Used by promptfoo configs via: type: javascript, value: file://eval/assertions/skill-quality.js - -/** - * Check that output demonstrates domain expertise (not generic advice). - * Looks for specific terminology, frameworks, or tools mentioned. - */ -function hasDomainDepth(output, minTerms = 3) { - // Count domain-specific patterns: frameworks, tools, methodologies, metrics - const patterns = [ - /\b(RICE|MoSCoW|OKR|KPI|DORA|SLA|SLO|SLI)\b/gi, - /\b(React|Next\.js|Tailwind|TypeScript|PostgreSQL|Redis|Lambda|S3)\b/gi, - /\b(SEO|CRO|CTR|LTV|CAC|MRR|ARR|NPS|CSAT)\b/gi, - /\b(OWASP|CVE|GDPR|SOC\s?2|ISO\s?27001|PCI)\b/gi, - /\b(sprint|backlog|retrospective|standup|velocity)\b/gi, - ]; - - let termCount = 0; - for (const pattern of patterns) { - const matches = output.match(pattern); - if (matches) termCount += new Set(matches.map(m => m.toLowerCase())).size; - } - - return { - pass: termCount >= minTerms, - score: Math.min(1, termCount / (minTerms * 2)), - reason: `Found ${termCount} domain-specific terms (minimum: ${minTerms})`, - }; -} - -/** - * Check that output is actionable (contains concrete next steps, not just analysis). - */ -function isActionable(output) { - const actionPatterns = [ - /\b(step \d|first|second|third|next|then|finally)\b/gi, - /\b(implement|create|build|configure|set up|install|deploy|run)\b/gi, - /\b(action item|todo|checklist|recommendation)\b/gi, - /```[\s\S]*?```/g, // code blocks indicate concrete output - ]; - - let score = 0; - for (const pattern of actionPatterns) { - if (pattern.test(output)) score += 0.25; - } - - return { - pass: score >= 0.5, - score: Math.min(1, score), - reason: `Actionability score: ${score}/1.0`, - }; -} - -module.exports = { hasDomainDepth, isActionable }; diff --git a/eval/promptfooconfig.yaml b/eval/promptfooconfig.yaml deleted file mode 100644 index 89cb6ef..0000000 --- a/eval/promptfooconfig.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Promptfoo Master Config — claude-skills -# Run all pilot skill evals: npx promptfoo@latest eval -c eval/promptfooconfig.yaml -# Run a single skill: npx promptfoo@latest eval -c eval/skills/copywriting.yaml - -description: "claude-skills quality evaluation — pilot batch" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded that guides your behavior: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: - {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -defaultTest: - assert: - - type: javascript - value: "output.length > 200" - - type: llm-rubric - value: "The response demonstrates domain expertise relevant to the task, not generic advice" - -# Import per-skill test suites -tests: [] diff --git a/eval/scripts/generate-eval-config.py b/eval/scripts/generate-eval-config.py deleted file mode 100755 index 5cbdf61..0000000 --- a/eval/scripts/generate-eval-config.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 -"""Generate a promptfoo eval config for any skill. - -Usage: - python eval/scripts/generate-eval-config.py marketing-skill/copywriting - python eval/scripts/generate-eval-config.py c-level-advisor/cto-advisor --force -""" - -import os -import re -import sys -import textwrap - - -def parse_frontmatter(skill_path): - """Extract name and description from SKILL.md YAML frontmatter.""" - with open(skill_path, "r", encoding="utf-8") as f: - content = f.read() - - # Match YAML frontmatter between --- delimiters - match = re.match(r"^---\s*\n(.*?)\n---", content, re.DOTALL) - if not match: - return None, None - - frontmatter = match.group(1) - name = None - description = None - - for line in frontmatter.split("\n"): - if line.startswith("name:"): - name = line.split(":", 1)[1].strip().strip("'\"") - elif line.startswith("description:"): - # Handle multi-line descriptions - desc = line.split(":", 1)[1].strip().strip("'\"") - description = desc - - return name, description - - -def generate_config(skill_dir, force=False): - """Generate a promptfoo eval YAML config for the given skill directory.""" - # Resolve SKILL.md path - skill_md = os.path.join(skill_dir, "SKILL.md") - if not os.path.exists(skill_md): - print(f"Error: {skill_md} not found", file=sys.stderr) - sys.exit(1) - - name, description = parse_frontmatter(skill_md) - if not name: - print(f"Error: Could not parse frontmatter from {skill_md}", file=sys.stderr) - sys.exit(1) - - # Output path - output_path = os.path.join("eval", "skills", f"{name}.yaml") - if os.path.exists(output_path) and not force: - print(f"Eval config already exists: {output_path}") - print("Use --force to overwrite.") - sys.exit(0) - - # Calculate relative path from eval/skills/ to the skill - rel_path = os.path.relpath(skill_md, os.path.join("eval", "skills")) - - # Generate test prompts based on description - desc_lower = (description or "").lower() - - # Default test prompts - prompts = [ - f"I need help with {name.replace('-', ' ')}. Give me a comprehensive approach for a mid-stage B2B SaaS startup.", - f"Act as an expert in {name.replace('-', ' ')} and review my current approach. I'm a solo founder building a developer tool.", - ] - - # Add domain-specific third prompt - if any(w in desc_lower for w in ["marketing", "content", "seo", "copy"]): - prompts.append( - "Create a 90-day plan with specific deliverables, metrics, and milestones." - ) - elif any(w in desc_lower for w in ["engineer", "architect", "code", "technical"]): - prompts.append( - "Design a technical solution with architecture diagram, tech stack recommendations, and implementation plan." - ) - elif any(w in desc_lower for w in ["advisor", "executive", "strategic", "leader"]): - prompts.append( - "Help me prepare a board presentation on this topic with key metrics and strategic recommendations." - ) - else: - prompts.append( - f"What are the top 5 mistakes people make with {name.replace('-', ' ')} and how to avoid them?" - ) - - # Build YAML - config = textwrap.dedent(f"""\ - # Eval: {name} - # Source: {skill_dir}/SKILL.md - # Run: npx promptfoo@latest eval -c eval/skills/{name}.yaml - # Auto-generated — customize test prompts and assertions for better coverage - - description: "Evaluate {name} skill" - - prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{{{skill_content}}}} - ---END SKILL--- - - Now complete this task: {{{{task}}}} - - providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - - tests: - """) - - for i, prompt in enumerate(prompts): - test_block = textwrap.dedent(f"""\ - - vars: - skill_content: file://{rel_path} - task: "{prompt}" - assert: - - type: llm-rubric - value: "Response demonstrates specific expertise in {name.replace('-', ' ')}, not generic advice" - - type: llm-rubric - value: "Response is actionable with concrete steps or deliverables" - - type: javascript - value: "output.length > 300" - """) - config += test_block - - # Write - os.makedirs(os.path.dirname(output_path), exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: - f.write(config) - - print(f"✅ Generated: {output_path}") - print(f" Skill: {name}") - print(f" Tests: {len(prompts)}") - print(f" Edit the file to customize prompts and assertions.") - - -if __name__ == "__main__": - if len(sys.argv) < 2: - print("Usage: python eval/scripts/generate-eval-config.py ") - print(" python eval/scripts/generate-eval-config.py marketing-skill/copywriting --force") - sys.exit(1) - - skill_dir = sys.argv[1].rstrip("/") - force = "--force" in sys.argv - - generate_config(skill_dir, force) diff --git a/eval/skills/agile-product-owner.yaml b/eval/skills/agile-product-owner.yaml deleted file mode 100644 index a20fc42..0000000 --- a/eval/skills/agile-product-owner.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: agile-product-owner -# Source: product-team/agile-product-owner/SKILL.md - -description: "Evaluate agile product owner skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../product-team/agile-product-owner/SKILL.md - task: "Write user stories with acceptance criteria for an 'invite team members' feature in a project management tool. Users should be able to invite by email, set roles (admin/member/viewer), and revoke access." - assert: - - type: llm-rubric - value: "Output uses proper user story format (As a..., I want..., So that...) with testable acceptance criteria" - - type: llm-rubric - value: "Stories cover the three main flows: invite, role assignment, and access revocation" - - type: llm-rubric - value: "Acceptance criteria are specific and testable, not vague requirements" - - - vars: - skill_content: file://../../product-team/agile-product-owner/SKILL.md - task: "We have 30 items in our backlog. Help me prioritize for a 2-week sprint with 2 developers (40 story points capacity). The items range from bug fixes to new features to tech debt." - assert: - - type: llm-rubric - value: "Response uses a prioritization framework (RICE, MoSCoW, or similar) with clear reasoning" - - type: llm-rubric - value: "Response respects the 40 story point capacity constraint" diff --git a/eval/skills/aws-solution-architect.yaml b/eval/skills/aws-solution-architect.yaml deleted file mode 100644 index ff1d0c7..0000000 --- a/eval/skills/aws-solution-architect.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: aws-solution-architect -# Source: engineering-team/aws-solution-architect/SKILL.md - -description: "Evaluate AWS solution architect skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../engineering-team/aws-solution-architect/SKILL.md - task: "Design a serverless architecture for a real-time notification system that needs to handle 10K messages per second with sub-200ms delivery. Users connect via WebSocket. Budget is $500/month." - assert: - - type: llm-rubric - value: "Response uses specific AWS services (API Gateway WebSocket, Lambda, DynamoDB, etc.) not generic cloud patterns" - - type: llm-rubric - value: "Response addresses the throughput requirement (10K msg/s) with concrete scaling strategy" - - type: llm-rubric - value: "Response includes cost estimation relative to the $500/month budget constraint" - - - vars: - skill_content: file://../../engineering-team/aws-solution-architect/SKILL.md - task: "We're migrating a Django monolith from Heroku to AWS. We have PostgreSQL, Redis, Celery workers, and S3 for file storage. Team of 3 devs, no DevOps experience. What's the simplest production-ready setup?" - assert: - - type: llm-rubric - value: "Response recommends managed services appropriate for a small team without DevOps (e.g., ECS Fargate, RDS, ElastiCache)" - - type: llm-rubric - value: "Response includes a migration plan with phases, not just target architecture" diff --git a/eval/skills/content-strategy.yaml b/eval/skills/content-strategy.yaml deleted file mode 100644 index 73b6422..0000000 --- a/eval/skills/content-strategy.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: content-strategy -# Source: marketing-skill/content-strategy/SKILL.md - -description: "Evaluate content strategy skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../marketing-skill/content-strategy/SKILL.md - task: "Build a 3-month content strategy for a developer tools startup that just launched. We have zero blog posts and a small Twitter following of 500. Our product is an open-source database migration tool." - assert: - - type: llm-rubric - value: "Response includes a phased plan with specific content types, topics, and publishing cadence" - - type: llm-rubric - value: "Strategy addresses developer audience specifically with appropriate channels (dev blogs, GitHub, HN)" - - type: llm-rubric - value: "Response includes measurable goals or KPIs for the content program" - - - vars: - skill_content: file://../../marketing-skill/content-strategy/SKILL.md - task: "We have 50 blog posts but traffic has plateaued at 10K monthly visits. What should we do to 3x our organic traffic in 6 months?" - assert: - - type: llm-rubric - value: "Response diagnoses potential issues with existing content before prescribing new content" - - type: llm-rubric - value: "Response includes specific tactics like content refresh, internal linking, or topic clusters" diff --git a/eval/skills/copywriting.yaml b/eval/skills/copywriting.yaml deleted file mode 100644 index bcea86e..0000000 --- a/eval/skills/copywriting.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Eval: copywriting -# Source: marketing-skill/copywriting/SKILL.md -# Run: npx promptfoo@latest eval -c eval/skills/copywriting.yaml - -description: "Evaluate copywriting skill — marketing copy generation" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../marketing-skill/copywriting/SKILL.md - task: "Write homepage copy for a B2B SaaS that automates invoicing for freelancers called InvoiceFlow" - assert: - - type: llm-rubric - value: "Output includes a clear headline, subheadline, at least 3 value propositions, and a call-to-action" - - type: llm-rubric - value: "Copy is specific to InvoiceFlow and freelancer invoicing, not generic B2B marketing" - - type: llm-rubric - value: "Copy follows direct-response copywriting principles with benefit-driven language" - - type: javascript - value: "output.length > 500" - - - vars: - skill_content: file://../../marketing-skill/copywriting/SKILL.md - task: "Rewrite this landing page headline and subheadline: 'Welcome to our platform. We help businesses grow with our comprehensive solution for managing operations.' Make it compelling for a project management tool targeting remote teams." - assert: - - type: llm-rubric - value: "The rewritten headline is specific, benefit-driven, and not generic" - - type: llm-rubric - value: "The output specifically addresses remote teams, not generic businesses" - - type: javascript - value: "output.length > 100" - - - vars: - skill_content: file://../../marketing-skill/copywriting/SKILL.md - task: "Write a pricing page for a developer tool with 3 tiers: Free, Pro ($29/mo), and Enterprise (custom). The tool is an API monitoring service called PingGuard." - assert: - - type: llm-rubric - value: "Output includes copy for all three pricing tiers with differentiated value propositions" - - type: llm-rubric - value: "Each tier has clear feature descriptions and the copy encourages upgrade paths" - - type: javascript - value: "output.length > 400" diff --git a/eval/skills/cto-advisor.yaml b/eval/skills/cto-advisor.yaml deleted file mode 100644 index 3cf5c44..0000000 --- a/eval/skills/cto-advisor.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# Eval: cto-advisor -# Source: c-level-advisor/cto-advisor/SKILL.md -# Run: npx promptfoo@latest eval -c eval/skills/cto-advisor.yaml - -description: "Evaluate CTO advisor skill — technical leadership guidance" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../c-level-advisor/cto-advisor/SKILL.md - task: "We're a 15-person startup with a monolithic Django app serving 50K users. Response times are growing. Should we move to microservices or optimize the monolith? We have 4 backend engineers." - assert: - - type: llm-rubric - value: "Response provides a clear recommendation with reasoning, not just listing pros and cons" - - type: llm-rubric - value: "Response considers team size (4 engineers) as a factor in the architecture decision" - - type: llm-rubric - value: "Response includes concrete next steps or an action plan" - - - vars: - skill_content: file://../../c-level-advisor/cto-advisor/SKILL.md - task: "Our tech debt is slowing us down. Engineering velocity dropped 30% over 6 months. The CEO wants new features but we can barely maintain what we have. How do I make the case for a tech debt sprint to the board?" - assert: - - type: llm-rubric - value: "Response frames tech debt in business terms the board would understand, not just technical jargon" - - type: llm-rubric - value: "Response includes a strategy for balancing tech debt work with feature delivery" - - type: llm-rubric - value: "Response provides specific metrics or frameworks to measure tech debt impact" - - - vars: - skill_content: file://../../c-level-advisor/cto-advisor/SKILL.md - task: "I'm hiring my first VP of Engineering. I'm a technical founder who has been CTO and lead dev. What should I look for, and how do I avoid hiring someone who will clash with me?" - assert: - - type: llm-rubric - value: "Response addresses the founder-VP dynamic specifically, not generic hiring advice" - - type: llm-rubric - value: "Response includes qualities to look for and red flags to watch for" diff --git a/eval/skills/launch-strategy.yaml b/eval/skills/launch-strategy.yaml deleted file mode 100644 index 4242ca3..0000000 --- a/eval/skills/launch-strategy.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: launch-strategy -# Source: marketing-skill/launch-strategy/SKILL.md - -description: "Evaluate launch strategy skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../marketing-skill/launch-strategy/SKILL.md - task: "Plan a Product Hunt launch for an AI writing assistant. We have 2,000 email subscribers, 500 Twitter followers, and the product has been in beta for 3 months with 200 active users. Budget: $0 (bootstrapped)." - assert: - - type: llm-rubric - value: "Response includes a phased timeline (pre-launch, launch day, post-launch) with specific actions" - - type: llm-rubric - value: "Strategy leverages existing assets (2K email list, 200 beta users, Twitter) concretely" - - type: llm-rubric - value: "Response includes Product Hunt-specific tactics (hunter selection, timing, asset preparation)" - - - vars: - skill_content: file://../../marketing-skill/launch-strategy/SKILL.md - task: "We're launching a major feature update (AI-powered analytics) to our existing SaaS product with 5,000 paying customers. How should we announce it to maximize adoption and upsell opportunities?" - assert: - - type: llm-rubric - value: "Response distinguishes between existing customer communication and new user acquisition" - - type: llm-rubric - value: "Response includes specific channels and messaging for the announcement" diff --git a/eval/skills/mcp-server-builder.yaml b/eval/skills/mcp-server-builder.yaml deleted file mode 100644 index 1f60a36..0000000 --- a/eval/skills/mcp-server-builder.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: mcp-server-builder -# Source: engineering/mcp-server-builder/SKILL.md - -description: "Evaluate MCP server builder skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../engineering/mcp-server-builder/SKILL.md - task: "Build an MCP server in Python that exposes a 'search_github_repos' tool. The tool should take a query string and return top 5 repos with name, stars, and description. Use the GitHub REST API (no auth required for public search)." - assert: - - type: llm-rubric - value: "Output includes working Python code that follows MCP server patterns (tool registration, handler)" - - type: llm-rubric - value: "Code includes proper error handling for API failures" - - type: llm-rubric - value: "Tool definition includes proper input schema with type annotations" - - - vars: - skill_content: file://../../engineering/mcp-server-builder/SKILL.md - task: "Design an MCP server architecture for a CRM system that exposes: list_contacts, get_contact, create_contact, search_contacts, and list_deals tools. Show the tool definitions and server structure." - assert: - - type: llm-rubric - value: "Response includes tool definitions with proper input/output schemas for all 5 tools" - - type: llm-rubric - value: "Architecture follows MCP best practices (proper transport, error handling, resource definitions)" diff --git a/eval/skills/senior-frontend.yaml b/eval/skills/senior-frontend.yaml deleted file mode 100644 index 0a95b36..0000000 --- a/eval/skills/senior-frontend.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: senior-frontend (replacing frontend-design which doesn't exist as standalone) -# Source: engineering-team/senior-frontend/SKILL.md - -description: "Evaluate senior frontend skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../engineering-team/senior-frontend/SKILL.md - task: "Build a responsive dashboard layout in React with TypeScript. It should have a sidebar navigation, a top bar with user menu, and a main content area with a grid of metric cards. Use Tailwind CSS." - assert: - - type: llm-rubric - value: "Output includes actual React/TypeScript code, not just descriptions" - - type: llm-rubric - value: "Code uses Tailwind CSS classes for responsive design (sm:, md:, lg: breakpoints)" - - type: llm-rubric - value: "Component structure follows React best practices (proper component decomposition)" - - - vars: - skill_content: file://../../engineering-team/senior-frontend/SKILL.md - task: "Our Next.js app has a Core Web Vitals score of 45. LCP is 4.2s, CLS is 0.25, and INP is 350ms. Diagnose the likely causes and provide a fix plan." - assert: - - type: llm-rubric - value: "Response addresses each specific metric (LCP, CLS, INP) with targeted fixes" - - type: llm-rubric - value: "Response includes Next.js-specific optimizations (Image component, dynamic imports, etc.)" diff --git a/eval/skills/senior-security.yaml b/eval/skills/senior-security.yaml deleted file mode 100644 index 5719d3c..0000000 --- a/eval/skills/senior-security.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# Eval: senior-security -# Source: engineering-team/senior-security/SKILL.md - -description: "Evaluate senior security engineer skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../engineering-team/senior-security/SKILL.md - task: "Perform a security review of this Express.js API endpoint pattern: app.post('/api/users', (req, res) => { const query = `SELECT * FROM users WHERE email = '${req.body.email}'`; db.query(query).then(user => res.json(user)); })" - assert: - - type: llm-rubric - value: "Response identifies SQL injection vulnerability as the primary critical issue" - - type: llm-rubric - value: "Response provides a fixed code example using parameterized queries" - - type: llm-rubric - value: "Response identifies additional issues beyond SQL injection (input validation, error handling, etc.)" - - - vars: - skill_content: file://../../engineering-team/senior-security/SKILL.md - task: "Create a security hardening checklist for a new Node.js API going to production. We handle user PII and payment data. Stack: Express, PostgreSQL, Redis, deployed on AWS ECS." - assert: - - type: llm-rubric - value: "Checklist covers OWASP Top 10 categories relevant to the stack" - - type: llm-rubric - value: "Response includes PII and payment-specific requirements (encryption at rest, PCI considerations)" diff --git a/eval/skills/seo-audit.yaml b/eval/skills/seo-audit.yaml deleted file mode 100644 index d05900c..0000000 --- a/eval/skills/seo-audit.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# Eval: seo-audit -# Source: marketing-skill/seo-audit/SKILL.md -# Run: npx promptfoo@latest eval -c eval/skills/seo-audit.yaml - -description: "Evaluate SEO audit skill" - -prompts: - - | - You are an expert AI assistant. You have the following skill loaded: - - ---BEGIN SKILL--- - {{skill_content}} - ---END SKILL--- - - Now complete this task: {{task}} - -providers: - - id: anthropic:messages:claude-sonnet-4-6 - config: - max_tokens: 4096 - temperature: 0.7 - -tests: - - vars: - skill_content: file://../../marketing-skill/seo-audit/SKILL.md - task: "Perform an SEO audit checklist for a new SaaS landing page targeting the keyword 'AI code review tool'. The page has a 3-second load time, no meta description, and 200 words of content." - assert: - - type: llm-rubric - value: "Response identifies specific SEO issues (load time, missing meta description, thin content) rather than generic advice" - - type: llm-rubric - value: "Response provides actionable fixes with priority ordering" - - type: llm-rubric - value: "Response references on-page SEO factors like title tags, headings, and internal linking" - - - vars: - skill_content: file://../../marketing-skill/seo-audit/SKILL.md - task: "Create a keyword strategy for a B2B SaaS in the project management space. We're a small startup competing against Asana, Monday.com, and Jira." - assert: - - type: llm-rubric - value: "Response suggests long-tail keywords rather than only head terms where competition is impossible" - - type: llm-rubric - value: "Response organizes keywords by intent (informational, commercial, transactional)"