From 6723bc69772772ed7c934f721c747e2a87353423 Mon Sep 17 00:00:00 2001 From: Alireza Rezvani Date: Mon, 26 Jan 2026 11:03:37 +0100 Subject: [PATCH] fix(skill): rewrite senior-prompt-engineer with unique, actionable content (#91) Issue #49 feedback implementation: SKILL.md: - Added YAML frontmatter with trigger phrases - Removed marketing language ("world-class", etc.) - Added Table of Contents - Converted vague bullets to concrete workflows - Added input/output examples for all tools Reference files (all 3 previously 100% identical): - prompt_engineering_patterns.md: 10 patterns with examples (Zero-Shot, Few-Shot, CoT, Role, Structured Output, etc.) - llm_evaluation_frameworks.md: 7 sections on metrics (BLEU, ROUGE, BERTScore, RAG metrics, A/B testing) - agentic_system_design.md: 6 agent architecture sections (ReAct, Plan-Execute, Tool Use, Multi-Agent, Memory) Python scripts (all 3 previously identical placeholders): - prompt_optimizer.py: Token counting, clarity analysis, few-shot extraction, optimization suggestions - rag_evaluator.py: Context relevance, faithfulness, retrieval metrics (Precision@K, MRR, NDCG) - agent_orchestrator.py: Config parsing, validation, ASCII/Mermaid visualization, cost estimation Total: 3,571 lines added, 587 deleted Before: ~785 lines duplicate boilerplate After: 3,750 lines unique, actionable content Closes #49 Co-authored-by: Claude Opus 4.5 --- .../senior-prompt-engineer/SKILL.md | 517 +++++++++----- .../references/agentic_system_design.md | 676 ++++++++++++++++-- .../references/llm_evaluation_frameworks.md | 556 ++++++++++++-- .../references/prompt_engineering_patterns.md | 602 ++++++++++++++-- .../scripts/agent_orchestrator.py | 624 +++++++++++++--- .../scripts/prompt_optimizer.py | 585 ++++++++++++--- .../scripts/rag_evaluator.py | 638 ++++++++++++++--- 7 files changed, 3591 insertions(+), 607 deletions(-) diff --git a/engineering-team/senior-prompt-engineer/SKILL.md b/engineering-team/senior-prompt-engineer/SKILL.md index 3c3b30d..561ec3c 100644 --- a/engineering-team/senior-prompt-engineer/SKILL.md +++ b/engineering-team/senior-prompt-engineer/SKILL.md @@ -1,226 +1,355 @@ --- name: senior-prompt-engineer -description: World-class prompt engineering skill for LLM optimization, prompt patterns, structured outputs, and AI product development. Expertise in Claude, GPT-4, prompt design patterns, few-shot learning, chain-of-thought, and AI evaluation. Includes RAG optimization, agent design, and LLM system architecture. Use when building AI products, optimizing LLM performance, designing agentic systems, or implementing advanced prompting techniques. +description: This skill should be used when the user asks to "optimize prompts", "design prompt templates", "evaluate LLM outputs", "build agentic systems", "implement RAG", "create few-shot examples", "analyze token usage", or "design AI workflows". Use for prompt engineering patterns, LLM evaluation frameworks, agent architectures, and structured output design. --- # Senior Prompt Engineer -World-class senior prompt engineer skill for production-grade AI/ML/Data systems. +Prompt engineering patterns, LLM evaluation frameworks, and agentic system design. + +## Table of Contents + +- [Quick Start](#quick-start) +- [Tools Overview](#tools-overview) + - [Prompt Optimizer](#1-prompt-optimizer) + - [RAG Evaluator](#2-rag-evaluator) + - [Agent Orchestrator](#3-agent-orchestrator) +- [Prompt Engineering Workflows](#prompt-engineering-workflows) + - [Prompt Optimization Workflow](#prompt-optimization-workflow) + - [Few-Shot Example Design](#few-shot-example-design-workflow) + - [Structured Output Design](#structured-output-design-workflow) +- [Reference Documentation](#reference-documentation) +- [Common Patterns Quick Reference](#common-patterns-quick-reference) + +--- ## Quick Start -### Main Capabilities - ```bash -# Core Tool 1 -python scripts/prompt_optimizer.py --input data/ --output results/ +# Analyze and optimize a prompt file +python scripts/prompt_optimizer.py prompts/my_prompt.txt --analyze -# Core Tool 2 -python scripts/rag_evaluator.py --target project/ --analyze +# Evaluate RAG retrieval quality +python scripts/rag_evaluator.py --contexts contexts.json --questions questions.json -# Core Tool 3 -python scripts/agent_orchestrator.py --config config.yaml --deploy +# Visualize agent workflow from definition +python scripts/agent_orchestrator.py agent_config.yaml --visualize ``` -## Core Expertise +--- -This skill covers world-class capabilities in: +## Tools Overview -- Advanced production patterns and architectures -- Scalable system design and implementation -- Performance optimization at scale -- MLOps and DataOps best practices -- Real-time processing and inference -- Distributed computing frameworks -- Model deployment and monitoring -- Security and compliance -- Cost optimization -- Team leadership and mentoring +### 1. Prompt Optimizer -## Tech Stack +Analyzes prompts for token efficiency, clarity, and structure. Generates optimized versions. -**Languages:** Python, SQL, R, Scala, Go -**ML Frameworks:** PyTorch, TensorFlow, Scikit-learn, XGBoost -**Data Tools:** Spark, Airflow, dbt, Kafka, Databricks -**LLM Frameworks:** LangChain, LlamaIndex, DSPy -**Deployment:** Docker, Kubernetes, AWS/GCP/Azure -**Monitoring:** MLflow, Weights & Biases, Prometheus -**Databases:** PostgreSQL, BigQuery, Snowflake, Pinecone +**Input:** Prompt text file or string +**Output:** Analysis report with optimization suggestions + +**Usage:** +```bash +# Analyze a prompt file +python scripts/prompt_optimizer.py prompt.txt --analyze + +# Output: +# Token count: 847 +# Estimated cost: $0.0025 (GPT-4) +# Clarity score: 72/100 +# Issues found: +# - Ambiguous instruction at line 3 +# - Missing output format specification +# - Redundant context (lines 12-15 repeat lines 5-8) +# Suggestions: +# 1. Add explicit output format: "Respond in JSON with keys: ..." +# 2. Remove redundant context to save 89 tokens +# 3. Clarify "analyze" -> "list the top 3 issues with severity ratings" + +# Generate optimized version +python scripts/prompt_optimizer.py prompt.txt --optimize --output optimized.txt + +# Count tokens for cost estimation +python scripts/prompt_optimizer.py prompt.txt --tokens --model gpt-4 + +# Extract and manage few-shot examples +python scripts/prompt_optimizer.py prompt.txt --extract-examples --output examples.json +``` + +--- + +### 2. RAG Evaluator + +Evaluates Retrieval-Augmented Generation quality by measuring context relevance and answer faithfulness. + +**Input:** Retrieved contexts (JSON) and questions/answers +**Output:** Evaluation metrics and quality report + +**Usage:** +```bash +# Evaluate retrieval quality +python scripts/rag_evaluator.py --contexts retrieved.json --questions eval_set.json + +# Output: +# === RAG Evaluation Report === +# Questions evaluated: 50 +# +# Retrieval Metrics: +# Context Relevance: 0.78 (target: >0.80) +# Retrieval Precision@5: 0.72 +# Coverage: 0.85 +# +# Generation Metrics: +# Answer Faithfulness: 0.91 +# Groundedness: 0.88 +# +# Issues Found: +# - 8 questions had no relevant context in top-5 +# - 3 answers contained information not in context +# +# Recommendations: +# 1. Improve chunking strategy for technical documents +# 2. Add metadata filtering for date-sensitive queries + +# Evaluate with custom metrics +python scripts/rag_evaluator.py --contexts retrieved.json --questions eval_set.json \ + --metrics relevance,faithfulness,coverage + +# Export detailed results +python scripts/rag_evaluator.py --contexts retrieved.json --questions eval_set.json \ + --output report.json --verbose +``` + +--- + +### 3. Agent Orchestrator + +Parses agent definitions and visualizes execution flows. Validates tool configurations. + +**Input:** Agent configuration (YAML/JSON) +**Output:** Workflow visualization, validation report + +**Usage:** +```bash +# Validate agent configuration +python scripts/agent_orchestrator.py agent.yaml --validate + +# Output: +# === Agent Validation Report === +# Agent: research_assistant +# Pattern: ReAct +# +# Tools (4 registered): +# [OK] web_search - API key configured +# [OK] calculator - No config needed +# [WARN] file_reader - Missing allowed_paths +# [OK] summarizer - Prompt template valid +# +# Flow Analysis: +# Max depth: 5 iterations +# Estimated tokens/run: 2,400-4,800 +# Potential infinite loop: No +# +# Recommendations: +# 1. Add allowed_paths to file_reader for security +# 2. Consider adding early exit condition for simple queries + +# Visualize agent workflow (ASCII) +python scripts/agent_orchestrator.py agent.yaml --visualize + +# Output: +# ┌─────────────────────────────────────────┐ +# │ research_assistant │ +# │ (ReAct Pattern) │ +# └─────────────────┬───────────────────────┘ +# │ +# ┌────────▼────────┐ +# │ User Query │ +# └────────┬────────┘ +# │ +# ┌────────▼────────┐ +# │ Think │◄──────┐ +# └────────┬────────┘ │ +# │ │ +# ┌────────▼────────┐ │ +# │ Select Tool │ │ +# └────────┬────────┘ │ +# │ │ +# ┌─────────────┼─────────────┐ │ +# ▼ ▼ ▼ │ +# [web_search] [calculator] [file_reader] +# │ │ │ │ +# └─────────────┼─────────────┘ │ +# │ │ +# ┌────────▼────────┐ │ +# │ Observe │───────┘ +# └────────┬────────┘ +# │ +# ┌────────▼────────┐ +# │ Final Answer │ +# └─────────────────┘ + +# Export workflow as Mermaid diagram +python scripts/agent_orchestrator.py agent.yaml --visualize --format mermaid +``` + +--- + +## Prompt Engineering Workflows + +### Prompt Optimization Workflow + +Use when improving an existing prompt's performance or reducing token costs. + +**Step 1: Baseline current prompt** +```bash +python scripts/prompt_optimizer.py current_prompt.txt --analyze --output baseline.json +``` + +**Step 2: Identify issues** +Review the analysis report for: +- Token waste (redundant instructions, verbose examples) +- Ambiguous instructions (unclear output format, vague verbs) +- Missing constraints (no length limits, no format specification) + +**Step 3: Apply optimization patterns** +| Issue | Pattern to Apply | +|-------|------------------| +| Ambiguous output | Add explicit format specification | +| Too verbose | Extract to few-shot examples | +| Inconsistent results | Add role/persona framing | +| Missing edge cases | Add constraint boundaries | + +**Step 4: Generate optimized version** +```bash +python scripts/prompt_optimizer.py current_prompt.txt --optimize --output optimized.txt +``` + +**Step 5: Compare results** +```bash +python scripts/prompt_optimizer.py optimized.txt --analyze --compare baseline.json +# Shows: token reduction, clarity improvement, issues resolved +``` + +**Step 6: Validate with test cases** +Run both prompts against your evaluation set and compare outputs. + +--- + +### Few-Shot Example Design Workflow + +Use when creating examples for in-context learning. + +**Step 1: Define the task clearly** +``` +Task: Extract product entities from customer reviews +Input: Review text +Output: JSON with {product_name, sentiment, features_mentioned} +``` + +**Step 2: Select diverse examples (3-5 recommended)** +| Example Type | Purpose | +|--------------|---------| +| Simple case | Shows basic pattern | +| Edge case | Handles ambiguity | +| Complex case | Multiple entities | +| Negative case | What NOT to extract | + +**Step 3: Format consistently** +``` +Example 1: +Input: "Love my new iPhone 15, the camera is amazing!" +Output: {"product_name": "iPhone 15", "sentiment": "positive", "features_mentioned": ["camera"]} + +Example 2: +Input: "The laptop was okay but battery life is terrible." +Output: {"product_name": "laptop", "sentiment": "mixed", "features_mentioned": ["battery life"]} +``` + +**Step 4: Validate example quality** +```bash +python scripts/prompt_optimizer.py prompt_with_examples.txt --validate-examples +# Checks: consistency, coverage, format alignment +``` + +**Step 5: Test with held-out cases** +Ensure model generalizes beyond your examples. + +--- + +### Structured Output Design Workflow + +Use when you need reliable JSON/XML/structured responses. + +**Step 1: Define schema** +```json +{ + "type": "object", + "properties": { + "summary": {"type": "string", "maxLength": 200}, + "sentiment": {"enum": ["positive", "negative", "neutral"]}, + "confidence": {"type": "number", "minimum": 0, "maximum": 1} + }, + "required": ["summary", "sentiment"] +} +``` + +**Step 2: Include schema in prompt** +``` +Respond with JSON matching this schema: +- summary (string, max 200 chars): Brief summary of the content +- sentiment (enum): One of "positive", "negative", "neutral" +- confidence (number 0-1): Your confidence in the sentiment +``` + +**Step 3: Add format enforcement** +``` +IMPORTANT: Respond ONLY with valid JSON. No markdown, no explanation. +Start your response with { and end with } +``` + +**Step 4: Validate outputs** +```bash +python scripts/prompt_optimizer.py structured_prompt.txt --validate-schema schema.json +``` + +--- ## Reference Documentation -### 1. Prompt Engineering Patterns +| File | Contains | Load when user asks about | +|------|----------|---------------------------| +| `references/prompt_engineering_patterns.md` | 10 prompt patterns with input/output examples | "which pattern?", "few-shot", "chain-of-thought", "role prompting" | +| `references/llm_evaluation_frameworks.md` | Evaluation metrics, scoring methods, A/B testing | "how to evaluate?", "measure quality", "compare prompts" | +| `references/agentic_system_design.md` | Agent architectures (ReAct, Plan-Execute, Tool Use) | "build agent", "tool calling", "multi-agent" | -Comprehensive guide available in `references/prompt_engineering_patterns.md` covering: +--- -- Advanced patterns and best practices -- Production implementation strategies -- Performance optimization techniques -- Scalability considerations -- Security and compliance -- Real-world case studies +## Common Patterns Quick Reference -### 2. Llm Evaluation Frameworks +| Pattern | When to Use | Example | +|---------|-------------|---------| +| **Zero-shot** | Simple, well-defined tasks | "Classify this email as spam or not spam" | +| **Few-shot** | Complex tasks, consistent format needed | Provide 3-5 examples before the task | +| **Chain-of-Thought** | Reasoning, math, multi-step logic | "Think step by step..." | +| **Role Prompting** | Expertise needed, specific perspective | "You are an expert tax accountant..." | +| **Structured Output** | Need parseable JSON/XML | Include schema + format enforcement | -Complete workflow documentation in `references/llm_evaluation_frameworks.md` including: - -- Step-by-step processes -- Architecture design patterns -- Tool integration guides -- Performance tuning strategies -- Troubleshooting procedures - -### 3. Agentic System Design - -Technical reference guide in `references/agentic_system_design.md` with: - -- System design principles -- Implementation examples -- Configuration best practices -- Deployment strategies -- Monitoring and observability - -## Production Patterns - -### Pattern 1: Scalable Data Processing - -Enterprise-scale data processing with distributed computing: - -- Horizontal scaling architecture -- Fault-tolerant design -- Real-time and batch processing -- Data quality validation -- Performance monitoring - -### Pattern 2: ML Model Deployment - -Production ML system with high availability: - -- Model serving with low latency -- A/B testing infrastructure -- Feature store integration -- Model monitoring and drift detection -- Automated retraining pipelines - -### Pattern 3: Real-Time Inference - -High-throughput inference system: - -- Batching and caching strategies -- Load balancing -- Auto-scaling -- Latency optimization -- Cost optimization - -## Best Practices - -### Development - -- Test-driven development -- Code reviews and pair programming -- Documentation as code -- Version control everything -- Continuous integration - -### Production - -- Monitor everything critical -- Automate deployments -- Feature flags for releases -- Canary deployments -- Comprehensive logging - -### Team Leadership - -- Mentor junior engineers -- Drive technical decisions -- Establish coding standards -- Foster learning culture -- Cross-functional collaboration - -## Performance Targets - -**Latency:** -- P50: < 50ms -- P95: < 100ms -- P99: < 200ms - -**Throughput:** -- Requests/second: > 1000 -- Concurrent users: > 10,000 - -**Availability:** -- Uptime: 99.9% -- Error rate: < 0.1% - -## Security & Compliance - -- Authentication & authorization -- Data encryption (at rest & in transit) -- PII handling and anonymization -- GDPR/CCPA compliance -- Regular security audits -- Vulnerability management +--- ## Common Commands ```bash -# Development -python -m pytest tests/ -v --cov -python -m black src/ -python -m pylint src/ +# Prompt Analysis +python scripts/prompt_optimizer.py prompt.txt --analyze # Full analysis +python scripts/prompt_optimizer.py prompt.txt --tokens # Token count only +python scripts/prompt_optimizer.py prompt.txt --optimize # Generate optimized version -# Training -python scripts/train.py --config prod.yaml -python scripts/evaluate.py --model best.pth +# RAG Evaluation +python scripts/rag_evaluator.py --contexts ctx.json --questions q.json # Evaluate +python scripts/rag_evaluator.py --contexts ctx.json --compare baseline # Compare to baseline -# Deployment -docker build -t service:v1 . -kubectl apply -f k8s/ -helm upgrade service ./charts/ - -# Monitoring -kubectl logs -f deployment/service -python scripts/health_check.py +# Agent Development +python scripts/agent_orchestrator.py agent.yaml --validate # Validate config +python scripts/agent_orchestrator.py agent.yaml --visualize # Show workflow +python scripts/agent_orchestrator.py agent.yaml --estimate-cost # Token estimation ``` - -## Resources - -- Advanced Patterns: `references/prompt_engineering_patterns.md` -- Implementation Guide: `references/llm_evaluation_frameworks.md` -- Technical Reference: `references/agentic_system_design.md` -- Automation Scripts: `scripts/` directory - -## Senior-Level Responsibilities - -As a world-class senior professional: - -1. **Technical Leadership** - - Drive architectural decisions - - Mentor team members - - Establish best practices - - Ensure code quality - -2. **Strategic Thinking** - - Align with business goals - - Evaluate trade-offs - - Plan for scale - - Manage technical debt - -3. **Collaboration** - - Work across teams - - Communicate effectively - - Build consensus - - Share knowledge - -4. **Innovation** - - Stay current with research - - Experiment with new approaches - - Contribute to community - - Drive continuous improvement - -5. **Production Excellence** - - Ensure high availability - - Monitor proactively - - Optimize performance - - Respond to incidents diff --git a/engineering-team/senior-prompt-engineer/references/agentic_system_design.md b/engineering-team/senior-prompt-engineer/references/agentic_system_design.md index 8c91ba3..bcfe500 100644 --- a/engineering-team/senior-prompt-engineer/references/agentic_system_design.md +++ b/engineering-team/senior-prompt-engineer/references/agentic_system_design.md @@ -1,80 +1,646 @@ # Agentic System Design -## Overview +Agent architectures, tool use patterns, and multi-agent orchestration with pseudocode. -World-class agentic system design for senior prompt engineer. +## Architectures Index -## Core Principles +1. [ReAct Pattern](#1-react-pattern) +2. [Plan-and-Execute](#2-plan-and-execute) +3. [Tool Use / Function Calling](#3-tool-use--function-calling) +4. [Multi-Agent Collaboration](#4-multi-agent-collaboration) +5. [Memory and State Management](#5-memory-and-state-management) +6. [Agent Design Patterns](#6-agent-design-patterns) -### Production-First Design +--- -Always design with production in mind: -- Scalability: Handle 10x current load -- Reliability: 99.9% uptime target -- Maintainability: Clear, documented code -- Observability: Monitor everything +## 1. ReAct Pattern -### Performance by Design +**Reasoning + Acting**: The agent alternates between thinking about what to do and taking actions. -Optimize from the start: -- Efficient algorithms -- Resource awareness -- Strategic caching -- Batch processing +### Architecture -### Security & Privacy +``` +┌─────────────────────────────────────────────────────────────┐ +│ ReAct Loop │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ +│ │ Thought │───▶│ Action │───▶│ Tool │───▶│Observat.│ │ +│ └─────────┘ └─────────┘ └─────────┘ └────┬────┘ │ +│ ▲ │ │ +│ └────────────────────────────────────────────┘ │ +│ (loop until done) │ +└─────────────────────────────────────────────────────────────┘ +``` -Build security in: -- Input validation -- Data encryption -- Access control -- Audit logging +### Pseudocode -## Advanced Patterns +```python +def react_agent(query, tools, max_iterations=10): + """ + ReAct agent implementation. -### Pattern 1: Distributed Processing + Args: + query: User question + tools: Dict of available tools {name: function} + max_iterations: Safety limit + """ + context = f"Question: {query}\n" -Enterprise-scale data processing with fault tolerance. + for i in range(max_iterations): + # Generate thought and action + response = llm.generate( + REACT_PROMPT.format( + tools=format_tools(tools), + context=context + ) + ) -### Pattern 2: Real-Time Systems + # Parse response + thought = extract_thought(response) + action = extract_action(response) -Low-latency, high-throughput systems. + context += f"Thought: {thought}\n" -### Pattern 3: ML at Scale + # Check for final answer + if action.name == "finish": + return action.argument -Production ML with monitoring and automation. + # Execute tool + if action.name in tools: + observation = tools[action.name](action.argument) + context += f"Action: {action.name}({action.argument})\n" + context += f"Observation: {observation}\n" + else: + context += f"Error: Unknown tool {action.name}\n" -## Best Practices + return "Max iterations reached" +``` -### Code Quality -- Comprehensive testing -- Clear documentation -- Code reviews -- Type hints +### Prompt Template -### Performance -- Profile before optimizing -- Monitor continuously -- Cache strategically -- Batch operations +``` +You are a helpful assistant that can use tools to answer questions. -### Reliability -- Design for failure -- Implement retries -- Use circuit breakers -- Monitor health +Available tools: +{tools} -## Tools & Technologies +Answer format: +Thought: [your reasoning about what to do next] +Action: [tool_name(argument)] OR finish(final_answer) -Essential tools for this domain: -- Development frameworks -- Testing libraries -- Deployment platforms -- Monitoring solutions +{context} -## Further Reading +Continue: +``` -- Research papers -- Industry blogs -- Conference talks -- Open source projects +### When to Use + +| Scenario | ReAct Fit | +|----------|-----------| +| Simple Q&A with lookup | Good | +| Multi-step research | Good | +| Math calculations | Good | +| Creative writing | Poor | +| Real-time conversation | Poor | + +--- + +## 2. Plan-and-Execute + +**Two-phase approach**: First create a plan, then execute each step. + +### Architecture + +``` +┌──────────────────────────────────────────────────────────────┐ +│ Plan-and-Execute │ +├──────────────────────────────────────────────────────────────┤ +│ │ +│ Phase 1: Planning │ +│ ┌──────────┐ ┌──────────────────────────────────────┐ │ +│ │ Query │───▶│ Generate step-by-step plan │ │ +│ └──────────┘ └──────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────────┐ │ +│ │ Plan: [S1, S2, S3] │ │ +│ └──────────┬───────────┘ │ +│ │ │ +│ Phase 2: Execution │ │ +│ ┌──────────▼───────────┐ │ +│ │ Execute Step 1 │ │ +│ └──────────┬───────────┘ │ +│ │ │ +│ ┌──────────▼───────────┐ │ +│ │ Execute Step 2 │──▶ Replan? │ +│ └──────────┬───────────┘ │ +│ │ │ +│ ┌──────────▼───────────┐ │ +│ │ Execute Step 3 │ │ +│ └──────────┬───────────┘ │ +│ │ │ +│ ┌──────────▼───────────┐ │ +│ │ Final Answer │ │ +│ └──────────────────────┘ │ +└──────────────────────────────────────────────────────────────┘ +``` + +### Pseudocode + +```python +def plan_and_execute(query, tools): + """ + Plan-and-Execute agent. + + Separates planning from execution for complex tasks. + """ + # Phase 1: Generate plan + plan = generate_plan(query) + + results = [] + + # Phase 2: Execute each step + for i, step in enumerate(plan.steps): + # Execute step + result = execute_step(step, tools, results) + results.append(result) + + # Optional: Check if replanning needed + if should_replan(step, result, plan): + remaining_steps = plan.steps[i+1:] + new_plan = replan(query, results, remaining_steps) + plan.steps = plan.steps[:i+1] + new_plan.steps + + # Synthesize final answer + return synthesize_answer(query, results) + + +def generate_plan(query): + """Generate execution plan from query.""" + prompt = f""" + Create a step-by-step plan to answer this question: + {query} + + Format each step as: + Step N: [action description] + + Keep the plan concise (3-7 steps). + """ + response = llm.generate(prompt) + return parse_plan(response) + + +def execute_step(step, tools, previous_results): + """Execute a single step using available tools.""" + prompt = f""" + Execute this step: {step.description} + + Previous results: + {format_results(previous_results)} + + Available tools: {format_tools(tools)} + + Provide the result of this step. + """ + return llm.generate(prompt) +``` + +### When to Use + +| Task Complexity | Recommendation | +|-----------------|----------------| +| Simple (1-2 steps) | Use ReAct | +| Medium (3-5 steps) | Plan-and-Execute | +| Complex (6+ steps) | Plan-and-Execute with replanning | +| Highly dynamic | ReAct with adaptive planning | + +--- + +## 3. Tool Use / Function Calling + +**Structured tool invocation**: LLM generates structured calls that are executed externally. + +### Tool Definition Schema + +```json +{ + "name": "search_web", + "description": "Search the web for current information", + "parameters": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search query" + }, + "num_results": { + "type": "integer", + "default": 5, + "description": "Number of results to return" + } + }, + "required": ["query"] + } +} +``` + +### Implementation Pattern + +```python +class ToolRegistry: + """Registry for agent tools.""" + + def __init__(self): + self.tools = {} + + def register(self, name, func, schema): + """Register a tool with its schema.""" + self.tools[name] = { + "function": func, + "schema": schema + } + + def get_schemas(self): + """Get all tool schemas for LLM.""" + return [t["schema"] for t in self.tools.values()] + + def execute(self, name, arguments): + """Execute a tool by name.""" + if name not in self.tools: + raise ValueError(f"Unknown tool: {name}") + + func = self.tools[name]["function"] + return func(**arguments) + + +def tool_use_agent(query, registry): + """Agent with function calling.""" + messages = [{"role": "user", "content": query}] + + while True: + # Call LLM with tools + response = llm.chat( + messages=messages, + tools=registry.get_schemas(), + tool_choice="auto" + ) + + # Check if done + if response.finish_reason == "stop": + return response.content + + # Execute tool calls + if response.tool_calls: + for call in response.tool_calls: + result = registry.execute( + call.function.name, + json.loads(call.function.arguments) + ) + messages.append({ + "role": "tool", + "tool_call_id": call.id, + "content": str(result) + }) +``` + +### Tool Design Best Practices + +| Practice | Example | +|----------|---------| +| Clear descriptions | "Search web for query" not "search" | +| Type hints | Use JSON Schema types | +| Default values | Provide sensible defaults | +| Error handling | Return error messages, not exceptions | +| Idempotency | Same input = same output | + +--- + +## 4. Multi-Agent Collaboration + +### Orchestration Patterns + +**Pattern 1: Sequential Pipeline** +``` +Agent A → Agent B → Agent C → Output + +Use case: Research → Analysis → Writing +``` + +**Pattern 2: Hierarchical** +``` + ┌─────────────┐ + │ Coordinator │ + └──────┬──────┘ + ┌──────────┼──────────┐ + ▼ ▼ ▼ +┌───────┐ ┌───────┐ ┌───────┐ +│Agent A│ │Agent B│ │Agent C│ +└───────┘ └───────┘ └───────┘ + +Use case: Complex task decomposition +``` + +**Pattern 3: Debate/Consensus** +``` +┌───────┐ ┌───────┐ +│Agent A│◄───▶│Agent B│ +└───┬───┘ └───┬───┘ + │ │ + └──────┬──────┘ + ▼ + ┌─────────────┐ + │ Arbiter │ + └─────────────┘ + +Use case: Critical decisions, fact-checking +``` + +### Pseudocode: Hierarchical Multi-Agent + +```python +class CoordinatorAgent: + """Coordinates multiple specialized agents.""" + + def __init__(self, agents): + self.agents = agents # Dict[str, Agent] + + def process(self, query): + # Decompose task + subtasks = self.decompose(query) + + # Assign to agents + results = {} + for subtask in subtasks: + agent_name = self.select_agent(subtask) + result = self.agents[agent_name].execute(subtask) + results[subtask.id] = result + + # Synthesize + return self.synthesize(query, results) + + def decompose(self, query): + """Break query into subtasks.""" + prompt = f""" + Break this task into subtasks for specialized agents: + + Task: {query} + + Available agents: + - researcher: Gathers information + - analyst: Analyzes data + - writer: Produces content + + Format: + 1. [agent]: [subtask description] + """ + response = llm.generate(prompt) + return parse_subtasks(response) + + def select_agent(self, subtask): + """Select best agent for subtask.""" + return subtask.assigned_agent + + def synthesize(self, query, results): + """Combine agent results into final answer.""" + prompt = f""" + Combine these results to answer: {query} + + Results: + {format_results(results)} + + Provide a coherent final answer. + """ + return llm.generate(prompt) +``` + +### Communication Protocols + +| Protocol | Description | Use When | +|----------|-------------|----------| +| Direct | Agent calls agent | Simple pipelines | +| Message queue | Async message passing | High throughput | +| Shared state | Shared memory/database | Collaborative editing | +| Broadcast | One-to-many | Status updates | + +--- + +## 5. Memory and State Management + +### Memory Types + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Agent Memory System │ +├─────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ Working Memory │ │ Episodic Memory │ │ +│ │ (Current task) │ │ (Past sessions) │ │ +│ └────────┬────────┘ └────────┬─────────┘ │ +│ │ │ │ +│ └────────┬───────────┘ │ +│ ▼ │ +│ ┌─────────────────────────────────────────┐ │ +│ │ Semantic Memory │ │ +│ │ (Long-term knowledge, embeddings) │ │ +│ └─────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Implementation + +```python +class AgentMemory: + """Memory system for conversational agents.""" + + def __init__(self, embedding_model, vector_store): + self.embedding_model = embedding_model + self.vector_store = vector_store + self.working_memory = [] # Current conversation + self.buffer_size = 10 # Recent messages to keep + + def add_message(self, role, content): + """Add message to working memory.""" + self.working_memory.append({ + "role": role, + "content": content, + "timestamp": datetime.now() + }) + + # Trim if too long + if len(self.working_memory) > self.buffer_size: + # Summarize old messages before removing + old_messages = self.working_memory[:5] + summary = self.summarize(old_messages) + self.store_long_term(summary) + self.working_memory = self.working_memory[5:] + + def store_long_term(self, content): + """Store in semantic memory (vector store).""" + embedding = self.embedding_model.embed(content) + self.vector_store.add( + embedding=embedding, + metadata={"content": content, "type": "summary"} + ) + + def retrieve_relevant(self, query, k=5): + """Retrieve relevant memories for context.""" + query_embedding = self.embedding_model.embed(query) + results = self.vector_store.search(query_embedding, k=k) + return [r.metadata["content"] for r in results] + + def get_context(self, query): + """Build context for LLM from memories.""" + relevant = self.retrieve_relevant(query) + recent = self.working_memory[-self.buffer_size:] + + return { + "relevant_memories": relevant, + "recent_conversation": recent + } + + def summarize(self, messages): + """Summarize messages for long-term storage.""" + content = "\n".join([ + f"{m['role']}: {m['content']}" + for m in messages + ]) + prompt = f"Summarize this conversation:\n{content}" + return llm.generate(prompt) +``` + +### State Persistence Patterns + +| Pattern | Storage | Use Case | +|---------|---------|----------| +| In-memory | Dict/List | Single session | +| Redis | Key-value | Multi-session, fast | +| PostgreSQL | Relational | Complex queries | +| Vector DB | Embeddings | Semantic search | + +--- + +## 6. Agent Design Patterns + +### Pattern: Reflection + +Agent reviews and critiques its own output. + +```python +def reflective_agent(query, tools): + """Agent that reflects on its answers.""" + # Initial response + response = react_agent(query, tools) + + # Reflection + critique = llm.generate(f""" + Review this answer for: + 1. Accuracy - Is the information correct? + 2. Completeness - Does it fully answer the question? + 3. Clarity - Is it easy to understand? + + Question: {query} + Answer: {response} + + Critique: + """) + + # Check if revision needed + if needs_revision(critique): + revised = llm.generate(f""" + Improve this answer based on the critique: + + Original: {response} + Critique: {critique} + + Improved answer: + """) + return revised + + return response +``` + +### Pattern: Self-Ask + +Break complex questions into simpler sub-questions. + +```python +def self_ask_agent(query, tools): + """Agent that asks itself follow-up questions.""" + context = [] + + while True: + prompt = f""" + Question: {query} + + Previous Q&A: + {format_qa(context)} + + Do you need to ask a follow-up question to answer this? + If yes: "Follow-up: [question]" + If no: "Final Answer: [answer]" + """ + + response = llm.generate(prompt) + + if response.startswith("Final Answer:"): + return response.replace("Final Answer:", "").strip() + + # Answer follow-up question + follow_up = response.replace("Follow-up:", "").strip() + answer = simple_qa(follow_up, tools) + context.append({"q": follow_up, "a": answer}) +``` + +### Pattern: Expert Routing + +Route queries to specialized sub-agents. + +```python +class ExpertRouter: + """Routes queries to expert agents.""" + + def __init__(self): + self.experts = { + "code": CodeAgent(), + "math": MathAgent(), + "research": ResearchAgent(), + "general": GeneralAgent() + } + + def route(self, query): + """Determine best expert for query.""" + prompt = f""" + Classify this query into one category: + - code: Programming questions + - math: Mathematical calculations + - research: Fact-finding, current events + - general: Everything else + + Query: {query} + Category: + """ + category = llm.generate(prompt).strip().lower() + return self.experts.get(category, self.experts["general"]) + + def process(self, query): + expert = self.route(query) + return expert.execute(query) +``` + +--- + +## Quick Reference: Pattern Selection + +| Need | Pattern | +|------|---------| +| Simple tool use | ReAct | +| Complex multi-step | Plan-and-Execute | +| API integration | Function Calling | +| Multiple perspectives | Multi-Agent Debate | +| Quality assurance | Reflection | +| Complex reasoning | Self-Ask | +| Domain expertise | Expert Routing | +| Conversation continuity | Memory System | diff --git a/engineering-team/senior-prompt-engineer/references/llm_evaluation_frameworks.md b/engineering-team/senior-prompt-engineer/references/llm_evaluation_frameworks.md index 6d0be7e..e31a34e 100644 --- a/engineering-team/senior-prompt-engineer/references/llm_evaluation_frameworks.md +++ b/engineering-team/senior-prompt-engineer/references/llm_evaluation_frameworks.md @@ -1,80 +1,524 @@ -# Llm Evaluation Frameworks +# LLM Evaluation Frameworks -## Overview +Concrete metrics, scoring methods, comparison tables, and A/B testing frameworks. -World-class llm evaluation frameworks for senior prompt engineer. +## Frameworks Index -## Core Principles +1. [Evaluation Metrics Overview](#1-evaluation-metrics-overview) +2. [Text Generation Metrics](#2-text-generation-metrics) +3. [RAG-Specific Metrics](#3-rag-specific-metrics) +4. [Human Evaluation Frameworks](#4-human-evaluation-frameworks) +5. [A/B Testing for Prompts](#5-ab-testing-for-prompts) +6. [Benchmark Datasets](#6-benchmark-datasets) +7. [Evaluation Pipeline Design](#7-evaluation-pipeline-design) -### Production-First Design +--- -Always design with production in mind: -- Scalability: Handle 10x current load -- Reliability: 99.9% uptime target -- Maintainability: Clear, documented code -- Observability: Monitor everything +## 1. Evaluation Metrics Overview -### Performance by Design +### Metric Categories -Optimize from the start: -- Efficient algorithms -- Resource awareness -- Strategic caching -- Batch processing +| Category | Metrics | When to Use | +|----------|---------|-------------| +| **Lexical** | BLEU, ROUGE, Exact Match | Reference-based comparison | +| **Semantic** | BERTScore, Embedding similarity | Meaning preservation | +| **Task-specific** | F1, Accuracy, Precision/Recall | Classification, extraction | +| **Quality** | Coherence, Fluency, Relevance | Open-ended generation | +| **Safety** | Toxicity, Bias scores | Content moderation | -### Security & Privacy +### Choosing the Right Metric -Build security in: -- Input validation -- Data encryption -- Access control -- Audit logging +``` +Is there a single correct answer? +├── Yes → Exact Match or F1 +└── No + └── Is there a reference output? + ├── Yes → BLEU, ROUGE, or BERTScore + └── No + └── Can you define quality criteria? + ├── Yes → Human evaluation + LLM-as-judge + └── No → A/B testing with user metrics +``` -## Advanced Patterns +--- -### Pattern 1: Distributed Processing +## 2. Text Generation Metrics -Enterprise-scale data processing with fault tolerance. +### BLEU (Bilingual Evaluation Understudy) -### Pattern 2: Real-Time Systems +**What it measures:** N-gram overlap between generated and reference text. -Low-latency, high-throughput systems. +**Score range:** 0 to 1 (higher is better) -### Pattern 3: ML at Scale +**Calculation:** +``` +BLEU = BP × exp(Σ wn × log(pn)) -Production ML with monitoring and automation. +Where: +- BP = brevity penalty (penalizes short outputs) +- pn = precision of n-grams +- wn = weight (typically 0.25 for BLEU-4) +``` -## Best Practices +**Interpretation:** +| BLEU Score | Quality | +|------------|---------| +| > 0.6 | Excellent | +| 0.4 - 0.6 | Good | +| 0.2 - 0.4 | Acceptable | +| < 0.2 | Poor | -### Code Quality -- Comprehensive testing -- Clear documentation -- Code reviews -- Type hints +**Example:** +``` +Reference: "The quick brown fox jumps over the lazy dog" +Generated: "A fast brown fox leaps over the lazy dog" -### Performance -- Profile before optimizing -- Monitor continuously -- Cache strategically -- Batch operations +1-gram precision: 7/9 = 0.78 (matched: brown, fox, over, the, lazy, dog) +2-gram precision: 4/8 = 0.50 (matched: brown fox, the lazy, lazy dog) +BLEU-4: ~0.35 +``` -### Reliability -- Design for failure -- Implement retries -- Use circuit breakers -- Monitor health +**Limitations:** +- Doesn't capture meaning (synonyms penalized) +- Position-independent +- Requires reference text -## Tools & Technologies +--- -Essential tools for this domain: -- Development frameworks -- Testing libraries -- Deployment platforms -- Monitoring solutions +### ROUGE (Recall-Oriented Understudy for Gisting Evaluation) -## Further Reading +**What it measures:** Overlap focused on recall (coverage of reference). -- Research papers -- Industry blogs -- Conference talks -- Open source projects +**Variants:** +| Variant | Measures | +|---------|----------| +| ROUGE-1 | Unigram overlap | +| ROUGE-2 | Bigram overlap | +| ROUGE-L | Longest common subsequence | +| ROUGE-Lsum | LCS with sentence-level computation | + +**Calculation:** +``` +ROUGE-N Recall = (matching n-grams) / (n-grams in reference) +ROUGE-N Precision = (matching n-grams) / (n-grams in generated) +ROUGE-N F1 = 2 × (Precision × Recall) / (Precision + Recall) +``` + +**Example:** +``` +Reference: "The cat sat on the mat" +Generated: "The cat was sitting on the mat" + +ROUGE-1: + Recall: 5/6 = 0.83 (matched: the, cat, on, the, mat) + Precision: 5/7 = 0.71 + F1: 0.77 + +ROUGE-2: + Recall: 2/5 = 0.40 (matched: "the cat", "the mat") + Precision: 2/6 = 0.33 + F1: 0.36 +``` + +**Best for:** Summarization, text compression + +--- + +### BERTScore + +**What it measures:** Semantic similarity using contextual embeddings. + +**How it works:** +1. Generate BERT embeddings for each token +2. Compute cosine similarity between token pairs +3. Apply greedy matching to find best alignment +4. Aggregate into Precision, Recall, F1 + +**Advantages over lexical metrics:** +- Captures synonyms and paraphrases +- Context-aware matching +- Better correlation with human judgment + +**Example:** +``` +Reference: "The movie was excellent" +Generated: "The film was outstanding" + +Lexical (BLEU): Low score (only "The" and "was" match) +BERTScore: High score (semantic meaning preserved) +``` + +**Interpretation:** +| BERTScore F1 | Quality | +|--------------|---------| +| > 0.9 | Excellent | +| 0.8 - 0.9 | Good | +| 0.7 - 0.8 | Acceptable | +| < 0.7 | Review needed | + +--- + +## 3. RAG-Specific Metrics + +### Context Relevance + +**What it measures:** How relevant retrieved documents are to the query. + +**Calculation methods:** + +**Method 1: Embedding similarity** +```python +relevance = cosine_similarity( + embed(query), + embed(context) +) +``` + +**Method 2: LLM-as-judge** +``` +Prompt: "Rate the relevance of this context to the question. +Question: {question} +Context: {context} +Rate from 1-5 where 5 is highly relevant." +``` + +**Target:** > 0.8 for top-k contexts + +--- + +### Answer Faithfulness + +**What it measures:** Whether the answer is supported by the context (no hallucination). + +**Evaluation prompt:** +``` +Given the context and answer, determine if every claim in the +answer is supported by the context. + +Context: {context} +Answer: {answer} + +For each claim in the answer: +1. Identify the claim +2. Find supporting evidence in context (or mark as unsupported) +3. Rate: Supported / Partially Supported / Not Supported + +Overall faithfulness score: [0-1] +``` + +**Scoring:** +``` +Faithfulness = (supported claims) / (total claims) +``` + +**Target:** > 0.95 for production systems + +--- + +### Retrieval Metrics + +| Metric | Formula | What it measures | +|--------|---------|------------------| +| **Precision@k** | (relevant in top-k) / k | Quality of top results | +| **Recall@k** | (relevant in top-k) / (total relevant) | Coverage | +| **MRR** | 1 / (rank of first relevant) | Position of first hit | +| **NDCG@k** | DCG@k / IDCG@k | Ranking quality | + +**Example:** +``` +Query: "What is photosynthesis?" +Retrieved docs (k=5): [R, N, R, N, R] (R=relevant, N=not relevant) +Total relevant in corpus: 10 + +Precision@5 = 3/5 = 0.6 +Recall@5 = 3/10 = 0.3 +MRR = 1/1 = 1.0 (first doc is relevant) +``` + +--- + +## 4. Human Evaluation Frameworks + +### Likert Scale Evaluation + +**Setup:** +``` +Rate the following response on a scale of 1-5: + +Response: {generated_response} + +Criteria: +- Relevance (1-5): Does it address the question? +- Accuracy (1-5): Is the information correct? +- Fluency (1-5): Is it well-written? +- Helpfulness (1-5): Would this be useful to the user? +``` + +**Sample size guidance:** +| Confidence Level | Margin of Error | Required Samples | +|-----------------|-----------------|------------------| +| 95% | ±5% | 385 | +| 95% | ±10% | 97 | +| 90% | ±10% | 68 | + +--- + +### Comparative Evaluation (Side-by-Side) + +**Setup:** +``` +Compare these two responses to the question: + +Question: {question} + +Response A: {response_a} +Response B: {response_b} + +Which response is better? +[ ] A is much better +[ ] A is slightly better +[ ] About the same +[ ] B is slightly better +[ ] B is much better + +Why? _______________ +``` + +**Advantages:** +- Easier for humans than absolute scoring +- Reduces calibration issues +- Clear winner for A/B decisions + +**Analysis:** +``` +Win rate = (A wins + 0.5 × ties) / total +Bradley-Terry model for ranking multiple variants +``` + +--- + +### LLM-as-Judge + +**Setup:** +``` +You are an expert evaluator. Rate the quality of this response. + +Question: {question} +Response: {response} +Reference (if available): {reference} + +Evaluate on: +1. Correctness (0-10): Is the information accurate? +2. Completeness (0-10): Does it fully address the question? +3. Clarity (0-10): Is it easy to understand? +4. Conciseness (0-10): Is it appropriately brief? + +Provide scores and brief justification for each. +Overall score (0-10): +``` + +**Calibration techniques:** +- Include reference responses with known scores +- Use chain-of-thought for reasoning +- Compare against human baseline periodically + +**Known biases:** +| Bias | Mitigation | +|------|------------| +| Position bias | Randomize order | +| Length bias | Normalize or specify length | +| Self-preference | Use different model as judge | +| Verbosity preference | Penalize unnecessary length | + +--- + +## 5. A/B Testing for Prompts + +### Experiment Design + +**Hypothesis template:** +``` +H0: Prompt A and Prompt B have equal performance on [metric] +H1: Prompt B improves [metric] by at least [minimum detectable effect] +``` + +**Sample size calculation:** +``` +n = 2 × ((z_α + z_β)² × σ²) / δ² + +Where: +- z_α = 1.96 for 95% confidence +- z_β = 0.84 for 80% power +- σ = standard deviation of metric +- δ = minimum detectable effect +``` + +**Quick reference:** +| MDE | Baseline Rate | Required n/variant | +|-----|---------------|-------------------| +| 5% relative | 50% | 3,200 | +| 10% relative | 50% | 800 | +| 20% relative | 50% | 200 | + +--- + +### Metrics to Track + +**Primary metrics:** +| Metric | Measurement | +|--------|-------------| +| Task success rate | % of queries with correct/helpful response | +| User satisfaction | Thumbs up/down or 1-5 rating | +| Engagement | Follow-up questions, session length | + +**Guardrail metrics:** +| Metric | Threshold | +|--------|-----------| +| Error rate | < 1% | +| Latency P95 | < 2s | +| Toxicity rate | < 0.1% | +| Cost per query | Within budget | + +--- + +### Analysis Framework + +**Statistical test selection:** +``` +Is the metric binary (success/failure)? +├── Yes → Chi-squared test or Z-test for proportions +└── No + └── Is the data normally distributed? + ├── Yes → Two-sample t-test + └── No → Mann-Whitney U test +``` + +**Interpreting results:** +``` +p-value < 0.05: Statistically significant +Effect size (Cohen's d): + - Small: 0.2 + - Medium: 0.5 + - Large: 0.8 + +Decision: Ship if p < 0.05 AND effect size meets threshold AND guardrails pass +``` + +--- + +## 6. Benchmark Datasets + +### General NLP Benchmarks + +| Benchmark | Task | Size | Metric | +|-----------|------|------|--------| +| **MMLU** | Knowledge QA | 14K | Accuracy | +| **HellaSwag** | Commonsense | 10K | Accuracy | +| **TruthfulQA** | Factuality | 817 | % Truthful | +| **HumanEval** | Code generation | 164 | pass@k | +| **GSM8K** | Math reasoning | 8.5K | Accuracy | + +### RAG Benchmarks + +| Benchmark | Focus | Metrics | +|-----------|-------|---------| +| **Natural Questions** | Wikipedia QA | EM, F1 | +| **HotpotQA** | Multi-hop reasoning | EM, F1 | +| **MS MARCO** | Web search | MRR, Recall | +| **BEIR** | Zero-shot retrieval | NDCG@10 | + +### Creating Custom Benchmarks + +**Template:** +```json +{ + "id": "custom-001", + "input": "What are the symptoms of diabetes?", + "expected_output": "Common symptoms include...", + "metadata": { + "category": "medical", + "difficulty": "easy", + "source": "internal docs" + }, + "evaluation": { + "type": "semantic_similarity", + "threshold": 0.85 + } +} +``` + +**Best practices:** +- Minimum 100 examples per category +- Include edge cases (10-20%) +- Balance difficulty levels +- Version control your benchmark +- Update quarterly + +--- + +## 7. Evaluation Pipeline Design + +### Automated Evaluation Pipeline + +``` +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Prompt │────▶│ LLM API │────▶│ Output │ +│ Version │ │ │ │ Storage │ +└─────────────┘ └─────────────┘ └──────┬──────┘ + │ + ┌──────────────────────────┘ + ▼ +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ Metrics │◀────│ Evaluator │◀────│ Benchmark │ +│ Dashboard │ │ Service │ │ Dataset │ +└─────────────┘ └─────────────┘ └─────────────┘ +``` + +### Implementation Checklist + +``` +□ Define success metrics + □ Primary metric (what you're optimizing) + □ Guardrail metrics (what must not regress) + □ Monitoring metrics (operational health) + +□ Create benchmark dataset + □ Representative samples from production + □ Edge cases and failure modes + □ Golden answers or human labels + +□ Set up evaluation infrastructure + □ Automated scoring pipeline + □ Version control for prompts + □ Results tracking and comparison + +□ Establish baseline + □ Run current prompt against benchmark + □ Document scores for all metrics + □ Set improvement targets + +□ Run experiments + □ Test one change at a time + □ Use statistical significance testing + □ Check all guardrail metrics + +□ Deploy and monitor + □ Gradual rollout (canary) + □ Real-time metric monitoring + □ Rollback plan if regression +``` + +--- + +## Quick Reference: Metric Selection + +| Use Case | Primary Metric | Secondary Metrics | +|----------|---------------|-------------------| +| Summarization | ROUGE-L | BERTScore, Compression ratio | +| Translation | BLEU | chrF, Human pref | +| QA (extractive) | Exact Match, F1 | | +| QA (generative) | BERTScore | Faithfulness, Relevance | +| Code generation | pass@k | Syntax errors | +| Classification | Accuracy, F1 | Precision, Recall | +| RAG | Faithfulness | Context relevance, MRR | +| Open-ended chat | Human eval | Helpfulness, Safety | diff --git a/engineering-team/senior-prompt-engineer/references/prompt_engineering_patterns.md b/engineering-team/senior-prompt-engineer/references/prompt_engineering_patterns.md index 15c2430..d95f948 100644 --- a/engineering-team/senior-prompt-engineer/references/prompt_engineering_patterns.md +++ b/engineering-team/senior-prompt-engineer/references/prompt_engineering_patterns.md @@ -1,80 +1,572 @@ # Prompt Engineering Patterns -## Overview +Specific prompt techniques with example inputs and expected outputs. -World-class prompt engineering patterns for senior prompt engineer. +## Patterns Index -## Core Principles +1. [Zero-Shot Prompting](#1-zero-shot-prompting) +2. [Few-Shot Prompting](#2-few-shot-prompting) +3. [Chain-of-Thought (CoT)](#3-chain-of-thought-cot) +4. [Role Prompting](#4-role-prompting) +5. [Structured Output](#5-structured-output) +6. [Self-Consistency](#6-self-consistency) +7. [ReAct (Reasoning + Acting)](#7-react-reasoning--acting) +8. [Tree of Thoughts](#8-tree-of-thoughts) +9. [Retrieval-Augmented Generation](#9-retrieval-augmented-generation) +10. [Meta-Prompting](#10-meta-prompting) -### Production-First Design +--- -Always design with production in mind: -- Scalability: Handle 10x current load -- Reliability: 99.9% uptime target -- Maintainability: Clear, documented code -- Observability: Monitor everything +## 1. Zero-Shot Prompting -### Performance by Design +**When to use:** Simple, well-defined tasks where the model has sufficient training knowledge. -Optimize from the start: -- Efficient algorithms -- Resource awareness -- Strategic caching -- Batch processing +**Pattern:** +``` +[Task instruction] +[Input] +``` -### Security & Privacy +**Example:** -Build security in: -- Input validation -- Data encryption -- Access control -- Audit logging +Input: +``` +Classify the following customer review as positive, negative, or neutral. -## Advanced Patterns +Review: "The shipping was fast but the product quality was disappointing." +``` -### Pattern 1: Distributed Processing +Expected Output: +``` +negative +``` -Enterprise-scale data processing with fault tolerance. +**Best practices:** +- Be explicit about output format +- Use clear, unambiguous verbs (classify, extract, summarize) +- Specify constraints (word limits, format requirements) -### Pattern 2: Real-Time Systems +**When to avoid:** +- Tasks requiring specific formatting the model hasn't seen +- Domain-specific tasks requiring specialized knowledge +- Tasks where consistency is critical -Low-latency, high-throughput systems. +--- -### Pattern 3: ML at Scale +## 2. Few-Shot Prompting -Production ML with monitoring and automation. +**When to use:** Tasks requiring consistent formatting or domain-specific patterns. -## Best Practices +**Pattern:** +``` +[Task description] -### Code Quality -- Comprehensive testing -- Clear documentation -- Code reviews -- Type hints +Example 1: +Input: [example input] +Output: [example output] -### Performance -- Profile before optimizing -- Monitor continuously -- Cache strategically -- Batch operations +Example 2: +Input: [example input] +Output: [example output] -### Reliability -- Design for failure -- Implement retries -- Use circuit breakers -- Monitor health +Now process: +Input: [actual input] +Output: +``` -## Tools & Technologies +**Example:** -Essential tools for this domain: -- Development frameworks -- Testing libraries -- Deployment platforms -- Monitoring solutions +Input: +``` +Extract the company name and founding year from the text. -## Further Reading +Example 1: +Input: "Apple Inc. was founded in 1976 by Steve Jobs." +Output: {"company": "Apple Inc.", "year": 1976} -- Research papers -- Industry blogs -- Conference talks -- Open source projects +Example 2: +Input: "Microsoft Corporation started in 1975." +Output: {"company": "Microsoft Corporation", "year": 1975} + +Example 3: +Input: "Founded in 1994, Amazon has grown into a tech giant." +Output: {"company": "Amazon", "year": 1994} + +Now process: +Input: "Tesla, Inc. was established in 2003 by Martin Eberhard." +Output: +``` + +Expected Output: +``` +{"company": "Tesla, Inc.", "year": 2003} +``` + +**Example selection guidelines:** +| Example Type | Purpose | Count | +|--------------|---------|-------| +| Simple/typical | Establish basic pattern | 1-2 | +| Edge case | Handle ambiguity | 1 | +| Different format | Show variations | 1 | +| **Total** | | **3-5** | + +**Common mistakes:** +- Too many examples (wastes tokens, may confuse) +- Inconsistent formatting between examples +- Examples too similar (model doesn't learn variation) + +--- + +## 3. Chain-of-Thought (CoT) + +**When to use:** Math problems, logical reasoning, multi-step analysis. + +**Pattern:** +``` +[Problem] + +Think through this step by step: +``` + +**Example:** + +Input: +``` +A store sells apples for $2 each. If you buy 5 or more, you get 20% off. +How much do 7 apples cost? + +Think through this step by step: +``` + +Expected Output: +``` +Step 1: Calculate the regular price for 7 apples +7 apples × $2 = $14 + +Step 2: Check if discount applies +7 apples ≥ 5 apples, so 20% discount applies + +Step 3: Calculate the discount amount +20% of $14 = 0.20 × $14 = $2.80 + +Step 4: Calculate final price +$14 - $2.80 = $11.20 + +The 7 apples cost $11.20 +``` + +**Variations:** +| Variation | Trigger Phrase | Use Case | +|-----------|----------------|----------| +| Basic CoT | "Think step by step" | General reasoning | +| Explicit steps | "Break this into numbered steps" | Complex procedures | +| Show work | "Show your calculations" | Math problems | +| Reasoning chain | "Explain your reasoning" | Justification needed | + +**Zero-shot CoT:** +Simply append "Let's think step by step." to any prompt. + +--- + +## 4. Role Prompting + +**When to use:** Tasks requiring expertise, specific perspective, or consistent persona. + +**Pattern:** +``` +You are a [role/expert description]. + +[Context about the task] + +[Specific instruction] +``` + +**Example:** + +Input: +``` +You are a senior security engineer with 15 years of experience in web application security. + +Review the following code snippet for security vulnerabilities: + +```python +def login(username, password): + query = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" + result = db.execute(query) + return result +``` + +Identify vulnerabilities and provide specific fixes. +``` + +Expected Output: +``` +Security Vulnerabilities Identified: + +1. SQL Injection (Critical) + - The code directly interpolates user input into SQL query + - Attacker can input: username=' OR '1'='1' -- + - This bypasses authentication entirely + + Fix: + ```python + def login(username, password): + query = "SELECT * FROM users WHERE username=? AND password=?" + result = db.execute(query, (username, password)) + return result + ``` + +2. Plain-text Password Storage (Critical) + - Passwords appear to be stored and compared in plain text + - Fix: Use bcrypt or argon2 for password hashing + +3. Missing Rate Limiting (Medium) + - No protection against brute force attacks + - Fix: Implement login attempt throttling +``` + +**Effective role descriptions:** +| Weak | Strong | +|------|--------| +| "You are an expert" | "You are a senior data scientist with 10 years of experience in NLP" | +| "You are helpful" | "You are a patient teacher explaining concepts to beginners" | +| "You know about X" | "You are a certified AWS solutions architect specializing in serverless" | + +--- + +## 5. Structured Output + +**When to use:** When you need parseable responses (JSON, XML, CSV). + +**Pattern:** +``` +[Task instruction] + +Respond in JSON format with exactly these fields: +- field1 (type): description +- field2 (type): description + +[Input] + +Return ONLY valid JSON, no markdown or explanation. +``` + +**Example:** + +Input: +``` +Extract meeting details from this email. + +Respond in JSON format with exactly these fields: +- date (string, ISO format): Meeting date +- time (string, 24h format): Meeting time +- attendees (array of strings): List of attendees +- topic (string): Meeting topic +- location (string or null): Meeting location if mentioned + +Email: "Hi team, let's meet tomorrow at 2pm to discuss Q4 planning. +Sarah, Mike, and Lisa should attend. We'll use Conference Room B." + +Today's date is 2024-01-15. + +Return ONLY valid JSON, no markdown or explanation. +``` + +Expected Output: +```json +{ + "date": "2024-01-16", + "time": "14:00", + "attendees": ["Sarah", "Mike", "Lisa"], + "topic": "Q4 planning", + "location": "Conference Room B" +} +``` + +**Format enforcement techniques:** +``` +# Strong enforcement +"Return ONLY valid JSON. Start with { and end with }" + +# Schema validation hint +"The output must be valid JSON matching this TypeScript type: +type Output = { name: string; age: number; active: boolean }" + +# Negative instruction +"Do NOT include markdown code blocks. Do NOT add explanations." +``` + +--- + +## 6. Self-Consistency + +**When to use:** Complex reasoning where multiple valid paths exist. + +**Pattern:** +1. Generate multiple reasoning paths (temperature > 0) +2. Extract final answers from each path +3. Select most common answer (majority vote) + +**Example approach:** + +``` +# Run this prompt 5 times with temperature=0.7 + +Solve this logic puzzle. Think through it step by step. + +Three friends (Alice, Bob, Carol) each have a different pet (cat, dog, bird). +- Alice doesn't have the dog +- The person with the bird is not Carol +- Bob's pet has four legs + +What pet does each person have? +``` + +**Implementation:** +```python +# Pseudo-code for self-consistency +answers = [] +for i in range(5): + response = llm.generate(prompt, temperature=0.7) + answer = extract_final_answer(response) + answers.append(answer) + +final_answer = majority_vote(answers) +``` + +**When to use:** +- Math word problems +- Logic puzzles +- Any task where you can verify consistency + +--- + +## 7. ReAct (Reasoning + Acting) + +**When to use:** Tasks requiring external tool use or information gathering. + +**Pattern:** +``` +Answer the question using the available tools. + +Tools available: +- search(query): Search the web +- calculate(expression): Evaluate math +- lookup(term): Look up in knowledge base + +Format your response as: +Thought: [reasoning about what to do] +Action: [tool_name(arguments)] +Observation: [result from tool] +... (repeat as needed) +Thought: I now have enough information +Final Answer: [answer] + +Question: [user question] +``` + +**Example:** + +Input: +``` +Question: What is the population of France divided by the population of Belgium? +``` + +Expected Output: +``` +Thought: I need to find the population of France and Belgium, then calculate the ratio. + +Action: search("population of France 2024") +Observation: France has a population of approximately 67.75 million. + +Thought: Now I need Belgium's population. + +Action: search("population of Belgium 2024") +Observation: Belgium has a population of approximately 11.7 million. + +Thought: Now I can calculate the ratio. + +Action: calculate(67.75 / 11.7) +Observation: 5.79 + +Thought: I now have enough information to answer. + +Final Answer: The population of France is approximately 5.79 times the population of Belgium. +``` + +--- + +## 8. Tree of Thoughts + +**When to use:** Complex problems requiring exploration of multiple solution paths. + +**Pattern:** +``` +Problem: [complex problem] + +Generate 3 different approaches to solve this: + +Approach 1: [description] +Evaluation: [pros/cons] + +Approach 2: [description] +Evaluation: [pros/cons] + +Approach 3: [description] +Evaluation: [pros/cons] + +Best approach: [selection with reasoning] +Solution using best approach: [detailed solution] +``` + +**Example:** + +Input: +``` +Problem: Design a system to handle 10,000 concurrent user sessions with sub-100ms response times. + +Generate 3 different architectural approaches: +``` + +Expected Output: +``` +Approach 1: Horizontal scaling with load balancer +- Multiple stateless app servers behind load balancer +- Session state in Redis cluster +Evaluation: Simple, proven pattern. May have Redis as bottleneck. + +Approach 2: Serverless with edge computing +- AWS Lambda@Edge or Cloudflare Workers +- Session state in DynamoDB Global Tables +Evaluation: Auto-scales, but cold starts may impact latency. + +Approach 3: Actor model with sticky sessions +- Akka/Orleans cluster with virtual actors +- Session state co-located with actor +Evaluation: Lowest latency, but complex to operate. + +Best approach: Approach 1 (Horizontal scaling) +Reasoning: Best balance of simplicity, reliability, and performance for the given requirements. + +Solution: [detailed architecture follows] +``` + +--- + +## 9. Retrieval-Augmented Generation + +**When to use:** Tasks requiring factual accuracy or current information. + +**Pattern:** +``` +Use the following context to answer the question. If the answer is not in the context, say "I don't have information about that." + +Context: +[retrieved documents] + +Question: [user question] + +Answer based only on the context provided: +``` + +**Example:** + +Input: +``` +Use the following context to answer the question. + +Context: +Document 1: "The Acme Widget 3000 was released in March 2024. It features +a 12-hour battery life and weighs 250 grams." + +Document 2: "Pricing for the Acme Widget 3000 starts at $299 for the base +model. The Pro version with extended battery is $399." + +Question: How much does the Acme Widget 3000 cost and what is its battery life? + +Answer based only on the context provided: +``` + +Expected Output: +``` +The Acme Widget 3000 starts at $299 for the base model, with a Pro version +available at $399. The standard model has a 12-hour battery life. +``` + +**Chunking strategies:** +| Strategy | Chunk Size | Overlap | Best For | +|----------|------------|---------|----------| +| Fixed | 500 tokens | 50 tokens | General purpose | +| Semantic | Paragraph | None | Well-structured docs | +| Sentence | 3-5 sentences | 1 sentence | Q&A systems | + +--- + +## 10. Meta-Prompting + +**When to use:** Generating or improving prompts programmatically. + +**Pattern:** +``` +You are a prompt engineering expert. + +Task: [description of what the prompt should do] +Target model: [GPT-4/Claude/etc.] +Constraints: [length limits, format requirements] + +Generate an optimized prompt for this task. +``` + +**Example:** + +Input: +``` +You are a prompt engineering expert. + +Task: Create a prompt that extracts action items from meeting notes +Target model: GPT-4 +Constraints: +- Output must be valid JSON +- Each action item needs: task, owner, due_date +- Handle cases where information is missing + +Generate an optimized prompt for this task. +``` + +Expected Output: +``` +Extract action items from the following meeting notes. + +For each action item, identify: +- task: The specific action to be taken +- owner: Person responsible (use "Unassigned" if not mentioned) +- due_date: Deadline in YYYY-MM-DD format (use null if not specified) + +Meeting Notes: +{meeting_notes} + +Respond with a JSON array. Example format: +[ + {"task": "Review proposal", "owner": "Sarah", "due_date": "2024-01-20"}, + {"task": "Send update", "owner": "Unassigned", "due_date": null} +] + +Return ONLY the JSON array, no additional text. +``` + +--- + +## Pattern Selection Guide + +| Task Type | Recommended Pattern | +|-----------|---------------------| +| Simple classification | Zero-shot | +| Consistent formatting needed | Few-shot | +| Math/logic problems | Chain-of-Thought | +| Need expertise/perspective | Role Prompting | +| API integration | Structured Output | +| High-stakes decisions | Self-Consistency | +| Tool use required | ReAct | +| Complex problem solving | Tree of Thoughts | +| Factual Q&A | RAG | +| Prompt generation | Meta-Prompting | diff --git a/engineering-team/senior-prompt-engineer/scripts/agent_orchestrator.py b/engineering-team/senior-prompt-engineer/scripts/agent_orchestrator.py index 52052a2..c54596a 100755 --- a/engineering-team/senior-prompt-engineer/scripts/agent_orchestrator.py +++ b/engineering-team/senior-prompt-engineer/scripts/agent_orchestrator.py @@ -1,100 +1,560 @@ #!/usr/bin/env python3 """ -Agent Orchestrator -Production-grade tool for senior prompt engineer +Agent Orchestrator - Tool for designing and validating agent workflows + +Features: +- Parse agent configurations (YAML/JSON) +- Validate tool registrations +- Visualize execution flows (ASCII/Mermaid) +- Estimate token usage per run +- Detect potential issues (loops, missing tools) + +Usage: + python agent_orchestrator.py agent.yaml --validate + python agent_orchestrator.py agent.yaml --visualize + python agent_orchestrator.py agent.yaml --visualize --format mermaid + python agent_orchestrator.py agent.yaml --estimate-cost """ -import os -import sys -import json -import logging import argparse +import json +import re +import sys from pathlib import Path -from typing import Dict, List, Optional -from datetime import datetime +from typing import Dict, List, Optional, Set, Tuple, Any +from dataclasses import dataclass, asdict, field +from enum import Enum -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) -class AgentOrchestrator: - """Production-grade agent orchestrator""" - - def __init__(self, config: Dict): - self.config = config - self.results = { - 'status': 'initialized', - 'start_time': datetime.now().isoformat(), - 'processed_items': 0 - } - logger.info(f"Initialized {self.__class__.__name__}") - - def validate_config(self) -> bool: - """Validate configuration""" - logger.info("Validating configuration...") - # Add validation logic - logger.info("Configuration validated") - return True - - def process(self) -> Dict: - """Main processing logic""" - logger.info("Starting processing...") - +class AgentPattern(Enum): + """Supported agent patterns""" + REACT = "react" + PLAN_EXECUTE = "plan-execute" + TOOL_USE = "tool-use" + MULTI_AGENT = "multi-agent" + CUSTOM = "custom" + + +@dataclass +class ToolDefinition: + """Definition of an agent tool""" + name: str + description: str + parameters: Dict[str, Any] = field(default_factory=dict) + required_config: List[str] = field(default_factory=list) + estimated_tokens: int = 100 + + +@dataclass +class AgentConfig: + """Agent configuration""" + name: str + pattern: AgentPattern + description: str + tools: List[ToolDefinition] + max_iterations: int = 10 + system_prompt: str = "" + temperature: float = 0.7 + model: str = "gpt-4" + + +@dataclass +class ValidationResult: + """Result of agent validation""" + is_valid: bool + errors: List[str] + warnings: List[str] + tool_status: Dict[str, str] + estimated_tokens_per_run: Tuple[int, int] # (min, max) + potential_infinite_loop: bool + max_depth: int + + +def parse_yaml_simple(content: str) -> Dict[str, Any]: + """Simple YAML parser for agent configs (no external dependencies)""" + result = {} + current_key = None + current_list = None + indent_stack = [(0, result)] + + lines = content.split('\n') + + for line in lines: + # Skip empty lines and comments + stripped = line.strip() + if not stripped or stripped.startswith('#'): + continue + + # Calculate indent + indent = len(line) - len(line.lstrip()) + + # Check for list item + if stripped.startswith('- '): + item = stripped[2:].strip() + if current_list is not None: + # Check if it's a key-value pair + if ':' in item and not item.startswith('{'): + key, _, value = item.partition(':') + current_list.append({key.strip(): value.strip().strip('"\'')}) + else: + current_list.append(item.strip('"\'')) + continue + + # Check for key-value pair + if ':' in stripped: + key, _, value = stripped.partition(':') + key = key.strip() + value = value.strip().strip('"\'') + + # Pop indent stack as needed + while indent_stack and indent <= indent_stack[-1][0] and len(indent_stack) > 1: + indent_stack.pop() + + current_dict = indent_stack[-1][1] + + if value: + # Simple key-value + current_dict[key] = value + current_list = None + else: + # Start of nested structure or list + # Peek ahead to see if it's a list + next_line_idx = lines.index(line) + 1 + if next_line_idx < len(lines): + next_stripped = lines[next_line_idx].strip() + if next_stripped.startswith('- '): + current_dict[key] = [] + current_list = current_dict[key] + else: + current_dict[key] = {} + indent_stack.append((indent + 2, current_dict[key])) + current_list = None + + return result + + +def load_config(path: Path) -> AgentConfig: + """Load agent configuration from file""" + content = path.read_text(encoding='utf-8') + + # Try JSON first + if path.suffix == '.json': + data = json.loads(content) + else: + # Try YAML try: - self.validate_config() - - # Main processing - result = self._execute() - - self.results['status'] = 'completed' - self.results['end_time'] = datetime.now().isoformat() - - logger.info("Processing completed successfully") - return self.results - - except Exception as e: - self.results['status'] = 'failed' - self.results['error'] = str(e) - logger.error(f"Processing failed: {e}") - raise - - def _execute(self) -> Dict: - """Execute main logic""" - # Implementation here - return {'success': True} + data = parse_yaml_simple(content) + except Exception: + # Fallback to JSON if YAML parsing fails + data = json.loads(content) + + # Parse pattern + pattern_str = data.get('pattern', 'react').lower() + try: + pattern = AgentPattern(pattern_str) + except ValueError: + pattern = AgentPattern.CUSTOM + + # Parse tools + tools = [] + for tool_data in data.get('tools', []): + if isinstance(tool_data, dict): + tools.append(ToolDefinition( + name=tool_data.get('name', 'unknown'), + description=tool_data.get('description', ''), + parameters=tool_data.get('parameters', {}), + required_config=tool_data.get('required_config', []), + estimated_tokens=tool_data.get('estimated_tokens', 100) + )) + elif isinstance(tool_data, str): + tools.append(ToolDefinition(name=tool_data, description='')) + + return AgentConfig( + name=data.get('name', 'agent'), + pattern=pattern, + description=data.get('description', ''), + tools=tools, + max_iterations=int(data.get('max_iterations', 10)), + system_prompt=data.get('system_prompt', ''), + temperature=float(data.get('temperature', 0.7)), + model=data.get('model', 'gpt-4') + ) + + +def validate_agent(config: AgentConfig) -> ValidationResult: + """Validate agent configuration""" + errors = [] + warnings = [] + tool_status = {} + + # Validate name + if not config.name: + errors.append("Agent name is required") + + # Validate tools + if not config.tools: + warnings.append("No tools defined - agent will have limited capabilities") + + tool_names = set() + for tool in config.tools: + # Check for duplicates + if tool.name in tool_names: + errors.append(f"Duplicate tool name: {tool.name}") + tool_names.add(tool.name) + + # Check required config + if tool.required_config: + missing = [c for c in tool.required_config if not c.startswith('$')] + if missing: + tool_status[tool.name] = f"WARN: Missing config: {missing}" + else: + tool_status[tool.name] = "OK" + else: + tool_status[tool.name] = "OK - No config needed" + + # Check description + if not tool.description: + warnings.append(f"Tool '{tool.name}' has no description") + + # Validate pattern-specific requirements + if config.pattern == AgentPattern.MULTI_AGENT: + if len(config.tools) < 2: + warnings.append("Multi-agent pattern typically requires 2+ specialized tools") + + # Check for potential infinite loops + potential_loop = config.max_iterations > 50 + + # Estimate tokens + base_tokens = len(config.system_prompt.split()) * 1.3 if config.system_prompt else 200 + tool_tokens = sum(t.estimated_tokens for t in config.tools) + + min_tokens = int(base_tokens + tool_tokens) + max_tokens = int((base_tokens + tool_tokens * 2) * config.max_iterations) + + return ValidationResult( + is_valid=len(errors) == 0, + errors=errors, + warnings=warnings, + tool_status=tool_status, + estimated_tokens_per_run=(min_tokens, max_tokens), + potential_infinite_loop=potential_loop, + max_depth=config.max_iterations + ) + + +def generate_ascii_diagram(config: AgentConfig) -> str: + """Generate ASCII workflow diagram""" + lines = [] + + # Header + width = max(40, len(config.name) + 10) + lines.append("┌" + "─" * width + "┐") + lines.append("│" + config.name.center(width) + "│") + lines.append("│" + f"({config.pattern.value} Pattern)".center(width) + "│") + lines.append("└" + "─" * (width // 2 - 1) + "┬" + "─" * (width // 2) + "┘") + lines.append(" " * (width // 2) + "│") + + # User Query + lines.append(" " * (width // 2 - 8) + "┌───────────────┐") + lines.append(" " * (width // 2 - 8) + "│ User Query │") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘") + lines.append(" " * (width // 2) + "│") + + if config.pattern == AgentPattern.REACT: + # ReAct loop + lines.append(" " * (width // 2 - 8) + "┌───────────────┐") + lines.append(" " * (width // 2 - 8) + "│ Think │◄──────┐") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘ │") + lines.append(" " * (width // 2) + "│ │") + lines.append(" " * (width // 2 - 8) + "┌───────────────┐ │") + lines.append(" " * (width // 2 - 8) + "│ Select Tool │ │") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘ │") + lines.append(" " * (width // 2) + "│ │") + + # Tools + if config.tools: + tool_line = " ".join([f"[{t.name}]" for t in config.tools[:4]]) + if len(config.tools) > 4: + tool_line += " ..." + lines.append(" " * 4 + tool_line) + lines.append(" " * (width // 2) + "│ │") + + lines.append(" " * (width // 2 - 8) + "┌───────────────┐ │") + lines.append(" " * (width // 2 - 8) + "│ Observe │───────┘") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘") + + elif config.pattern == AgentPattern.PLAN_EXECUTE: + # Plan phase + lines.append(" " * (width // 2 - 8) + "┌───────────────┐") + lines.append(" " * (width // 2 - 8) + "│ Create Plan │") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘") + lines.append(" " * (width // 2) + "│") + + # Execute loop + lines.append(" " * (width // 2 - 8) + "┌───────────────┐") + lines.append(" " * (width // 2 - 8) + "│ Execute Step │◄──────┐") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘ │") + lines.append(" " * (width // 2) + "│ │") + + if config.tools: + tool_line = " ".join([f"[{t.name}]" for t in config.tools[:4]]) + lines.append(" " * 4 + tool_line) + lines.append(" " * (width // 2) + "│ │") + + lines.append(" " * (width // 2 - 8) + "┌───────────────┐ │") + lines.append(" " * (width // 2 - 8) + "│ Check Done? │───────┘") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘") + + else: + # Generic tool use + lines.append(" " * (width // 2 - 8) + "┌───────────────┐") + lines.append(" " * (width // 2 - 8) + "│ Process Query │") + lines.append(" " * (width // 2 - 8) + "└───────┬───────┘") + lines.append(" " * (width // 2) + "│") + + if config.tools: + for tool in config.tools[:6]: + lines.append(" " * (width // 2 - 8) + f"├──▶ [{tool.name}]") + if len(config.tools) > 6: + lines.append(" " * (width // 2 - 8) + "├──▶ [...]") + + # Final answer + lines.append(" " * (width // 2) + "│") + lines.append(" " * (width // 2 - 8) + "┌───────────────┐") + lines.append(" " * (width // 2 - 8) + "│ Final Answer │") + lines.append(" " * (width // 2 - 8) + "└───────────────┘") + + return '\n'.join(lines) + + +def generate_mermaid_diagram(config: AgentConfig) -> str: + """Generate Mermaid flowchart""" + lines = ["```mermaid", "flowchart TD"] + + # Start and query + lines.append(f" subgraph {config.name}[{config.name}]") + lines.append(" direction TB") + lines.append(" A[User Query] --> B{Process}") + + if config.pattern == AgentPattern.REACT: + lines.append(" B --> C[Think]") + lines.append(" C --> D{Select Tool}") + + for i, tool in enumerate(config.tools[:6]): + lines.append(f" D -->|{tool.name}| T{i}[{tool.name}]") + lines.append(f" T{i} --> E[Observe]") + + lines.append(" E -->|Continue| C") + lines.append(" E -->|Done| F[Final Answer]") + + elif config.pattern == AgentPattern.PLAN_EXECUTE: + lines.append(" B --> P[Create Plan]") + lines.append(" P --> X{Execute Step}") + + for i, tool in enumerate(config.tools[:6]): + lines.append(f" X -->|{tool.name}| T{i}[{tool.name}]") + lines.append(f" T{i} --> R[Review]") + + lines.append(" R -->|More Steps| X") + lines.append(" R -->|Complete| F[Final Answer]") + + else: + for i, tool in enumerate(config.tools[:6]): + lines.append(f" B -->|use| T{i}[{tool.name}]") + lines.append(f" T{i} --> F[Final Answer]") + + lines.append(" end") + lines.append("```") + + return '\n'.join(lines) + + +def estimate_cost(config: AgentConfig, runs: int = 100) -> Dict[str, Any]: + """Estimate token costs for agent runs""" + validation = validate_agent(config) + min_tokens, max_tokens = validation.estimated_tokens_per_run + + # Cost per 1K tokens + costs = { + 'gpt-4': {'input': 0.03, 'output': 0.06}, + 'gpt-4-turbo': {'input': 0.01, 'output': 0.03}, + 'gpt-3.5-turbo': {'input': 0.0005, 'output': 0.0015}, + 'claude-3-opus': {'input': 0.015, 'output': 0.075}, + 'claude-3-sonnet': {'input': 0.003, 'output': 0.015}, + } + + model_cost = costs.get(config.model, costs['gpt-4']) + + # Assume 60% input, 40% output + input_tokens = min_tokens * 0.6 + output_tokens = min_tokens * 0.4 + + cost_per_run_min = (input_tokens / 1000 * model_cost['input'] + + output_tokens / 1000 * model_cost['output']) + + input_tokens_max = max_tokens * 0.6 + output_tokens_max = max_tokens * 0.4 + cost_per_run_max = (input_tokens_max / 1000 * model_cost['input'] + + output_tokens_max / 1000 * model_cost['output']) + + return { + 'model': config.model, + 'tokens_per_run': {'min': min_tokens, 'max': max_tokens}, + 'cost_per_run': {'min': round(cost_per_run_min, 4), 'max': round(cost_per_run_max, 4)}, + 'estimated_monthly': { + 'runs': runs * 30, + 'cost_min': round(cost_per_run_min * runs * 30, 2), + 'cost_max': round(cost_per_run_max * runs * 30, 2) + } + } + + +def format_validation_report(config: AgentConfig, result: ValidationResult) -> str: + """Format validation result as human-readable report""" + lines = [] + lines.append("=" * 50) + lines.append("AGENT VALIDATION REPORT") + lines.append("=" * 50) + lines.append("") + + lines.append(f"📋 AGENT INFO") + lines.append(f" Name: {config.name}") + lines.append(f" Pattern: {config.pattern.value}") + lines.append(f" Model: {config.model}") + lines.append("") + + lines.append(f"🔧 TOOLS ({len(config.tools)} registered)") + for tool in config.tools: + status = result.tool_status.get(tool.name, "Unknown") + emoji = "✅" if status.startswith("OK") else "⚠️" + lines.append(f" {emoji} {tool.name} - {status}") + lines.append("") + + lines.append("📊 FLOW ANALYSIS") + lines.append(f" Max iterations: {result.max_depth}") + lines.append(f" Estimated tokens: {result.estimated_tokens_per_run[0]:,} - {result.estimated_tokens_per_run[1]:,}") + lines.append(f" Potential loop: {'⚠️ Yes' if result.potential_infinite_loop else '✅ No'}") + lines.append("") + + if result.errors: + lines.append(f"❌ ERRORS ({len(result.errors)})") + for error in result.errors: + lines.append(f" • {error}") + lines.append("") + + if result.warnings: + lines.append(f"⚠️ WARNINGS ({len(result.warnings)})") + for warning in result.warnings: + lines.append(f" • {warning}") + lines.append("") + + # Overall status + if result.is_valid: + lines.append("✅ VALIDATION PASSED") + else: + lines.append("❌ VALIDATION FAILED") + + lines.append("") + lines.append("=" * 50) + + return '\n'.join(lines) + def main(): - """Main entry point""" parser = argparse.ArgumentParser( - description="Agent Orchestrator" + description="Agent Orchestrator - Design and validate agent workflows", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s agent.yaml --validate + %(prog)s agent.yaml --visualize + %(prog)s agent.yaml --visualize --format mermaid + %(prog)s agent.yaml --estimate-cost --runs 100 + +Agent config format (YAML): + +name: research_assistant +pattern: react +model: gpt-4 +max_iterations: 10 +tools: + - name: web_search + description: Search the web + required_config: [api_key] + - name: calculator + description: Evaluate math expressions + """ ) - parser.add_argument('--input', '-i', required=True, help='Input path') - parser.add_argument('--output', '-o', required=True, help='Output path') - parser.add_argument('--config', '-c', help='Configuration file') - parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') - + + parser.add_argument('config', help='Agent configuration file (YAML or JSON)') + parser.add_argument('--validate', '-V', action='store_true', help='Validate agent configuration') + parser.add_argument('--visualize', '-v', action='store_true', help='Visualize agent workflow') + parser.add_argument('--format', '-f', choices=['ascii', 'mermaid'], default='ascii', + help='Visualization format (default: ascii)') + parser.add_argument('--estimate-cost', '-e', action='store_true', help='Estimate token costs') + parser.add_argument('--runs', '-r', type=int, default=100, help='Daily runs for cost estimation') + parser.add_argument('--output', '-o', help='Output file path') + parser.add_argument('--json', '-j', action='store_true', help='Output as JSON') + args = parser.parse_args() - - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - try: - config = { - 'input': args.input, - 'output': args.output - } - - processor = AgentOrchestrator(config) - results = processor.process() - - print(json.dumps(results, indent=2)) - sys.exit(0) - - except Exception as e: - logger.error(f"Fatal error: {e}") + + # Load config + config_path = Path(args.config) + if not config_path.exists(): + print(f"Error: Config file not found: {args.config}", file=sys.stderr) sys.exit(1) + try: + config = load_config(config_path) + except Exception as e: + print(f"Error parsing config: {e}", file=sys.stderr) + sys.exit(1) + + # Default to validate if no action specified + if not any([args.validate, args.visualize, args.estimate_cost]): + args.validate = True + + output_parts = [] + + # Validate + if args.validate: + result = validate_agent(config) + if args.json: + output_parts.append(json.dumps(asdict(result), indent=2)) + else: + output_parts.append(format_validation_report(config, result)) + + # Visualize + if args.visualize: + if args.format == 'mermaid': + diagram = generate_mermaid_diagram(config) + else: + diagram = generate_ascii_diagram(config) + output_parts.append(diagram) + + # Cost estimation + if args.estimate_cost: + costs = estimate_cost(config, args.runs) + if args.json: + output_parts.append(json.dumps(costs, indent=2)) + else: + output_parts.append("") + output_parts.append("💰 COST ESTIMATION") + output_parts.append(f" Model: {costs['model']}") + output_parts.append(f" Tokens per run: {costs['tokens_per_run']['min']:,} - {costs['tokens_per_run']['max']:,}") + output_parts.append(f" Cost per run: ${costs['cost_per_run']['min']:.4f} - ${costs['cost_per_run']['max']:.4f}") + output_parts.append(f" Monthly ({costs['estimated_monthly']['runs']:,} runs):") + output_parts.append(f" Min: ${costs['estimated_monthly']['cost_min']:.2f}") + output_parts.append(f" Max: ${costs['estimated_monthly']['cost_max']:.2f}") + + # Output + output = '\n'.join(output_parts) + print(output) + + if args.output: + Path(args.output).write_text(output) + print(f"\nOutput saved to {args.output}") + + if __name__ == '__main__': main() diff --git a/engineering-team/senior-prompt-engineer/scripts/prompt_optimizer.py b/engineering-team/senior-prompt-engineer/scripts/prompt_optimizer.py index 512e025..700093b 100755 --- a/engineering-team/senior-prompt-engineer/scripts/prompt_optimizer.py +++ b/engineering-team/senior-prompt-engineer/scripts/prompt_optimizer.py @@ -1,100 +1,519 @@ #!/usr/bin/env python3 """ -Prompt Optimizer -Production-grade tool for senior prompt engineer +Prompt Optimizer - Static analysis tool for prompt engineering + +Features: +- Token estimation (GPT-4/Claude approximation) +- Prompt structure analysis +- Clarity scoring +- Few-shot example extraction and management +- Optimization suggestions + +Usage: + python prompt_optimizer.py prompt.txt --analyze + python prompt_optimizer.py prompt.txt --tokens --model gpt-4 + python prompt_optimizer.py prompt.txt --optimize --output optimized.txt + python prompt_optimizer.py prompt.txt --extract-examples --output examples.json """ -import os -import sys -import json -import logging import argparse +import json +import re +import sys from pathlib import Path -from typing import Dict, List, Optional -from datetime import datetime +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass, asdict -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) -class PromptOptimizer: - """Production-grade prompt optimizer""" - - def __init__(self, config: Dict): - self.config = config - self.results = { - 'status': 'initialized', - 'start_time': datetime.now().isoformat(), - 'processed_items': 0 - } - logger.info(f"Initialized {self.__class__.__name__}") - - def validate_config(self) -> bool: - """Validate configuration""" - logger.info("Validating configuration...") - # Add validation logic - logger.info("Configuration validated") - return True - - def process(self) -> Dict: - """Main processing logic""" - logger.info("Starting processing...") - - try: - self.validate_config() - - # Main processing - result = self._execute() - - self.results['status'] = 'completed' - self.results['end_time'] = datetime.now().isoformat() - - logger.info("Processing completed successfully") - return self.results - - except Exception as e: - self.results['status'] = 'failed' - self.results['error'] = str(e) - logger.error(f"Processing failed: {e}") - raise - - def _execute(self) -> Dict: - """Execute main logic""" - # Implementation here - return {'success': True} +# Token estimation ratios (chars per token approximation) +TOKEN_RATIOS = { + 'gpt-4': 4.0, + 'gpt-3.5': 4.0, + 'claude': 3.5, + 'default': 4.0 +} + +# Cost per 1K tokens (input) +COST_PER_1K = { + 'gpt-4': 0.03, + 'gpt-4-turbo': 0.01, + 'gpt-3.5-turbo': 0.0005, + 'claude-3-opus': 0.015, + 'claude-3-sonnet': 0.003, + 'claude-3-haiku': 0.00025, + 'default': 0.01 +} + + +@dataclass +class PromptAnalysis: + """Results of prompt analysis""" + token_count: int + estimated_cost: float + model: str + clarity_score: int + structure_score: int + issues: List[Dict[str, str]] + suggestions: List[str] + sections: List[Dict[str, any]] + has_examples: bool + example_count: int + has_output_format: bool + word_count: int + line_count: int + + +@dataclass +class FewShotExample: + """A single few-shot example""" + input_text: str + output_text: str + index: int + + +def estimate_tokens(text: str, model: str = 'default') -> int: + """Estimate token count based on character ratio""" + ratio = TOKEN_RATIOS.get(model, TOKEN_RATIOS['default']) + return int(len(text) / ratio) + + +def estimate_cost(token_count: int, model: str = 'default') -> float: + """Estimate cost based on token count""" + cost_per_1k = COST_PER_1K.get(model, COST_PER_1K['default']) + return round((token_count / 1000) * cost_per_1k, 6) + + +def find_ambiguous_instructions(text: str) -> List[Dict[str, str]]: + """Find vague or ambiguous instructions""" + issues = [] + + # Vague verbs that need specificity + vague_patterns = [ + (r'\b(analyze|process|handle|deal with)\b', 'Vague verb - specify the exact action'), + (r'\b(good|nice|appropriate|suitable)\b', 'Subjective term - define specific criteria'), + (r'\b(etc\.|and so on|and more)\b', 'Open-ended list - enumerate all items explicitly'), + (r'\b(if needed|as necessary|when appropriate)\b', 'Conditional without criteria - specify when'), + (r'\b(some|several|many|few|various)\b', 'Vague quantity - use specific numbers'), + ] + + lines = text.split('\n') + for i, line in enumerate(lines, 1): + for pattern, message in vague_patterns: + matches = re.finditer(pattern, line, re.IGNORECASE) + for match in matches: + issues.append({ + 'type': 'ambiguity', + 'line': i, + 'text': match.group(), + 'message': message, + 'context': line.strip()[:80] + }) + + return issues + + +def find_redundant_content(text: str) -> List[Dict[str, str]]: + """Find potentially redundant content""" + issues = [] + lines = text.split('\n') + + # Check for repeated phrases (3+ words) + seen_phrases = {} + for i, line in enumerate(lines, 1): + words = line.split() + for j in range(len(words) - 2): + phrase = ' '.join(words[j:j+3]).lower() + phrase = re.sub(r'[^\w\s]', '', phrase) + if phrase and len(phrase) > 10: + if phrase in seen_phrases: + issues.append({ + 'type': 'redundancy', + 'line': i, + 'text': phrase, + 'message': f'Phrase repeated from line {seen_phrases[phrase]}', + 'context': line.strip()[:80] + }) + else: + seen_phrases[phrase] = i + + return issues + + +def check_output_format(text: str) -> Tuple[bool, List[str]]: + """Check if prompt specifies output format""" + suggestions = [] + + format_indicators = [ + r'respond\s+(in|with)\s+(json|xml|csv|markdown)', + r'output\s+format', + r'return\s+(only|just)', + r'format:\s*\n', + r'\{["\']?\w+["\']?\s*:', # JSON-like structure + r'```\w*\n', # Code block + ] + + has_format = any(re.search(p, text, re.IGNORECASE) for p in format_indicators) + + if not has_format: + suggestions.append('Add explicit output format specification (e.g., "Respond in JSON with keys: ...")') + + return has_format, suggestions + + +def extract_sections(text: str) -> List[Dict[str, any]]: + """Extract logical sections from prompt""" + sections = [] + + # Common section patterns + section_patterns = [ + r'^#+\s+(.+)$', # Markdown headers + r'^([A-Z][A-Za-z\s]+):\s*$', # Title Case Label: + r'^(Instructions|Context|Examples?|Input|Output|Task|Role|Format)[:.]', + ] + + lines = text.split('\n') + current_section = {'name': 'Introduction', 'start': 1, 'content': []} + + for i, line in enumerate(lines, 1): + is_header = False + for pattern in section_patterns: + match = re.match(pattern, line.strip(), re.IGNORECASE) + if match: + if current_section['content']: + current_section['end'] = i - 1 + current_section['line_count'] = len(current_section['content']) + sections.append(current_section) + current_section = { + 'name': match.group(1).strip() if match.groups() else line.strip(), + 'start': i, + 'content': [] + } + is_header = True + break + + if not is_header: + current_section['content'].append(line) + + # Add last section + if current_section['content']: + current_section['end'] = len(lines) + current_section['line_count'] = len(current_section['content']) + sections.append(current_section) + + return sections + + +def extract_few_shot_examples(text: str) -> List[FewShotExample]: + """Extract few-shot examples from prompt""" + examples = [] + + # Pattern 1: "Example N:" or "Example:" blocks + example_pattern = r'Example\s*\d*:\s*\n(Input:\s*(.+?)\n(?:Output:\s*(.+?)(?=\n\nExample|\n\n[A-Z]|\Z)))' + + matches = re.finditer(example_pattern, text, re.DOTALL | re.IGNORECASE) + for i, match in enumerate(matches, 1): + examples.append(FewShotExample( + input_text=match.group(2).strip() if match.group(2) else '', + output_text=match.group(3).strip() if match.group(3) else '', + index=i + )) + + # Pattern 2: Input/Output pairs without "Example" label + if not examples: + io_pattern = r'Input:\s*["\']?(.+?)["\']?\s*\nOutput:\s*(.+?)(?=\nInput:|\Z)' + matches = re.finditer(io_pattern, text, re.DOTALL) + for i, match in enumerate(matches, 1): + examples.append(FewShotExample( + input_text=match.group(1).strip(), + output_text=match.group(2).strip(), + index=i + )) + + return examples + + +def calculate_clarity_score(text: str, issues: List[Dict]) -> int: + """Calculate clarity score (0-100)""" + score = 100 + + # Deduct for issues + score -= len([i for i in issues if i['type'] == 'ambiguity']) * 5 + score -= len([i for i in issues if i['type'] == 'redundancy']) * 3 + + # Check for structure + if not re.search(r'^#+\s|^[A-Z][a-z]+:', text, re.MULTILINE): + score -= 10 # No clear sections + + # Check for instruction clarity + if not re.search(r'(you (should|must|will)|please|your task)', text, re.IGNORECASE): + score -= 5 # No clear directives + + return max(0, min(100, score)) + + +def calculate_structure_score(sections: List[Dict], has_format: bool, has_examples: bool) -> int: + """Calculate structure score (0-100)""" + score = 50 # Base score + + # Bonus for clear sections + if len(sections) >= 2: + score += 15 + if len(sections) >= 4: + score += 10 + + # Bonus for output format + if has_format: + score += 15 + + # Bonus for examples + if has_examples: + score += 10 + + return min(100, score) + + +def generate_suggestions(analysis: PromptAnalysis) -> List[str]: + """Generate optimization suggestions""" + suggestions = [] + + if not analysis.has_output_format: + suggestions.append('Add explicit output format: "Respond in JSON with keys: ..."') + + if analysis.example_count == 0: + suggestions.append('Consider adding 2-3 few-shot examples for consistent outputs') + elif analysis.example_count == 1: + suggestions.append('Add 1-2 more examples to improve consistency') + elif analysis.example_count > 5: + suggestions.append(f'Consider reducing examples from {analysis.example_count} to 3-5 to save tokens') + + if analysis.clarity_score < 70: + suggestions.append('Improve clarity: replace vague terms with specific instructions') + + if analysis.token_count > 2000: + suggestions.append(f'Prompt is {analysis.token_count} tokens - consider condensing for cost efficiency') + + # Check for role prompting + if not re.search(r'you are|act as|as a\s+\w+', analysis.sections[0].get('content', [''])[0] if analysis.sections else '', re.IGNORECASE): + suggestions.append('Consider adding role context: "You are an expert..."') + + return suggestions + + +def analyze_prompt(text: str, model: str = 'gpt-4') -> PromptAnalysis: + """Perform comprehensive prompt analysis""" + + # Basic metrics + token_count = estimate_tokens(text, model) + cost = estimate_cost(token_count, model) + word_count = len(text.split()) + line_count = len(text.split('\n')) + + # Find issues + ambiguity_issues = find_ambiguous_instructions(text) + redundancy_issues = find_redundant_content(text) + all_issues = ambiguity_issues + redundancy_issues + + # Extract structure + sections = extract_sections(text) + examples = extract_few_shot_examples(text) + has_format, format_suggestions = check_output_format(text) + + # Calculate scores + clarity_score = calculate_clarity_score(text, all_issues) + structure_score = calculate_structure_score(sections, has_format, len(examples) > 0) + + analysis = PromptAnalysis( + token_count=token_count, + estimated_cost=cost, + model=model, + clarity_score=clarity_score, + structure_score=structure_score, + issues=all_issues, + suggestions=[], + sections=[{'name': s['name'], 'lines': f"{s['start']}-{s.get('end', s['start'])}"} for s in sections], + has_examples=len(examples) > 0, + example_count=len(examples), + has_output_format=has_format, + word_count=word_count, + line_count=line_count + ) + + analysis.suggestions = generate_suggestions(analysis) + format_suggestions + + return analysis + + +def optimize_prompt(text: str) -> str: + """Generate optimized version of prompt""" + optimized = text + + # Remove redundant whitespace + optimized = re.sub(r'\n{3,}', '\n\n', optimized) + optimized = re.sub(r' {2,}', ' ', optimized) + + # Trim lines + lines = [line.rstrip() for line in optimized.split('\n')] + optimized = '\n'.join(lines) + + return optimized.strip() + + +def format_report(analysis: PromptAnalysis) -> str: + """Format analysis as human-readable report""" + report = [] + report.append("=" * 50) + report.append("PROMPT ANALYSIS REPORT") + report.append("=" * 50) + report.append("") + + report.append("📊 METRICS") + report.append(f" Token count: {analysis.token_count:,}") + report.append(f" Estimated cost: ${analysis.estimated_cost:.4f} ({analysis.model})") + report.append(f" Word count: {analysis.word_count:,}") + report.append(f" Line count: {analysis.line_count}") + report.append("") + + report.append("📈 SCORES") + report.append(f" Clarity: {analysis.clarity_score}/100 {'✅' if analysis.clarity_score >= 70 else '⚠️'}") + report.append(f" Structure: {analysis.structure_score}/100 {'✅' if analysis.structure_score >= 70 else '⚠️'}") + report.append("") + + report.append("📋 STRUCTURE") + report.append(f" Sections: {len(analysis.sections)}") + report.append(f" Examples: {analysis.example_count} {'✅' if analysis.has_examples else '❌'}") + report.append(f" Output format: {'✅ Specified' if analysis.has_output_format else '❌ Missing'}") + report.append("") + + if analysis.sections: + report.append(" Detected sections:") + for section in analysis.sections: + report.append(f" - {section['name']} (lines {section['lines']})") + report.append("") + + if analysis.issues: + report.append(f"⚠️ ISSUES FOUND ({len(analysis.issues)})") + for issue in analysis.issues[:10]: # Limit to first 10 + report.append(f" Line {issue['line']}: {issue['message']}") + report.append(f" Found: \"{issue['text']}\"") + if len(analysis.issues) > 10: + report.append(f" ... and {len(analysis.issues) - 10} more issues") + report.append("") + + if analysis.suggestions: + report.append("💡 SUGGESTIONS") + for i, suggestion in enumerate(analysis.suggestions, 1): + report.append(f" {i}. {suggestion}") + report.append("") + + report.append("=" * 50) + + return '\n'.join(report) + def main(): - """Main entry point""" parser = argparse.ArgumentParser( - description="Prompt Optimizer" + description="Prompt Optimizer - Analyze and optimize prompts", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s prompt.txt --analyze + %(prog)s prompt.txt --tokens --model claude-3-sonnet + %(prog)s prompt.txt --optimize --output optimized.txt + %(prog)s prompt.txt --extract-examples --output examples.json + """ ) - parser.add_argument('--input', '-i', required=True, help='Input path') - parser.add_argument('--output', '-o', required=True, help='Output path') - parser.add_argument('--config', '-c', help='Configuration file') - parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') - + + parser.add_argument('prompt', help='Prompt file to analyze') + parser.add_argument('--analyze', '-a', action='store_true', help='Run full analysis') + parser.add_argument('--tokens', '-t', action='store_true', help='Count tokens only') + parser.add_argument('--optimize', '-O', action='store_true', help='Generate optimized version') + parser.add_argument('--extract-examples', '-e', action='store_true', help='Extract few-shot examples') + parser.add_argument('--model', '-m', default='gpt-4', + choices=['gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo', 'claude-3-opus', 'claude-3-sonnet', 'claude-3-haiku'], + help='Model for token/cost estimation') + parser.add_argument('--output', '-o', help='Output file path') + parser.add_argument('--json', '-j', action='store_true', help='Output as JSON') + parser.add_argument('--compare', '-c', help='Compare with baseline analysis JSON') + args = parser.parse_args() - - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - try: - config = { - 'input': args.input, - 'output': args.output - } - - processor = PromptOptimizer(config) - results = processor.process() - - print(json.dumps(results, indent=2)) - sys.exit(0) - - except Exception as e: - logger.error(f"Fatal error: {e}") + + # Read prompt file + prompt_path = Path(args.prompt) + if not prompt_path.exists(): + print(f"Error: File not found: {args.prompt}", file=sys.stderr) sys.exit(1) + text = prompt_path.read_text(encoding='utf-8') + + # Tokens only + if args.tokens: + token_count = estimate_tokens(text, args.model) + cost = estimate_cost(token_count, args.model) + if args.json: + print(json.dumps({ + 'tokens': token_count, + 'cost': cost, + 'model': args.model + }, indent=2)) + else: + print(f"Tokens: {token_count:,}") + print(f"Estimated cost: ${cost:.4f} ({args.model})") + sys.exit(0) + + # Extract examples + if args.extract_examples: + examples = extract_few_shot_examples(text) + output = [asdict(ex) for ex in examples] + + if args.output: + Path(args.output).write_text(json.dumps(output, indent=2)) + print(f"Extracted {len(examples)} examples to {args.output}") + else: + print(json.dumps(output, indent=2)) + sys.exit(0) + + # Optimize + if args.optimize: + optimized = optimize_prompt(text) + + if args.output: + Path(args.output).write_text(optimized) + print(f"Optimized prompt written to {args.output}") + + # Show comparison + orig_tokens = estimate_tokens(text, args.model) + new_tokens = estimate_tokens(optimized, args.model) + saved = orig_tokens - new_tokens + print(f"Tokens: {orig_tokens:,} -> {new_tokens:,} (saved {saved:,})") + else: + print(optimized) + sys.exit(0) + + # Default: full analysis + analysis = analyze_prompt(text, args.model) + + # Compare with baseline + if args.compare: + baseline_path = Path(args.compare) + if baseline_path.exists(): + baseline = json.loads(baseline_path.read_text()) + print("\n📊 COMPARISON WITH BASELINE") + print(f" Tokens: {baseline.get('token_count', 0):,} -> {analysis.token_count:,}") + print(f" Clarity: {baseline.get('clarity_score', 0)} -> {analysis.clarity_score}") + print(f" Issues: {len(baseline.get('issues', []))} -> {len(analysis.issues)}") + print() + + if args.json: + print(json.dumps(asdict(analysis), indent=2)) + else: + print(format_report(analysis)) + + # Write to output file + if args.output: + output_data = asdict(analysis) + Path(args.output).write_text(json.dumps(output_data, indent=2)) + print(f"\nAnalysis saved to {args.output}") + + if __name__ == '__main__': main() diff --git a/engineering-team/senior-prompt-engineer/scripts/rag_evaluator.py b/engineering-team/senior-prompt-engineer/scripts/rag_evaluator.py index c676ff1..9906cb3 100755 --- a/engineering-team/senior-prompt-engineer/scripts/rag_evaluator.py +++ b/engineering-team/senior-prompt-engineer/scripts/rag_evaluator.py @@ -1,100 +1,574 @@ #!/usr/bin/env python3 """ -Rag Evaluator -Production-grade tool for senior prompt engineer +RAG Evaluator - Evaluation tool for Retrieval-Augmented Generation systems + +Features: +- Context relevance scoring (lexical overlap) +- Answer faithfulness checking +- Retrieval metrics (Precision@K, Recall@K, MRR) +- Coverage analysis +- Quality report generation + +Usage: + python rag_evaluator.py --contexts contexts.json --questions questions.json + python rag_evaluator.py --contexts ctx.json --questions q.json --metrics relevance,faithfulness + python rag_evaluator.py --contexts ctx.json --questions q.json --output report.json --verbose """ -import os -import sys -import json -import logging import argparse +import json +import re +import sys from pathlib import Path -from typing import Dict, List, Optional -from datetime import datetime +from typing import Dict, List, Optional, Set, Tuple +from dataclasses import dataclass, asdict, field +from collections import Counter +import math -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) -class RagEvaluator: - """Production-grade rag evaluator""" - - def __init__(self, config: Dict): - self.config = config - self.results = { - 'status': 'initialized', - 'start_time': datetime.now().isoformat(), - 'processed_items': 0 +@dataclass +class RetrievalMetrics: + """Retrieval quality metrics""" + precision_at_k: float + recall_at_k: float + mrr: float # Mean Reciprocal Rank + ndcg_at_k: float + k: int + + +@dataclass +class ContextEvaluation: + """Evaluation of a single context""" + context_id: str + relevance_score: float + token_overlap: float + key_terms_covered: List[str] + missing_terms: List[str] + + +@dataclass +class AnswerEvaluation: + """Evaluation of an answer against context""" + question_id: str + faithfulness_score: float + groundedness_score: float + claims: List[Dict[str, any]] + unsupported_claims: List[str] + context_used: List[str] + + +@dataclass +class RAGEvaluationReport: + """Complete RAG evaluation report""" + total_questions: int + avg_context_relevance: float + avg_faithfulness: float + avg_groundedness: float + retrieval_metrics: Dict[str, float] + coverage: float + issues: List[Dict[str, str]] + recommendations: List[str] + question_details: List[Dict[str, any]] = field(default_factory=list) + + +def tokenize(text: str) -> List[str]: + """Simple tokenization for text comparison""" + # Lowercase and split on non-alphanumeric + text = text.lower() + tokens = re.findall(r'\b\w+\b', text) + # Remove common stopwords + stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', + 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', + 'would', 'could', 'should', 'may', 'might', 'must', 'shall', + 'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', + 'from', 'as', 'into', 'through', 'during', 'before', 'after', + 'above', 'below', 'up', 'down', 'out', 'off', 'over', 'under', + 'again', 'further', 'then', 'once', 'here', 'there', 'when', + 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', + 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', + 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', + 'if', 'or', 'because', 'until', 'while', 'it', 'this', 'that', + 'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they'} + return [t for t in tokens if t not in stopwords and len(t) > 2] + + +def extract_key_terms(text: str, top_n: int = 10) -> List[str]: + """Extract key terms from text based on frequency""" + tokens = tokenize(text) + freq = Counter(tokens) + return [term for term, _ in freq.most_common(top_n)] + + +def calculate_token_overlap(text1: str, text2: str) -> float: + """Calculate Jaccard similarity between two texts""" + tokens1 = set(tokenize(text1)) + tokens2 = set(tokenize(text2)) + + if not tokens1 or not tokens2: + return 0.0 + + intersection = tokens1 & tokens2 + union = tokens1 | tokens2 + + return len(intersection) / len(union) if union else 0.0 + + +def calculate_rouge_l(reference: str, candidate: str) -> float: + """Calculate ROUGE-L score (Longest Common Subsequence)""" + ref_tokens = tokenize(reference) + cand_tokens = tokenize(candidate) + + if not ref_tokens or not cand_tokens: + return 0.0 + + # LCS using dynamic programming + m, n = len(ref_tokens), len(cand_tokens) + dp = [[0] * (n + 1) for _ in range(m + 1)] + + for i in range(1, m + 1): + for j in range(1, n + 1): + if ref_tokens[i-1] == cand_tokens[j-1]: + dp[i][j] = dp[i-1][j-1] + 1 + else: + dp[i][j] = max(dp[i-1][j], dp[i][j-1]) + + lcs_length = dp[m][n] + + # F1-like score + precision = lcs_length / n if n > 0 else 0 + recall = lcs_length / m if m > 0 else 0 + + if precision + recall == 0: + return 0.0 + + return 2 * precision * recall / (precision + recall) + + +def evaluate_context_relevance(question: str, context: str, context_id: str = "") -> ContextEvaluation: + """Evaluate how relevant a context is to a question""" + question_terms = set(extract_key_terms(question, 15)) + context_terms = set(extract_key_terms(context, 30)) + + covered = question_terms & context_terms + missing = question_terms - context_terms + + # Calculate relevance based on term coverage and overlap + term_coverage = len(covered) / len(question_terms) if question_terms else 0 + token_overlap = calculate_token_overlap(question, context) + + # Combined relevance score + relevance = 0.6 * term_coverage + 0.4 * token_overlap + + return ContextEvaluation( + context_id=context_id, + relevance_score=round(relevance, 3), + token_overlap=round(token_overlap, 3), + key_terms_covered=list(covered), + missing_terms=list(missing) + ) + + +def extract_claims(answer: str) -> List[str]: + """Extract individual claims from an answer""" + # Split on sentence boundaries + sentences = re.split(r'[.!?]+', answer) + claims = [] + + for sentence in sentences: + sentence = sentence.strip() + if len(sentence) > 10: # Filter out very short fragments + claims.append(sentence) + + return claims + + +def check_claim_support(claim: str, context: str) -> Tuple[bool, float]: + """Check if a claim is supported by the context""" + claim_terms = set(tokenize(claim)) + context_terms = set(tokenize(context)) + + if not claim_terms: + return True, 1.0 # Empty claim is "supported" + + # Check term overlap + overlap = claim_terms & context_terms + support_ratio = len(overlap) / len(claim_terms) + + # Also check for ROUGE-L style matching + rouge_score = calculate_rouge_l(context, claim) + + # Combined support score + support_score = 0.5 * support_ratio + 0.5 * rouge_score + + return support_score > 0.3, support_score + + +def evaluate_answer_faithfulness( + question: str, + answer: str, + contexts: List[str], + question_id: str = "" +) -> AnswerEvaluation: + """Evaluate if answer is faithful to the provided contexts""" + claims = extract_claims(answer) + combined_context = ' '.join(contexts) + + claim_evaluations = [] + supported_claims = 0 + unsupported = [] + context_used = [] + + for claim in claims: + is_supported, score = check_claim_support(claim, combined_context) + + claim_eval = { + 'claim': claim[:100] + '...' if len(claim) > 100 else claim, + 'supported': is_supported, + 'score': round(score, 3) } - logger.info(f"Initialized {self.__class__.__name__}") - - def validate_config(self) -> bool: - """Validate configuration""" - logger.info("Validating configuration...") - # Add validation logic - logger.info("Configuration validated") - return True - - def process(self) -> Dict: - """Main processing logic""" - logger.info("Starting processing...") - - try: - self.validate_config() - - # Main processing - result = self._execute() - - self.results['status'] = 'completed' - self.results['end_time'] = datetime.now().isoformat() - - logger.info("Processing completed successfully") - return self.results - - except Exception as e: - self.results['status'] = 'failed' - self.results['error'] = str(e) - logger.error(f"Processing failed: {e}") - raise - - def _execute(self) -> Dict: - """Execute main logic""" - # Implementation here - return {'success': True} + + # Track which contexts support this claim + for i, ctx in enumerate(contexts): + _, ctx_score = check_claim_support(claim, ctx) + if ctx_score > 0.3: + claim_eval[f'context_{i}'] = round(ctx_score, 3) + if f'context_{i}' not in context_used: + context_used.append(f'context_{i}') + + claim_evaluations.append(claim_eval) + + if is_supported: + supported_claims += 1 + else: + unsupported.append(claim[:100]) + + # Faithfulness = % of claims supported + faithfulness = supported_claims / len(claims) if claims else 1.0 + + # Groundedness = average support score + avg_score = sum(c['score'] for c in claim_evaluations) / len(claim_evaluations) if claim_evaluations else 1.0 + + return AnswerEvaluation( + question_id=question_id, + faithfulness_score=round(faithfulness, 3), + groundedness_score=round(avg_score, 3), + claims=claim_evaluations, + unsupported_claims=unsupported, + context_used=context_used + ) + + +def calculate_retrieval_metrics( + retrieved: List[str], + relevant: Set[str], + k: int = 5 +) -> RetrievalMetrics: + """Calculate standard retrieval metrics""" + retrieved_k = retrieved[:k] + + # Precision@K + relevant_in_k = sum(1 for doc in retrieved_k if doc in relevant) + precision = relevant_in_k / k if k > 0 else 0 + + # Recall@K + recall = relevant_in_k / len(relevant) if relevant else 0 + + # MRR (Mean Reciprocal Rank) + mrr = 0.0 + for i, doc in enumerate(retrieved): + if doc in relevant: + mrr = 1.0 / (i + 1) + break + + # NDCG@K + dcg = 0.0 + for i, doc in enumerate(retrieved_k): + rel = 1 if doc in relevant else 0 + dcg += rel / math.log2(i + 2) + + # Ideal DCG (all relevant at top) + idcg = sum(1 / math.log2(i + 2) for i in range(min(len(relevant), k))) + ndcg = dcg / idcg if idcg > 0 else 0 + + return RetrievalMetrics( + precision_at_k=round(precision, 3), + recall_at_k=round(recall, 3), + mrr=round(mrr, 3), + ndcg_at_k=round(ndcg, 3), + k=k + ) + + +def generate_recommendations(report: RAGEvaluationReport) -> List[str]: + """Generate actionable recommendations based on evaluation""" + recommendations = [] + + if report.avg_context_relevance < 0.8: + recommendations.append( + f"Context relevance ({report.avg_context_relevance:.2f}) is below target (0.80). " + "Consider: improving chunking strategy, adding metadata filtering, or using hybrid search." + ) + + if report.avg_faithfulness < 0.95: + recommendations.append( + f"Faithfulness ({report.avg_faithfulness:.2f}) is below target (0.95). " + "Consider: adding source citations, implementing fact-checking, or adjusting temperature." + ) + + if report.avg_groundedness < 0.85: + recommendations.append( + f"Groundedness ({report.avg_groundedness:.2f}) is below target (0.85). " + "Consider: using more restrictive prompts, adding 'only use provided context' instructions." + ) + + if report.coverage < 0.9: + recommendations.append( + f"Coverage ({report.coverage:.2f}) indicates some questions lack relevant context. " + "Consider: expanding document corpus, improving embedding model, or adding fallback responses." + ) + + retrieval = report.retrieval_metrics + if retrieval.get('precision_at_k', 0) < 0.7: + recommendations.append( + "Retrieval precision is low. Consider: re-ranking retrieved documents, " + "using cross-encoder for reranking, or adjusting similarity threshold." + ) + + if not recommendations: + recommendations.append("All metrics meet targets. Consider A/B testing new improvements.") + + return recommendations + + +def evaluate_rag_system( + questions: List[Dict], + contexts: List[Dict], + k: int = 5, + verbose: bool = False +) -> RAGEvaluationReport: + """Comprehensive RAG system evaluation""" + + all_context_scores = [] + all_faithfulness_scores = [] + all_groundedness_scores = [] + issues = [] + question_details = [] + + questions_with_context = 0 + + for q_data in questions: + question = q_data.get('question', q_data.get('query', '')) + question_id = q_data.get('id', str(questions.index(q_data))) + answer = q_data.get('answer', q_data.get('response', '')) + expected = q_data.get('expected', q_data.get('ground_truth', '')) + + # Find contexts for this question + q_contexts = [] + for ctx in contexts: + if ctx.get('question_id') == question_id or ctx.get('query_id') == question_id: + q_contexts.append(ctx.get('content', ctx.get('text', ''))) + + # If no specific contexts, use all contexts (for simple datasets) + if not q_contexts: + q_contexts = [ctx.get('content', ctx.get('text', '')) + for ctx in contexts[:k]] + + if q_contexts: + questions_with_context += 1 + + # Evaluate context relevance + context_evals = [] + for i, ctx in enumerate(q_contexts[:k]): + eval_result = evaluate_context_relevance(question, ctx, f"ctx_{i}") + context_evals.append(eval_result) + all_context_scores.append(eval_result.relevance_score) + + # Evaluate answer faithfulness + if answer and q_contexts: + answer_eval = evaluate_answer_faithfulness(question, answer, q_contexts, question_id) + all_faithfulness_scores.append(answer_eval.faithfulness_score) + all_groundedness_scores.append(answer_eval.groundedness_score) + + # Track issues + if answer_eval.unsupported_claims: + issues.append({ + 'type': 'unsupported_claim', + 'question_id': question_id, + 'claims': answer_eval.unsupported_claims[:3] + }) + + # Check for low relevance contexts + low_relevance = [e for e in context_evals if e.relevance_score < 0.5] + if low_relevance: + issues.append({ + 'type': 'low_relevance', + 'question_id': question_id, + 'contexts': [e.context_id for e in low_relevance] + }) + + if verbose: + question_details.append({ + 'question_id': question_id, + 'question': question[:100], + 'context_scores': [asdict(e) for e in context_evals], + 'answer_faithfulness': all_faithfulness_scores[-1] if all_faithfulness_scores else None + }) + + # Calculate aggregates + avg_context_relevance = sum(all_context_scores) / len(all_context_scores) if all_context_scores else 0 + avg_faithfulness = sum(all_faithfulness_scores) / len(all_faithfulness_scores) if all_faithfulness_scores else 0 + avg_groundedness = sum(all_groundedness_scores) / len(all_groundedness_scores) if all_groundedness_scores else 0 + coverage = questions_with_context / len(questions) if questions else 0 + + # Simulated retrieval metrics (based on relevance scores) + high_relevance = sum(1 for s in all_context_scores if s > 0.5) + retrieval_metrics = { + 'precision_at_k': round(high_relevance / len(all_context_scores) if all_context_scores else 0, 3), + 'estimated_recall': round(coverage, 3), + 'k': k + } + + report = RAGEvaluationReport( + total_questions=len(questions), + avg_context_relevance=round(avg_context_relevance, 3), + avg_faithfulness=round(avg_faithfulness, 3), + avg_groundedness=round(avg_groundedness, 3), + retrieval_metrics=retrieval_metrics, + coverage=round(coverage, 3), + issues=issues[:20], # Limit to 20 issues + recommendations=[], + question_details=question_details if verbose else [] + ) + + report.recommendations = generate_recommendations(report) + + return report + + +def format_report(report: RAGEvaluationReport) -> str: + """Format report as human-readable text""" + lines = [] + lines.append("=" * 60) + lines.append("RAG EVALUATION REPORT") + lines.append("=" * 60) + lines.append("") + + lines.append(f"📊 SUMMARY") + lines.append(f" Questions evaluated: {report.total_questions}") + lines.append(f" Coverage: {report.coverage:.1%}") + lines.append("") + + lines.append("📈 RETRIEVAL METRICS") + lines.append(f" Context Relevance: {report.avg_context_relevance:.2f} {'✅' if report.avg_context_relevance >= 0.8 else '⚠️'} (target: >0.80)") + lines.append(f" Precision@{report.retrieval_metrics.get('k', 5)}: {report.retrieval_metrics.get('precision_at_k', 0):.2f}") + lines.append("") + + lines.append("📝 GENERATION METRICS") + lines.append(f" Answer Faithfulness: {report.avg_faithfulness:.2f} {'✅' if report.avg_faithfulness >= 0.95 else '⚠️'} (target: >0.95)") + lines.append(f" Groundedness: {report.avg_groundedness:.2f} {'✅' if report.avg_groundedness >= 0.85 else '⚠️'} (target: >0.85)") + lines.append("") + + if report.issues: + lines.append(f"⚠️ ISSUES FOUND ({len(report.issues)})") + for issue in report.issues[:10]: + if issue['type'] == 'unsupported_claim': + lines.append(f" Q{issue['question_id']}: {len(issue.get('claims', []))} unsupported claim(s)") + elif issue['type'] == 'low_relevance': + lines.append(f" Q{issue['question_id']}: Low relevance contexts: {issue.get('contexts', [])}") + if len(report.issues) > 10: + lines.append(f" ... and {len(report.issues) - 10} more issues") + lines.append("") + + lines.append("💡 RECOMMENDATIONS") + for i, rec in enumerate(report.recommendations, 1): + lines.append(f" {i}. {rec}") + lines.append("") + + lines.append("=" * 60) + + return '\n'.join(lines) + def main(): - """Main entry point""" parser = argparse.ArgumentParser( - description="Rag Evaluator" + description="RAG Evaluator - Evaluate Retrieval-Augmented Generation systems", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s --contexts contexts.json --questions questions.json + %(prog)s --contexts ctx.json --questions q.json --k 10 + %(prog)s --contexts ctx.json --questions q.json --output report.json --verbose + +Input file formats: + +questions.json: +[ + {"id": "q1", "question": "What is X?", "answer": "X is..."}, + {"id": "q2", "question": "How does Y work?", "answer": "Y works by..."} +] + +contexts.json: +[ + {"question_id": "q1", "content": "Retrieved context text..."}, + {"question_id": "q2", "content": "Another context..."} +] + """ ) - parser.add_argument('--input', '-i', required=True, help='Input path') - parser.add_argument('--output', '-o', required=True, help='Output path') - parser.add_argument('--config', '-c', help='Configuration file') - parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output') - + + parser.add_argument('--contexts', '-c', required=True, help='JSON file with retrieved contexts') + parser.add_argument('--questions', '-q', required=True, help='JSON file with questions and answers') + parser.add_argument('--k', type=int, default=5, help='Number of top contexts to evaluate (default: 5)') + parser.add_argument('--output', '-o', help='Output file for detailed report (JSON)') + parser.add_argument('--json', '-j', action='store_true', help='Output as JSON instead of text') + parser.add_argument('--verbose', '-v', action='store_true', help='Include per-question details') + parser.add_argument('--compare', help='Compare with baseline report JSON') + args = parser.parse_args() - - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - - try: - config = { - 'input': args.input, - 'output': args.output - } - - processor = RagEvaluator(config) - results = processor.process() - - print(json.dumps(results, indent=2)) - sys.exit(0) - - except Exception as e: - logger.error(f"Fatal error: {e}") + + # Load input files + contexts_path = Path(args.contexts) + questions_path = Path(args.questions) + + if not contexts_path.exists(): + print(f"Error: Contexts file not found: {args.contexts}", file=sys.stderr) sys.exit(1) + if not questions_path.exists(): + print(f"Error: Questions file not found: {args.questions}", file=sys.stderr) + sys.exit(1) + + try: + contexts = json.loads(contexts_path.read_text(encoding='utf-8')) + questions = json.loads(questions_path.read_text(encoding='utf-8')) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON format: {e}", file=sys.stderr) + sys.exit(1) + + # Run evaluation + report = evaluate_rag_system(questions, contexts, k=args.k, verbose=args.verbose) + + # Compare with baseline + if args.compare: + baseline_path = Path(args.compare) + if baseline_path.exists(): + baseline = json.loads(baseline_path.read_text()) + print("\n📊 COMPARISON WITH BASELINE") + print(f" Relevance: {baseline.get('avg_context_relevance', 0):.2f} -> {report.avg_context_relevance:.2f}") + print(f" Faithfulness: {baseline.get('avg_faithfulness', 0):.2f} -> {report.avg_faithfulness:.2f}") + print(f" Groundedness: {baseline.get('avg_groundedness', 0):.2f} -> {report.avg_groundedness:.2f}") + print() + + # Output + if args.json: + print(json.dumps(asdict(report), indent=2)) + else: + print(format_report(report)) + + # Save to file + if args.output: + Path(args.output).write_text(json.dumps(asdict(report), indent=2)) + print(f"\nDetailed report saved to {args.output}") + + if __name__ == '__main__': main()