Files
claude-skills-reference/engineering-team/senior-prompt-engineer/scripts/rag_evaluator.py
Alireza Rezvani 6723bc6977 fix(skill): rewrite senior-prompt-engineer with unique, actionable content (#91)
Issue #49 feedback implementation:

SKILL.md:
- Added YAML frontmatter with trigger phrases
- Removed marketing language ("world-class", etc.)
- Added Table of Contents
- Converted vague bullets to concrete workflows
- Added input/output examples for all tools

Reference files (all 3 previously 100% identical):
- prompt_engineering_patterns.md: 10 patterns with examples
  (Zero-Shot, Few-Shot, CoT, Role, Structured Output, etc.)
- llm_evaluation_frameworks.md: 7 sections on metrics
  (BLEU, ROUGE, BERTScore, RAG metrics, A/B testing)
- agentic_system_design.md: 6 agent architecture sections
  (ReAct, Plan-Execute, Tool Use, Multi-Agent, Memory)

Python scripts (all 3 previously identical placeholders):
- prompt_optimizer.py: Token counting, clarity analysis,
  few-shot extraction, optimization suggestions
- rag_evaluator.py: Context relevance, faithfulness,
  retrieval metrics (Precision@K, MRR, NDCG)
- agent_orchestrator.py: Config parsing, validation,
  ASCII/Mermaid visualization, cost estimation

Total: 3,571 lines added, 587 deleted
Before: ~785 lines duplicate boilerplate
After: 3,750 lines unique, actionable content

Closes #49

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-26 11:03:37 +01:00

575 lines
20 KiB
Python
Executable File

#!/usr/bin/env python3
"""
RAG Evaluator - Evaluation tool for Retrieval-Augmented Generation systems
Features:
- Context relevance scoring (lexical overlap)
- Answer faithfulness checking
- Retrieval metrics (Precision@K, Recall@K, MRR)
- Coverage analysis
- Quality report generation
Usage:
python rag_evaluator.py --contexts contexts.json --questions questions.json
python rag_evaluator.py --contexts ctx.json --questions q.json --metrics relevance,faithfulness
python rag_evaluator.py --contexts ctx.json --questions q.json --output report.json --verbose
"""
import argparse
import json
import re
import sys
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple
from dataclasses import dataclass, asdict, field
from collections import Counter
import math
@dataclass
class RetrievalMetrics:
"""Retrieval quality metrics"""
precision_at_k: float
recall_at_k: float
mrr: float # Mean Reciprocal Rank
ndcg_at_k: float
k: int
@dataclass
class ContextEvaluation:
"""Evaluation of a single context"""
context_id: str
relevance_score: float
token_overlap: float
key_terms_covered: List[str]
missing_terms: List[str]
@dataclass
class AnswerEvaluation:
"""Evaluation of an answer against context"""
question_id: str
faithfulness_score: float
groundedness_score: float
claims: List[Dict[str, any]]
unsupported_claims: List[str]
context_used: List[str]
@dataclass
class RAGEvaluationReport:
"""Complete RAG evaluation report"""
total_questions: int
avg_context_relevance: float
avg_faithfulness: float
avg_groundedness: float
retrieval_metrics: Dict[str, float]
coverage: float
issues: List[Dict[str, str]]
recommendations: List[str]
question_details: List[Dict[str, any]] = field(default_factory=list)
def tokenize(text: str) -> List[str]:
"""Simple tokenization for text comparison"""
# Lowercase and split on non-alphanumeric
text = text.lower()
tokens = re.findall(r'\b\w+\b', text)
# Remove common stopwords
stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'must', 'shall',
'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by',
'from', 'as', 'into', 'through', 'during', 'before', 'after',
'above', 'below', 'up', 'down', 'out', 'off', 'over', 'under',
'again', 'further', 'then', 'once', 'here', 'there', 'when',
'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most',
'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but',
'if', 'or', 'because', 'until', 'while', 'it', 'this', 'that',
'these', 'those', 'i', 'you', 'he', 'she', 'we', 'they'}
return [t for t in tokens if t not in stopwords and len(t) > 2]
def extract_key_terms(text: str, top_n: int = 10) -> List[str]:
"""Extract key terms from text based on frequency"""
tokens = tokenize(text)
freq = Counter(tokens)
return [term for term, _ in freq.most_common(top_n)]
def calculate_token_overlap(text1: str, text2: str) -> float:
"""Calculate Jaccard similarity between two texts"""
tokens1 = set(tokenize(text1))
tokens2 = set(tokenize(text2))
if not tokens1 or not tokens2:
return 0.0
intersection = tokens1 & tokens2
union = tokens1 | tokens2
return len(intersection) / len(union) if union else 0.0
def calculate_rouge_l(reference: str, candidate: str) -> float:
"""Calculate ROUGE-L score (Longest Common Subsequence)"""
ref_tokens = tokenize(reference)
cand_tokens = tokenize(candidate)
if not ref_tokens or not cand_tokens:
return 0.0
# LCS using dynamic programming
m, n = len(ref_tokens), len(cand_tokens)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if ref_tokens[i-1] == cand_tokens[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
lcs_length = dp[m][n]
# F1-like score
precision = lcs_length / n if n > 0 else 0
recall = lcs_length / m if m > 0 else 0
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
def evaluate_context_relevance(question: str, context: str, context_id: str = "") -> ContextEvaluation:
"""Evaluate how relevant a context is to a question"""
question_terms = set(extract_key_terms(question, 15))
context_terms = set(extract_key_terms(context, 30))
covered = question_terms & context_terms
missing = question_terms - context_terms
# Calculate relevance based on term coverage and overlap
term_coverage = len(covered) / len(question_terms) if question_terms else 0
token_overlap = calculate_token_overlap(question, context)
# Combined relevance score
relevance = 0.6 * term_coverage + 0.4 * token_overlap
return ContextEvaluation(
context_id=context_id,
relevance_score=round(relevance, 3),
token_overlap=round(token_overlap, 3),
key_terms_covered=list(covered),
missing_terms=list(missing)
)
def extract_claims(answer: str) -> List[str]:
"""Extract individual claims from an answer"""
# Split on sentence boundaries
sentences = re.split(r'[.!?]+', answer)
claims = []
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) > 10: # Filter out very short fragments
claims.append(sentence)
return claims
def check_claim_support(claim: str, context: str) -> Tuple[bool, float]:
"""Check if a claim is supported by the context"""
claim_terms = set(tokenize(claim))
context_terms = set(tokenize(context))
if not claim_terms:
return True, 1.0 # Empty claim is "supported"
# Check term overlap
overlap = claim_terms & context_terms
support_ratio = len(overlap) / len(claim_terms)
# Also check for ROUGE-L style matching
rouge_score = calculate_rouge_l(context, claim)
# Combined support score
support_score = 0.5 * support_ratio + 0.5 * rouge_score
return support_score > 0.3, support_score
def evaluate_answer_faithfulness(
question: str,
answer: str,
contexts: List[str],
question_id: str = ""
) -> AnswerEvaluation:
"""Evaluate if answer is faithful to the provided contexts"""
claims = extract_claims(answer)
combined_context = ' '.join(contexts)
claim_evaluations = []
supported_claims = 0
unsupported = []
context_used = []
for claim in claims:
is_supported, score = check_claim_support(claim, combined_context)
claim_eval = {
'claim': claim[:100] + '...' if len(claim) > 100 else claim,
'supported': is_supported,
'score': round(score, 3)
}
# Track which contexts support this claim
for i, ctx in enumerate(contexts):
_, ctx_score = check_claim_support(claim, ctx)
if ctx_score > 0.3:
claim_eval[f'context_{i}'] = round(ctx_score, 3)
if f'context_{i}' not in context_used:
context_used.append(f'context_{i}')
claim_evaluations.append(claim_eval)
if is_supported:
supported_claims += 1
else:
unsupported.append(claim[:100])
# Faithfulness = % of claims supported
faithfulness = supported_claims / len(claims) if claims else 1.0
# Groundedness = average support score
avg_score = sum(c['score'] for c in claim_evaluations) / len(claim_evaluations) if claim_evaluations else 1.0
return AnswerEvaluation(
question_id=question_id,
faithfulness_score=round(faithfulness, 3),
groundedness_score=round(avg_score, 3),
claims=claim_evaluations,
unsupported_claims=unsupported,
context_used=context_used
)
def calculate_retrieval_metrics(
retrieved: List[str],
relevant: Set[str],
k: int = 5
) -> RetrievalMetrics:
"""Calculate standard retrieval metrics"""
retrieved_k = retrieved[:k]
# Precision@K
relevant_in_k = sum(1 for doc in retrieved_k if doc in relevant)
precision = relevant_in_k / k if k > 0 else 0
# Recall@K
recall = relevant_in_k / len(relevant) if relevant else 0
# MRR (Mean Reciprocal Rank)
mrr = 0.0
for i, doc in enumerate(retrieved):
if doc in relevant:
mrr = 1.0 / (i + 1)
break
# NDCG@K
dcg = 0.0
for i, doc in enumerate(retrieved_k):
rel = 1 if doc in relevant else 0
dcg += rel / math.log2(i + 2)
# Ideal DCG (all relevant at top)
idcg = sum(1 / math.log2(i + 2) for i in range(min(len(relevant), k)))
ndcg = dcg / idcg if idcg > 0 else 0
return RetrievalMetrics(
precision_at_k=round(precision, 3),
recall_at_k=round(recall, 3),
mrr=round(mrr, 3),
ndcg_at_k=round(ndcg, 3),
k=k
)
def generate_recommendations(report: RAGEvaluationReport) -> List[str]:
"""Generate actionable recommendations based on evaluation"""
recommendations = []
if report.avg_context_relevance < 0.8:
recommendations.append(
f"Context relevance ({report.avg_context_relevance:.2f}) is below target (0.80). "
"Consider: improving chunking strategy, adding metadata filtering, or using hybrid search."
)
if report.avg_faithfulness < 0.95:
recommendations.append(
f"Faithfulness ({report.avg_faithfulness:.2f}) is below target (0.95). "
"Consider: adding source citations, implementing fact-checking, or adjusting temperature."
)
if report.avg_groundedness < 0.85:
recommendations.append(
f"Groundedness ({report.avg_groundedness:.2f}) is below target (0.85). "
"Consider: using more restrictive prompts, adding 'only use provided context' instructions."
)
if report.coverage < 0.9:
recommendations.append(
f"Coverage ({report.coverage:.2f}) indicates some questions lack relevant context. "
"Consider: expanding document corpus, improving embedding model, or adding fallback responses."
)
retrieval = report.retrieval_metrics
if retrieval.get('precision_at_k', 0) < 0.7:
recommendations.append(
"Retrieval precision is low. Consider: re-ranking retrieved documents, "
"using cross-encoder for reranking, or adjusting similarity threshold."
)
if not recommendations:
recommendations.append("All metrics meet targets. Consider A/B testing new improvements.")
return recommendations
def evaluate_rag_system(
questions: List[Dict],
contexts: List[Dict],
k: int = 5,
verbose: bool = False
) -> RAGEvaluationReport:
"""Comprehensive RAG system evaluation"""
all_context_scores = []
all_faithfulness_scores = []
all_groundedness_scores = []
issues = []
question_details = []
questions_with_context = 0
for q_data in questions:
question = q_data.get('question', q_data.get('query', ''))
question_id = q_data.get('id', str(questions.index(q_data)))
answer = q_data.get('answer', q_data.get('response', ''))
expected = q_data.get('expected', q_data.get('ground_truth', ''))
# Find contexts for this question
q_contexts = []
for ctx in contexts:
if ctx.get('question_id') == question_id or ctx.get('query_id') == question_id:
q_contexts.append(ctx.get('content', ctx.get('text', '')))
# If no specific contexts, use all contexts (for simple datasets)
if not q_contexts:
q_contexts = [ctx.get('content', ctx.get('text', ''))
for ctx in contexts[:k]]
if q_contexts:
questions_with_context += 1
# Evaluate context relevance
context_evals = []
for i, ctx in enumerate(q_contexts[:k]):
eval_result = evaluate_context_relevance(question, ctx, f"ctx_{i}")
context_evals.append(eval_result)
all_context_scores.append(eval_result.relevance_score)
# Evaluate answer faithfulness
if answer and q_contexts:
answer_eval = evaluate_answer_faithfulness(question, answer, q_contexts, question_id)
all_faithfulness_scores.append(answer_eval.faithfulness_score)
all_groundedness_scores.append(answer_eval.groundedness_score)
# Track issues
if answer_eval.unsupported_claims:
issues.append({
'type': 'unsupported_claim',
'question_id': question_id,
'claims': answer_eval.unsupported_claims[:3]
})
# Check for low relevance contexts
low_relevance = [e for e in context_evals if e.relevance_score < 0.5]
if low_relevance:
issues.append({
'type': 'low_relevance',
'question_id': question_id,
'contexts': [e.context_id for e in low_relevance]
})
if verbose:
question_details.append({
'question_id': question_id,
'question': question[:100],
'context_scores': [asdict(e) for e in context_evals],
'answer_faithfulness': all_faithfulness_scores[-1] if all_faithfulness_scores else None
})
# Calculate aggregates
avg_context_relevance = sum(all_context_scores) / len(all_context_scores) if all_context_scores else 0
avg_faithfulness = sum(all_faithfulness_scores) / len(all_faithfulness_scores) if all_faithfulness_scores else 0
avg_groundedness = sum(all_groundedness_scores) / len(all_groundedness_scores) if all_groundedness_scores else 0
coverage = questions_with_context / len(questions) if questions else 0
# Simulated retrieval metrics (based on relevance scores)
high_relevance = sum(1 for s in all_context_scores if s > 0.5)
retrieval_metrics = {
'precision_at_k': round(high_relevance / len(all_context_scores) if all_context_scores else 0, 3),
'estimated_recall': round(coverage, 3),
'k': k
}
report = RAGEvaluationReport(
total_questions=len(questions),
avg_context_relevance=round(avg_context_relevance, 3),
avg_faithfulness=round(avg_faithfulness, 3),
avg_groundedness=round(avg_groundedness, 3),
retrieval_metrics=retrieval_metrics,
coverage=round(coverage, 3),
issues=issues[:20], # Limit to 20 issues
recommendations=[],
question_details=question_details if verbose else []
)
report.recommendations = generate_recommendations(report)
return report
def format_report(report: RAGEvaluationReport) -> str:
"""Format report as human-readable text"""
lines = []
lines.append("=" * 60)
lines.append("RAG EVALUATION REPORT")
lines.append("=" * 60)
lines.append("")
lines.append(f"📊 SUMMARY")
lines.append(f" Questions evaluated: {report.total_questions}")
lines.append(f" Coverage: {report.coverage:.1%}")
lines.append("")
lines.append("📈 RETRIEVAL METRICS")
lines.append(f" Context Relevance: {report.avg_context_relevance:.2f} {'' if report.avg_context_relevance >= 0.8 else '⚠️'} (target: >0.80)")
lines.append(f" Precision@{report.retrieval_metrics.get('k', 5)}: {report.retrieval_metrics.get('precision_at_k', 0):.2f}")
lines.append("")
lines.append("📝 GENERATION METRICS")
lines.append(f" Answer Faithfulness: {report.avg_faithfulness:.2f} {'' if report.avg_faithfulness >= 0.95 else '⚠️'} (target: >0.95)")
lines.append(f" Groundedness: {report.avg_groundedness:.2f} {'' if report.avg_groundedness >= 0.85 else '⚠️'} (target: >0.85)")
lines.append("")
if report.issues:
lines.append(f"⚠️ ISSUES FOUND ({len(report.issues)})")
for issue in report.issues[:10]:
if issue['type'] == 'unsupported_claim':
lines.append(f" Q{issue['question_id']}: {len(issue.get('claims', []))} unsupported claim(s)")
elif issue['type'] == 'low_relevance':
lines.append(f" Q{issue['question_id']}: Low relevance contexts: {issue.get('contexts', [])}")
if len(report.issues) > 10:
lines.append(f" ... and {len(report.issues) - 10} more issues")
lines.append("")
lines.append("💡 RECOMMENDATIONS")
for i, rec in enumerate(report.recommendations, 1):
lines.append(f" {i}. {rec}")
lines.append("")
lines.append("=" * 60)
return '\n'.join(lines)
def main():
parser = argparse.ArgumentParser(
description="RAG Evaluator - Evaluate Retrieval-Augmented Generation systems",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --contexts contexts.json --questions questions.json
%(prog)s --contexts ctx.json --questions q.json --k 10
%(prog)s --contexts ctx.json --questions q.json --output report.json --verbose
Input file formats:
questions.json:
[
{"id": "q1", "question": "What is X?", "answer": "X is..."},
{"id": "q2", "question": "How does Y work?", "answer": "Y works by..."}
]
contexts.json:
[
{"question_id": "q1", "content": "Retrieved context text..."},
{"question_id": "q2", "content": "Another context..."}
]
"""
)
parser.add_argument('--contexts', '-c', required=True, help='JSON file with retrieved contexts')
parser.add_argument('--questions', '-q', required=True, help='JSON file with questions and answers')
parser.add_argument('--k', type=int, default=5, help='Number of top contexts to evaluate (default: 5)')
parser.add_argument('--output', '-o', help='Output file for detailed report (JSON)')
parser.add_argument('--json', '-j', action='store_true', help='Output as JSON instead of text')
parser.add_argument('--verbose', '-v', action='store_true', help='Include per-question details')
parser.add_argument('--compare', help='Compare with baseline report JSON')
args = parser.parse_args()
# Load input files
contexts_path = Path(args.contexts)
questions_path = Path(args.questions)
if not contexts_path.exists():
print(f"Error: Contexts file not found: {args.contexts}", file=sys.stderr)
sys.exit(1)
if not questions_path.exists():
print(f"Error: Questions file not found: {args.questions}", file=sys.stderr)
sys.exit(1)
try:
contexts = json.loads(contexts_path.read_text(encoding='utf-8'))
questions = json.loads(questions_path.read_text(encoding='utf-8'))
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON format: {e}", file=sys.stderr)
sys.exit(1)
# Run evaluation
report = evaluate_rag_system(questions, contexts, k=args.k, verbose=args.verbose)
# Compare with baseline
if args.compare:
baseline_path = Path(args.compare)
if baseline_path.exists():
baseline = json.loads(baseline_path.read_text())
print("\n📊 COMPARISON WITH BASELINE")
print(f" Relevance: {baseline.get('avg_context_relevance', 0):.2f} -> {report.avg_context_relevance:.2f}")
print(f" Faithfulness: {baseline.get('avg_faithfulness', 0):.2f} -> {report.avg_faithfulness:.2f}")
print(f" Groundedness: {baseline.get('avg_groundedness', 0):.2f} -> {report.avg_groundedness:.2f}")
print()
# Output
if args.json:
print(json.dumps(asdict(report), indent=2))
else:
print(format_report(report))
# Save to file
if args.output:
Path(args.output).write_text(json.dumps(asdict(report), indent=2))
print(f"\nDetailed report saved to {args.output}")
if __name__ == '__main__':
main()