- Replace 3 boilerplate reference files with real technical content: - mlops_production_patterns.md: deployment, feature stores, A/B testing - llm_integration_guide.md: provider abstraction, cost management - rag_system_architecture.md: vector DBs, chunking, reranking - Rewrite SKILL.md: add trigger phrases, TOC, numbered workflows - Remove "world-class" marketing language (appeared 5+ times) - Standardize terminology to "MLOps" (not "Mlops") - Add validation checkpoints to all workflows Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
372 lines
10 KiB
Markdown
372 lines
10 KiB
Markdown
# RAG System Architecture
|
|
|
|
Retrieval-Augmented Generation patterns for production applications.
|
|
|
|
---
|
|
|
|
## Table of Contents
|
|
|
|
- [RAG Pipeline Architecture](#rag-pipeline-architecture)
|
|
- [Vector Database Selection](#vector-database-selection)
|
|
- [Chunking Strategies](#chunking-strategies)
|
|
- [Embedding Models](#embedding-models)
|
|
- [Retrieval Optimization](#retrieval-optimization)
|
|
|
|
---
|
|
|
|
## RAG Pipeline Architecture
|
|
|
|
### Basic RAG Flow
|
|
|
|
1. Receive user query
|
|
2. Generate query embedding
|
|
3. Search vector database for relevant chunks
|
|
4. Rerank retrieved chunks by relevance
|
|
5. Format context with retrieved chunks
|
|
6. Send prompt to LLM with context
|
|
7. Return generated response
|
|
8. **Validation:** Response references retrieved context, no hallucinations
|
|
|
|
### Pipeline Components
|
|
|
|
```python
|
|
from dataclasses import dataclass
|
|
from typing import List
|
|
|
|
@dataclass
|
|
class Document:
|
|
content: str
|
|
metadata: dict
|
|
embedding: List[float] = None
|
|
|
|
@dataclass
|
|
class RetrievalResult:
|
|
document: Document
|
|
score: float
|
|
|
|
class RAGPipeline:
|
|
def __init__(
|
|
self,
|
|
embedder: Embedder,
|
|
vector_store: VectorStore,
|
|
llm: LLMProvider,
|
|
reranker: Reranker = None
|
|
):
|
|
self.embedder = embedder
|
|
self.vector_store = vector_store
|
|
self.llm = llm
|
|
self.reranker = reranker
|
|
|
|
def query(self, question: str, top_k: int = 5) -> str:
|
|
# 1. Embed query
|
|
query_embedding = self.embedder.embed(question)
|
|
|
|
# 2. Retrieve relevant documents
|
|
results = self.vector_store.search(query_embedding, top_k=top_k * 2)
|
|
|
|
# 3. Rerank if available
|
|
if self.reranker:
|
|
results = self.reranker.rerank(question, results)[:top_k]
|
|
else:
|
|
results = results[:top_k]
|
|
|
|
# 4. Build context
|
|
context = self._build_context(results)
|
|
|
|
# 5. Generate response
|
|
prompt = self._build_prompt(question, context)
|
|
return self.llm.complete(prompt)
|
|
|
|
def _build_context(self, results: List[RetrievalResult]) -> str:
|
|
return "\n\n".join([
|
|
f"[Source {i+1}]: {r.document.content}"
|
|
for i, r in enumerate(results)
|
|
])
|
|
|
|
def _build_prompt(self, question: str, context: str) -> str:
|
|
return f"""Answer the question based on the context provided.
|
|
|
|
Context:
|
|
{context}
|
|
|
|
Question: {question}
|
|
|
|
Answer:"""
|
|
```
|
|
|
|
---
|
|
|
|
## Vector Database Selection
|
|
|
|
### Comparison Matrix
|
|
|
|
| Database | Hosting | Scale | Latency | Cost | Best For |
|
|
|----------|---------|-------|---------|------|----------|
|
|
| Pinecone | Managed | High | Low | $$ | Production, managed |
|
|
| Weaviate | Both | High | Low | $ | Hybrid search |
|
|
| Qdrant | Both | High | Very Low | $ | Performance-critical |
|
|
| Chroma | Self-hosted | Medium | Low | Free | Prototyping |
|
|
| pgvector | Self-hosted | Medium | Medium | Free | Existing Postgres |
|
|
| Milvus | Both | Very High | Low | $ | Large-scale |
|
|
|
|
### Pinecone Integration
|
|
|
|
```python
|
|
import pinecone
|
|
|
|
class PineconeVectorStore:
|
|
def __init__(self, api_key: str, environment: str, index_name: str):
|
|
pinecone.init(api_key=api_key, environment=environment)
|
|
self.index = pinecone.Index(index_name)
|
|
|
|
def upsert(self, documents: List[Document], batch_size: int = 100):
|
|
"""Upsert documents in batches."""
|
|
vectors = [
|
|
(doc.metadata["id"], doc.embedding, doc.metadata)
|
|
for doc in documents
|
|
]
|
|
|
|
for i in range(0, len(vectors), batch_size):
|
|
batch = vectors[i:i + batch_size]
|
|
self.index.upsert(vectors=batch)
|
|
|
|
def search(self, embedding: List[float], top_k: int = 5) -> List[RetrievalResult]:
|
|
"""Search for similar vectors."""
|
|
results = self.index.query(
|
|
vector=embedding,
|
|
top_k=top_k,
|
|
include_metadata=True
|
|
)
|
|
|
|
return [
|
|
RetrievalResult(
|
|
document=Document(
|
|
content=match.metadata.get("content", ""),
|
|
metadata=match.metadata
|
|
),
|
|
score=match.score
|
|
)
|
|
for match in results.matches
|
|
]
|
|
```
|
|
|
|
---
|
|
|
|
## Chunking Strategies
|
|
|
|
### Strategy Comparison
|
|
|
|
| Strategy | Chunk Size | Overlap | Best For |
|
|
|----------|------------|---------|----------|
|
|
| Fixed | 500-1000 tokens | 50-100 | General text |
|
|
| Sentence | 3-5 sentences | 1 sentence | Structured text |
|
|
| Paragraph | Natural breaks | None | Documents with clear structure |
|
|
| Semantic | Variable | Based on meaning | Research papers |
|
|
| Recursive | Hierarchical | Parent-child | Long documents |
|
|
|
|
### Recursive Character Splitter
|
|
|
|
```python
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
def create_chunks(
|
|
text: str,
|
|
chunk_size: int = 1000,
|
|
chunk_overlap: int = 100
|
|
) -> List[str]:
|
|
"""Split text using recursive character splitting."""
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
separators=["\n\n", "\n", ". ", " ", ""]
|
|
)
|
|
|
|
return splitter.split_text(text)
|
|
```
|
|
|
|
### Semantic Chunking
|
|
|
|
```python
|
|
from sentence_transformers import SentenceTransformer
|
|
import numpy as np
|
|
|
|
def semantic_chunk(
|
|
sentences: List[str],
|
|
embedder: SentenceTransformer,
|
|
threshold: float = 0.7
|
|
) -> List[List[str]]:
|
|
"""Group sentences by semantic similarity."""
|
|
embeddings = embedder.encode(sentences)
|
|
|
|
chunks = []
|
|
current_chunk = [sentences[0]]
|
|
current_embedding = embeddings[0]
|
|
|
|
for i in range(1, len(sentences)):
|
|
similarity = np.dot(current_embedding, embeddings[i]) / (
|
|
np.linalg.norm(current_embedding) * np.linalg.norm(embeddings[i])
|
|
)
|
|
|
|
if similarity >= threshold:
|
|
current_chunk.append(sentences[i])
|
|
current_embedding = np.mean(
|
|
[current_embedding, embeddings[i]], axis=0
|
|
)
|
|
else:
|
|
chunks.append(current_chunk)
|
|
current_chunk = [sentences[i]]
|
|
current_embedding = embeddings[i]
|
|
|
|
chunks.append(current_chunk)
|
|
return chunks
|
|
```
|
|
|
|
---
|
|
|
|
## Embedding Models
|
|
|
|
### Model Comparison
|
|
|
|
| Model | Dimensions | Quality | Speed | Cost |
|
|
|-------|------------|---------|-------|------|
|
|
| text-embedding-3-large | 3072 | Excellent | Medium | $0.13/1M |
|
|
| text-embedding-3-small | 1536 | Good | Fast | $0.02/1M |
|
|
| BGE-large | 1024 | Excellent | Medium | Free |
|
|
| all-MiniLM-L6-v2 | 384 | Good | Very Fast | Free |
|
|
| Cohere embed-v3 | 1024 | Excellent | Medium | $0.10/1M |
|
|
|
|
### Embedding with Caching
|
|
|
|
```python
|
|
import hashlib
|
|
from functools import lru_cache
|
|
|
|
class CachedEmbedder:
|
|
def __init__(self, model_name: str = "text-embedding-3-small"):
|
|
self.client = OpenAI()
|
|
self.model = model_name
|
|
self._cache = {}
|
|
|
|
def embed(self, text: str) -> List[float]:
|
|
"""Embed text with caching."""
|
|
cache_key = hashlib.md5(text.encode()).hexdigest()
|
|
|
|
if cache_key in self._cache:
|
|
return self._cache[cache_key]
|
|
|
|
response = self.client.embeddings.create(
|
|
model=self.model,
|
|
input=text
|
|
)
|
|
|
|
embedding = response.data[0].embedding
|
|
self._cache[cache_key] = embedding
|
|
|
|
return embedding
|
|
|
|
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
|
"""Embed multiple texts efficiently."""
|
|
response = self.client.embeddings.create(
|
|
model=self.model,
|
|
input=texts
|
|
)
|
|
|
|
return [item.embedding for item in response.data]
|
|
```
|
|
|
|
---
|
|
|
|
## Retrieval Optimization
|
|
|
|
### Hybrid Search
|
|
|
|
Combine dense (vector) and sparse (keyword) retrieval:
|
|
|
|
```python
|
|
from rank_bm25 import BM25Okapi
|
|
|
|
class HybridRetriever:
|
|
def __init__(
|
|
self,
|
|
vector_store: VectorStore,
|
|
documents: List[Document],
|
|
alpha: float = 0.5
|
|
):
|
|
self.vector_store = vector_store
|
|
self.alpha = alpha # Weight for vector search
|
|
|
|
# Build BM25 index
|
|
tokenized = [doc.content.lower().split() for doc in documents]
|
|
self.bm25 = BM25Okapi(tokenized)
|
|
self.documents = documents
|
|
|
|
def search(self, query: str, query_embedding: List[float], top_k: int = 5):
|
|
# Vector search
|
|
vector_results = self.vector_store.search(query_embedding, top_k=top_k * 2)
|
|
|
|
# BM25 search
|
|
tokenized_query = query.lower().split()
|
|
bm25_scores = self.bm25.get_scores(tokenized_query)
|
|
|
|
# Combine scores
|
|
combined = {}
|
|
for result in vector_results:
|
|
doc_id = result.document.metadata["id"]
|
|
combined[doc_id] = self.alpha * result.score
|
|
|
|
for i, score in enumerate(bm25_scores):
|
|
doc_id = self.documents[i].metadata["id"]
|
|
if doc_id in combined:
|
|
combined[doc_id] += (1 - self.alpha) * score
|
|
else:
|
|
combined[doc_id] = (1 - self.alpha) * score
|
|
|
|
# Sort and return top_k
|
|
sorted_ids = sorted(combined.keys(), key=lambda x: combined[x], reverse=True)
|
|
return sorted_ids[:top_k]
|
|
```
|
|
|
|
### Reranking
|
|
|
|
```python
|
|
from sentence_transformers import CrossEncoder
|
|
|
|
class Reranker:
|
|
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-12-v2"):
|
|
self.model = CrossEncoder(model_name)
|
|
|
|
def rerank(
|
|
self,
|
|
query: str,
|
|
results: List[RetrievalResult],
|
|
top_k: int = 5
|
|
) -> List[RetrievalResult]:
|
|
"""Rerank results using cross-encoder."""
|
|
pairs = [(query, r.document.content) for r in results]
|
|
scores = self.model.predict(pairs)
|
|
|
|
# Update scores and sort
|
|
for i, score in enumerate(scores):
|
|
results[i].score = float(score)
|
|
|
|
return sorted(results, key=lambda x: x.score, reverse=True)[:top_k]
|
|
```
|
|
|
|
### Query Expansion
|
|
|
|
```python
|
|
def expand_query(query: str, llm: LLMProvider) -> List[str]:
|
|
"""Generate query variations for better retrieval."""
|
|
prompt = f"""Generate 3 alternative phrasings of this question for search.
|
|
Return only the questions, one per line.
|
|
|
|
Original: {query}
|
|
|
|
Alternatives:"""
|
|
|
|
response = llm.complete(prompt, max_tokens=150)
|
|
alternatives = [q.strip() for q in response.strip().split("\n") if q.strip()]
|
|
|
|
return [query] + alternatives[:3]
|
|
```
|