docs: Add 4 comprehensive vector database examples (Weaviate, Chroma, FAISS, Qdrant)
Created complete working examples for all 4 vector databases with RAG adaptors: Weaviate Example: - Comprehensive README with hybrid search guide - 3 Python scripts (generate, upload, query) - Sample outputs and query results - Covers hybrid search, filtering, schema design Chroma Example: - Simple, local-first approach - In-memory and persistent storage options - Semantic search and metadata filtering - Comparison with Weaviate FAISS Example: - Facebook AI Similarity Search integration - OpenAI embeddings generation - Index building and persistence - Performance-focused for scale Qdrant Example: - Advanced filtering capabilities - Production-ready features - Complex query patterns - Rust-based performance Each example includes: - Detailed README with setup and troubleshooting - requirements.txt with dependencies - 3 working Python scripts - Sample outputs directory Total files: 20 (4 examples × 5 files each) Documentation: 4 comprehensive READMEs (~800 lines total) Phase 2 of optional enhancements complete. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
72
examples/faiss-example/2_build_faiss_index.py
Normal file
72
examples/faiss-example/2_build_faiss_index.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Build FAISS index with OpenAI embeddings"""
|
||||
import json, sys, os
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import faiss
|
||||
from openai import OpenAI
|
||||
from rich.console import Console
|
||||
except ImportError:
|
||||
print("❌ Missing dependencies! Run: pip install -r requirements.txt")
|
||||
sys.exit(1)
|
||||
|
||||
console = Console()
|
||||
|
||||
# Check API key
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
console.print("[red]❌ OPENAI_API_KEY not set![/red]")
|
||||
console.print("Set it with: export OPENAI_API_KEY=sk-...")
|
||||
sys.exit(1)
|
||||
|
||||
# Load data
|
||||
console.print("📥 Loading skill data...")
|
||||
with open("output/flask-faiss.json") as f:
|
||||
data = json.load(f)
|
||||
|
||||
documents = data["documents"]
|
||||
metadatas = data["metadatas"]
|
||||
ids = data["ids"]
|
||||
|
||||
console.print(f"✅ Loaded {len(documents)} documents")
|
||||
|
||||
# Generate embeddings
|
||||
console.print("\n🔄 Generating embeddings (this may take 30-60 seconds)...")
|
||||
console.print(f" Cost: ~$0.001 for {len(documents)} documents")
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
embeddings = []
|
||||
|
||||
for i, doc in enumerate(documents):
|
||||
response = client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=doc[:8000] # Truncate to max length
|
||||
)
|
||||
embeddings.append(response.data[0].embedding)
|
||||
|
||||
if (i + 1) % 5 == 0:
|
||||
console.print(f" Progress: {i+1}/{len(documents)}")
|
||||
|
||||
console.print("✅ Embeddings generated!")
|
||||
|
||||
# Build FAISS index
|
||||
console.print("\n🏗️ Building FAISS index...")
|
||||
dimension = len(embeddings[0]) # 1536 for ada-002
|
||||
vectors = np.array(embeddings).astype('float32')
|
||||
|
||||
# Create index (L2 distance)
|
||||
index = faiss.IndexFlatL2(dimension)
|
||||
index.add(vectors)
|
||||
|
||||
# Save everything
|
||||
faiss.write_index(index, "flask.index")
|
||||
with open("flask_metadata.json", "w") as f:
|
||||
json.dump({"documents": documents, "metadatas": metadatas, "ids": ids}, f)
|
||||
|
||||
console.print(f"✅ Index saved: flask.index")
|
||||
console.print(f"✅ Metadata saved: flask_metadata.json")
|
||||
console.print(f"\n💡 Total vectors: {index.ntotal}")
|
||||
console.print(f"💡 Dimension: {dimension}")
|
||||
console.print("\n➡️ Next: python 3_query_example.py")
|
||||
Reference in New Issue
Block a user