Created complete working examples for all 4 vector databases with RAG adaptors: Weaviate Example: - Comprehensive README with hybrid search guide - 3 Python scripts (generate, upload, query) - Sample outputs and query results - Covers hybrid search, filtering, schema design Chroma Example: - Simple, local-first approach - In-memory and persistent storage options - Semantic search and metadata filtering - Comparison with Weaviate FAISS Example: - Facebook AI Similarity Search integration - OpenAI embeddings generation - Index building and persistence - Performance-focused for scale Qdrant Example: - Advanced filtering capabilities - Production-ready features - Complex query patterns - Rust-based performance Each example includes: - Detailed README with setup and troubleshooting - requirements.txt with dependencies - 3 working Python scripts - Sample outputs directory Total files: 20 (4 examples × 5 files each) Documentation: 4 comprehensive READMEs (~800 lines total) Phase 2 of optional enhancements complete. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
73 lines
2.1 KiB
Python
73 lines
2.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Build FAISS index with OpenAI embeddings"""
|
|
import json, sys, os
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import faiss
|
|
from openai import OpenAI
|
|
from rich.console import Console
|
|
except ImportError:
|
|
print("❌ Missing dependencies! Run: pip install -r requirements.txt")
|
|
sys.exit(1)
|
|
|
|
console = Console()
|
|
|
|
# Check API key
|
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
if not api_key:
|
|
console.print("[red]❌ OPENAI_API_KEY not set![/red]")
|
|
console.print("Set it with: export OPENAI_API_KEY=sk-...")
|
|
sys.exit(1)
|
|
|
|
# Load data
|
|
console.print("📥 Loading skill data...")
|
|
with open("output/flask-faiss.json") as f:
|
|
data = json.load(f)
|
|
|
|
documents = data["documents"]
|
|
metadatas = data["metadatas"]
|
|
ids = data["ids"]
|
|
|
|
console.print(f"✅ Loaded {len(documents)} documents")
|
|
|
|
# Generate embeddings
|
|
console.print("\n🔄 Generating embeddings (this may take 30-60 seconds)...")
|
|
console.print(f" Cost: ~$0.001 for {len(documents)} documents")
|
|
|
|
client = OpenAI(api_key=api_key)
|
|
embeddings = []
|
|
|
|
for i, doc in enumerate(documents):
|
|
response = client.embeddings.create(
|
|
model="text-embedding-ada-002",
|
|
input=doc[:8000] # Truncate to max length
|
|
)
|
|
embeddings.append(response.data[0].embedding)
|
|
|
|
if (i + 1) % 5 == 0:
|
|
console.print(f" Progress: {i+1}/{len(documents)}")
|
|
|
|
console.print("✅ Embeddings generated!")
|
|
|
|
# Build FAISS index
|
|
console.print("\n🏗️ Building FAISS index...")
|
|
dimension = len(embeddings[0]) # 1536 for ada-002
|
|
vectors = np.array(embeddings).astype('float32')
|
|
|
|
# Create index (L2 distance)
|
|
index = faiss.IndexFlatL2(dimension)
|
|
index.add(vectors)
|
|
|
|
# Save everything
|
|
faiss.write_index(index, "flask.index")
|
|
with open("flask_metadata.json", "w") as f:
|
|
json.dump({"documents": documents, "metadatas": metadatas, "ids": ids}, f)
|
|
|
|
console.print(f"✅ Index saved: flask.index")
|
|
console.print(f"✅ Metadata saved: flask_metadata.json")
|
|
console.print(f"\n💡 Total vectors: {index.ntotal}")
|
|
console.print(f"💡 Dimension: {dimension}")
|
|
console.print("\n➡️ Next: python 3_query_example.py")
|