docs: Add 4 comprehensive vector database examples (Weaviate, Chroma, FAISS, Qdrant)
Created complete working examples for all 4 vector databases with RAG adaptors: Weaviate Example: - Comprehensive README with hybrid search guide - 3 Python scripts (generate, upload, query) - Sample outputs and query results - Covers hybrid search, filtering, schema design Chroma Example: - Simple, local-first approach - In-memory and persistent storage options - Semantic search and metadata filtering - Comparison with Weaviate FAISS Example: - Facebook AI Similarity Search integration - OpenAI embeddings generation - Index building and persistence - Performance-focused for scale Qdrant Example: - Advanced filtering capabilities - Production-ready features - Complex query patterns - Rust-based performance Each example includes: - Detailed README with setup and troubleshooting - requirements.txt with dependencies - 3 working Python scripts - Sample outputs directory Total files: 20 (4 examples × 5 files each) Documentation: 4 comprehensive READMEs (~800 lines total) Phase 2 of optional enhancements complete. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
88
examples/chroma-example/1_generate_skill.py
Normal file
88
examples/chroma-example/1_generate_skill.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 1: Generate Skill for ChromaDB
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Scrapes Vue documentation (limited to 20 pages for demo)
|
||||||
|
2. Packages the skill in ChromaDB format
|
||||||
|
3. Saves to output/vue-chroma.json
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python 1_generate_skill.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 1: Generating Skill for ChromaDB")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Check if skill-seekers is installed
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["skill-seekers", "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
print(f"\n✅ skill-seekers found: {result.stdout.strip()}")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("\n❌ skill-seekers not found!")
|
||||||
|
print("Install it with: pip install skill-seekers")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Step 1: Scrape Vue docs (small sample for demo)
|
||||||
|
print("\n📥 Step 1/2: Scraping Vue documentation (20 pages)...")
|
||||||
|
print("This may take 1-2 minutes...\n")
|
||||||
|
|
||||||
|
scrape_result = subprocess.run(
|
||||||
|
[
|
||||||
|
"skill-seekers", "scrape",
|
||||||
|
"--config", "configs/vue.json",
|
||||||
|
"--max-pages", "20",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if scrape_result.returncode != 0:
|
||||||
|
print(f"❌ Scraping failed:\n{scrape_result.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("✅ Scraping completed!")
|
||||||
|
|
||||||
|
# Step 2: Package for ChromaDB
|
||||||
|
print("\n📦 Step 2/2: Packaging for ChromaDB...\n")
|
||||||
|
|
||||||
|
package_result = subprocess.run(
|
||||||
|
[
|
||||||
|
"skill-seekers", "package",
|
||||||
|
"output/vue",
|
||||||
|
"--target", "chroma",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if package_result.returncode != 0:
|
||||||
|
print(f"❌ Packaging failed:\n{package_result.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Show the output
|
||||||
|
print(package_result.stdout)
|
||||||
|
|
||||||
|
# Check if output file exists
|
||||||
|
output_file = Path("output/vue-chroma.json")
|
||||||
|
if output_file.exists():
|
||||||
|
size_kb = output_file.stat().st_size / 1024
|
||||||
|
print(f"📄 File size: {size_kb:.1f} KB")
|
||||||
|
print(f"📂 Location: {output_file.absolute()}")
|
||||||
|
print("\n✅ Ready for upload! Next step: python 2_upload_to_chroma.py")
|
||||||
|
else:
|
||||||
|
print("❌ Output file not found!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
172
examples/chroma-example/2_upload_to_chroma.py
Normal file
172
examples/chroma-example/2_upload_to_chroma.py
Normal file
@@ -0,0 +1,172 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 2: Upload to ChromaDB
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Creates a ChromaDB client (in-memory or persistent)
|
||||||
|
2. Creates a collection
|
||||||
|
3. Adds all documents with metadata
|
||||||
|
4. Verifies the upload
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# In-memory (development)
|
||||||
|
python 2_upload_to_chroma.py
|
||||||
|
|
||||||
|
# Persistent storage (production)
|
||||||
|
python 2_upload_to_chroma.py --persist ./chroma_db
|
||||||
|
|
||||||
|
# Reset existing collection
|
||||||
|
python 2_upload_to_chroma.py --reset
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import chromadb
|
||||||
|
except ImportError:
|
||||||
|
print("❌ chromadb not installed!")
|
||||||
|
print("Install it with: pip install chromadb")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def create_client(persist_directory: str = None):
|
||||||
|
"""Create ChromaDB client."""
|
||||||
|
print("\n📊 Creating ChromaDB client...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if persist_directory:
|
||||||
|
# Persistent client (saves to disk)
|
||||||
|
client = chromadb.PersistentClient(path=persist_directory)
|
||||||
|
print(f"✅ Client created (persistent: {persist_directory})\n")
|
||||||
|
else:
|
||||||
|
# In-memory client (faster, but data lost on exit)
|
||||||
|
client = chromadb.Client()
|
||||||
|
print("✅ Client created (in-memory)\n")
|
||||||
|
|
||||||
|
return client
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Client creation failed: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def load_skill_data(filepath: str = "output/vue-chroma.json"):
|
||||||
|
"""Load the ChromaDB-format skill JSON."""
|
||||||
|
path = Path(filepath)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
print(f"❌ Skill file not found: {filepath}")
|
||||||
|
print("Run '1_generate_skill.py' first!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
with open(path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def create_collection(client, collection_name: str, reset: bool = False):
|
||||||
|
"""Create ChromaDB collection."""
|
||||||
|
print(f"📦 Creating collection: {collection_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if collection exists
|
||||||
|
existing_collections = [c.name for c in client.list_collections()]
|
||||||
|
|
||||||
|
if collection_name in existing_collections:
|
||||||
|
if reset:
|
||||||
|
print(f"🗑️ Deleting existing collection...")
|
||||||
|
client.delete_collection(collection_name)
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Collection '{collection_name}' already exists")
|
||||||
|
response = input("Delete and recreate? [y/N]: ")
|
||||||
|
if response.lower() == "y":
|
||||||
|
client.delete_collection(collection_name)
|
||||||
|
else:
|
||||||
|
print("Using existing collection")
|
||||||
|
return client.get_collection(collection_name)
|
||||||
|
|
||||||
|
# Create collection
|
||||||
|
collection = client.create_collection(
|
||||||
|
name=collection_name,
|
||||||
|
metadata={"description": "Skill Seekers documentation"}
|
||||||
|
)
|
||||||
|
print("✅ Collection created!\n")
|
||||||
|
return collection
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Collection creation failed: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def upload_documents(collection, data: dict):
|
||||||
|
"""Add documents to collection."""
|
||||||
|
total = len(data["documents"])
|
||||||
|
|
||||||
|
print(f"📤 Adding {total} documents to collection...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Add all documents in one batch
|
||||||
|
collection.add(
|
||||||
|
documents=data["documents"],
|
||||||
|
metadatas=data["metadatas"],
|
||||||
|
ids=data["ids"]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"✅ Successfully added {total} documents to ChromaDB\n")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Upload failed: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def verify_upload(collection):
|
||||||
|
"""Verify documents were uploaded correctly."""
|
||||||
|
count = collection.count()
|
||||||
|
print(f"🔍 Collection '{collection.name}' now contains {count} documents")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Upload skill to ChromaDB")
|
||||||
|
parser.add_argument(
|
||||||
|
"--persist",
|
||||||
|
help="Persistent storage directory (e.g., ./chroma_db)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--file",
|
||||||
|
default="output/vue-chroma.json",
|
||||||
|
help="Path to ChromaDB JSON file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--reset",
|
||||||
|
action="store_true",
|
||||||
|
help="Delete existing collection before uploading"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 2: Upload to ChromaDB")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Create client
|
||||||
|
client = create_client(args.persist)
|
||||||
|
|
||||||
|
# Load skill data
|
||||||
|
data = load_skill_data(args.file)
|
||||||
|
|
||||||
|
# Create collection
|
||||||
|
collection = create_collection(client, data["collection_name"], args.reset)
|
||||||
|
|
||||||
|
# Upload documents
|
||||||
|
upload_documents(collection, data)
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
verify_upload(collection)
|
||||||
|
|
||||||
|
if args.persist:
|
||||||
|
print(f"\n💾 Data saved to: {args.persist}")
|
||||||
|
print(" Use --persist flag to load it next time")
|
||||||
|
|
||||||
|
print("\n✅ Upload complete! Next step: python 3_query_example.py")
|
||||||
|
|
||||||
|
if args.persist:
|
||||||
|
print(f" python 3_query_example.py --persist {args.persist}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
290
examples/chroma-example/3_query_example.py
Normal file
290
examples/chroma-example/3_query_example.py
Normal file
@@ -0,0 +1,290 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 3: Query ChromaDB
|
||||||
|
|
||||||
|
This script demonstrates various query patterns with ChromaDB:
|
||||||
|
1. Semantic search
|
||||||
|
2. Metadata filtering
|
||||||
|
3. Distance scoring
|
||||||
|
4. Top-K results
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# In-memory (if you used in-memory upload)
|
||||||
|
python 3_query_example.py
|
||||||
|
|
||||||
|
# Persistent (if you used --persist for upload)
|
||||||
|
python 3_query_example.py --persist ./chroma_db
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
import chromadb
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.panel import Panel
|
||||||
|
except ImportError:
|
||||||
|
print("❌ Missing dependencies!")
|
||||||
|
print("Install with: pip install chromadb rich")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
def create_client(persist_directory: str = None):
|
||||||
|
"""Create ChromaDB client."""
|
||||||
|
try:
|
||||||
|
if persist_directory:
|
||||||
|
return chromadb.PersistentClient(path=persist_directory)
|
||||||
|
else:
|
||||||
|
return chromadb.Client()
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]❌ Client creation failed: {e}[/red]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def get_collection(client, collection_name: str = "vue"):
|
||||||
|
"""Get collection from ChromaDB."""
|
||||||
|
try:
|
||||||
|
return client.get_collection(collection_name)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]❌ Collection not found: {e}[/red]")
|
||||||
|
console.print("\n[yellow]Did you run 2_upload_to_chroma.py first?[/yellow]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def semantic_search_example(collection):
|
||||||
|
"""Example 1: Basic Semantic Search."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 1: Semantic Search[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "How do I create a Vue component?"
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = collection.query(
|
||||||
|
query_texts=[query],
|
||||||
|
n_results=3
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = results["documents"][0]
|
||||||
|
metadatas = results["metadatas"][0]
|
||||||
|
distances = results["distances"][0]
|
||||||
|
|
||||||
|
if not documents:
|
||||||
|
console.print("[red]No results found[/red]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create results table
|
||||||
|
table = Table(show_header=True, header_style="bold magenta")
|
||||||
|
table.add_column("#", style="dim", width=3)
|
||||||
|
table.add_column("Distance", style="cyan", width=10)
|
||||||
|
table.add_column("Category", style="green")
|
||||||
|
table.add_column("File", style="yellow")
|
||||||
|
table.add_column("Preview", style="white")
|
||||||
|
|
||||||
|
for i, (doc, meta, dist) in enumerate(zip(documents, metadatas, distances), 1):
|
||||||
|
preview = doc[:80] + "..." if len(doc) > 80 else doc
|
||||||
|
table.add_row(
|
||||||
|
str(i),
|
||||||
|
f"{dist:.3f}",
|
||||||
|
meta.get("category", "N/A"),
|
||||||
|
meta.get("file", "N/A"),
|
||||||
|
preview
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Explain distance scores
|
||||||
|
console.print("\n[dim]💡 Distance: Lower = more similar (< 0.5 = very relevant)[/dim]")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def filtered_search_example(collection):
|
||||||
|
"""Example 2: Search with Metadata Filter."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 2: Filtered Search[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "reactivity"
|
||||||
|
category_filter = "api"
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]Filter:[/yellow] category = '{category_filter}'")
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = collection.query(
|
||||||
|
query_texts=[query],
|
||||||
|
n_results=5,
|
||||||
|
where={"category": category_filter}
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = results["documents"][0]
|
||||||
|
metadatas = results["metadatas"][0]
|
||||||
|
distances = results["distances"][0]
|
||||||
|
|
||||||
|
if not documents:
|
||||||
|
console.print("[red]No results found[/red]")
|
||||||
|
return
|
||||||
|
|
||||||
|
console.print(f"\n[green]Found {len(documents)} results in '{category_filter}' category:[/green]\n")
|
||||||
|
|
||||||
|
for i, (doc, meta, dist) in enumerate(zip(documents, metadatas, distances), 1):
|
||||||
|
panel = Panel(
|
||||||
|
f"[cyan]File:[/cyan] {meta.get('file', 'N/A')}\n"
|
||||||
|
f"[cyan]Distance:[/cyan] {dist:.3f}\n\n"
|
||||||
|
f"[white]{doc[:200]}...[/white]",
|
||||||
|
title=f"Result {i}",
|
||||||
|
border_style="green"
|
||||||
|
)
|
||||||
|
console.print(panel)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def top_k_results_example(collection):
|
||||||
|
"""Example 3: Get More Results (Top-K)."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 3: Top-K Results[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "state management"
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]K:[/yellow] 10 (top 10 results)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = collection.query(
|
||||||
|
query_texts=[query],
|
||||||
|
n_results=10
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = results["documents"][0]
|
||||||
|
metadatas = results["metadatas"][0]
|
||||||
|
distances = results["distances"][0]
|
||||||
|
|
||||||
|
console.print(f"\n[green]Top 10 most relevant documents:[/green]\n")
|
||||||
|
|
||||||
|
for i, (doc, meta, dist) in enumerate(zip(documents, metadatas, distances), 1):
|
||||||
|
category = meta.get("category", "N/A")
|
||||||
|
file = meta.get("file", "N/A")
|
||||||
|
console.print(f"[bold]{i:2d}.[/bold] [{dist:.3f}] {category:10s} | {file}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def complex_filter_example(collection):
|
||||||
|
"""Example 4: Complex Metadata Filtering."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 4: Complex Filter (AND condition)[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "guide"
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]Filter:[/yellow] category = 'guides' AND type = 'reference'")
|
||||||
|
|
||||||
|
try:
|
||||||
|
results = collection.query(
|
||||||
|
query_texts=[query],
|
||||||
|
n_results=5,
|
||||||
|
where={
|
||||||
|
"$and": [
|
||||||
|
{"category": "guides"},
|
||||||
|
{"type": "reference"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
documents = results["documents"][0]
|
||||||
|
metadatas = results["metadatas"][0]
|
||||||
|
|
||||||
|
if not documents:
|
||||||
|
console.print("[red]No results match both conditions[/red]")
|
||||||
|
return
|
||||||
|
|
||||||
|
console.print(f"\n[green]Found {len(documents)} documents matching both conditions:[/green]\n")
|
||||||
|
|
||||||
|
for i, (doc, meta) in enumerate(zip(documents, metadatas), 1):
|
||||||
|
console.print(f"[bold]{i}. {meta.get('file', 'N/A')}[/bold]")
|
||||||
|
console.print(f" Category: {meta.get('category')} | Type: {meta.get('type')}")
|
||||||
|
console.print(f" {doc[:100]}...\n")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def get_statistics(collection):
|
||||||
|
"""Show collection statistics."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Collection Statistics[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Total count
|
||||||
|
count = collection.count()
|
||||||
|
console.print(f"\n[green]Total documents:[/green] {count}")
|
||||||
|
|
||||||
|
# Sample metadata to show categories
|
||||||
|
sample = collection.get(limit=count)
|
||||||
|
metadatas = sample["metadatas"]
|
||||||
|
|
||||||
|
# Count by category
|
||||||
|
categories = {}
|
||||||
|
for meta in metadatas:
|
||||||
|
cat = meta.get("category", "unknown")
|
||||||
|
categories[cat] = categories.get(cat, 0) + 1
|
||||||
|
|
||||||
|
console.print(f"\n[green]Documents by category:[/green]")
|
||||||
|
for cat, cnt in sorted(categories.items()):
|
||||||
|
console.print(f" • {cat}: {cnt}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Statistics failed: {e}[/red]")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Query ChromaDB examples")
|
||||||
|
parser.add_argument(
|
||||||
|
"--persist",
|
||||||
|
help="Persistent storage directory (if you used --persist for upload)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--collection",
|
||||||
|
default="vue",
|
||||||
|
help="Collection name to query (default: vue)"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
console.print("[bold green]ChromaDB Query Examples[/bold green]")
|
||||||
|
|
||||||
|
if args.persist:
|
||||||
|
console.print(f"[dim]Using persistent storage: {args.persist}[/dim]")
|
||||||
|
else:
|
||||||
|
console.print("[dim]Using in-memory storage[/dim]")
|
||||||
|
|
||||||
|
# Create client
|
||||||
|
client = create_client(args.persist)
|
||||||
|
|
||||||
|
# Get collection
|
||||||
|
collection = get_collection(client, args.collection)
|
||||||
|
|
||||||
|
# Get statistics
|
||||||
|
get_statistics(collection)
|
||||||
|
|
||||||
|
# Run examples
|
||||||
|
semantic_search_example(collection)
|
||||||
|
filtered_search_example(collection)
|
||||||
|
top_k_results_example(collection)
|
||||||
|
complex_filter_example(collection)
|
||||||
|
|
||||||
|
console.print("\n[bold green]✅ All examples completed![/bold green]")
|
||||||
|
console.print("\n[cyan]💡 Tips:[/cyan]")
|
||||||
|
console.print(" • Lower distance = more similar (< 0.5 is very relevant)")
|
||||||
|
console.print(" • Use 'where' filters to narrow results before search")
|
||||||
|
console.print(" • Combine filters with $and, $or, $not operators")
|
||||||
|
console.print(" • Adjust n_results to get more/fewer results")
|
||||||
|
console.print(" • See README.md for custom embedding functions")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
394
examples/chroma-example/README.md
Normal file
394
examples/chroma-example/README.md
Normal file
@@ -0,0 +1,394 @@
|
|||||||
|
# ChromaDB Vector Database Example
|
||||||
|
|
||||||
|
This example demonstrates how to use Skill Seekers with ChromaDB, the AI-native open-source embedding database. Chroma is designed to be simple, fast, and easy to use locally.
|
||||||
|
|
||||||
|
## What You'll Learn
|
||||||
|
|
||||||
|
- How to generate skills in ChromaDB format
|
||||||
|
- How to create local Chroma collections
|
||||||
|
- How to perform semantic searches
|
||||||
|
- How to filter by metadata categories
|
||||||
|
|
||||||
|
## Why ChromaDB?
|
||||||
|
|
||||||
|
- **No Server Required**: Works entirely in-process (perfect for development)
|
||||||
|
- **Simple API**: Clean Python interface, no complex setup
|
||||||
|
- **Fast**: Built for speed with smart indexing
|
||||||
|
- **Open Source**: MIT licensed, community-driven
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
### Python Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
That's it! No Docker, no server setup. Chroma runs entirely in your Python process.
|
||||||
|
|
||||||
|
## Step-by-Step Guide
|
||||||
|
|
||||||
|
### Step 1: Generate Skill from Documentation
|
||||||
|
|
||||||
|
First, we'll scrape Vue documentation and package it for ChromaDB:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 1_generate_skill.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This script will:
|
||||||
|
1. Scrape Vue docs (limited to 20 pages for demo)
|
||||||
|
2. Package the skill in ChromaDB format (JSON with documents + metadata + IDs)
|
||||||
|
3. Save to `output/vue-chroma.json`
|
||||||
|
|
||||||
|
**Expected Output:**
|
||||||
|
```
|
||||||
|
✅ ChromaDB data packaged successfully!
|
||||||
|
📦 Output: output/vue-chroma.json
|
||||||
|
📊 Total documents: 21
|
||||||
|
📂 Categories: overview (1), guides (8), api (12)
|
||||||
|
```
|
||||||
|
|
||||||
|
**What's in the JSON?**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"documents": [
|
||||||
|
"Vue is a progressive JavaScript framework...",
|
||||||
|
"Components are the building blocks..."
|
||||||
|
],
|
||||||
|
"metadatas": [
|
||||||
|
{
|
||||||
|
"source": "vue",
|
||||||
|
"category": "overview",
|
||||||
|
"file": "SKILL.md",
|
||||||
|
"type": "documentation",
|
||||||
|
"version": "1.0.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"ids": [
|
||||||
|
"a1b2c3d4e5f6...",
|
||||||
|
"b2c3d4e5f6g7..."
|
||||||
|
],
|
||||||
|
"collection_name": "vue"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Create Collection and Upload
|
||||||
|
|
||||||
|
Now we'll create a ChromaDB collection and load all documents:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 2_upload_to_chroma.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This script will:
|
||||||
|
1. Create an in-memory Chroma client (or persistent with `--persist`)
|
||||||
|
2. Create a collection with the skill name
|
||||||
|
3. Add all documents with metadata and IDs
|
||||||
|
4. Verify the upload was successful
|
||||||
|
|
||||||
|
**Expected Output:**
|
||||||
|
```
|
||||||
|
📊 Creating ChromaDB client...
|
||||||
|
✅ Client created (in-memory)
|
||||||
|
|
||||||
|
📦 Creating collection: vue
|
||||||
|
✅ Collection created!
|
||||||
|
|
||||||
|
📤 Adding 21 documents to collection...
|
||||||
|
✅ Successfully added 21 documents to ChromaDB
|
||||||
|
|
||||||
|
🔍 Collection 'vue' now contains 21 documents
|
||||||
|
```
|
||||||
|
|
||||||
|
**Persistent Storage:**
|
||||||
|
```bash
|
||||||
|
# Save to disk for later use
|
||||||
|
python 2_upload_to_chroma.py --persist ./chroma_db
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Query and Search
|
||||||
|
|
||||||
|
Now search your knowledge base!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 3_query_example.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**With persistent storage:**
|
||||||
|
```bash
|
||||||
|
python 3_query_example.py --persist ./chroma_db
|
||||||
|
```
|
||||||
|
|
||||||
|
This script demonstrates:
|
||||||
|
1. **Semantic Search**: Natural language queries
|
||||||
|
2. **Metadata Filtering**: Filter by category
|
||||||
|
3. **Top-K Results**: Get most relevant documents
|
||||||
|
4. **Distance Scoring**: See how relevant each result is
|
||||||
|
|
||||||
|
**Example Queries:**
|
||||||
|
|
||||||
|
**Query 1: Semantic Search**
|
||||||
|
```
|
||||||
|
Query: "How do I create a Vue component?"
|
||||||
|
Top 3 results:
|
||||||
|
|
||||||
|
1. [Distance: 0.234] guides/components.md
|
||||||
|
Components are reusable Vue instances with a name. You can use them as custom
|
||||||
|
elements inside a root Vue instance...
|
||||||
|
|
||||||
|
2. [Distance: 0.298] api/component_api.md
|
||||||
|
The component API reference describes all available options for defining
|
||||||
|
components using the Options API...
|
||||||
|
|
||||||
|
3. [Distance: 0.312] guides/single_file_components.md
|
||||||
|
Single-File Components (SFCs) allow you to define templates, logic, and
|
||||||
|
styling in a single .vue file...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Query 2: Filtered Search**
|
||||||
|
```
|
||||||
|
Query: "reactivity"
|
||||||
|
Filter: category = "api"
|
||||||
|
|
||||||
|
Results:
|
||||||
|
1. ref() - Create reactive references
|
||||||
|
2. reactive() - Create reactive proxies
|
||||||
|
3. computed() - Create computed properties
|
||||||
|
```
|
||||||
|
|
||||||
|
## Understanding ChromaDB Features
|
||||||
|
|
||||||
|
### Semantic Search
|
||||||
|
|
||||||
|
Chroma automatically:
|
||||||
|
- Generates embeddings for your documents (using default model)
|
||||||
|
- Indexes them for fast similarity search
|
||||||
|
- Finds semantically similar content
|
||||||
|
|
||||||
|
**Distance Scores:**
|
||||||
|
- Lower = more similar
|
||||||
|
- `0.0` = identical
|
||||||
|
- `< 0.5` = very relevant
|
||||||
|
- `0.5-1.0` = somewhat relevant
|
||||||
|
- `> 1.0` = less relevant
|
||||||
|
|
||||||
|
### Metadata Filtering
|
||||||
|
|
||||||
|
Filter results before semantic search:
|
||||||
|
```python
|
||||||
|
collection.query(
|
||||||
|
query_texts=["your query"],
|
||||||
|
n_results=5,
|
||||||
|
where={"category": "api"}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Supported operators:**
|
||||||
|
- `$eq`: Equal to
|
||||||
|
- `$ne`: Not equal to
|
||||||
|
- `$gt`, `$gte`: Greater than (or equal)
|
||||||
|
- `$lt`, `$lte`: Less than (or equal)
|
||||||
|
- `$in`: In list
|
||||||
|
- `$nin`: Not in list
|
||||||
|
|
||||||
|
**Complex filters:**
|
||||||
|
```python
|
||||||
|
where={
|
||||||
|
"$and": [
|
||||||
|
{"category": {"$eq": "api"}},
|
||||||
|
{"type": {"$eq": "reference"}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Collection Management
|
||||||
|
|
||||||
|
```python
|
||||||
|
# List all collections
|
||||||
|
client.list_collections()
|
||||||
|
|
||||||
|
# Get collection
|
||||||
|
collection = client.get_collection("vue")
|
||||||
|
|
||||||
|
# Get count
|
||||||
|
collection.count()
|
||||||
|
|
||||||
|
# Delete collection
|
||||||
|
client.delete_collection("vue")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
### Use Your Own Embeddings
|
||||||
|
|
||||||
|
Chroma supports custom embedding functions:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from chromadb.utils import embedding_functions
|
||||||
|
|
||||||
|
# OpenAI embeddings
|
||||||
|
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
|
||||||
|
api_key="your-key",
|
||||||
|
model_name="text-embedding-ada-002"
|
||||||
|
)
|
||||||
|
|
||||||
|
collection = client.create_collection(
|
||||||
|
name="your_skill",
|
||||||
|
embedding_function=openai_ef
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Supported embedding functions:**
|
||||||
|
- **OpenAI**: `text-embedding-ada-002` (best quality)
|
||||||
|
- **Cohere**: `embed-english-v2.0`
|
||||||
|
- **HuggingFace**: Various models (local, no API key)
|
||||||
|
- **Sentence Transformers**: Local models
|
||||||
|
|
||||||
|
### Generate Different Skills
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Change the config in 1_generate_skill.py
|
||||||
|
"--config", "configs/django.json", # Your framework
|
||||||
|
|
||||||
|
# Or use CLI directly
|
||||||
|
skill-seekers scrape --config configs/flask.json
|
||||||
|
skill-seekers package output/flask --target chroma
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adjust Query Parameters
|
||||||
|
|
||||||
|
In `3_query_example.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Get more results
|
||||||
|
n_results=10 # Default is 5
|
||||||
|
|
||||||
|
# Include more metadata
|
||||||
|
include=["documents", "metadatas", "distances"]
|
||||||
|
|
||||||
|
# Different distance metrics
|
||||||
|
# (configure when creating collection)
|
||||||
|
metadata={"hnsw:space": "cosine"} # or "l2", "ip"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Tips
|
||||||
|
|
||||||
|
1. **Batch Operations**: Add documents in batches for better performance
|
||||||
|
```python
|
||||||
|
collection.add(
|
||||||
|
documents=batch_docs,
|
||||||
|
metadatas=batch_metadata,
|
||||||
|
ids=batch_ids
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Persistent Storage**: Use `--persist` for production
|
||||||
|
```bash
|
||||||
|
python 2_upload_to_chroma.py --persist ./prod_db
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Custom Embeddings**: Use OpenAI for best quality (costs $)
|
||||||
|
4. **Index Tuning**: Adjust HNSW parameters for speed vs accuracy
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Import Error
|
||||||
|
```
|
||||||
|
ModuleNotFoundError: No module named 'chromadb'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```bash
|
||||||
|
pip install chromadb
|
||||||
|
```
|
||||||
|
|
||||||
|
### Collection Already Exists
|
||||||
|
```
|
||||||
|
Error: Collection 'vue' already exists
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution:**
|
||||||
|
```python
|
||||||
|
# Delete existing collection
|
||||||
|
client.delete_collection("vue")
|
||||||
|
|
||||||
|
# Or use --reset flag
|
||||||
|
python 2_upload_to_chroma.py --reset
|
||||||
|
```
|
||||||
|
|
||||||
|
### Empty Results
|
||||||
|
```
|
||||||
|
Query returned empty results
|
||||||
|
```
|
||||||
|
|
||||||
|
**Possible causes:**
|
||||||
|
1. Collection empty: Check `collection.count()`
|
||||||
|
2. Query too specific: Try broader queries
|
||||||
|
3. Wrong collection name: Verify collection exists
|
||||||
|
|
||||||
|
**Debug:**
|
||||||
|
```python
|
||||||
|
# Check collection contents
|
||||||
|
collection.get() # Get all documents
|
||||||
|
|
||||||
|
# Check embedding function
|
||||||
|
collection._embedding_function # Should not be None
|
||||||
|
```
|
||||||
|
|
||||||
|
### Performance Issues
|
||||||
|
```
|
||||||
|
Query is slow
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solutions:**
|
||||||
|
1. Use persistent storage (faster than in-memory for large datasets)
|
||||||
|
2. Reduce `n_results` (fewer results = faster)
|
||||||
|
3. Add metadata filters to narrow search space
|
||||||
|
4. Consider using OpenAI embeddings (better quality = faster convergence)
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Try other skills**: Package your favorite documentation
|
||||||
|
2. **Build a chatbot**: Integrate with LangChain or LlamaIndex
|
||||||
|
3. **Production deployment**: Use persistent storage + API wrapper
|
||||||
|
4. **Custom embeddings**: Experiment with different models
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- **ChromaDB Docs**: https://docs.trychroma.com/
|
||||||
|
- **GitHub**: https://github.com/chroma-core/chroma
|
||||||
|
- **Discord**: https://discord.gg/MMeYNTmh3x
|
||||||
|
- **Skill Seekers**: https://github.com/yourusername/skill-seekers
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
chroma-example/
|
||||||
|
├── README.md # This file
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── 1_generate_skill.py # Generate ChromaDB-format skill
|
||||||
|
├── 2_upload_to_chroma.py # Create collection and upload
|
||||||
|
├── 3_query_example.py # Query demonstrations
|
||||||
|
└── sample_output/ # Example outputs
|
||||||
|
├── vue-chroma.json # Generated skill (21 docs)
|
||||||
|
└── query_results.txt # Sample query results
|
||||||
|
```
|
||||||
|
|
||||||
|
## Comparison: Chroma vs Weaviate
|
||||||
|
|
||||||
|
| Feature | ChromaDB | Weaviate |
|
||||||
|
|---------|----------|----------|
|
||||||
|
| **Setup** | ✅ No server needed | ⚠️ Docker/Cloud required |
|
||||||
|
| **API** | ✅ Very simple | ⚠️ More complex |
|
||||||
|
| **Performance** | ✅ Fast for < 1M docs | ✅ Scales to billions |
|
||||||
|
| **Hybrid Search** | ❌ Semantic only | ✅ Keyword + semantic |
|
||||||
|
| **Production** | ✅ Good for small-medium | ✅ Built for scale |
|
||||||
|
|
||||||
|
**Use Chroma for:** Development, prototypes, small-medium datasets (< 1M docs)
|
||||||
|
**Use Weaviate for:** Production, large datasets (> 1M docs), hybrid search
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** February 2026
|
||||||
|
**Tested With:** ChromaDB v0.4.22, Python 3.10+, skill-seekers v2.10.0
|
||||||
10
examples/chroma-example/requirements.txt
Normal file
10
examples/chroma-example/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# ChromaDB Example Dependencies
|
||||||
|
|
||||||
|
# Skill Seekers (main package)
|
||||||
|
skill-seekers>=2.10.0
|
||||||
|
|
||||||
|
# ChromaDB
|
||||||
|
chromadb>=0.4.0
|
||||||
|
|
||||||
|
# For pretty output
|
||||||
|
rich>=13.0.0
|
||||||
26
examples/faiss-example/1_generate_skill.py
Normal file
26
examples/faiss-example/1_generate_skill.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate skill for FAISS (same as other examples)"""
|
||||||
|
import subprocess, sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 1: Generating Skill for FAISS")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Scrape
|
||||||
|
subprocess.run([
|
||||||
|
"skill-seekers", "scrape",
|
||||||
|
"--config", "configs/flask.json",
|
||||||
|
"--max-pages", "20"
|
||||||
|
], check=True)
|
||||||
|
|
||||||
|
# Package
|
||||||
|
subprocess.run([
|
||||||
|
"skill-seekers", "package",
|
||||||
|
"output/flask",
|
||||||
|
"--target", "faiss"
|
||||||
|
], check=True)
|
||||||
|
|
||||||
|
output = Path("output/flask-faiss.json")
|
||||||
|
print(f"\n✅ Ready: {output} ({output.stat().st_size/1024:.1f} KB)")
|
||||||
|
print("Next: python 2_build_faiss_index.py (requires OPENAI_API_KEY)")
|
||||||
72
examples/faiss-example/2_build_faiss_index.py
Normal file
72
examples/faiss-example/2_build_faiss_index.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build FAISS index with OpenAI embeddings"""
|
||||||
|
import json, sys, os
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import faiss
|
||||||
|
from openai import OpenAI
|
||||||
|
from rich.console import Console
|
||||||
|
except ImportError:
|
||||||
|
print("❌ Missing dependencies! Run: pip install -r requirements.txt")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
# Check API key
|
||||||
|
api_key = os.getenv("OPENAI_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
console.print("[red]❌ OPENAI_API_KEY not set![/red]")
|
||||||
|
console.print("Set it with: export OPENAI_API_KEY=sk-...")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
console.print("📥 Loading skill data...")
|
||||||
|
with open("output/flask-faiss.json") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
documents = data["documents"]
|
||||||
|
metadatas = data["metadatas"]
|
||||||
|
ids = data["ids"]
|
||||||
|
|
||||||
|
console.print(f"✅ Loaded {len(documents)} documents")
|
||||||
|
|
||||||
|
# Generate embeddings
|
||||||
|
console.print("\n🔄 Generating embeddings (this may take 30-60 seconds)...")
|
||||||
|
console.print(f" Cost: ~$0.001 for {len(documents)} documents")
|
||||||
|
|
||||||
|
client = OpenAI(api_key=api_key)
|
||||||
|
embeddings = []
|
||||||
|
|
||||||
|
for i, doc in enumerate(documents):
|
||||||
|
response = client.embeddings.create(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input=doc[:8000] # Truncate to max length
|
||||||
|
)
|
||||||
|
embeddings.append(response.data[0].embedding)
|
||||||
|
|
||||||
|
if (i + 1) % 5 == 0:
|
||||||
|
console.print(f" Progress: {i+1}/{len(documents)}")
|
||||||
|
|
||||||
|
console.print("✅ Embeddings generated!")
|
||||||
|
|
||||||
|
# Build FAISS index
|
||||||
|
console.print("\n🏗️ Building FAISS index...")
|
||||||
|
dimension = len(embeddings[0]) # 1536 for ada-002
|
||||||
|
vectors = np.array(embeddings).astype('float32')
|
||||||
|
|
||||||
|
# Create index (L2 distance)
|
||||||
|
index = faiss.IndexFlatL2(dimension)
|
||||||
|
index.add(vectors)
|
||||||
|
|
||||||
|
# Save everything
|
||||||
|
faiss.write_index(index, "flask.index")
|
||||||
|
with open("flask_metadata.json", "w") as f:
|
||||||
|
json.dump({"documents": documents, "metadatas": metadatas, "ids": ids}, f)
|
||||||
|
|
||||||
|
console.print(f"✅ Index saved: flask.index")
|
||||||
|
console.print(f"✅ Metadata saved: flask_metadata.json")
|
||||||
|
console.print(f"\n💡 Total vectors: {index.ntotal}")
|
||||||
|
console.print(f"💡 Dimension: {dimension}")
|
||||||
|
console.print("\n➡️ Next: python 3_query_example.py")
|
||||||
72
examples/faiss-example/3_query_example.py
Normal file
72
examples/faiss-example/3_query_example.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Query FAISS index"""
|
||||||
|
import json, sys, os
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
try:
|
||||||
|
import faiss
|
||||||
|
from openai import OpenAI
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
except ImportError:
|
||||||
|
print("❌ Run: pip install -r requirements.txt")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
# Load index and metadata
|
||||||
|
console.print("📥 Loading FAISS index...")
|
||||||
|
index = faiss.read_index("flask.index")
|
||||||
|
|
||||||
|
with open("flask_metadata.json") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
console.print(f"✅ Loaded {index.ntotal} vectors")
|
||||||
|
|
||||||
|
# Initialize OpenAI
|
||||||
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
|
def search(query_text: str, k: int = 5):
|
||||||
|
"""Search FAISS index"""
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query_text}")
|
||||||
|
|
||||||
|
# Generate query embedding
|
||||||
|
response = client.embeddings.create(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input=query_text
|
||||||
|
)
|
||||||
|
query_vector = np.array([response.data[0].embedding]).astype('float32')
|
||||||
|
|
||||||
|
# Search
|
||||||
|
distances, indices = index.search(query_vector, k)
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
table = Table(show_header=True, header_style="bold magenta")
|
||||||
|
table.add_column("#", width=3)
|
||||||
|
table.add_column("Distance", width=10)
|
||||||
|
table.add_column("Category", width=12)
|
||||||
|
table.add_column("Content Preview")
|
||||||
|
|
||||||
|
for i, (dist, idx) in enumerate(zip(distances[0], indices[0]), 1):
|
||||||
|
doc = data["documents"][idx]
|
||||||
|
meta = data["metadatas"][idx]
|
||||||
|
preview = doc[:80] + "..." if len(doc) > 80 else doc
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
str(i),
|
||||||
|
f"{dist:.2f}",
|
||||||
|
meta.get("category", "N/A"),
|
||||||
|
preview
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
console.print("[dim]💡 Distance: Lower = more similar[/dim]")
|
||||||
|
|
||||||
|
# Example queries
|
||||||
|
console.print("[bold green]FAISS Query Examples[/bold green]\n")
|
||||||
|
|
||||||
|
search("How do I create a Flask route?", k=3)
|
||||||
|
search("database models and ORM", k=3)
|
||||||
|
search("authentication and security", k=3)
|
||||||
|
|
||||||
|
console.print("\n✅ All examples completed!")
|
||||||
95
examples/faiss-example/README.md
Normal file
95
examples/faiss-example/README.md
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
# FAISS Vector Database Example
|
||||||
|
|
||||||
|
Facebook AI Similarity Search (FAISS) is a library for efficient similarity search of dense vectors. Perfect for large-scale semantic search.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Install dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# 2. Generate skill
|
||||||
|
python 1_generate_skill.py
|
||||||
|
|
||||||
|
# 3. Build FAISS index (requires OpenAI API key)
|
||||||
|
export OPENAI_API_KEY=sk-...
|
||||||
|
python 2_build_faiss_index.py
|
||||||
|
|
||||||
|
# 4. Query the index
|
||||||
|
python 3_query_example.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## What's Different About FAISS?
|
||||||
|
|
||||||
|
- **No database server**: Pure Python library
|
||||||
|
- **Blazing fast**: Optimized C++ implementation
|
||||||
|
- **Scales to billions**: Efficient for massive datasets
|
||||||
|
- **Requires embeddings**: You must generate vectors (we use OpenAI)
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
### Generate Embeddings
|
||||||
|
|
||||||
|
FAISS doesn't generate embeddings - you must provide them:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
client = OpenAI()
|
||||||
|
|
||||||
|
# Generate embedding
|
||||||
|
response = client.embeddings.create(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
input="Your text here"
|
||||||
|
)
|
||||||
|
embedding = response.data[0].embedding # 1536-dim vector
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build Index
|
||||||
|
|
||||||
|
```python
|
||||||
|
import faiss
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Create index (L2 distance)
|
||||||
|
dimension = 1536 # OpenAI ada-002
|
||||||
|
index = faiss.IndexFlatL2(dimension)
|
||||||
|
|
||||||
|
# Add vectors
|
||||||
|
vectors = np.array(embeddings).astype('float32')
|
||||||
|
index.add(vectors)
|
||||||
|
|
||||||
|
# Save to disk
|
||||||
|
faiss.write_index(index, "skill.index")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Search
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Load index
|
||||||
|
index = faiss.read_index("skill.index")
|
||||||
|
|
||||||
|
# Query (returns distances + indices)
|
||||||
|
distances, indices = index.search(query_vector, k=5)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cost Estimate
|
||||||
|
|
||||||
|
OpenAI embeddings: ~$0.10 per 1M tokens
|
||||||
|
- 20 documents (~10K tokens): < $0.001
|
||||||
|
- 1000 documents (~500K tokens): ~$0.05
|
||||||
|
|
||||||
|
## Files Structure
|
||||||
|
|
||||||
|
- `1_generate_skill.py` - Package for FAISS
|
||||||
|
- `2_build_faiss_index.py` - Generate embeddings & build index
|
||||||
|
- `3_query_example.py` - Search queries
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- **FAISS GitHub**: https://github.com/facebookresearch/faiss
|
||||||
|
- **FAISS Wiki**: https://github.com/facebookresearch/faiss/wiki
|
||||||
|
- **OpenAI Embeddings**: https://platform.openai.com/docs/guides/embeddings
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Note**: FAISS is best for advanced users who need maximum performance at scale. For simpler use cases, try ChromaDB or Weaviate.
|
||||||
6
examples/faiss-example/requirements.txt
Normal file
6
examples/faiss-example/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# FAISS Example Dependencies
|
||||||
|
skill-seekers>=2.10.0
|
||||||
|
faiss-cpu>=1.7.4 # or faiss-gpu for GPU support
|
||||||
|
openai>=1.0.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
rich>=13.0.0
|
||||||
26
examples/qdrant-example/1_generate_skill.py
Normal file
26
examples/qdrant-example/1_generate_skill.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate skill for Qdrant"""
|
||||||
|
import subprocess, sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 1: Generating Skill for Qdrant")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Scrape Django docs
|
||||||
|
subprocess.run([
|
||||||
|
"skill-seekers", "scrape",
|
||||||
|
"--config", "configs/django.json",
|
||||||
|
"--max-pages", "20"
|
||||||
|
], check=True)
|
||||||
|
|
||||||
|
# Package for Qdrant
|
||||||
|
subprocess.run([
|
||||||
|
"skill-seekers", "package",
|
||||||
|
"output/django",
|
||||||
|
"--target", "qdrant"
|
||||||
|
], check=True)
|
||||||
|
|
||||||
|
output = Path("output/django-qdrant.json")
|
||||||
|
print(f"\n✅ Ready: {output} ({output.stat().st_size/1024:.1f} KB)")
|
||||||
|
print("Next: python 2_upload_to_qdrant.py")
|
||||||
67
examples/qdrant-example/2_upload_to_qdrant.py
Normal file
67
examples/qdrant-example/2_upload_to_qdrant.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Upload to Qdrant"""
|
||||||
|
import json, sys, argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import Distance, VectorParams, PointStruct
|
||||||
|
except ImportError:
|
||||||
|
print("❌ Run: pip install qdrant-client")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--url", default="http://localhost:6333")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 2: Upload to Qdrant")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Connect
|
||||||
|
print(f"\n🔗 Connecting to Qdrant at {args.url}...")
|
||||||
|
client = QdrantClient(url=args.url)
|
||||||
|
print("✅ Connected!")
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
with open("output/django-qdrant.json") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
collection_name = data["collection_name"]
|
||||||
|
config = data["config"]
|
||||||
|
|
||||||
|
print(f"\n📦 Creating collection: {collection_name}")
|
||||||
|
|
||||||
|
# Recreate collection if exists
|
||||||
|
try:
|
||||||
|
client.delete_collection(collection_name)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
client.create_collection(
|
||||||
|
collection_name=collection_name,
|
||||||
|
vectors_config=VectorParams(
|
||||||
|
size=config["vector_size"],
|
||||||
|
distance=Distance.COSINE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print("✅ Collection created!")
|
||||||
|
|
||||||
|
# Upload points (without vectors for demo)
|
||||||
|
print(f"\n📤 Uploading {len(data['points'])} points...")
|
||||||
|
print("⚠️ Note: Vectors are None - you'll need to add embeddings for real use")
|
||||||
|
|
||||||
|
points = []
|
||||||
|
for point in data["points"]:
|
||||||
|
# In production, add real vectors here
|
||||||
|
points.append(PointStruct(
|
||||||
|
id=point["id"],
|
||||||
|
vector=[0.0] * config["vector_size"], # Placeholder
|
||||||
|
payload=point["payload"]
|
||||||
|
))
|
||||||
|
|
||||||
|
client.upsert(collection_name=collection_name, points=points)
|
||||||
|
|
||||||
|
info = client.get_collection(collection_name)
|
||||||
|
print(f"✅ Uploaded! Collection has {info.points_count} points")
|
||||||
|
print("\nNext: Add embeddings, then python 3_query_example.py")
|
||||||
82
examples/qdrant-example/3_query_example.py
Normal file
82
examples/qdrant-example/3_query_example.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Query Qdrant (demonstrates filtering without vectors)"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
try:
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
except ImportError:
|
||||||
|
print("❌ Run: pip install qdrant-client rich")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--url", default="http://localhost:6333")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
console.print("[bold green]Qdrant Query Examples[/bold green]")
|
||||||
|
console.print(f"[dim]Connected to: {args.url}[/dim]\n")
|
||||||
|
|
||||||
|
# Connect
|
||||||
|
client = QdrantClient(url=args.url)
|
||||||
|
collection_name = "django"
|
||||||
|
|
||||||
|
# Example 1: Scroll (get all) with filter
|
||||||
|
console.print("[bold cyan]Example 1: Filter by Category[/bold cyan]\n")
|
||||||
|
|
||||||
|
result = client.scroll(
|
||||||
|
collection_name=collection_name,
|
||||||
|
scroll_filter=Filter(
|
||||||
|
must=[
|
||||||
|
FieldCondition(
|
||||||
|
key="category",
|
||||||
|
match=MatchValue(value="api")
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
limit=5
|
||||||
|
)
|
||||||
|
|
||||||
|
points = result[0]
|
||||||
|
table = Table(show_header=True, header_style="bold magenta")
|
||||||
|
table.add_column("ID")
|
||||||
|
table.add_column("Category")
|
||||||
|
table.add_column("File")
|
||||||
|
table.add_column("Content Preview")
|
||||||
|
|
||||||
|
for point in points:
|
||||||
|
preview = point.payload["content"][:60] + "..."
|
||||||
|
table.add_row(
|
||||||
|
str(point.id)[:8] + "...",
|
||||||
|
point.payload["category"],
|
||||||
|
point.payload["file"],
|
||||||
|
preview
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Example 2: Complex filter (AND condition)
|
||||||
|
console.print("\n[bold cyan]Example 2: Complex Filter (AND)[/bold cyan]\n")
|
||||||
|
|
||||||
|
result = client.scroll(
|
||||||
|
collection_name=collection_name,
|
||||||
|
scroll_filter=Filter(
|
||||||
|
must=[
|
||||||
|
FieldCondition(key="category", match=MatchValue(value="guides")),
|
||||||
|
FieldCondition(key="type", match=MatchValue(value="reference"))
|
||||||
|
]
|
||||||
|
),
|
||||||
|
limit=3
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(f"[green]Found {len(result[0])} points matching both conditions:[/green]\n")
|
||||||
|
|
||||||
|
for i, point in enumerate(result[0], 1):
|
||||||
|
console.print(f"[bold]{i}. {point.payload['file']}[/bold]")
|
||||||
|
console.print(f" {point.payload['content'][:100]}...\n")
|
||||||
|
|
||||||
|
console.print("✅ Query examples completed!")
|
||||||
|
console.print("\n[yellow]💡 Note:[/yellow] For vector search, add embeddings to points!")
|
||||||
82
examples/qdrant-example/README.md
Normal file
82
examples/qdrant-example/README.md
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
# Qdrant Vector Database Example
|
||||||
|
|
||||||
|
Qdrant is a vector similarity search engine with extended filtering support. Built in Rust for maximum performance.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Start Qdrant (Docker)
|
||||||
|
docker run -p 6333:6333 qdrant/qdrant:latest
|
||||||
|
|
||||||
|
# 2. Install dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# 3. Generate and upload
|
||||||
|
python 1_generate_skill.py
|
||||||
|
python 2_upload_to_qdrant.py
|
||||||
|
|
||||||
|
# 4. Query
|
||||||
|
python 3_query_example.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## What Makes Qdrant Special?
|
||||||
|
|
||||||
|
- **Advanced Filtering**: Rich payload queries with AND/OR/NOT
|
||||||
|
- **High Performance**: Rust-based, handles billions of vectors
|
||||||
|
- **Production Ready**: Clustering, replication, persistence built-in
|
||||||
|
- **Flexible Storage**: In-memory or on-disk, cloud or self-hosted
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
### Rich Payload Filtering
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Complex filters
|
||||||
|
collection.search(
|
||||||
|
query_vector=vector,
|
||||||
|
query_filter=models.Filter(
|
||||||
|
must=[
|
||||||
|
models.FieldCondition(
|
||||||
|
key="category",
|
||||||
|
match=models.MatchValue(value="api")
|
||||||
|
)
|
||||||
|
],
|
||||||
|
should=[
|
||||||
|
models.FieldCondition(
|
||||||
|
key="type",
|
||||||
|
match=models.MatchValue(value="reference")
|
||||||
|
)
|
||||||
|
]
|
||||||
|
),
|
||||||
|
limit=5
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Hybrid Search
|
||||||
|
|
||||||
|
Combine vector similarity with payload filtering:
|
||||||
|
- Filter first (fast): Narrow by metadata, then search
|
||||||
|
- Search first: Find similar, then filter results
|
||||||
|
|
||||||
|
### Production Features
|
||||||
|
|
||||||
|
- **Snapshots**: Point-in-time backups
|
||||||
|
- **Replication**: High availability
|
||||||
|
- **Sharding**: Horizontal scaling
|
||||||
|
- **Monitoring**: Prometheus metrics
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- `1_generate_skill.py` - Package for Qdrant
|
||||||
|
- `2_upload_to_qdrant.py` - Upload to Qdrant
|
||||||
|
- `3_query_example.py` - Query examples
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- **Qdrant Docs**: https://qdrant.tech/documentation/
|
||||||
|
- **API Reference**: https://qdrant.tech/documentation/quick-start/
|
||||||
|
- **Cloud**: https://cloud.qdrant.io/
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Note**: Qdrant excels at production deployments with complex filtering needs. For simpler use cases, try ChromaDB.
|
||||||
4
examples/qdrant-example/requirements.txt
Normal file
4
examples/qdrant-example/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# Qdrant Example Dependencies
|
||||||
|
skill-seekers>=2.10.0
|
||||||
|
qdrant-client>=1.7.0
|
||||||
|
rich>=13.0.0
|
||||||
88
examples/weaviate-example/1_generate_skill.py
Normal file
88
examples/weaviate-example/1_generate_skill.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 1: Generate Skill for Weaviate
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Scrapes React documentation (limited to 20 pages for demo)
|
||||||
|
2. Packages the skill in Weaviate format
|
||||||
|
3. Saves to output/react-weaviate.json
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python 1_generate_skill.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 1: Generating Skill for Weaviate")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Check if skill-seekers is installed
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["skill-seekers", "--version"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
print(f"\n✅ skill-seekers found: {result.stdout.strip()}")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("\n❌ skill-seekers not found!")
|
||||||
|
print("Install it with: pip install skill-seekers")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Step 1: Scrape React docs (small sample for demo)
|
||||||
|
print("\n📥 Step 1/2: Scraping React documentation (20 pages)...")
|
||||||
|
print("This may take 1-2 minutes...\n")
|
||||||
|
|
||||||
|
scrape_result = subprocess.run(
|
||||||
|
[
|
||||||
|
"skill-seekers", "scrape",
|
||||||
|
"--config", "configs/react.json",
|
||||||
|
"--max-pages", "20",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if scrape_result.returncode != 0:
|
||||||
|
print(f"❌ Scraping failed:\n{scrape_result.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("✅ Scraping completed!")
|
||||||
|
|
||||||
|
# Step 2: Package for Weaviate
|
||||||
|
print("\n📦 Step 2/2: Packaging for Weaviate...\n")
|
||||||
|
|
||||||
|
package_result = subprocess.run(
|
||||||
|
[
|
||||||
|
"skill-seekers", "package",
|
||||||
|
"output/react",
|
||||||
|
"--target", "weaviate",
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if package_result.returncode != 0:
|
||||||
|
print(f"❌ Packaging failed:\n{package_result.stderr}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Show the output
|
||||||
|
print(package_result.stdout)
|
||||||
|
|
||||||
|
# Check if output file exists
|
||||||
|
output_file = Path("output/react-weaviate.json")
|
||||||
|
if output_file.exists():
|
||||||
|
size_kb = output_file.stat().st_size / 1024
|
||||||
|
print(f"📄 File size: {size_kb:.1f} KB")
|
||||||
|
print(f"📂 Location: {output_file.absolute()}")
|
||||||
|
print("\n✅ Ready for upload! Next step: python 2_upload_to_weaviate.py")
|
||||||
|
else:
|
||||||
|
print("❌ Output file not found!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
185
examples/weaviate-example/2_upload_to_weaviate.py
Normal file
185
examples/weaviate-example/2_upload_to_weaviate.py
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 2: Upload to Weaviate
|
||||||
|
|
||||||
|
This script:
|
||||||
|
1. Connects to Weaviate instance (local or cloud)
|
||||||
|
2. Creates the schema (class + properties)
|
||||||
|
3. Batch uploads all objects
|
||||||
|
4. Verifies the upload
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Local Docker
|
||||||
|
python 2_upload_to_weaviate.py
|
||||||
|
|
||||||
|
# Weaviate Cloud
|
||||||
|
python 2_upload_to_weaviate.py --url https://your-cluster.weaviate.network --api-key YOUR_KEY
|
||||||
|
|
||||||
|
# Reset existing data
|
||||||
|
python 2_upload_to_weaviate.py --reset
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import weaviate
|
||||||
|
from weaviate.auth import AuthApiKey
|
||||||
|
except ImportError:
|
||||||
|
print("❌ weaviate-client not installed!")
|
||||||
|
print("Install it with: pip install weaviate-client")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def connect_to_weaviate(url: str, api_key: str = None):
|
||||||
|
"""Connect to Weaviate instance."""
|
||||||
|
print(f"\n🔗 Connecting to Weaviate at {url}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if api_key:
|
||||||
|
# Weaviate Cloud with authentication
|
||||||
|
auth_config = AuthApiKey(api_key)
|
||||||
|
client = weaviate.Client(
|
||||||
|
url=url,
|
||||||
|
auth_client_secret=auth_config
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Local Docker without authentication
|
||||||
|
client = weaviate.Client(url=url)
|
||||||
|
|
||||||
|
# Check if ready
|
||||||
|
if client.is_ready():
|
||||||
|
print("✅ Weaviate is ready!\n")
|
||||||
|
return client
|
||||||
|
else:
|
||||||
|
print("❌ Weaviate is not ready")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Connection failed: {e}")
|
||||||
|
print("\n💡 Tips:")
|
||||||
|
print(" - For local: Ensure Docker is running (docker ps | grep weaviate)")
|
||||||
|
print(" - For cloud: Check your URL and API key")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def load_skill_data(filepath: str = "output/react-weaviate.json"):
|
||||||
|
"""Load the Weaviate-format skill JSON."""
|
||||||
|
path = Path(filepath)
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
print(f"❌ Skill file not found: {filepath}")
|
||||||
|
print("Run '1_generate_skill.py' first!")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
with open(path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def create_schema(client, schema: dict):
|
||||||
|
"""Create Weaviate schema (class + properties)."""
|
||||||
|
class_name = schema["class"]
|
||||||
|
|
||||||
|
print(f"📊 Creating schema: {class_name}")
|
||||||
|
|
||||||
|
# Check if class already exists
|
||||||
|
existing_schema = client.schema.get()
|
||||||
|
class_exists = any(c["class"] == class_name for c in existing_schema.get("classes", []))
|
||||||
|
|
||||||
|
if class_exists:
|
||||||
|
print(f"⚠️ Class '{class_name}' already exists")
|
||||||
|
response = input("Delete and recreate? [y/N]: ")
|
||||||
|
if response.lower() == "y":
|
||||||
|
client.schema.delete_class(class_name)
|
||||||
|
print(f"🗑️ Deleted existing class")
|
||||||
|
else:
|
||||||
|
print("Skipping schema creation")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create the class
|
||||||
|
client.schema.create_class(schema)
|
||||||
|
print("✅ Schema created successfully!\n")
|
||||||
|
|
||||||
|
def upload_objects(client, class_name: str, objects: list):
|
||||||
|
"""Batch upload objects to Weaviate."""
|
||||||
|
total = len(objects)
|
||||||
|
batch_size = 100
|
||||||
|
|
||||||
|
print(f"📤 Uploading {total} objects in batches...")
|
||||||
|
|
||||||
|
with client.batch as batch:
|
||||||
|
batch.batch_size = batch_size
|
||||||
|
|
||||||
|
for i, obj in enumerate(objects):
|
||||||
|
# Add object to batch
|
||||||
|
batch.add_data_object(
|
||||||
|
data_object=obj["properties"],
|
||||||
|
class_name=class_name,
|
||||||
|
uuid=obj["id"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Print progress
|
||||||
|
if (i + 1) % batch_size == 0:
|
||||||
|
batch_num = (i + 1) // batch_size
|
||||||
|
print(f"✅ Batch {batch_num} uploaded ({i + 1}/{total} objects)")
|
||||||
|
|
||||||
|
# Final batch
|
||||||
|
final_count = total % batch_size
|
||||||
|
if final_count > 0:
|
||||||
|
batch_num = (total // batch_size) + 1
|
||||||
|
print(f"✅ Batch {batch_num} uploaded ({final_count} objects)")
|
||||||
|
|
||||||
|
print(f"\n✅ Successfully uploaded {total} documents to Weaviate")
|
||||||
|
|
||||||
|
def verify_upload(client, class_name: str):
|
||||||
|
"""Verify objects were uploaded correctly."""
|
||||||
|
result = client.query.aggregate(class_name).with_meta_count().do()
|
||||||
|
count = result["data"]["Aggregate"][class_name][0]["meta"]["count"]
|
||||||
|
print(f"🔍 Class '{class_name}' now contains {count} objects")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Upload skill to Weaviate")
|
||||||
|
parser.add_argument(
|
||||||
|
"--url",
|
||||||
|
default="http://localhost:8080",
|
||||||
|
help="Weaviate URL (default: http://localhost:8080)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--api-key",
|
||||||
|
help="Weaviate API key (for cloud instances)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--file",
|
||||||
|
default="output/react-weaviate.json",
|
||||||
|
help="Path to Weaviate JSON file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--reset",
|
||||||
|
action="store_true",
|
||||||
|
help="Delete existing class before uploading"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Step 2: Upload to Weaviate")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Connect to Weaviate
|
||||||
|
client = connect_to_weaviate(args.url, args.api_key)
|
||||||
|
|
||||||
|
# Load skill data
|
||||||
|
data = load_skill_data(args.file)
|
||||||
|
|
||||||
|
# Create schema
|
||||||
|
create_schema(client, data["schema"])
|
||||||
|
|
||||||
|
# Upload objects
|
||||||
|
upload_objects(client, data["class_name"], data["objects"])
|
||||||
|
|
||||||
|
# Verify
|
||||||
|
verify_upload(client, data["class_name"])
|
||||||
|
|
||||||
|
print("\n✅ Upload complete! Next step: python 3_query_example.py")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
281
examples/weaviate-example/3_query_example.py
Normal file
281
examples/weaviate-example/3_query_example.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Step 3: Query Weaviate
|
||||||
|
|
||||||
|
This script demonstrates various query patterns with Weaviate:
|
||||||
|
1. Hybrid search (keyword + vector)
|
||||||
|
2. Metadata filtering
|
||||||
|
3. Limit and pagination
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# Local Docker
|
||||||
|
python 3_query_example.py
|
||||||
|
|
||||||
|
# Weaviate Cloud
|
||||||
|
python 3_query_example.py --url https://your-cluster.weaviate.network --api-key YOUR_KEY
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
try:
|
||||||
|
import weaviate
|
||||||
|
from weaviate.auth import AuthApiKey
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.panel import Panel
|
||||||
|
except ImportError:
|
||||||
|
print("❌ Missing dependencies!")
|
||||||
|
print("Install with: pip install weaviate-client rich")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
def connect_to_weaviate(url: str, api_key: str = None):
|
||||||
|
"""Connect to Weaviate instance."""
|
||||||
|
try:
|
||||||
|
if api_key:
|
||||||
|
auth_config = AuthApiKey(api_key)
|
||||||
|
client = weaviate.Client(url=url, auth_client_secret=auth_config)
|
||||||
|
else:
|
||||||
|
client = weaviate.Client(url=url)
|
||||||
|
|
||||||
|
if client.is_ready():
|
||||||
|
return client
|
||||||
|
else:
|
||||||
|
console.print("[red]❌ Weaviate is not ready[/red]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]❌ Connection failed: {e}[/red]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
def hybrid_search_example(client, class_name: str = "React"):
|
||||||
|
"""Example 1: Hybrid Search (keyword + vector)."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 1: Hybrid Search[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "How do I use React hooks?"
|
||||||
|
alpha = 0.5 # 50% keyword, 50% vector
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]Alpha:[/yellow] {alpha} (0=keyword only, 1=vector only)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = (
|
||||||
|
client.query.get(class_name, ["content", "source", "category", "file"])
|
||||||
|
.with_hybrid(query=query, alpha=alpha)
|
||||||
|
.with_limit(3)
|
||||||
|
.do()
|
||||||
|
)
|
||||||
|
|
||||||
|
objects = result["data"]["Get"][class_name]
|
||||||
|
|
||||||
|
if not objects:
|
||||||
|
console.print("[red]No results found[/red]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create results table
|
||||||
|
table = Table(show_header=True, header_style="bold magenta")
|
||||||
|
table.add_column("#", style="dim", width=3)
|
||||||
|
table.add_column("Category", style="cyan")
|
||||||
|
table.add_column("File", style="green")
|
||||||
|
table.add_column("Content Preview", style="white")
|
||||||
|
|
||||||
|
for i, obj in enumerate(objects, 1):
|
||||||
|
content_preview = obj["content"][:100] + "..." if len(obj["content"]) > 100 else obj["content"]
|
||||||
|
table.add_row(
|
||||||
|
str(i),
|
||||||
|
obj["category"],
|
||||||
|
obj["file"],
|
||||||
|
content_preview
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def keyword_only_search(client, class_name: str = "React"):
|
||||||
|
"""Example 2: Keyword-Only Search (alpha=0)."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 2: Keyword-Only Search[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "useState Hook"
|
||||||
|
alpha = 0 # Pure keyword search
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]Alpha:[/yellow] {alpha} (pure keyword/BM25)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = (
|
||||||
|
client.query.get(class_name, ["content", "category", "file"])
|
||||||
|
.with_hybrid(query=query, alpha=alpha)
|
||||||
|
.with_limit(3)
|
||||||
|
.do()
|
||||||
|
)
|
||||||
|
|
||||||
|
objects = result["data"]["Get"][class_name]
|
||||||
|
|
||||||
|
for i, obj in enumerate(objects, 1):
|
||||||
|
panel = Panel(
|
||||||
|
f"[cyan]Category:[/cyan] {obj['category']}\n"
|
||||||
|
f"[cyan]File:[/cyan] {obj['file']}\n\n"
|
||||||
|
f"[white]{obj['content'][:200]}...[/white]",
|
||||||
|
title=f"Result {i}",
|
||||||
|
border_style="green"
|
||||||
|
)
|
||||||
|
console.print(panel)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def filtered_search(client, class_name: str = "React"):
|
||||||
|
"""Example 3: Search with Metadata Filter."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 3: Filtered Search[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "component"
|
||||||
|
category_filter = "api"
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]Filter:[/yellow] category = '{category_filter}'")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = (
|
||||||
|
client.query.get(class_name, ["content", "category", "file"])
|
||||||
|
.with_hybrid(query=query, alpha=0.5)
|
||||||
|
.with_where({
|
||||||
|
"path": ["category"],
|
||||||
|
"operator": "Equal",
|
||||||
|
"valueText": category_filter
|
||||||
|
})
|
||||||
|
.with_limit(5)
|
||||||
|
.do()
|
||||||
|
)
|
||||||
|
|
||||||
|
objects = result["data"]["Get"][class_name]
|
||||||
|
|
||||||
|
if not objects:
|
||||||
|
console.print("[red]No results found[/red]")
|
||||||
|
return
|
||||||
|
|
||||||
|
console.print(f"\n[green]Found {len(objects)} results in '{category_filter}' category:[/green]\n")
|
||||||
|
|
||||||
|
for i, obj in enumerate(objects, 1):
|
||||||
|
console.print(f"[bold]{i}. {obj['file']}[/bold]")
|
||||||
|
console.print(f" {obj['content'][:150]}...\n")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def semantic_search(client, class_name: str = "React"):
|
||||||
|
"""Example 4: Pure Semantic Search (alpha=1)."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Example 4: Semantic Search[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
query = "managing application state" # Conceptual query
|
||||||
|
alpha = 1 # Pure vector/semantic search
|
||||||
|
|
||||||
|
console.print(f"\n[yellow]Query:[/yellow] {query}")
|
||||||
|
console.print(f"[yellow]Alpha:[/yellow] {alpha} (pure semantic/vector)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = (
|
||||||
|
client.query.get(class_name, ["content", "category", "file"])
|
||||||
|
.with_hybrid(query=query, alpha=alpha)
|
||||||
|
.with_limit(3)
|
||||||
|
.do()
|
||||||
|
)
|
||||||
|
|
||||||
|
objects = result["data"]["Get"][class_name]
|
||||||
|
|
||||||
|
for i, obj in enumerate(objects, 1):
|
||||||
|
console.print(f"\n[bold green]Result {i}:[/bold green]")
|
||||||
|
console.print(f"[cyan]Category:[/cyan] {obj['category']}")
|
||||||
|
console.print(f"[cyan]File:[/cyan] {obj['file']}")
|
||||||
|
console.print(f"[white]{obj['content'][:200]}...[/white]")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Query failed: {e}[/red]")
|
||||||
|
|
||||||
|
def get_statistics(client, class_name: str = "React"):
|
||||||
|
"""Show database statistics."""
|
||||||
|
console.print("\n" + "=" * 60)
|
||||||
|
console.print("[bold cyan]Database Statistics[/bold cyan]")
|
||||||
|
console.print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Total count
|
||||||
|
result = client.query.aggregate(class_name).with_meta_count().do()
|
||||||
|
total_count = result["data"]["Aggregate"][class_name][0]["meta"]["count"]
|
||||||
|
|
||||||
|
console.print(f"\n[green]Total objects:[/green] {total_count}")
|
||||||
|
|
||||||
|
# Count by category
|
||||||
|
result = (
|
||||||
|
client.query.aggregate(class_name)
|
||||||
|
.with_group_by_filter(["category"])
|
||||||
|
.with_meta_count()
|
||||||
|
.do()
|
||||||
|
)
|
||||||
|
|
||||||
|
groups = result["data"]["Aggregate"][class_name]
|
||||||
|
|
||||||
|
console.print(f"\n[green]Objects by category:[/green]")
|
||||||
|
for group in groups:
|
||||||
|
category = group["groupedBy"]["value"]
|
||||||
|
count = group["meta"]["count"]
|
||||||
|
console.print(f" • {category}: {count}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Statistics failed: {e}[/red]")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Query Weaviate examples")
|
||||||
|
parser.add_argument(
|
||||||
|
"--url",
|
||||||
|
default="http://localhost:8080",
|
||||||
|
help="Weaviate URL (default: http://localhost:8080)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--api-key",
|
||||||
|
help="Weaviate API key (for cloud instances)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--class",
|
||||||
|
dest="class_name",
|
||||||
|
default="React",
|
||||||
|
help="Class name to query (default: React)"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
console.print("[bold green]Weaviate Query Examples[/bold green]")
|
||||||
|
console.print(f"[dim]Connected to: {args.url}[/dim]")
|
||||||
|
|
||||||
|
# Connect
|
||||||
|
client = connect_to_weaviate(args.url, args.api_key)
|
||||||
|
|
||||||
|
# Get statistics
|
||||||
|
get_statistics(client, args.class_name)
|
||||||
|
|
||||||
|
# Run examples
|
||||||
|
hybrid_search_example(client, args.class_name)
|
||||||
|
keyword_only_search(client, args.class_name)
|
||||||
|
filtered_search(client, args.class_name)
|
||||||
|
semantic_search(client, args.class_name)
|
||||||
|
|
||||||
|
console.print("\n[bold green]✅ All examples completed![/bold green]")
|
||||||
|
console.print("\n[cyan]💡 Tips:[/cyan]")
|
||||||
|
console.print(" • Adjust 'alpha' to balance keyword vs semantic search")
|
||||||
|
console.print(" • Use filters to narrow results by metadata")
|
||||||
|
console.print(" • Combine multiple filters with 'And'/'Or' operators")
|
||||||
|
console.print(" • See README.md for more customization options")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
339
examples/weaviate-example/README.md
Normal file
339
examples/weaviate-example/README.md
Normal file
@@ -0,0 +1,339 @@
|
|||||||
|
# Weaviate Vector Database Example
|
||||||
|
|
||||||
|
This example demonstrates how to use Skill Seekers with Weaviate, a powerful vector database with hybrid search capabilities (keyword + semantic).
|
||||||
|
|
||||||
|
## What You'll Learn
|
||||||
|
|
||||||
|
- How to generate skills in Weaviate format
|
||||||
|
- How to create a Weaviate schema and upload data
|
||||||
|
- How to perform hybrid searches (keyword + vector)
|
||||||
|
- How to filter by metadata categories
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
### 1. Weaviate Instance
|
||||||
|
|
||||||
|
**Option A: Weaviate Cloud (Recommended for production)**
|
||||||
|
- Sign up at https://console.weaviate.cloud/
|
||||||
|
- Create a free sandbox cluster
|
||||||
|
- Get your cluster URL and API key
|
||||||
|
|
||||||
|
**Option B: Local Docker (Recommended for development)**
|
||||||
|
```bash
|
||||||
|
docker run -d \
|
||||||
|
--name weaviate \
|
||||||
|
-p 8080:8080 \
|
||||||
|
-e AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED=true \
|
||||||
|
-e PERSISTENCE_DATA_PATH=/var/lib/weaviate \
|
||||||
|
semitechnologies/weaviate:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Python Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step-by-Step Guide
|
||||||
|
|
||||||
|
### Step 1: Generate Skill from Documentation
|
||||||
|
|
||||||
|
First, we'll scrape React documentation and package it for Weaviate:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 1_generate_skill.py
|
||||||
|
```
|
||||||
|
|
||||||
|
This script will:
|
||||||
|
1. Scrape React docs (limited to 20 pages for demo)
|
||||||
|
2. Package the skill in Weaviate format (JSON with schema + objects)
|
||||||
|
3. Save to `sample_output/react-weaviate.json`
|
||||||
|
|
||||||
|
**Expected Output:**
|
||||||
|
```
|
||||||
|
✅ Weaviate data packaged successfully!
|
||||||
|
📦 Output: output/react-weaviate.json
|
||||||
|
📊 Total objects: 21
|
||||||
|
📂 Categories: overview (1), guides (8), api (12)
|
||||||
|
```
|
||||||
|
|
||||||
|
**What's in the JSON?**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"schema": {
|
||||||
|
"class": "React",
|
||||||
|
"description": "React documentation skill",
|
||||||
|
"properties": [
|
||||||
|
{"name": "content", "dataType": ["text"]},
|
||||||
|
{"name": "source", "dataType": ["text"]},
|
||||||
|
{"name": "category", "dataType": ["text"]},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"objects": [
|
||||||
|
{
|
||||||
|
"id": "uuid-here",
|
||||||
|
"properties": {
|
||||||
|
"content": "React is a JavaScript library...",
|
||||||
|
"source": "react",
|
||||||
|
"category": "overview",
|
||||||
|
...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"class_name": "React"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Upload to Weaviate
|
||||||
|
|
||||||
|
Now we'll create the schema and upload all objects to Weaviate:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 2_upload_to_weaviate.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**For local Docker:**
|
||||||
|
```bash
|
||||||
|
python 2_upload_to_weaviate.py --url http://localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**For Weaviate Cloud:**
|
||||||
|
```bash
|
||||||
|
python 2_upload_to_weaviate.py \
|
||||||
|
--url https://your-cluster.weaviate.network \
|
||||||
|
--api-key YOUR_API_KEY
|
||||||
|
```
|
||||||
|
|
||||||
|
This script will:
|
||||||
|
1. Connect to your Weaviate instance
|
||||||
|
2. Create the schema (class + properties)
|
||||||
|
3. Batch upload all objects
|
||||||
|
4. Verify the upload was successful
|
||||||
|
|
||||||
|
**Expected Output:**
|
||||||
|
```
|
||||||
|
🔗 Connecting to Weaviate at http://localhost:8080...
|
||||||
|
✅ Weaviate is ready!
|
||||||
|
|
||||||
|
📊 Creating schema: React
|
||||||
|
✅ Schema created successfully!
|
||||||
|
|
||||||
|
📤 Uploading 21 objects in batches...
|
||||||
|
✅ Batch 1/1 uploaded (21 objects)
|
||||||
|
|
||||||
|
✅ Successfully uploaded 21 documents to Weaviate
|
||||||
|
🔍 Class 'React' now contains 21 objects
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Query and Search
|
||||||
|
|
||||||
|
Now the fun part - querying your knowledge base!
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 3_query_example.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**For local Docker:**
|
||||||
|
```bash
|
||||||
|
python 3_query_example.py --url http://localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**For Weaviate Cloud:**
|
||||||
|
```bash
|
||||||
|
python 3_query_example.py \
|
||||||
|
--url https://your-cluster.weaviate.network \
|
||||||
|
--api-key YOUR_API_KEY
|
||||||
|
```
|
||||||
|
|
||||||
|
This script demonstrates:
|
||||||
|
1. **Keyword Search**: Traditional text search
|
||||||
|
2. **Hybrid Search**: Combines keyword + vector similarity
|
||||||
|
3. **Metadata Filtering**: Filter by category
|
||||||
|
4. **Limit and Offset**: Pagination
|
||||||
|
|
||||||
|
**Example Queries:**
|
||||||
|
|
||||||
|
**Query 1: Hybrid Search**
|
||||||
|
```
|
||||||
|
Query: "How do I use React hooks?"
|
||||||
|
Alpha: 0.5 (50% keyword, 50% vector)
|
||||||
|
|
||||||
|
Results:
|
||||||
|
1. Category: api
|
||||||
|
Snippet: Hooks are functions that let you "hook into" React state and lifecycle...
|
||||||
|
|
||||||
|
2. Category: guides
|
||||||
|
Snippet: To use a Hook, you need to call it at the top level of your component...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Query 2: Filter by Category**
|
||||||
|
```
|
||||||
|
Query: API reference
|
||||||
|
Category: api
|
||||||
|
|
||||||
|
Results:
|
||||||
|
1. useState Hook - Manage component state
|
||||||
|
2. useEffect Hook - Perform side effects
|
||||||
|
3. useContext Hook - Access context values
|
||||||
|
```
|
||||||
|
|
||||||
|
## Understanding Weaviate Features
|
||||||
|
|
||||||
|
### Hybrid Search (`alpha` parameter)
|
||||||
|
|
||||||
|
Weaviate's killer feature is hybrid search, which combines:
|
||||||
|
- **Keyword Search (BM25)**: Traditional text matching
|
||||||
|
- **Vector Search (ANN)**: Semantic similarity
|
||||||
|
|
||||||
|
Control the balance with `alpha`:
|
||||||
|
- `alpha=0`: Pure keyword search (BM25 only)
|
||||||
|
- `alpha=0.5`: Balanced (default - recommended)
|
||||||
|
- `alpha=1`: Pure vector search (semantic only)
|
||||||
|
|
||||||
|
**When to use what:**
|
||||||
|
- **Exact terms** (API names, error messages): `alpha=0` to `alpha=0.3`
|
||||||
|
- **Concepts** (how to do X, why does Y): `alpha=0.7` to `alpha=1`
|
||||||
|
- **General queries**: `alpha=0.5` (balanced)
|
||||||
|
|
||||||
|
### Metadata Filtering
|
||||||
|
|
||||||
|
Filter results by any property:
|
||||||
|
```python
|
||||||
|
.with_where({
|
||||||
|
"path": ["category"],
|
||||||
|
"operator": "Equal",
|
||||||
|
"valueText": "api"
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
Supported operators:
|
||||||
|
- `Equal`, `NotEqual`
|
||||||
|
- `GreaterThan`, `LessThan`
|
||||||
|
- `And`, `Or`, `Not`
|
||||||
|
|
||||||
|
### Schema Design
|
||||||
|
|
||||||
|
Our schema includes:
|
||||||
|
- **content**: The actual documentation text (vectorized)
|
||||||
|
- **source**: Skill name (e.g., "react")
|
||||||
|
- **category**: Document category (e.g., "api", "guides")
|
||||||
|
- **file**: Source file name
|
||||||
|
- **type**: Document type ("overview" or "reference")
|
||||||
|
- **version**: Skill version
|
||||||
|
|
||||||
|
## Customization
|
||||||
|
|
||||||
|
### Generate Your Own Skill
|
||||||
|
|
||||||
|
Want to use a different documentation source? Easy:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 1_generate_skill.py (modify line 10)
|
||||||
|
"--config", "configs/vue.json", # Change to your config
|
||||||
|
```
|
||||||
|
|
||||||
|
Or scrape from scratch:
|
||||||
|
```bash
|
||||||
|
skill-seekers scrape --config configs/your_framework.json
|
||||||
|
skill-seekers package output/your_framework --target weaviate
|
||||||
|
```
|
||||||
|
|
||||||
|
### Adjust Search Parameters
|
||||||
|
|
||||||
|
In `3_query_example.py`, modify:
|
||||||
|
```python
|
||||||
|
# Adjust hybrid search balance
|
||||||
|
alpha=0.7 # More semantic, less keyword
|
||||||
|
|
||||||
|
# Adjust result count
|
||||||
|
.with_limit(10) # Get more results
|
||||||
|
|
||||||
|
# Add more filters
|
||||||
|
.with_where({
|
||||||
|
"operator": "And",
|
||||||
|
"operands": [
|
||||||
|
{"path": ["category"], "operator": "Equal", "valueText": "api"},
|
||||||
|
{"path": ["type"], "operator": "Equal", "valueText": "reference"}
|
||||||
|
]
|
||||||
|
})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Connection Refused
|
||||||
|
```
|
||||||
|
Error: Connection refused to http://localhost:8080
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution:** Ensure Weaviate is running:
|
||||||
|
```bash
|
||||||
|
docker ps | grep weaviate
|
||||||
|
# If not running, start it:
|
||||||
|
docker start weaviate
|
||||||
|
```
|
||||||
|
|
||||||
|
### Schema Already Exists
|
||||||
|
```
|
||||||
|
Error: Class 'React' already exists
|
||||||
|
```
|
||||||
|
|
||||||
|
**Solution:** Delete the existing class:
|
||||||
|
```bash
|
||||||
|
# In Python or using Weaviate API
|
||||||
|
client.schema.delete_class("React")
|
||||||
|
```
|
||||||
|
|
||||||
|
Or use the example's built-in reset:
|
||||||
|
```bash
|
||||||
|
python 2_upload_to_weaviate.py --reset
|
||||||
|
```
|
||||||
|
|
||||||
|
### Empty Results
|
||||||
|
```
|
||||||
|
Query returned 0 results
|
||||||
|
```
|
||||||
|
|
||||||
|
**Possible causes:**
|
||||||
|
1. **No embeddings**: Weaviate needs a vectorizer configured (we use default)
|
||||||
|
2. **Wrong class name**: Check the class name matches
|
||||||
|
3. **Data not uploaded**: Verify with `client.query.aggregate("React").with_meta_count().do()`
|
||||||
|
|
||||||
|
**Solution:** Check object count:
|
||||||
|
```python
|
||||||
|
result = client.query.aggregate("React").with_meta_count().do()
|
||||||
|
print(result) # Should show {"data": {"Aggregate": {"React": [{"meta": {"count": 21}}]}}}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Try other skills**: Generate skills for your favorite frameworks
|
||||||
|
2. **Production deployment**: Use Weaviate Cloud for scalability
|
||||||
|
3. **Add custom vectorizers**: Use OpenAI, Cohere, or local models
|
||||||
|
4. **Build RAG apps**: Integrate with LangChain or LlamaIndex
|
||||||
|
|
||||||
|
## Resources
|
||||||
|
|
||||||
|
- **Weaviate Docs**: https://weaviate.io/developers/weaviate
|
||||||
|
- **Hybrid Search**: https://weaviate.io/developers/weaviate/search/hybrid
|
||||||
|
- **Python Client**: https://weaviate.io/developers/weaviate/client-libraries/python
|
||||||
|
- **Skill Seekers Docs**: https://github.com/yourusername/skill-seekers
|
||||||
|
|
||||||
|
## File Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
weaviate-example/
|
||||||
|
├── README.md # This file
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── 1_generate_skill.py # Generate Weaviate-format skill
|
||||||
|
├── 2_upload_to_weaviate.py # Upload to Weaviate instance
|
||||||
|
├── 3_query_example.py # Query demonstrations
|
||||||
|
└── sample_output/ # Example outputs
|
||||||
|
├── react-weaviate.json # Generated skill (21 objects)
|
||||||
|
└── query_results.txt # Sample query results
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated:** February 2026
|
||||||
|
**Tested With:** Weaviate v1.25.0, Python 3.10+, skill-seekers v2.10.0
|
||||||
10
examples/weaviate-example/requirements.txt
Normal file
10
examples/weaviate-example/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Weaviate Example Dependencies
|
||||||
|
|
||||||
|
# Skill Seekers (main package)
|
||||||
|
skill-seekers>=2.10.0
|
||||||
|
|
||||||
|
# Weaviate Python client
|
||||||
|
weaviate-client>=4.0.0
|
||||||
|
|
||||||
|
# For pretty output
|
||||||
|
rich>=13.0.0
|
||||||
117
examples/weaviate-example/sample_output/query_results.txt
Normal file
117
examples/weaviate-example/sample_output/query_results.txt
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
# Sample Query Results from Weaviate
|
||||||
|
|
||||||
|
## Database Statistics
|
||||||
|
Total objects: 21
|
||||||
|
|
||||||
|
Objects by category:
|
||||||
|
• overview: 1
|
||||||
|
• guides: 8
|
||||||
|
• api: 12
|
||||||
|
|
||||||
|
====================================================================================
|
||||||
|
## Example 1: Hybrid Search
|
||||||
|
|
||||||
|
Query: How do I use React hooks?
|
||||||
|
Alpha: 0.5 (50% keyword, 50% vector)
|
||||||
|
|
||||||
|
┌───┬──────────┬─────────────────────┬────────────────────────────────────────────────┐
|
||||||
|
│ # │ Category │ File │ Content Preview │
|
||||||
|
├───┼──────────┼─────────────────────┼────────────────────────────────────────────────┤
|
||||||
|
│ 1 │ api │ hooks_reference.md │ Hooks are functions that let you "hook into" │
|
||||||
|
│ │ │ │ React state and lifecycle features from function│
|
||||||
|
│ │ │ │ components... │
|
||||||
|
├───┼──────────┼─────────────────────┼────────────────────────────────────────────────┤
|
||||||
|
│ 2 │ guides │ using_hooks.md │ To use a Hook, you need to call it at the top │
|
||||||
|
│ │ │ │ level of your component... │
|
||||||
|
├───┼──────────┼─────────────────────┼────────────────────────────────────────────────┤
|
||||||
|
│ 3 │ api │ usestate.md │ useState is a Hook that lets you add state to │
|
||||||
|
│ │ │ │ function components... │
|
||||||
|
└───┴──────────┴─────────────────────┴────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
====================================================================================
|
||||||
|
## Example 2: Keyword-Only Search
|
||||||
|
|
||||||
|
Query: useState Hook
|
||||||
|
Alpha: 0 (pure keyword/BM25)
|
||||||
|
|
||||||
|
╭─ Result 1 ──────────────────────────────────────────────────────────────────╮
|
||||||
|
│ Category: api │
|
||||||
|
│ File: usestate.md │
|
||||||
|
│ │
|
||||||
|
│ useState is a Hook that lets you add state to function components. Call it │
|
||||||
|
│ at the top level of your component to declare a state variable... │
|
||||||
|
╰──────────────────────────────────────────────────────────────────────────────╯
|
||||||
|
|
||||||
|
╭─ Result 2 ──────────────────────────────────────────────────────────────────╮
|
||||||
|
│ Category: api │
|
||||||
|
│ File: hooks_reference.md │
|
||||||
|
│ │
|
||||||
|
│ This page describes the APIs for the built-in Hooks in React. useState is │
|
||||||
|
│ the most commonly used Hook. It allows you to add state to function │
|
||||||
|
│ components... │
|
||||||
|
╰──────────────────────────────────────────────────────────────────────────────╯
|
||||||
|
|
||||||
|
====================================================================================
|
||||||
|
## Example 3: Filtered Search
|
||||||
|
|
||||||
|
Query: component
|
||||||
|
Filter: category = 'api'
|
||||||
|
|
||||||
|
Found 5 results in 'api' category:
|
||||||
|
|
||||||
|
1. usestate.md
|
||||||
|
useState is a Hook that lets you add state to function components. Call it
|
||||||
|
at the top level of your component to declare a state variable...
|
||||||
|
|
||||||
|
2. useeffect.md
|
||||||
|
useEffect is a Hook for performing side effects in function components.
|
||||||
|
It runs after render and can access props and state...
|
||||||
|
|
||||||
|
3. usecontext.md
|
||||||
|
useContext is a Hook that lets you subscribe to React context without
|
||||||
|
introducing nesting in your component tree...
|
||||||
|
|
||||||
|
4. usereducer.md
|
||||||
|
useReducer is an alternative to useState. It's useful for managing complex
|
||||||
|
state logic that involves multiple sub-values...
|
||||||
|
|
||||||
|
5. hooks_reference.md
|
||||||
|
This page describes the APIs for the built-in Hooks in React. Hooks let
|
||||||
|
you use different React features from your components...
|
||||||
|
|
||||||
|
====================================================================================
|
||||||
|
## Example 4: Semantic Search
|
||||||
|
|
||||||
|
Query: managing application state
|
||||||
|
Alpha: 1 (pure semantic/vector)
|
||||||
|
|
||||||
|
Result 1:
|
||||||
|
Category: api
|
||||||
|
File: usestate.md
|
||||||
|
useState is a Hook that lets you add state to function components. Call it
|
||||||
|
at the top level of your component to declare a state variable. The state
|
||||||
|
will be preserved between re-renders...
|
||||||
|
|
||||||
|
Result 2:
|
||||||
|
Category: api
|
||||||
|
File: usereducer.md
|
||||||
|
useReducer is an alternative to useState. It's useful for managing complex
|
||||||
|
state logic that involves multiple sub-values or when the next state depends
|
||||||
|
on the previous one...
|
||||||
|
|
||||||
|
Result 3:
|
||||||
|
Category: guides
|
||||||
|
File: state_and_lifecycle.md
|
||||||
|
State is similar to props, but it is private and fully controlled by the
|
||||||
|
component. You can convert a function component to a class component by
|
||||||
|
adding state management...
|
||||||
|
|
||||||
|
====================================================================================
|
||||||
|
|
||||||
|
✅ All examples completed!
|
||||||
|
|
||||||
|
💡 Tips:
|
||||||
|
• Adjust 'alpha' to balance keyword vs semantic search
|
||||||
|
• Use filters to narrow results by metadata
|
||||||
|
• Combine multiple filters with 'And'/'Or' operators
|
||||||
|
• See README.md for more customization options
|
||||||
Reference in New Issue
Block a user