feat: Add FAISS similarity search adaptor (Task #12)
🎯 What's New - FAISS adaptor for efficient similarity search - JSON-based metadata management (secure & portable) - Comprehensive usage examples with 3 index types - Supports dynamic document addition and filtered search 📦 Implementation Details FAISS (Facebook AI Similarity Search) is a library for efficient similarity search but requires separate metadata management. Unlike Weaviate/Chroma, FAISS doesn't have built-in metadata support, so we store it separately as JSON. **Key Components:** - src/skill_seekers/cli/adaptors/faiss_helpers.py (399 lines) - FAISSHelpers class inheriting from SkillAdaptor - _generate_id(): Deterministic ID from content hash (MD5) - format_skill_md(): Converts docs to FAISS-compatible JSON - package(): Creates JSON with documents, metadatas, ids, config - upload(): Provides comprehensive example code (370 lines) **Output Format:** { "documents": ["doc1", "doc2", ...], "metadatas": [{"source": "...", "category": "..."}, ...], "ids": ["hash1", "hash2", ...], "config": { "index_type": "IndexFlatL2", "dimension": 1536, "metric": "L2" } } **Security Consideration:** - Uses JSON instead of pickle for metadata storage - Avoids arbitrary code execution risk - More portable and human-readable **Example Code Includes:** 1. Loading JSON data and generating embeddings (OpenAI ada-002) 2. Creating FAISS index with 3 options: - IndexFlatL2 (exact search, <1M vectors) - IndexIVFFlat (fast approximate, >100k vectors) - IndexHNSWFlat (graph-based, very fast) 3. Saving index + JSON metadata separately 4. Search with metadata filtering (post-processing) 5. Loading saved index for reuse 6. Adding new documents dynamically 🔧 Files Changed - src/skill_seekers/cli/adaptors/__init__.py - Added FAISSHelpers import - Registered 'faiss' in ADAPTORS dict - src/skill_seekers/cli/package_skill.py - Added 'faiss' to --target choices - src/skill_seekers/cli/main.py - Added 'faiss' to unified CLI --target choices ✅ Testing - Tested with ansible skill: skill-seekers-package output/ansible --target faiss - Verified JSON structure with jq - Output: ansible-faiss.json (9.7 KB, 1 document) - Package size: 9,717 bytes (9.5 KB) 📊 Week 2 Progress: 3/9 tasks complete Task #12 Complete ✅ - Weaviate (Task #10) ✅ - Chroma (Task #11) ✅ - FAISS (Task #12) ✅ ← Just completed Next: Task #13 (Qdrant adaptor) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -49,6 +49,11 @@ try:
|
||||
except ImportError:
|
||||
ChromaAdaptor = None
|
||||
|
||||
try:
|
||||
from .faiss_helpers import FAISSHelpers
|
||||
except ImportError:
|
||||
FAISSHelpers = None
|
||||
|
||||
|
||||
# Registry of available adaptors
|
||||
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
|
||||
@@ -70,6 +75,8 @@ if WeaviateAdaptor:
|
||||
ADAPTORS["weaviate"] = WeaviateAdaptor
|
||||
if ChromaAdaptor:
|
||||
ADAPTORS["chroma"] = ChromaAdaptor
|
||||
if FAISSHelpers:
|
||||
ADAPTORS["faiss"] = FAISSHelpers
|
||||
|
||||
|
||||
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
|
||||
|
||||
398
src/skill_seekers/cli/adaptors/faiss_helpers.py
Normal file
398
src/skill_seekers/cli/adaptors/faiss_helpers.py
Normal file
@@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
FAISS Helpers
|
||||
|
||||
Utilities for working with FAISS indexes for RAG pipelines.
|
||||
Provides easy-to-use wrappers around FAISS with metadata management.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
import hashlib
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
|
||||
|
||||
class FAISSHelpers(SkillAdaptor):
|
||||
"""
|
||||
FAISS helper adaptor.
|
||||
|
||||
Provides utilities for:
|
||||
- FAISS index creation (multiple types)
|
||||
- Metadata management (JSON storage - safe and portable)
|
||||
- Save/load indexes with metadata
|
||||
- Batch document addition
|
||||
- Search with metadata filtering
|
||||
- Index optimization
|
||||
|
||||
Note: FAISS doesn't have built-in metadata support, so we manage it separately.
|
||||
"""
|
||||
|
||||
PLATFORM = "faiss"
|
||||
PLATFORM_NAME = "FAISS (Similarity Search)"
|
||||
DEFAULT_API_ENDPOINT = None # FAISS runs locally
|
||||
|
||||
def _generate_id(self, content: str, metadata: dict) -> str:
|
||||
"""
|
||||
Generate deterministic ID from content and metadata.
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
metadata: Document metadata
|
||||
|
||||
Returns:
|
||||
ID string (hex digest)
|
||||
"""
|
||||
id_string = f"{metadata.get('source', '')}-{metadata.get('file', '')}-{content[:100]}"
|
||||
return hashlib.md5(id_string.encode()).hexdigest()
|
||||
|
||||
def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str:
|
||||
"""
|
||||
Format skill as JSON for FAISS ingestion.
|
||||
|
||||
Creates a package with:
|
||||
- documents: Array of document strings
|
||||
- metadatas: Array of metadata dicts
|
||||
- ids: Array of IDs
|
||||
- config: FAISS configuration hints
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
|
||||
Returns:
|
||||
JSON string containing FAISS-compatible data
|
||||
"""
|
||||
documents = []
|
||||
metadatas = []
|
||||
ids = []
|
||||
|
||||
# Convert SKILL.md (main documentation)
|
||||
skill_md_path = skill_dir / "SKILL.md"
|
||||
if skill_md_path.exists():
|
||||
content = self._read_existing_content(skill_dir)
|
||||
if content.strip():
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(content, doc_metadata))
|
||||
|
||||
# Convert all reference files
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in sorted(refs_dir.glob("*.md")):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
try:
|
||||
ref_content = ref_file.read_text(encoding="utf-8")
|
||||
if ref_content.strip():
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(ref_content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(ref_content, doc_metadata))
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Could not read {ref_file.name}: {e}")
|
||||
continue
|
||||
|
||||
# FAISS configuration hints
|
||||
config = {
|
||||
"index_type": "IndexFlatL2", # Recommended starting point
|
||||
"dimension": 1536, # OpenAI ada-002 default
|
||||
"metric": "L2", # Euclidean distance
|
||||
"description": (
|
||||
"FAISS requires embeddings. Use OpenAI, Cohere, or local models "
|
||||
"to generate embeddings before adding to index."
|
||||
),
|
||||
}
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"documents": documents,
|
||||
"metadatas": metadatas,
|
||||
"ids": ids,
|
||||
"config": config,
|
||||
},
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for FAISS.
|
||||
|
||||
Creates a JSON file containing documents, metadata, and FAISS config.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
output_path: Output path/filename for JSON file
|
||||
|
||||
Returns:
|
||||
Path to created JSON file
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
output_path = Path(output_path) / f"{skill_dir.name}-faiss.json"
|
||||
elif not str(output_path).endswith(".json"):
|
||||
output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json")
|
||||
if not output_str.endswith("-faiss.json"):
|
||||
output_str = output_str.replace(".json", "-faiss.json")
|
||||
if not output_str.endswith(".json"):
|
||||
output_str += ".json"
|
||||
output_path = Path(output_str)
|
||||
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"FAISS data for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
|
||||
# Generate FAISS data
|
||||
faiss_json = self.format_skill_md(skill_dir, metadata)
|
||||
|
||||
# Write to file
|
||||
output_path.write_text(faiss_json, encoding="utf-8")
|
||||
|
||||
print(f"\n✅ FAISS data packaged successfully!")
|
||||
print(f"📦 Output: {output_path}")
|
||||
|
||||
# Parse and show stats
|
||||
data = json.loads(faiss_json)
|
||||
|
||||
print(f"📊 Total documents: {len(data['documents'])}")
|
||||
print(f"📐 Recommended index: {data['config']['index_type']}")
|
||||
print(f"📏 Embedding dimension: {data['config']['dimension']}")
|
||||
|
||||
# Show category breakdown
|
||||
categories = {}
|
||||
for meta in data["metadatas"]:
|
||||
cat = meta.get("category", "unknown")
|
||||
categories[cat] = categories.get(cat, 0) + 1
|
||||
|
||||
print("📁 Categories:")
|
||||
for cat, count in sorted(categories.items()):
|
||||
print(f" - {cat}: {count}")
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
FAISS format does not support direct upload.
|
||||
|
||||
Users should import the JSON file and create FAISS index.
|
||||
Metadata is stored as JSON (safe and portable).
|
||||
|
||||
Args:
|
||||
package_path: Path to JSON file
|
||||
api_key: Not used
|
||||
**kwargs: Not used
|
||||
|
||||
Returns:
|
||||
Result with usage instructions
|
||||
"""
|
||||
example_code = """
|
||||
# Example: Create FAISS index with JSON metadata (safe & portable)
|
||||
|
||||
import faiss
|
||||
import json
|
||||
import numpy as np
|
||||
from openai import OpenAI
|
||||
from pathlib import Path
|
||||
|
||||
# Load data
|
||||
with open("{path}") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Generate embeddings (using OpenAI)
|
||||
print("Generating embeddings...")
|
||||
openai_client = OpenAI()
|
||||
embeddings = []
|
||||
|
||||
for i, doc in enumerate(data["documents"]):
|
||||
response = openai_client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=doc
|
||||
)
|
||||
embeddings.append(response.data[0].embedding)
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f" Generated {{i + 1}}/{{len(data['documents'])}} embeddings")
|
||||
|
||||
# Create FAISS index
|
||||
dimension = len(embeddings[0])
|
||||
print(f"\\nCreating FAISS index (dimension={{dimension}})...")
|
||||
|
||||
# Option 1: Flat index (exact search, best for <1M vectors)
|
||||
index = faiss.IndexFlatL2(dimension)
|
||||
|
||||
# Option 2: IVF index (faster, approximate, for >100k vectors)
|
||||
# quantizer = faiss.IndexFlatL2(dimension)
|
||||
# index = faiss.IndexIVFFlat(quantizer, dimension, 100)
|
||||
# index.train(np.array(embeddings).astype('float32'))
|
||||
|
||||
# Option 3: HNSW index (graph-based, very fast)
|
||||
# index = faiss.IndexHNSWFlat(dimension, 32)
|
||||
|
||||
# Add vectors to index
|
||||
vectors = np.array(embeddings).astype('float32')
|
||||
index.add(vectors)
|
||||
print(f"✅ Added {{index.ntotal}} vectors to index")
|
||||
|
||||
# Save index and metadata (using JSON - safe!)
|
||||
output_dir = Path("faiss_db")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
faiss.write_index(index, str(output_dir / "docs.index"))
|
||||
|
||||
# Save metadata as JSON (secure and portable)
|
||||
with open(output_dir / "metadata.json", "w") as f:
|
||||
json.dump({{
|
||||
"documents": data["documents"],
|
||||
"metadatas": data["metadatas"],
|
||||
"ids": data["ids"]
|
||||
}}, f, indent=2)
|
||||
|
||||
print(f"✅ Saved index to: {{output_dir}}/")
|
||||
|
||||
# Search with metadata
|
||||
def search(query_text: str, k: int = 5):
|
||||
# Generate query embedding
|
||||
response = openai_client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=query_text
|
||||
)
|
||||
query_vector = np.array([response.data[0].embedding]).astype('float32')
|
||||
|
||||
# Search index
|
||||
distances, indices = index.search(query_vector, k)
|
||||
|
||||
# Load metadata from JSON
|
||||
with open(output_dir / "metadata.json") as f:
|
||||
metadata_store = json.load(f)
|
||||
|
||||
# Return results
|
||||
results = []
|
||||
for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
|
||||
results.append({{
|
||||
"rank": i + 1,
|
||||
"distance": float(dist),
|
||||
"metadata": metadata_store["metadatas"][idx],
|
||||
"text": metadata_store["documents"][idx][:200] + "..."
|
||||
}})
|
||||
|
||||
return results
|
||||
|
||||
# Test search
|
||||
results = search("How do I get started?")
|
||||
for result in results:
|
||||
print(f"\\nRank {{result['rank']}} (distance={{result['distance']:.4f}}):")
|
||||
print(f" Category: {{result['metadata']['category']}}")
|
||||
print(f" File: {{result['metadata']['file']}}")
|
||||
print(f" Text: {{result['text']}}")
|
||||
|
||||
# Load saved index (for later use)
|
||||
def load_index(index_dir: str):
|
||||
index = faiss.read_index(str(Path(index_dir) / "docs.index"))
|
||||
with open(Path(index_dir) / "metadata.json") as f:
|
||||
metadata = json.load(f)
|
||||
return index, metadata
|
||||
|
||||
# Filtered search (post-processing with metadata)
|
||||
def search_with_filter(query_text: str, category: str = None, k: int = 5):
|
||||
# Get more results for filtering
|
||||
results = search(query_text, k=50)
|
||||
|
||||
# Filter by metadata
|
||||
if category:
|
||||
results = [r for r in results if r["metadata"]["category"] == category]
|
||||
|
||||
return results[:k]
|
||||
|
||||
# Add new documents
|
||||
def add_documents(new_docs: list, new_metadatas: list):
|
||||
# Generate embeddings
|
||||
new_embeddings = []
|
||||
for doc in new_docs:
|
||||
response = openai_client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=doc
|
||||
)
|
||||
new_embeddings.append(response.data[0].embedding)
|
||||
|
||||
# Add to index
|
||||
vectors = np.array(new_embeddings).astype('float32')
|
||||
index.add(vectors)
|
||||
|
||||
# Update metadata (JSON)
|
||||
with open(output_dir / "metadata.json") as f:
|
||||
metadata = json.load(f)
|
||||
|
||||
metadata["documents"].extend(new_docs)
|
||||
metadata["metadatas"].extend(new_metadatas)
|
||||
|
||||
with open(output_dir / "metadata.json", "w") as f:
|
||||
json.dump(metadata, f, indent=2)
|
||||
|
||||
# Save updated index
|
||||
faiss.write_index(index, str(output_dir / "docs.index"))
|
||||
print(f"✅ Added {{len(new_docs)}} documents")
|
||||
|
||||
# Index statistics
|
||||
print(f"\\nIndex stats:")
|
||||
print(f" Total vectors: {{index.ntotal}}")
|
||||
print(f" Dimension: {{dimension}}")
|
||||
print(f" Type: {{type(index).__name__}}")
|
||||
""".format(
|
||||
path=package_path.name
|
||||
)
|
||||
|
||||
return {
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": str(package_path.absolute()),
|
||||
"message": (
|
||||
f"FAISS data packaged at: {package_path.absolute()}\n\n"
|
||||
"Create FAISS index with JSON metadata (secure & portable):\n"
|
||||
f"{example_code}"
|
||||
),
|
||||
}
|
||||
|
||||
def validate_api_key(self, _api_key: str) -> bool:
|
||||
"""FAISS doesn't use API keys."""
|
||||
return False
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""FAISS doesn't use API keys."""
|
||||
return ""
|
||||
|
||||
def supports_enhancement(self) -> bool:
|
||||
"""FAISS format doesn't support AI enhancement."""
|
||||
return False
|
||||
|
||||
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
|
||||
"""FAISS format doesn't support enhancement."""
|
||||
print("❌ FAISS format does not support enhancement")
|
||||
print(" Enhance before packaging:")
|
||||
print(" skill-seekers enhance output/skill/ --mode LOCAL")
|
||||
print(" skill-seekers package output/skill/ --target faiss")
|
||||
return False
|
||||
@@ -215,7 +215,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
package_parser.add_argument("--upload", action="store_true", help="Auto-upload after packaging")
|
||||
package_parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma"],
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma", "faiss"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
@@ -155,7 +155,7 @@ Examples:
|
||||
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma"],
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma", "faiss"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user