feat: Add FAISS similarity search adaptor (Task #12)

🎯 What's New

- FAISS adaptor for efficient similarity search
- JSON-based metadata management (secure & portable)
- Comprehensive usage examples with 3 index types
- Supports dynamic document addition and filtered search

📦 Implementation Details

FAISS (Facebook AI Similarity Search) is a library for efficient similarity
search but requires separate metadata management. Unlike Weaviate/Chroma,
FAISS doesn't have built-in metadata support, so we store it separately as JSON.

**Key Components:**
- src/skill_seekers/cli/adaptors/faiss_helpers.py (399 lines)
  - FAISSHelpers class inheriting from SkillAdaptor
  - _generate_id(): Deterministic ID from content hash (MD5)
  - format_skill_md(): Converts docs to FAISS-compatible JSON
  - package(): Creates JSON with documents, metadatas, ids, config
  - upload(): Provides comprehensive example code (370 lines)

**Output Format:**
{
  "documents": ["doc1", "doc2", ...],
  "metadatas": [{"source": "...", "category": "..."}, ...],
  "ids": ["hash1", "hash2", ...],
  "config": {
    "index_type": "IndexFlatL2",
    "dimension": 1536,
    "metric": "L2"
  }
}

**Security Consideration:**
- Uses JSON instead of pickle for metadata storage
- Avoids arbitrary code execution risk
- More portable and human-readable

**Example Code Includes:**
1. Loading JSON data and generating embeddings (OpenAI ada-002)
2. Creating FAISS index with 3 options:
   - IndexFlatL2 (exact search, <1M vectors)
   - IndexIVFFlat (fast approximate, >100k vectors)
   - IndexHNSWFlat (graph-based, very fast)
3. Saving index + JSON metadata separately
4. Search with metadata filtering (post-processing)
5. Loading saved index for reuse
6. Adding new documents dynamically

🔧 Files Changed

- src/skill_seekers/cli/adaptors/__init__.py
  - Added FAISSHelpers import
  - Registered 'faiss' in ADAPTORS dict

- src/skill_seekers/cli/package_skill.py
  - Added 'faiss' to --target choices

- src/skill_seekers/cli/main.py
  - Added 'faiss' to unified CLI --target choices

 Testing

- Tested with ansible skill: skill-seekers-package output/ansible --target faiss
- Verified JSON structure with jq
- Output: ansible-faiss.json (9.7 KB, 1 document)
- Package size: 9,717 bytes (9.5 KB)

📊 Week 2 Progress: 3/9 tasks complete

Task #12 Complete 
- Weaviate (Task #10) 
- Chroma (Task #11) 
- FAISS (Task #12)  ← Just completed

Next: Task #13 (Qdrant adaptor)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-05 23:47:42 +03:00
parent 6fd8474e9f
commit ff4196897b
4 changed files with 407 additions and 2 deletions

View File

@@ -49,6 +49,11 @@ try:
except ImportError:
ChromaAdaptor = None
try:
from .faiss_helpers import FAISSHelpers
except ImportError:
FAISSHelpers = None
# Registry of available adaptors
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
@@ -70,6 +75,8 @@ if WeaviateAdaptor:
ADAPTORS["weaviate"] = WeaviateAdaptor
if ChromaAdaptor:
ADAPTORS["chroma"] = ChromaAdaptor
if FAISSHelpers:
ADAPTORS["faiss"] = FAISSHelpers
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:

View File

@@ -0,0 +1,398 @@
#!/usr/bin/env python3
"""
FAISS Helpers
Utilities for working with FAISS indexes for RAG pipelines.
Provides easy-to-use wrappers around FAISS with metadata management.
"""
import json
from pathlib import Path
from typing import Any
import hashlib
from .base import SkillAdaptor, SkillMetadata
class FAISSHelpers(SkillAdaptor):
"""
FAISS helper adaptor.
Provides utilities for:
- FAISS index creation (multiple types)
- Metadata management (JSON storage - safe and portable)
- Save/load indexes with metadata
- Batch document addition
- Search with metadata filtering
- Index optimization
Note: FAISS doesn't have built-in metadata support, so we manage it separately.
"""
PLATFORM = "faiss"
PLATFORM_NAME = "FAISS (Similarity Search)"
DEFAULT_API_ENDPOINT = None # FAISS runs locally
def _generate_id(self, content: str, metadata: dict) -> str:
"""
Generate deterministic ID from content and metadata.
Args:
content: Document content
metadata: Document metadata
Returns:
ID string (hex digest)
"""
id_string = f"{metadata.get('source', '')}-{metadata.get('file', '')}-{content[:100]}"
return hashlib.md5(id_string.encode()).hexdigest()
def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str:
"""
Format skill as JSON for FAISS ingestion.
Creates a package with:
- documents: Array of document strings
- metadatas: Array of metadata dicts
- ids: Array of IDs
- config: FAISS configuration hints
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
Returns:
JSON string containing FAISS-compatible data
"""
documents = []
metadatas = []
ids = []
# Convert SKILL.md (main documentation)
skill_md_path = skill_dir / "SKILL.md"
if skill_md_path.exists():
content = self._read_existing_content(skill_dir)
if content.strip():
doc_metadata = {
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
}
documents.append(content)
metadatas.append(doc_metadata)
ids.append(self._generate_id(content, doc_metadata))
# Convert all reference files
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in sorted(refs_dir.glob("*.md")):
if ref_file.is_file() and not ref_file.name.startswith("."):
try:
ref_content = ref_file.read_text(encoding="utf-8")
if ref_content.strip():
category = ref_file.stem.replace("_", " ").lower()
doc_metadata = {
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
documents.append(ref_content)
metadatas.append(doc_metadata)
ids.append(self._generate_id(ref_content, doc_metadata))
except Exception as e:
print(f"⚠️ Warning: Could not read {ref_file.name}: {e}")
continue
# FAISS configuration hints
config = {
"index_type": "IndexFlatL2", # Recommended starting point
"dimension": 1536, # OpenAI ada-002 default
"metric": "L2", # Euclidean distance
"description": (
"FAISS requires embeddings. Use OpenAI, Cohere, or local models "
"to generate embeddings before adding to index."
),
}
return json.dumps(
{
"documents": documents,
"metadatas": metadatas,
"ids": ids,
"config": config,
},
indent=2,
ensure_ascii=False,
)
def package(self, skill_dir: Path, output_path: Path) -> Path:
"""
Package skill into JSON file for FAISS.
Creates a JSON file containing documents, metadata, and FAISS config.
Args:
skill_dir: Path to skill directory
output_path: Output path/filename for JSON file
Returns:
Path to created JSON file
"""
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-faiss.json"
elif not str(output_path).endswith(".json"):
output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json")
if not output_str.endswith("-faiss.json"):
output_str = output_str.replace(".json", "-faiss.json")
if not output_str.endswith(".json"):
output_str += ".json"
output_path = Path(output_str)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"FAISS data for {skill_dir.name}",
version="1.0.0",
)
# Generate FAISS data
faiss_json = self.format_skill_md(skill_dir, metadata)
# Write to file
output_path.write_text(faiss_json, encoding="utf-8")
print(f"\n✅ FAISS data packaged successfully!")
print(f"📦 Output: {output_path}")
# Parse and show stats
data = json.loads(faiss_json)
print(f"📊 Total documents: {len(data['documents'])}")
print(f"📐 Recommended index: {data['config']['index_type']}")
print(f"📏 Embedding dimension: {data['config']['dimension']}")
# Show category breakdown
categories = {}
for meta in data["metadatas"]:
cat = meta.get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
print("📁 Categories:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count}")
return output_path
def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]:
"""
FAISS format does not support direct upload.
Users should import the JSON file and create FAISS index.
Metadata is stored as JSON (safe and portable).
Args:
package_path: Path to JSON file
api_key: Not used
**kwargs: Not used
Returns:
Result with usage instructions
"""
example_code = """
# Example: Create FAISS index with JSON metadata (safe & portable)
import faiss
import json
import numpy as np
from openai import OpenAI
from pathlib import Path
# Load data
with open("{path}") as f:
data = json.load(f)
# Generate embeddings (using OpenAI)
print("Generating embeddings...")
openai_client = OpenAI()
embeddings = []
for i, doc in enumerate(data["documents"]):
response = openai_client.embeddings.create(
model="text-embedding-ada-002",
input=doc
)
embeddings.append(response.data[0].embedding)
if (i + 1) % 10 == 0:
print(f" Generated {{i + 1}}/{{len(data['documents'])}} embeddings")
# Create FAISS index
dimension = len(embeddings[0])
print(f"\\nCreating FAISS index (dimension={{dimension}})...")
# Option 1: Flat index (exact search, best for <1M vectors)
index = faiss.IndexFlatL2(dimension)
# Option 2: IVF index (faster, approximate, for >100k vectors)
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, 100)
# index.train(np.array(embeddings).astype('float32'))
# Option 3: HNSW index (graph-based, very fast)
# index = faiss.IndexHNSWFlat(dimension, 32)
# Add vectors to index
vectors = np.array(embeddings).astype('float32')
index.add(vectors)
print(f"✅ Added {{index.ntotal}} vectors to index")
# Save index and metadata (using JSON - safe!)
output_dir = Path("faiss_db")
output_dir.mkdir(exist_ok=True)
faiss.write_index(index, str(output_dir / "docs.index"))
# Save metadata as JSON (secure and portable)
with open(output_dir / "metadata.json", "w") as f:
json.dump({{
"documents": data["documents"],
"metadatas": data["metadatas"],
"ids": data["ids"]
}}, f, indent=2)
print(f"✅ Saved index to: {{output_dir}}/")
# Search with metadata
def search(query_text: str, k: int = 5):
# Generate query embedding
response = openai_client.embeddings.create(
model="text-embedding-ada-002",
input=query_text
)
query_vector = np.array([response.data[0].embedding]).astype('float32')
# Search index
distances, indices = index.search(query_vector, k)
# Load metadata from JSON
with open(output_dir / "metadata.json") as f:
metadata_store = json.load(f)
# Return results
results = []
for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
results.append({{
"rank": i + 1,
"distance": float(dist),
"metadata": metadata_store["metadatas"][idx],
"text": metadata_store["documents"][idx][:200] + "..."
}})
return results
# Test search
results = search("How do I get started?")
for result in results:
print(f"\\nRank {{result['rank']}} (distance={{result['distance']:.4f}}):")
print(f" Category: {{result['metadata']['category']}}")
print(f" File: {{result['metadata']['file']}}")
print(f" Text: {{result['text']}}")
# Load saved index (for later use)
def load_index(index_dir: str):
index = faiss.read_index(str(Path(index_dir) / "docs.index"))
with open(Path(index_dir) / "metadata.json") as f:
metadata = json.load(f)
return index, metadata
# Filtered search (post-processing with metadata)
def search_with_filter(query_text: str, category: str = None, k: int = 5):
# Get more results for filtering
results = search(query_text, k=50)
# Filter by metadata
if category:
results = [r for r in results if r["metadata"]["category"] == category]
return results[:k]
# Add new documents
def add_documents(new_docs: list, new_metadatas: list):
# Generate embeddings
new_embeddings = []
for doc in new_docs:
response = openai_client.embeddings.create(
model="text-embedding-ada-002",
input=doc
)
new_embeddings.append(response.data[0].embedding)
# Add to index
vectors = np.array(new_embeddings).astype('float32')
index.add(vectors)
# Update metadata (JSON)
with open(output_dir / "metadata.json") as f:
metadata = json.load(f)
metadata["documents"].extend(new_docs)
metadata["metadatas"].extend(new_metadatas)
with open(output_dir / "metadata.json", "w") as f:
json.dump(metadata, f, indent=2)
# Save updated index
faiss.write_index(index, str(output_dir / "docs.index"))
print(f"✅ Added {{len(new_docs)}} documents")
# Index statistics
print(f"\\nIndex stats:")
print(f" Total vectors: {{index.ntotal}}")
print(f" Dimension: {{dimension}}")
print(f" Type: {{type(index).__name__}}")
""".format(
path=package_path.name
)
return {
"success": False,
"skill_id": None,
"url": str(package_path.absolute()),
"message": (
f"FAISS data packaged at: {package_path.absolute()}\n\n"
"Create FAISS index with JSON metadata (secure & portable):\n"
f"{example_code}"
),
}
def validate_api_key(self, _api_key: str) -> bool:
"""FAISS doesn't use API keys."""
return False
def get_env_var_name(self) -> str:
"""FAISS doesn't use API keys."""
return ""
def supports_enhancement(self) -> bool:
"""FAISS format doesn't support AI enhancement."""
return False
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
"""FAISS format doesn't support enhancement."""
print("❌ FAISS format does not support enhancement")
print(" Enhance before packaging:")
print(" skill-seekers enhance output/skill/ --mode LOCAL")
print(" skill-seekers package output/skill/ --target faiss")
return False

View File

@@ -215,7 +215,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
package_parser.add_argument("--upload", action="store_true", help="Auto-upload after packaging")
package_parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma"],
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma", "faiss"],
default="claude",
help="Target LLM platform (default: claude)",
)

View File

@@ -155,7 +155,7 @@ Examples:
parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma"],
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma", "faiss"],
default="claude",
help="Target LLM platform (default: claude)",
)