skill-seekers-reference/src/skill_seekers/cli/adaptors/faiss_helpers.py

#!/usr/bin/env python3
"""
FAISS Helpers

Utilities for working with FAISS indexes for RAG pipelines.
Provides easy-to-use wrappers around FAISS with metadata management.
"""

import json
from pathlib import Path
from typing import Any

from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


class FAISSHelpers(SkillAdaptor):
    """
    FAISS helper adaptor.

    Provides utilities for:
    - FAISS index creation (multiple types)
    - Metadata management (JSON storage - safe and portable)
    - Save/load indexes with metadata
    - Batch document addition
    - Search with metadata filtering
    - Index optimization

    Note: FAISS doesn't have built-in metadata support, so we manage it separately.
    """

    PLATFORM = "faiss"
    PLATFORM_NAME = "FAISS (Similarity Search)"
    DEFAULT_API_ENDPOINT = None  # FAISS runs locally

    def _generate_id(self, content: str, metadata: dict) -> str:
        """
        Generate deterministic ID from content and metadata.

        Args:
            content: Document content
            metadata: Document metadata

        Returns:
            ID string (hex digest)
        """
        return self._generate_deterministic_id(content, metadata, format="hex")

    def format_skill_md(
        self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
    ) -> str:
        """
        Format skill as JSON for FAISS ingestion.

        Creates a package with:
        - documents: Array of document strings
        - metadatas: Array of metadata dicts
        - ids: Array of IDs
        - config: FAISS configuration hints

        Args:
            skill_dir: Path to skill directory
            metadata: Skill metadata
            enable_chunking: Enable intelligent chunking for large documents
            **kwargs: Additional chunking parameters

        Returns:
            JSON string containing FAISS-compatible data
        """
        documents = []
        metadatas = []
        ids = []

        # Convert SKILL.md (main documentation)
        skill_md_path = skill_dir / "SKILL.md"
        if skill_md_path.exists():
            content = self._read_existing_content(skill_dir)
            if content.strip():
                doc_metadata = {
                    "source": metadata.name,
                    "category": "overview",
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
                chunks = self._maybe_chunk_content(
                    content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
                    chunk_overlap_tokens=kwargs.get(
                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
                    ),
                )

                # Add all chunks to parallel arrays
                for chunk_text, chunk_meta in chunks:
                    documents.append(chunk_text)
                    metadatas.append(chunk_meta)
                    ids.append(self._generate_id(chunk_text, chunk_meta))

        # Convert all reference files using base helper method
        for ref_file, ref_content in self._iterate_references(skill_dir):
            if ref_content.strip():
                category = ref_file.stem.replace("_", " ").lower()

                doc_metadata = {
                    "source": metadata.name,
                    "category": category,
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
                chunks = self._maybe_chunk_content(
                    ref_content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
                    chunk_overlap_tokens=kwargs.get(
                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
                    ),
                )

                # Add all chunks to parallel arrays
                for chunk_text, chunk_meta in chunks:
                    documents.append(chunk_text)
                    metadatas.append(chunk_meta)
                    ids.append(self._generate_id(chunk_text, chunk_meta))

        # FAISS configuration hints
        config = {
            "index_type": "IndexFlatL2",  # Recommended starting point
            "dimension": 1536,  # OpenAI ada-002 default
            "metric": "L2",  # Euclidean distance
            "description": (
                "FAISS requires embeddings. Use OpenAI, Cohere, or local models "
                "to generate embeddings before adding to index."
            ),
        }

        return json.dumps(
            {
                "documents": documents,
                "metadatas": metadatas,
                "ids": ids,
                "config": config,
            },
            indent=2,
            ensure_ascii=False,
        )

    def package(
        self,
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for FAISS.

        Creates a JSON file containing documents, metadata, and FAISS config.

        Args:
            skill_dir: Path to skill directory
            output_path: Output path/filename for JSON file

        Returns:
            Path to created JSON file
        """
        skill_dir = Path(skill_dir)

        # Determine output filename using base helper method
        output_path = self._format_output_path(skill_dir, Path(output_path), "-faiss.json")
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Read metadata from SKILL.md frontmatter
        metadata = self._build_skill_metadata(skill_dir)

        # Generate FAISS data
        faiss_json = self.format_skill_md(
            skill_dir,
            metadata,
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
        output_path.write_text(faiss_json, encoding="utf-8")

        print(f"\n✅ FAISS data packaged successfully!")
        print(f"📦 Output: {output_path}")

        # Parse and show stats
        data = json.loads(faiss_json)

        print(f"📊 Total documents: {len(data['documents'])}")
        print(f"📐 Recommended index: {data['config']['index_type']}")
        print(f"📏 Embedding dimension: {data['config']['dimension']}")

        # Show category breakdown
        categories = {}
        for meta in data["metadatas"]:
            cat = meta.get("category", "unknown")
            categories[cat] = categories.get(cat, 0) + 1

        print("📁 Categories:")
        for cat, count in sorted(categories.items()):
            print(f"   - {cat}: {count}")

        return output_path

    def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]:
        """
        FAISS format does not support direct upload.

        Users should import the JSON file and create FAISS index.
        Metadata is stored as JSON (safe and portable).

        Args:
            package_path: Path to JSON file
            api_key: Not used
            **kwargs: Not used

        Returns:
            Result with usage instructions
        """
        example_code = f"""
# Example: Create FAISS index with JSON metadata (safe & portable)

import faiss
import json
import numpy as np
from openai import OpenAI
from pathlib import Path

# Load data
with open("{package_path.name}") as f:
    data = json.load(f)

# Generate embeddings (using OpenAI)
print("Generating embeddings...")
openai_client = OpenAI()
embeddings = []

for i, doc in enumerate(data["documents"]):
    response = openai_client.embeddings.create(
        model="text-embedding-ada-002",
        input=doc
    )
    embeddings.append(response.data[0].embedding)
    if (i + 1) % 10 == 0:
        print(f"  Generated {{i + 1}}/{{len(data['documents'])}} embeddings")

# Create FAISS index
dimension = len(embeddings[0])
print(f"\\nCreating FAISS index (dimension={{dimension}})...")

# Option 1: Flat index (exact search, best for <1M vectors)
index = faiss.IndexFlatL2(dimension)

# Option 2: IVF index (faster, approximate, for >100k vectors)
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, 100)
# index.train(np.array(embeddings).astype('float32'))

# Option 3: HNSW index (graph-based, very fast)
# index = faiss.IndexHNSWFlat(dimension, 32)

# Add vectors to index
vectors = np.array(embeddings).astype('float32')
index.add(vectors)
print(f"✅ Added {{index.ntotal}} vectors to index")

# Save index and metadata (using JSON - safe!)
output_dir = Path("faiss_db")
output_dir.mkdir(exist_ok=True)

faiss.write_index(index, str(output_dir / "docs.index"))

# Save metadata as JSON (secure and portable)
with open(output_dir / "metadata.json", "w") as f:
    json.dump({{
        "documents": data["documents"],
        "metadatas": data["metadatas"],
        "ids": data["ids"]
    }}, f, indent=2)

print(f"✅ Saved index to: {{output_dir}}/")

# Search with metadata
def search(query_text: str, k: int = 5):
    # Generate query embedding
    response = openai_client.embeddings.create(
        model="text-embedding-ada-002",
        input=query_text
    )
    query_vector = np.array([response.data[0].embedding]).astype('float32')

    # Search index
    distances, indices = index.search(query_vector, k)

    # Load metadata from JSON
    with open(output_dir / "metadata.json") as f:
        metadata_store = json.load(f)

    # Return results
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        results.append({{
            "rank": i + 1,
            "distance": float(dist),
            "metadata": metadata_store["metadatas"][idx],
            "text": metadata_store["documents"][idx][:200] + "..."
        }})

    return results

# Test search
results = search("How do I get started?")
for result in results:
    print(f"\\nRank {{result['rank']}} (distance={{result['distance']:.4f}}):")
    print(f"  Category: {{result['metadata']['category']}}")
    print(f"  File: {{result['metadata']['file']}}")
    print(f"  Text: {{result['text']}}")

# Load saved index (for later use)
def load_index(index_dir: str):
    index = faiss.read_index(str(Path(index_dir) / "docs.index"))
    with open(Path(index_dir) / "metadata.json") as f:
        metadata = json.load(f)
    return index, metadata

# Filtered search (post-processing with metadata)
def search_with_filter(query_text: str, category: str = None, k: int = 5):
    # Get more results for filtering
    results = search(query_text, k=50)

    # Filter by metadata
    if category:
        results = [r for r in results if r["metadata"]["category"] == category]

    return results[:k]

# Add new documents
def add_documents(new_docs: list, new_metadatas: list):
    # Generate embeddings
    new_embeddings = []
    for doc in new_docs:
        response = openai_client.embeddings.create(
            model="text-embedding-ada-002",
            input=doc
        )
        new_embeddings.append(response.data[0].embedding)

    # Add to index
    vectors = np.array(new_embeddings).astype('float32')
    index.add(vectors)

    # Update metadata (JSON)
    with open(output_dir / "metadata.json") as f:
        metadata = json.load(f)

    metadata["documents"].extend(new_docs)
    metadata["metadatas"].extend(new_metadatas)

    with open(output_dir / "metadata.json", "w") as f:
        json.dump(metadata, f, indent=2)

    # Save updated index
    faiss.write_index(index, str(output_dir / "docs.index"))
    print(f"✅ Added {{len(new_docs)}} documents")

# Index statistics
print(f"\\nIndex stats:")
print(f"  Total vectors: {{index.ntotal}}")
print(f"  Dimension: {{dimension}}")
print(f"  Type: {{type(index).__name__}}")
"""

        return {
            "success": False,
            "skill_id": None,
            "url": str(package_path.absolute()),
            "message": (
                f"FAISS data packaged at: {package_path.absolute()}\n\n"
                "Create FAISS index with JSON metadata (secure & portable):\n"
                f"{example_code}"
            ),
        }

    def validate_api_key(self, _api_key: str) -> bool:
        """FAISS doesn't use API keys."""
        return False

    def get_env_var_name(self) -> str:
        """FAISS doesn't use API keys."""
        return ""

    def supports_enhancement(self) -> bool:
        """FAISS format doesn't support AI enhancement."""
        return False

    def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
        """FAISS format doesn't support enhancement."""
        print("❌ FAISS format does not support enhancement")
        print("   Enhance before packaging:")
        print("   skill-seekers enhance output/skill/ --mode LOCAL")
        print("   skill-seekers package output/skill/ --target faiss")
        return False