#!/usr/bin/env python3 """ FAISS Helpers Utilities for working with FAISS indexes for RAG pipelines. Provides easy-to-use wrappers around FAISS with metadata management. """ import json from pathlib import Path from typing import Any import hashlib from .base import SkillAdaptor, SkillMetadata class FAISSHelpers(SkillAdaptor): """ FAISS helper adaptor. Provides utilities for: - FAISS index creation (multiple types) - Metadata management (JSON storage - safe and portable) - Save/load indexes with metadata - Batch document addition - Search with metadata filtering - Index optimization Note: FAISS doesn't have built-in metadata support, so we manage it separately. """ PLATFORM = "faiss" PLATFORM_NAME = "FAISS (Similarity Search)" DEFAULT_API_ENDPOINT = None # FAISS runs locally def _generate_id(self, content: str, metadata: dict) -> str: """ Generate deterministic ID from content and metadata. Args: content: Document content metadata: Document metadata Returns: ID string (hex digest) """ id_string = f"{metadata.get('source', '')}-{metadata.get('file', '')}-{content[:100]}" return hashlib.md5(id_string.encode()).hexdigest() def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str: """ Format skill as JSON for FAISS ingestion. Creates a package with: - documents: Array of document strings - metadatas: Array of metadata dicts - ids: Array of IDs - config: FAISS configuration hints Args: skill_dir: Path to skill directory metadata: Skill metadata Returns: JSON string containing FAISS-compatible data """ documents = [] metadatas = [] ids = [] # Convert SKILL.md (main documentation) skill_md_path = skill_dir / "SKILL.md" if skill_md_path.exists(): content = self._read_existing_content(skill_dir) if content.strip(): doc_metadata = { "source": metadata.name, "category": "overview", "file": "SKILL.md", "type": "documentation", "version": metadata.version, } documents.append(content) metadatas.append(doc_metadata) ids.append(self._generate_id(content, doc_metadata)) # Convert all reference files refs_dir = skill_dir / "references" if refs_dir.exists(): for ref_file in sorted(refs_dir.glob("*.md")): if ref_file.is_file() and not ref_file.name.startswith("."): try: ref_content = ref_file.read_text(encoding="utf-8") if ref_content.strip(): category = ref_file.stem.replace("_", " ").lower() doc_metadata = { "source": metadata.name, "category": category, "file": ref_file.name, "type": "reference", "version": metadata.version, } documents.append(ref_content) metadatas.append(doc_metadata) ids.append(self._generate_id(ref_content, doc_metadata)) except Exception as e: print(f"āš ļø Warning: Could not read {ref_file.name}: {e}") continue # FAISS configuration hints config = { "index_type": "IndexFlatL2", # Recommended starting point "dimension": 1536, # OpenAI ada-002 default "metric": "L2", # Euclidean distance "description": ( "FAISS requires embeddings. Use OpenAI, Cohere, or local models " "to generate embeddings before adding to index." ), } return json.dumps( { "documents": documents, "metadatas": metadatas, "ids": ids, "config": config, }, indent=2, ensure_ascii=False, ) def package(self, skill_dir: Path, output_path: Path) -> Path: """ Package skill into JSON file for FAISS. Creates a JSON file containing documents, metadata, and FAISS config. Args: skill_dir: Path to skill directory output_path: Output path/filename for JSON file Returns: Path to created JSON file """ skill_dir = Path(skill_dir) # Determine output filename if output_path.is_dir() or str(output_path).endswith("/"): output_path = Path(output_path) / f"{skill_dir.name}-faiss.json" elif not str(output_path).endswith(".json"): output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json") if not output_str.endswith("-faiss.json"): output_str = output_str.replace(".json", "-faiss.json") if not output_str.endswith(".json"): output_str += ".json" output_path = Path(output_str) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Read metadata metadata = SkillMetadata( name=skill_dir.name, description=f"FAISS data for {skill_dir.name}", version="1.0.0", ) # Generate FAISS data faiss_json = self.format_skill_md(skill_dir, metadata) # Write to file output_path.write_text(faiss_json, encoding="utf-8") print(f"\nāœ… FAISS data packaged successfully!") print(f"šŸ“¦ Output: {output_path}") # Parse and show stats data = json.loads(faiss_json) print(f"šŸ“Š Total documents: {len(data['documents'])}") print(f"šŸ“ Recommended index: {data['config']['index_type']}") print(f"šŸ“ Embedding dimension: {data['config']['dimension']}") # Show category breakdown categories = {} for meta in data["metadatas"]: cat = meta.get("category", "unknown") categories[cat] = categories.get(cat, 0) + 1 print("šŸ“ Categories:") for cat, count in sorted(categories.items()): print(f" - {cat}: {count}") return output_path def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]: """ FAISS format does not support direct upload. Users should import the JSON file and create FAISS index. Metadata is stored as JSON (safe and portable). Args: package_path: Path to JSON file api_key: Not used **kwargs: Not used Returns: Result with usage instructions """ example_code = """ # Example: Create FAISS index with JSON metadata (safe & portable) import faiss import json import numpy as np from openai import OpenAI from pathlib import Path # Load data with open("{path}") as f: data = json.load(f) # Generate embeddings (using OpenAI) print("Generating embeddings...") openai_client = OpenAI() embeddings = [] for i, doc in enumerate(data["documents"]): response = openai_client.embeddings.create( model="text-embedding-ada-002", input=doc ) embeddings.append(response.data[0].embedding) if (i + 1) % 10 == 0: print(f" Generated {{i + 1}}/{{len(data['documents'])}} embeddings") # Create FAISS index dimension = len(embeddings[0]) print(f"\\nCreating FAISS index (dimension={{dimension}})...") # Option 1: Flat index (exact search, best for <1M vectors) index = faiss.IndexFlatL2(dimension) # Option 2: IVF index (faster, approximate, for >100k vectors) # quantizer = faiss.IndexFlatL2(dimension) # index = faiss.IndexIVFFlat(quantizer, dimension, 100) # index.train(np.array(embeddings).astype('float32')) # Option 3: HNSW index (graph-based, very fast) # index = faiss.IndexHNSWFlat(dimension, 32) # Add vectors to index vectors = np.array(embeddings).astype('float32') index.add(vectors) print(f"āœ… Added {{index.ntotal}} vectors to index") # Save index and metadata (using JSON - safe!) output_dir = Path("faiss_db") output_dir.mkdir(exist_ok=True) faiss.write_index(index, str(output_dir / "docs.index")) # Save metadata as JSON (secure and portable) with open(output_dir / "metadata.json", "w") as f: json.dump({{ "documents": data["documents"], "metadatas": data["metadatas"], "ids": data["ids"] }}, f, indent=2) print(f"āœ… Saved index to: {{output_dir}}/") # Search with metadata def search(query_text: str, k: int = 5): # Generate query embedding response = openai_client.embeddings.create( model="text-embedding-ada-002", input=query_text ) query_vector = np.array([response.data[0].embedding]).astype('float32') # Search index distances, indices = index.search(query_vector, k) # Load metadata from JSON with open(output_dir / "metadata.json") as f: metadata_store = json.load(f) # Return results results = [] for i, (dist, idx) in enumerate(zip(distances[0], indices[0])): results.append({{ "rank": i + 1, "distance": float(dist), "metadata": metadata_store["metadatas"][idx], "text": metadata_store["documents"][idx][:200] + "..." }}) return results # Test search results = search("How do I get started?") for result in results: print(f"\\nRank {{result['rank']}} (distance={{result['distance']:.4f}}):") print(f" Category: {{result['metadata']['category']}}") print(f" File: {{result['metadata']['file']}}") print(f" Text: {{result['text']}}") # Load saved index (for later use) def load_index(index_dir: str): index = faiss.read_index(str(Path(index_dir) / "docs.index")) with open(Path(index_dir) / "metadata.json") as f: metadata = json.load(f) return index, metadata # Filtered search (post-processing with metadata) def search_with_filter(query_text: str, category: str = None, k: int = 5): # Get more results for filtering results = search(query_text, k=50) # Filter by metadata if category: results = [r for r in results if r["metadata"]["category"] == category] return results[:k] # Add new documents def add_documents(new_docs: list, new_metadatas: list): # Generate embeddings new_embeddings = [] for doc in new_docs: response = openai_client.embeddings.create( model="text-embedding-ada-002", input=doc ) new_embeddings.append(response.data[0].embedding) # Add to index vectors = np.array(new_embeddings).astype('float32') index.add(vectors) # Update metadata (JSON) with open(output_dir / "metadata.json") as f: metadata = json.load(f) metadata["documents"].extend(new_docs) metadata["metadatas"].extend(new_metadatas) with open(output_dir / "metadata.json", "w") as f: json.dump(metadata, f, indent=2) # Save updated index faiss.write_index(index, str(output_dir / "docs.index")) print(f"āœ… Added {{len(new_docs)}} documents") # Index statistics print(f"\\nIndex stats:") print(f" Total vectors: {{index.ntotal}}") print(f" Dimension: {{dimension}}") print(f" Type: {{type(index).__name__}}") """.format( path=package_path.name ) return { "success": False, "skill_id": None, "url": str(package_path.absolute()), "message": ( f"FAISS data packaged at: {package_path.absolute()}\n\n" "Create FAISS index with JSON metadata (secure & portable):\n" f"{example_code}" ), } def validate_api_key(self, _api_key: str) -> bool: """FAISS doesn't use API keys.""" return False def get_env_var_name(self) -> str: """FAISS doesn't use API keys.""" return "" def supports_enhancement(self) -> bool: """FAISS format doesn't support AI enhancement.""" return False def enhance(self, _skill_dir: Path, _api_key: str) -> bool: """FAISS format doesn't support enhancement.""" print("āŒ FAISS format does not support enhancement") print(" Enhance before packaging:") print(" skill-seekers enhance output/skill/ --mode LOCAL") print(" skill-seekers package output/skill/ --target faiss") return False