#!/usr/bin/env python3 """ Chroma Adaptor Implements Chroma vector database format for RAG pipelines. Converts Skill Seekers documentation into Chroma-compatible format. """ import json from pathlib import Path from typing import Any from .base import SkillAdaptor, SkillMetadata class ChromaAdaptor(SkillAdaptor): """ Chroma vector database adaptor. Handles: - Chroma-compatible document format - ID generation for documents - Metadata structure - Collection configuration hints - Persistent collection support """ PLATFORM = "chroma" PLATFORM_NAME = "Chroma (Vector Database)" DEFAULT_API_ENDPOINT = None # Chroma runs locally or self-hosted def _generate_id(self, content: str, metadata: dict) -> str: """ Generate deterministic ID from content and metadata. Args: content: Document content metadata: Document metadata Returns: ID string (hex digest) """ return self._generate_deterministic_id(content, metadata, format="hex") def format_skill_md( self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs ) -> str: """ Format skill as JSON for Chroma ingestion. Converts SKILL.md and all references/*.md into Chroma-compatible format: { "documents": [...], "metadatas": [...], "ids": [...] } Args: skill_dir: Path to skill directory metadata: Skill metadata enable_chunking: Enable intelligent chunking for large documents **kwargs: Additional chunking parameters (chunk_max_tokens, preserve_code_blocks) Returns: JSON string containing Chroma-compatible data """ documents = [] metadatas = [] ids = [] # Convert SKILL.md (main documentation) skill_md_path = skill_dir / "SKILL.md" if skill_md_path.exists(): content = self._read_existing_content(skill_dir) if content.strip(): doc_metadata = { "source": metadata.name, "category": "overview", "file": "SKILL.md", "type": "documentation", "version": metadata.version, } # Chunk if enabled chunks = self._maybe_chunk_content( content, doc_metadata, enable_chunking=enable_chunking, chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), preserve_code_blocks=kwargs.get('preserve_code_blocks', True), source_file="SKILL.md" ) # Add all chunks to parallel arrays for chunk_text, chunk_meta in chunks: documents.append(chunk_text) metadatas.append(chunk_meta) ids.append(self._generate_id(chunk_text, chunk_meta)) # Convert all reference files using base helper method for ref_file, ref_content in self._iterate_references(skill_dir): if ref_content.strip(): # Derive category from filename category = ref_file.stem.replace("_", " ").lower() doc_metadata = { "source": metadata.name, "category": category, "file": ref_file.name, "type": "reference", "version": metadata.version, } # Chunk if enabled chunks = self._maybe_chunk_content( ref_content, doc_metadata, enable_chunking=enable_chunking, chunk_max_tokens=kwargs.get('chunk_max_tokens', 512), preserve_code_blocks=kwargs.get('preserve_code_blocks', True), source_file=ref_file.name ) # Add all chunks to parallel arrays for chunk_text, chunk_meta in chunks: documents.append(chunk_text) metadatas.append(chunk_meta) ids.append(self._generate_id(chunk_text, chunk_meta)) # Return Chroma-compatible format return json.dumps( { "documents": documents, "metadatas": metadatas, "ids": ids, "collection_name": metadata.name.replace("_", "-"), # Chroma prefers hyphens }, indent=2, ensure_ascii=False, ) def package( self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True ) -> Path: """ Package skill into JSON file for Chroma. Creates a JSON file containing documents, metadatas, and ids ready for Chroma collection import. Args: skill_dir: Path to skill directory output_path: Output path/filename for JSON file Returns: Path to created JSON file """ skill_dir = Path(skill_dir) # Determine output filename using base helper method output_path = self._format_output_path(skill_dir, Path(output_path), "-chroma.json") output_path.parent.mkdir(parents=True, exist_ok=True) # Read metadata metadata = SkillMetadata( name=skill_dir.name, description=f"Chroma collection data for {skill_dir.name}", version="1.0.0", ) # Generate Chroma data chroma_json = self.format_skill_md( skill_dir, metadata, enable_chunking=enable_chunking, chunk_max_tokens=chunk_max_tokens, preserve_code_blocks=preserve_code_blocks ) # Write to file output_path.write_text(chroma_json, encoding="utf-8") print(f"\nāœ… Chroma data packaged successfully!") print(f"šŸ“¦ Output: {output_path}") # Parse and show stats data = json.loads(chroma_json) print(f"šŸ“Š Total documents: {len(data['documents'])}") print(f"šŸ“‚ Collection name: {data['collection_name']}") # Show category breakdown categories = {} for meta in data["metadatas"]: cat = meta.get("category", "unknown") categories[cat] = categories.get(cat, 0) + 1 print("šŸ“ Categories:") for cat, count in sorted(categories.items()): print(f" - {cat}: {count}") return output_path def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]: """ Chroma format does not support direct upload. Users should import the JSON file into their Chroma instance: ```python import chromadb import json # Create client (persistent) client = chromadb.PersistentClient(path="./chroma_db") # Load data with open("skill-chroma.json") as f: data = json.load(f) # Create or get collection collection = client.get_or_create_collection( name=data["collection_name"] ) # Add documents (Chroma generates embeddings automatically) collection.add( documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"] ) ``` Args: package_path: Path to JSON file api_key: Not used **kwargs: Not used Returns: Result indicating no upload capability """ example_code = """ # Example: Import into Chroma import chromadb import json from openai import OpenAI # Load data with open("{path}") as f: data = json.load(f) # Option 1: Persistent client (recommended) client = chromadb.PersistentClient(path="./chroma_db") # Option 2: In-memory client (for testing) # client = chromadb.Client() # Create or get collection collection = client.get_or_create_collection( name=data["collection_name"], metadata={{"description": "Documentation from Skill Seekers"}} ) # Option A: Let Chroma generate embeddings (default) collection.add( documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"] ) # Option B: Use custom embeddings (OpenAI) openai_client = OpenAI() embeddings = [] for doc in data["documents"]: response = openai_client.embeddings.create( model="text-embedding-ada-002", input=doc ) embeddings.append(response.data[0].embedding) collection.add( documents=data["documents"], embeddings=embeddings, metadatas=data["metadatas"], ids=data["ids"] ) print(f"āœ… Added {{len(data['documents'])}} documents to collection") print(f"šŸ“Š Total documents in collection: {{collection.count()}}") # Query example (semantic search) results = collection.query( query_texts=["your search query"], n_results=3 ) # Query with metadata filter results = collection.query( query_texts=["search query"], n_results=5, where={{"category": "api"}} # Filter by category ) # Query with multiple filters (AND) results = collection.query( query_texts=["search query"], n_results=5, where={{ "$and": [ {{"category": "api"}}, {{"type": "reference"}} ] }} ) # Get documents by ID docs = collection.get(ids=[data["ids"][0]]) # Update collection (re-add with same IDs) collection.update( ids=[data["ids"][0]], documents=["updated content"], metadatas=[data["metadatas"][0]] ) # Delete documents collection.delete(ids=[data["ids"][0]]) # Persist collection (if using PersistentClient, automatic on exit) # Collection is automatically persisted to disk """.format( path=package_path.name ) return { "success": False, "skill_id": None, "url": str(package_path.absolute()), "message": ( f"Chroma data packaged at: {package_path.absolute()}\n\n" "Import into Chroma:\n" f"{example_code}" ), } def validate_api_key(self, _api_key: str) -> bool: """ Chroma format doesn't use API keys for packaging. Args: api_key: Not used Returns: Always False (no API needed for packaging) """ return False def get_env_var_name(self) -> str: """ No API key needed for Chroma packaging. Returns: Empty string """ return "" def supports_enhancement(self) -> bool: """ Chroma format doesn't support AI enhancement. Enhancement should be done before conversion using: skill-seekers enhance output/skill/ --mode LOCAL Returns: False """ return False def enhance(self, _skill_dir: Path, _api_key: str) -> bool: """ Chroma format doesn't support enhancement. Args: skill_dir: Not used api_key: Not used Returns: False """ print("āŒ Chroma format does not support enhancement") print(" Enhance before packaging:") print(" skill-seekers enhance output/skill/ --mode LOCAL") print(" skill-seekers package output/skill/ --target chroma") return False