feat(chroma): Add Chroma vector database adaptor (Task #11)
Implements native Chroma integration for RAG pipelines as part of Week 2 vector store integrations. ## Features - **Chroma-compatible format** - Direct `collection.add()` support - **Deterministic IDs** - Stable IDs for consistent re-imports - **Metadata structure** - Compatible with Chroma's metadata filtering - **Collection naming** - Auto-derived from skill name - **Example code** - Complete usage examples with persistent/in-memory options ## Output Format JSON file containing: - `documents`: Array of document strings - `metadatas`: Array of metadata dicts - `ids`: Array of deterministic IDs - `collection_name`: Suggested collection name ## CLI Integration ```bash skill-seekers package output/django --target chroma # → output/django-chroma.json ``` ## Files Added - src/skill_seekers/cli/adaptors/chroma.py (360 lines) * Complete Chroma adaptor implementation * ID generation from content hash * Metadata structure compatible with Chroma * Example code for add/query/filter/update/delete ## Files Modified - src/skill_seekers/cli/adaptors/__init__.py * Import ChromaAdaptor * Register "chroma" in ADAPTORS - src/skill_seekers/cli/package_skill.py * Add "chroma" to --target choices - src/skill_seekers/cli/main.py * Add "chroma" to --target choices ## Testing Tested with ansible skill: - ✅ Document format correct - ✅ Metadata structure compatible - ✅ IDs deterministic - ✅ Collection name derived correctly - ✅ CLI integration working Output: output/ansible-chroma.json (9.3 KB, 1 document) ## Week 2 Progress - ✅ Task #10: Weaviate adaptor (Complete) - ✅ Task #11: Chroma adaptor (Complete) - ⏳ Task #12: FAISS helpers (Next) - ⏳ Task #13: Qdrant adaptor Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -44,6 +44,11 @@ try:
|
||||
except ImportError:
|
||||
WeaviateAdaptor = None
|
||||
|
||||
try:
|
||||
from .chroma import ChromaAdaptor
|
||||
except ImportError:
|
||||
ChromaAdaptor = None
|
||||
|
||||
|
||||
# Registry of available adaptors
|
||||
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
|
||||
@@ -63,6 +68,8 @@ if LlamaIndexAdaptor:
|
||||
ADAPTORS["llama-index"] = LlamaIndexAdaptor
|
||||
if WeaviateAdaptor:
|
||||
ADAPTORS["weaviate"] = WeaviateAdaptor
|
||||
if ChromaAdaptor:
|
||||
ADAPTORS["chroma"] = ChromaAdaptor
|
||||
|
||||
|
||||
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
|
||||
|
||||
381
src/skill_seekers/cli/adaptors/chroma.py
Normal file
381
src/skill_seekers/cli/adaptors/chroma.py
Normal file
@@ -0,0 +1,381 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Chroma Adaptor
|
||||
|
||||
Implements Chroma vector database format for RAG pipelines.
|
||||
Converts Skill Seekers documentation into Chroma-compatible format.
|
||||
"""
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
|
||||
|
||||
class ChromaAdaptor(SkillAdaptor):
|
||||
"""
|
||||
Chroma vector database adaptor.
|
||||
|
||||
Handles:
|
||||
- Chroma-compatible document format
|
||||
- ID generation for documents
|
||||
- Metadata structure
|
||||
- Collection configuration hints
|
||||
- Persistent collection support
|
||||
"""
|
||||
|
||||
PLATFORM = "chroma"
|
||||
PLATFORM_NAME = "Chroma (Vector Database)"
|
||||
DEFAULT_API_ENDPOINT = None # Chroma runs locally or self-hosted
|
||||
|
||||
def _generate_id(self, content: str, metadata: dict) -> str:
|
||||
"""
|
||||
Generate deterministic ID from content and metadata.
|
||||
|
||||
Args:
|
||||
content: Document content
|
||||
metadata: Document metadata
|
||||
|
||||
Returns:
|
||||
ID string (hex digest)
|
||||
"""
|
||||
# Create deterministic ID from content + metadata
|
||||
id_string = f"{metadata.get('source', '')}-{metadata.get('file', '')}-{content[:100]}"
|
||||
return hashlib.md5(id_string.encode()).hexdigest()
|
||||
|
||||
def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str:
|
||||
"""
|
||||
Format skill as JSON for Chroma ingestion.
|
||||
|
||||
Converts SKILL.md and all references/*.md into Chroma-compatible format:
|
||||
{
|
||||
"documents": [...],
|
||||
"metadatas": [...],
|
||||
"ids": [...]
|
||||
}
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
|
||||
Returns:
|
||||
JSON string containing Chroma-compatible data
|
||||
"""
|
||||
documents = []
|
||||
metadatas = []
|
||||
ids = []
|
||||
|
||||
# Convert SKILL.md (main documentation)
|
||||
skill_md_path = skill_dir / "SKILL.md"
|
||||
if skill_md_path.exists():
|
||||
content = self._read_existing_content(skill_dir)
|
||||
if content.strip():
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(content, doc_metadata))
|
||||
|
||||
# Convert all reference files
|
||||
refs_dir = skill_dir / "references"
|
||||
if refs_dir.exists():
|
||||
for ref_file in sorted(refs_dir.glob("*.md")):
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
try:
|
||||
ref_content = ref_file.read_text(encoding="utf-8")
|
||||
if ref_content.strip():
|
||||
# Derive category from filename
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
}
|
||||
|
||||
documents.append(ref_content)
|
||||
metadatas.append(doc_metadata)
|
||||
ids.append(self._generate_id(ref_content, doc_metadata))
|
||||
except Exception as e:
|
||||
print(f"⚠️ Warning: Could not read {ref_file.name}: {e}")
|
||||
continue
|
||||
|
||||
# Return Chroma-compatible format
|
||||
return json.dumps(
|
||||
{
|
||||
"documents": documents,
|
||||
"metadatas": metadatas,
|
||||
"ids": ids,
|
||||
"collection_name": metadata.name.replace("_", "-"), # Chroma prefers hyphens
|
||||
},
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Chroma.
|
||||
|
||||
Creates a JSON file containing documents, metadatas, and ids ready
|
||||
for Chroma collection import.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
output_path: Output path/filename for JSON file
|
||||
|
||||
Returns:
|
||||
Path to created JSON file
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
output_path = Path(output_path) / f"{skill_dir.name}-chroma.json"
|
||||
elif not str(output_path).endswith(".json"):
|
||||
# Replace extension if needed
|
||||
output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json")
|
||||
if not output_str.endswith("-chroma.json"):
|
||||
output_str = output_str.replace(".json", "-chroma.json")
|
||||
if not output_str.endswith(".json"):
|
||||
output_str += ".json"
|
||||
output_path = Path(output_str)
|
||||
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"Chroma collection data for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
|
||||
# Generate Chroma data
|
||||
chroma_json = self.format_skill_md(skill_dir, metadata)
|
||||
|
||||
# Write to file
|
||||
output_path.write_text(chroma_json, encoding="utf-8")
|
||||
|
||||
print(f"\n✅ Chroma data packaged successfully!")
|
||||
print(f"📦 Output: {output_path}")
|
||||
|
||||
# Parse and show stats
|
||||
data = json.loads(chroma_json)
|
||||
|
||||
print(f"📊 Total documents: {len(data['documents'])}")
|
||||
print(f"📂 Collection name: {data['collection_name']}")
|
||||
|
||||
# Show category breakdown
|
||||
categories = {}
|
||||
for meta in data["metadatas"]:
|
||||
cat = meta.get("category", "unknown")
|
||||
categories[cat] = categories.get(cat, 0) + 1
|
||||
|
||||
print("📁 Categories:")
|
||||
for cat, count in sorted(categories.items()):
|
||||
print(f" - {cat}: {count}")
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Chroma format does not support direct upload.
|
||||
|
||||
Users should import the JSON file into their Chroma instance:
|
||||
|
||||
```python
|
||||
import chromadb
|
||||
import json
|
||||
|
||||
# Create client (persistent)
|
||||
client = chromadb.PersistentClient(path="./chroma_db")
|
||||
|
||||
# Load data
|
||||
with open("skill-chroma.json") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Create or get collection
|
||||
collection = client.get_or_create_collection(
|
||||
name=data["collection_name"]
|
||||
)
|
||||
|
||||
# Add documents (Chroma generates embeddings automatically)
|
||||
collection.add(
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"]
|
||||
)
|
||||
```
|
||||
|
||||
Args:
|
||||
package_path: Path to JSON file
|
||||
api_key: Not used
|
||||
**kwargs: Not used
|
||||
|
||||
Returns:
|
||||
Result indicating no upload capability
|
||||
"""
|
||||
example_code = """
|
||||
# Example: Import into Chroma
|
||||
|
||||
import chromadb
|
||||
import json
|
||||
from openai import OpenAI
|
||||
|
||||
# Load data
|
||||
with open("{path}") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Option 1: Persistent client (recommended)
|
||||
client = chromadb.PersistentClient(path="./chroma_db")
|
||||
|
||||
# Option 2: In-memory client (for testing)
|
||||
# client = chromadb.Client()
|
||||
|
||||
# Create or get collection
|
||||
collection = client.get_or_create_collection(
|
||||
name=data["collection_name"],
|
||||
metadata={{"description": "Documentation from Skill Seekers"}}
|
||||
)
|
||||
|
||||
# Option A: Let Chroma generate embeddings (default)
|
||||
collection.add(
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"]
|
||||
)
|
||||
|
||||
# Option B: Use custom embeddings (OpenAI)
|
||||
openai_client = OpenAI()
|
||||
embeddings = []
|
||||
for doc in data["documents"]:
|
||||
response = openai_client.embeddings.create(
|
||||
model="text-embedding-ada-002",
|
||||
input=doc
|
||||
)
|
||||
embeddings.append(response.data[0].embedding)
|
||||
|
||||
collection.add(
|
||||
documents=data["documents"],
|
||||
embeddings=embeddings,
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"]
|
||||
)
|
||||
|
||||
print(f"✅ Added {{len(data['documents'])}} documents to collection")
|
||||
print(f"📊 Total documents in collection: {{collection.count()}}")
|
||||
|
||||
# Query example (semantic search)
|
||||
results = collection.query(
|
||||
query_texts=["your search query"],
|
||||
n_results=3
|
||||
)
|
||||
|
||||
# Query with metadata filter
|
||||
results = collection.query(
|
||||
query_texts=["search query"],
|
||||
n_results=5,
|
||||
where={{"category": "api"}} # Filter by category
|
||||
)
|
||||
|
||||
# Query with multiple filters (AND)
|
||||
results = collection.query(
|
||||
query_texts=["search query"],
|
||||
n_results=5,
|
||||
where={{
|
||||
"$and": [
|
||||
{{"category": "api"}},
|
||||
{{"type": "reference"}}
|
||||
]
|
||||
}}
|
||||
)
|
||||
|
||||
# Get documents by ID
|
||||
docs = collection.get(ids=[data["ids"][0]])
|
||||
|
||||
# Update collection (re-add with same IDs)
|
||||
collection.update(
|
||||
ids=[data["ids"][0]],
|
||||
documents=["updated content"],
|
||||
metadatas=[data["metadatas"][0]]
|
||||
)
|
||||
|
||||
# Delete documents
|
||||
collection.delete(ids=[data["ids"][0]])
|
||||
|
||||
# Persist collection (if using PersistentClient, automatic on exit)
|
||||
# Collection is automatically persisted to disk
|
||||
""".format(
|
||||
path=package_path.name
|
||||
)
|
||||
|
||||
return {
|
||||
"success": False,
|
||||
"skill_id": None,
|
||||
"url": str(package_path.absolute()),
|
||||
"message": (
|
||||
f"Chroma data packaged at: {package_path.absolute()}\n\n"
|
||||
"Import into Chroma:\n"
|
||||
f"{example_code}"
|
||||
),
|
||||
}
|
||||
|
||||
def validate_api_key(self, _api_key: str) -> bool:
|
||||
"""
|
||||
Chroma format doesn't use API keys for packaging.
|
||||
|
||||
Args:
|
||||
api_key: Not used
|
||||
|
||||
Returns:
|
||||
Always False (no API needed for packaging)
|
||||
"""
|
||||
return False
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""
|
||||
No API key needed for Chroma packaging.
|
||||
|
||||
Returns:
|
||||
Empty string
|
||||
"""
|
||||
return ""
|
||||
|
||||
def supports_enhancement(self) -> bool:
|
||||
"""
|
||||
Chroma format doesn't support AI enhancement.
|
||||
|
||||
Enhancement should be done before conversion using:
|
||||
skill-seekers enhance output/skill/ --mode LOCAL
|
||||
|
||||
Returns:
|
||||
False
|
||||
"""
|
||||
return False
|
||||
|
||||
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
|
||||
"""
|
||||
Chroma format doesn't support enhancement.
|
||||
|
||||
Args:
|
||||
skill_dir: Not used
|
||||
api_key: Not used
|
||||
|
||||
Returns:
|
||||
False
|
||||
"""
|
||||
print("❌ Chroma format does not support enhancement")
|
||||
print(" Enhance before packaging:")
|
||||
print(" skill-seekers enhance output/skill/ --mode LOCAL")
|
||||
print(" skill-seekers package output/skill/ --target chroma")
|
||||
return False
|
||||
@@ -215,7 +215,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
||||
package_parser.add_argument("--upload", action="store_true", help="Auto-upload after packaging")
|
||||
package_parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate"],
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
@@ -155,7 +155,7 @@ Examples:
|
||||
|
||||
parser.add_argument(
|
||||
"--target",
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate"],
|
||||
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate", "chroma"],
|
||||
default="claude",
|
||||
help="Target LLM platform (default: claude)",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user