feat: Add Haystack RAG framework adaptor (Task 2.2)

Implements complete Haystack 2.x integration for RAG pipelines:

**Haystack Adaptor (src/skill_seekers/cli/adaptors/haystack.py):**
- Document format: {content: str, meta: dict}
- JSON packaging for Haystack pipelines
- Compatible with InMemoryDocumentStore, BM25Retriever
- Registered in adaptor factory as 'haystack'

**Example Pipeline (examples/haystack-pipeline/):**
- README.md with comprehensive guide and troubleshooting
- quickstart.py demonstrating BM25 retrieval
- requirements.txt (haystack-ai>=2.0.0)
- Shows document loading, indexing, and querying

**Tests (tests/test_adaptors/test_haystack_adaptor.py):**
- 11 tests covering all adaptor functionality
- Format validation, packaging, upload messages
- Edge cases: empty dirs, references-only skills
- All 93 adaptor tests passing (100% suite pass rate)

**Features:**
- No upload endpoint (local use only like LangChain/LlamaIndex)
- No AI enhancement (enhance before packaging)
- Same packaging pattern as other RAG frameworks
- InMemoryDocumentStore + BM25Retriever example

Test: pytest tests/test_adaptors/test_haystack_adaptor.py -v
This commit is contained in:
yusyus
2026-02-07 21:01:49 +03:00
parent 8b3f31409e
commit 1c888e7817
6 changed files with 910 additions and 0 deletions

View File

@@ -59,6 +59,11 @@ try:
except ImportError:
QdrantAdaptor = None
try:
from .haystack import HaystackAdaptor
except ImportError:
HaystackAdaptor = None
# Registry of available adaptors
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
@@ -84,6 +89,8 @@ if FAISSHelpers:
ADAPTORS["faiss"] = FAISSHelpers
if QdrantAdaptor:
ADAPTORS["qdrant"] = QdrantAdaptor
if HaystackAdaptor:
ADAPTORS["haystack"] = HaystackAdaptor
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:

View File

@@ -0,0 +1,294 @@
#!/usr/bin/env python3
"""
Haystack Adaptor
Implements Haystack Document format for RAG pipelines.
Converts Skill Seekers documentation into Haystack-compatible Document objects.
"""
import json
from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
class HaystackAdaptor(SkillAdaptor):
"""
Haystack platform adaptor.
Handles:
- Haystack Document format (content + meta)
- JSON packaging with array of documents
- No upload (users import directly into code)
- Optimized for Haystack 2.x pipelines
"""
PLATFORM = "haystack"
PLATFORM_NAME = "Haystack (RAG Framework)"
DEFAULT_API_ENDPOINT = None # No upload endpoint
def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str:
"""
Format skill as JSON array of Haystack Documents.
Converts SKILL.md and all references/*.md into Haystack Document format:
{
"content": "...",
"meta": {"source": "...", "category": "...", ...}
}
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
Returns:
JSON string containing array of Haystack Documents
"""
documents = []
# Convert SKILL.md (main documentation)
skill_md_path = skill_dir / "SKILL.md"
if skill_md_path.exists():
content = self._read_existing_content(skill_dir)
if content.strip():
documents.append(
{
"content": content,
"meta": {
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
},
}
)
# Convert all reference files
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in sorted(refs_dir.glob("*.md")):
if ref_file.is_file() and not ref_file.name.startswith("."):
try:
ref_content = ref_file.read_text(encoding="utf-8")
if ref_content.strip():
# Derive category from filename
category = ref_file.stem.replace("_", " ").lower()
documents.append(
{
"content": ref_content,
"meta": {
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
},
}
)
except Exception as e:
print(f"⚠️ Warning: Could not read {ref_file.name}: {e}")
continue
# Return as formatted JSON
return json.dumps(documents, indent=2, ensure_ascii=False)
def package(self, skill_dir: Path, output_path: Path) -> Path:
"""
Package skill into JSON file for Haystack.
Creates a JSON file containing an array of Haystack Documents ready
for ingestion into Haystack 2.x pipelines and document stores.
Args:
skill_dir: Path to skill directory
output_path: Output path/filename for JSON file
Returns:
Path to created JSON file
"""
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-haystack.json"
elif not str(output_path).endswith(".json"):
# Replace extension if needed
output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json")
if not output_str.endswith("-haystack.json"):
output_str = output_str.replace(".json", "-haystack.json")
if not output_str.endswith(".json"):
output_str += ".json"
output_path = Path(output_str)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"Haystack documents for {skill_dir.name}",
version="1.0.0",
)
# Generate Haystack documents
documents_json = self.format_skill_md(skill_dir, metadata)
# Write to file
output_path.write_text(documents_json, encoding="utf-8")
print(f"\n✅ Haystack documents packaged successfully!")
print(f"📦 Output: {output_path}")
# Parse and show stats
documents = json.loads(documents_json)
print(f"📊 Total documents: {len(documents)}")
# Show category breakdown
categories = {}
for doc in documents:
cat = doc["meta"].get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
print("📁 Categories:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count}")
return output_path
def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]:
"""
Haystack format does not support direct upload.
Users should import the JSON file into their Haystack code:
```python
from haystack import Document
import json
# Load documents
with open("skill-haystack.json") as f:
docs_data = json.load(f)
# Convert to Haystack Documents
documents = [
Document(content=doc["content"], meta=doc["meta"])
for doc in docs_data
]
# Use with document store
from haystack.document_stores.in_memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)
# Create pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
retriever = InMemoryBM25Retriever(document_store=document_store)
results = retriever.run(query="your query here")
```
Args:
package_path: Path to JSON file
api_key: Not used
**kwargs: Not used
Returns:
Result indicating no upload capability
"""
example_code = """
# Example: Load into Haystack 2.x
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
import json
# Load documents
with open("{path}") as f:
docs_data = json.load(f)
# Convert to Haystack Documents
documents = [
Document(content=doc["content"], meta=doc["meta"])
for doc in docs_data
]
# Create document store
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)
# Create retriever
retriever = InMemoryBM25Retriever(document_store=document_store)
# Query
results = retriever.run(query="your question here")
for doc in results["documents"]:
print(doc.content)
""".format(
path=package_path.name
)
return {
"success": False,
"skill_id": None,
"url": str(package_path.absolute()),
"message": (
f"Haystack documents packaged at: {package_path.absolute()}\n\n"
"Load into your code:\n"
f"{example_code}"
),
}
def validate_api_key(self, _api_key: str) -> bool:
"""
Haystack format doesn't use API keys for packaging.
Args:
api_key: Not used
Returns:
Always False (no API needed for packaging)
"""
return False
def get_env_var_name(self) -> str:
"""
No API key needed for Haystack packaging.
Returns:
Empty string
"""
return ""
def supports_enhancement(self) -> bool:
"""
Haystack format doesn't support AI enhancement.
Enhancement should be done before conversion using:
skill-seekers enhance output/skill/ --mode LOCAL
Returns:
False
"""
return False
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
"""
Haystack format doesn't support enhancement.
Args:
skill_dir: Not used
api_key: Not used
Returns:
False
"""
print("❌ Haystack format does not support enhancement")
print(" Enhance before packaging:")
print(" skill-seekers enhance output/skill/ --mode LOCAL")
print(" skill-seekers package output/skill/ --target haystack")
return False