Files
yusyus 1c888e7817 feat: Add Haystack RAG framework adaptor (Task 2.2)
Implements complete Haystack 2.x integration for RAG pipelines:

**Haystack Adaptor (src/skill_seekers/cli/adaptors/haystack.py):**
- Document format: {content: str, meta: dict}
- JSON packaging for Haystack pipelines
- Compatible with InMemoryDocumentStore, BM25Retriever
- Registered in adaptor factory as 'haystack'

**Example Pipeline (examples/haystack-pipeline/):**
- README.md with comprehensive guide and troubleshooting
- quickstart.py demonstrating BM25 retrieval
- requirements.txt (haystack-ai>=2.0.0)
- Shows document loading, indexing, and querying

**Tests (tests/test_adaptors/test_haystack_adaptor.py):**
- 11 tests covering all adaptor functionality
- Format validation, packaging, upload messages
- Edge cases: empty dirs, references-only skills
- All 93 adaptor tests passing (100% suite pass rate)

**Features:**
- No upload endpoint (local use only like LangChain/LlamaIndex)
- No AI enhancement (enhance before packaging)
- Same packaging pattern as other RAG frameworks
- InMemoryDocumentStore + BM25Retriever example

Test: pytest tests/test_adaptors/test_haystack_adaptor.py -v
2026-02-07 21:01:49 +03:00

129 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""
Haystack Pipeline Example
Demonstrates how to use Skill Seekers documentation with Haystack 2.x
for building RAG pipelines.
"""
import json
import sys
from pathlib import Path
def main():
"""Run Haystack pipeline example."""
print("=" * 60)
print("Haystack Pipeline Example")
print("=" * 60)
# Check if Haystack is installed
try:
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
except ImportError:
print("❌ Error: Haystack not installed")
print(" Install with: pip install haystack-ai")
sys.exit(1)
# Find the Haystack documents file
docs_path = Path("../../output/react-haystack.json")
if not docs_path.exists():
print(f"❌ Error: Documents not found at {docs_path}")
print("\n📝 Generate documents first:")
print(" skill-seekers scrape --config configs/react.json --max-pages 100")
print(" skill-seekers package output/react --target haystack")
sys.exit(1)
# Step 1: Load documents
print("\n📚 Step 1: Loading documents...")
with open(docs_path) as f:
docs_data = json.load(f)
documents = [
Document(content=doc["content"], meta=doc["meta"]) for doc in docs_data
]
print(f"✅ Loaded {len(documents)} documents")
# Show document breakdown
categories = {}
for doc in documents:
cat = doc.meta.get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
print("\n📁 Categories:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count}")
# Step 2: Create document store
print("\n💾 Step 2: Creating document store...")
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)
indexed_count = document_store.count_documents()
print(f"✅ Indexed {indexed_count} documents")
# Step 3: Create retriever
print("\n🔍 Step 3: Creating BM25 retriever...")
retriever = InMemoryBM25Retriever(document_store=document_store)
print("✅ Retriever ready")
# Step 4: Query examples
print("\n🎯 Step 4: Running queries...\n")
queries = [
"How do I use useState hook?",
"What are React components?",
"How to handle events in React?",
]
for i, query in enumerate(queries, 1):
print(f"\n{'=' * 60}")
print(f"Query {i}: {query}")
print("=" * 60)
# Run query
results = retriever.run(query=query, top_k=3)
if not results["documents"]:
print(" No results found")
continue
# Display results
for j, doc in enumerate(results["documents"], 1):
print(f"\n📖 Result {j}:")
print(f" Source: {doc.meta.get('file', 'unknown')}")
print(f" Category: {doc.meta.get('category', 'unknown')}")
# Show preview (first 200 chars)
preview = doc.content[:200].replace("\n", " ")
print(f" Preview: {preview}...")
# Summary
print("\n" + "=" * 60)
print("✅ Example complete!")
print("=" * 60)
print("\n📊 Summary:")
print(f" • Documents loaded: {len(documents)}")
print(f" • Documents indexed: {indexed_count}")
print(f" • Queries executed: {len(queries)}")
print("\n💡 Next steps:")
print(" • Try different queries")
print(" • Experiment with top_k parameter")
print(" • Build RAG pipeline with LLM generation")
print(" • Use vector embeddings for semantic search")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("\n\n⚠️ Interrupted by user")
sys.exit(0)
except Exception as e:
print(f"\n❌ Error: {e}")
sys.exit(1)