Implements complete Haystack 2.x integration for RAG pipelines:
**Haystack Adaptor (src/skill_seekers/cli/adaptors/haystack.py):**
- Document format: {content: str, meta: dict}
- JSON packaging for Haystack pipelines
- Compatible with InMemoryDocumentStore, BM25Retriever
- Registered in adaptor factory as 'haystack'
**Example Pipeline (examples/haystack-pipeline/):**
- README.md with comprehensive guide and troubleshooting
- quickstart.py demonstrating BM25 retrieval
- requirements.txt (haystack-ai>=2.0.0)
- Shows document loading, indexing, and querying
**Tests (tests/test_adaptors/test_haystack_adaptor.py):**
- 11 tests covering all adaptor functionality
- Format validation, packaging, upload messages
- Edge cases: empty dirs, references-only skills
- All 93 adaptor tests passing (100% suite pass rate)
**Features:**
- No upload endpoint (local use only like LangChain/LlamaIndex)
- No AI enhancement (enhance before packaging)
- Same packaging pattern as other RAG frameworks
- InMemoryDocumentStore + BM25Retriever example
Test: pytest tests/test_adaptors/test_haystack_adaptor.py -v
129 lines
3.9 KiB
Python
129 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Haystack Pipeline Example
|
|
|
|
Demonstrates how to use Skill Seekers documentation with Haystack 2.x
|
|
for building RAG pipelines.
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def main():
|
|
"""Run Haystack pipeline example."""
|
|
print("=" * 60)
|
|
print("Haystack Pipeline Example")
|
|
print("=" * 60)
|
|
|
|
# Check if Haystack is installed
|
|
try:
|
|
from haystack import Document
|
|
from haystack.document_stores.in_memory import InMemoryDocumentStore
|
|
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
|
|
except ImportError:
|
|
print("❌ Error: Haystack not installed")
|
|
print(" Install with: pip install haystack-ai")
|
|
sys.exit(1)
|
|
|
|
# Find the Haystack documents file
|
|
docs_path = Path("../../output/react-haystack.json")
|
|
|
|
if not docs_path.exists():
|
|
print(f"❌ Error: Documents not found at {docs_path}")
|
|
print("\n📝 Generate documents first:")
|
|
print(" skill-seekers scrape --config configs/react.json --max-pages 100")
|
|
print(" skill-seekers package output/react --target haystack")
|
|
sys.exit(1)
|
|
|
|
# Step 1: Load documents
|
|
print("\n📚 Step 1: Loading documents...")
|
|
with open(docs_path) as f:
|
|
docs_data = json.load(f)
|
|
|
|
documents = [
|
|
Document(content=doc["content"], meta=doc["meta"]) for doc in docs_data
|
|
]
|
|
|
|
print(f"✅ Loaded {len(documents)} documents")
|
|
|
|
# Show document breakdown
|
|
categories = {}
|
|
for doc in documents:
|
|
cat = doc.meta.get("category", "unknown")
|
|
categories[cat] = categories.get(cat, 0) + 1
|
|
|
|
print("\n📁 Categories:")
|
|
for cat, count in sorted(categories.items()):
|
|
print(f" - {cat}: {count}")
|
|
|
|
# Step 2: Create document store
|
|
print("\n💾 Step 2: Creating document store...")
|
|
document_store = InMemoryDocumentStore()
|
|
document_store.write_documents(documents)
|
|
|
|
indexed_count = document_store.count_documents()
|
|
print(f"✅ Indexed {indexed_count} documents")
|
|
|
|
# Step 3: Create retriever
|
|
print("\n🔍 Step 3: Creating BM25 retriever...")
|
|
retriever = InMemoryBM25Retriever(document_store=document_store)
|
|
print("✅ Retriever ready")
|
|
|
|
# Step 4: Query examples
|
|
print("\n🎯 Step 4: Running queries...\n")
|
|
|
|
queries = [
|
|
"How do I use useState hook?",
|
|
"What are React components?",
|
|
"How to handle events in React?",
|
|
]
|
|
|
|
for i, query in enumerate(queries, 1):
|
|
print(f"\n{'=' * 60}")
|
|
print(f"Query {i}: {query}")
|
|
print("=" * 60)
|
|
|
|
# Run query
|
|
results = retriever.run(query=query, top_k=3)
|
|
|
|
if not results["documents"]:
|
|
print(" No results found")
|
|
continue
|
|
|
|
# Display results
|
|
for j, doc in enumerate(results["documents"], 1):
|
|
print(f"\n📖 Result {j}:")
|
|
print(f" Source: {doc.meta.get('file', 'unknown')}")
|
|
print(f" Category: {doc.meta.get('category', 'unknown')}")
|
|
|
|
# Show preview (first 200 chars)
|
|
preview = doc.content[:200].replace("\n", " ")
|
|
print(f" Preview: {preview}...")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 60)
|
|
print("✅ Example complete!")
|
|
print("=" * 60)
|
|
print("\n📊 Summary:")
|
|
print(f" • Documents loaded: {len(documents)}")
|
|
print(f" • Documents indexed: {indexed_count}")
|
|
print(f" • Queries executed: {len(queries)}")
|
|
print("\n💡 Next steps:")
|
|
print(" • Try different queries")
|
|
print(" • Experiment with top_k parameter")
|
|
print(" • Build RAG pipeline with LLM generation")
|
|
print(" • Use vector embeddings for semantic search")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
print("\n\n⚠️ Interrupted by user")
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
print(f"\n❌ Error: {e}")
|
|
sys.exit(1)
|