fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions
--- a/src/skill_seekers/embedding/init.py
+++ b/src/skill_seekers/embedding/init.py
@@ -0,0 +1,31 @@
+"""
+Embedding generation system for Skill Seekers.
+
+Provides:
+- FastAPI server for embedding generation
+- Multiple embedding model support (OpenAI, sentence-transformers, Anthropic)
+- Batch processing for efficiency
+- Caching layer for embeddings
+- Vector database integration
+
+Usage:
+    # Start server
+    python -m skill_seekers.embedding.server
+
+    # Generate embeddings
+    curl -X POST http://localhost:8000/embed \
+         -H "Content-Type: application/json" \
+         -d '{"texts": ["Hello world"], "model": "text-embedding-3-small"}'
+"""
+
+from .models import EmbeddingRequest, EmbeddingResponse, BatchEmbeddingRequest
+from .generator import EmbeddingGenerator
+from .cache import EmbeddingCache
+
+__all__ = [
+    'EmbeddingRequest',
+    'EmbeddingResponse',
+    'BatchEmbeddingRequest',
+    'EmbeddingGenerator',
+    'EmbeddingCache',
+]