fix: Enforce min_chunk_size in RAG chunker
- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
@@ -62,6 +62,7 @@ dependencies = [
|
||||
"pathspec>=0.12.1",
|
||||
"networkx>=3.0",
|
||||
"tomli>=2.0.0; python_version < '3.11'", # TOML parser for version reading
|
||||
"schedule>=1.2.0", # Required for sync monitoring
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
@@ -92,6 +93,35 @@ all-llms = [
|
||||
"openai>=1.0.0",
|
||||
]
|
||||
|
||||
# Cloud storage support
|
||||
s3 = [
|
||||
"boto3>=1.34.0",
|
||||
]
|
||||
|
||||
gcs = [
|
||||
"google-cloud-storage>=2.10.0",
|
||||
]
|
||||
|
||||
azure = [
|
||||
"azure-storage-blob>=12.19.0",
|
||||
]
|
||||
|
||||
# All cloud storage providers combined
|
||||
all-cloud = [
|
||||
"boto3>=1.34.0",
|
||||
"google-cloud-storage>=2.10.0",
|
||||
"azure-storage-blob>=12.19.0",
|
||||
]
|
||||
|
||||
# Embedding server support
|
||||
embedding = [
|
||||
"fastapi>=0.109.0",
|
||||
"uvicorn>=0.27.0",
|
||||
"sentence-transformers>=2.3.0",
|
||||
"numpy>=1.24.0",
|
||||
"voyageai>=0.2.0",
|
||||
]
|
||||
|
||||
# All optional dependencies combined (dev dependencies now in [dependency-groups])
|
||||
all = [
|
||||
"mcp>=1.25,<2",
|
||||
@@ -102,6 +132,13 @@ all = [
|
||||
"sse-starlette>=3.0.2",
|
||||
"google-generativeai>=0.8.0",
|
||||
"openai>=1.0.0",
|
||||
"boto3>=1.34.0",
|
||||
"google-cloud-storage>=2.10.0",
|
||||
"azure-storage-blob>=12.19.0",
|
||||
"fastapi>=0.109.0",
|
||||
"sentence-transformers>=2.3.0",
|
||||
"numpy>=1.24.0",
|
||||
"voyageai>=0.2.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
@@ -136,6 +173,10 @@ skill-seekers-codebase = "skill_seekers.cli.codebase_scraper:main"
|
||||
skill-seekers-patterns = "skill_seekers.cli.pattern_recognizer:main"
|
||||
skill-seekers-how-to-guides = "skill_seekers.cli.how_to_guide_builder:main"
|
||||
skill-seekers-setup = "skill_seekers.cli.setup_wizard:main"
|
||||
skill-seekers-cloud = "skill_seekers.cli.cloud_storage_cli:main"
|
||||
skill-seekers-embed = "skill_seekers.embedding.server:main"
|
||||
skill-seekers-sync = "skill_seekers.cli.sync_cli:main"
|
||||
skill-seekers-benchmark = "skill_seekers.cli.benchmark_cli:main"
|
||||
|
||||
[tool.setuptools]
|
||||
package-dir = {"" = "src"}
|
||||
|
||||
Reference in New Issue
Block a user