fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens)
- Exception: Keep all chunks if entire document is smaller than target size
- All 15 tests passing (100% pass rate)

Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were
being created despite min_chunk_size=100 setting.

Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
yusyus
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions

View File

@@ -0,0 +1,489 @@
"""
Vector Database Tools for MCP Server.
Provides MCP tools for exporting skills to 4 vector databases:
- Weaviate (hybrid search, 450K+ users)
- Chroma (local-first, 800K+ developers)
- FAISS (billion-scale, GPU-accelerated)
- Qdrant (native filtering, 100K+ users)
Each tool provides a direct interface to its respective vector database adaptor.
"""
import sys
from pathlib import Path
from typing import List
try:
from mcp.types import TextContent
except ImportError:
# Graceful degradation for testing
class TextContent:
"""Fallback TextContent for when MCP is not installed"""
def __init__(self, type: str, text: str):
self.type = type
self.text = text
# Path to CLI adaptors
CLI_DIR = Path(__file__).parent.parent.parent / "cli"
sys.path.insert(0, str(CLI_DIR))
try:
from adaptors import get_adaptor
except ImportError:
get_adaptor = None # Will handle gracefully below
async def export_to_weaviate_impl(args: dict) -> List[TextContent]:
"""
Export skill to Weaviate vector database format.
Weaviate is a popular cloud-native vector database with hybrid search
(combining vector similarity + BM25 keyword search). Ideal for
production RAG applications with 450K+ users.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Weaviate schema:
- class_name: Weaviate class name
- schema: Property definitions
- objects: Document objects with vectors and metadata
- config: Distance metric configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module. Please ensure skill-seekers is properly installed.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}\n\nPlease scrape documentation first using scrape_docs.",
)
]
try:
# Get Weaviate adaptor
adaptor = get_adaptor("weaviate")
# Package skill
package_path = adaptor.package(skill_dir, output_dir)
# Success message
result_text = f"""✅ Weaviate Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Upload to Weaviate:
```python
import weaviate
import json
client = weaviate.Client("http://localhost:8080")
data = json.load(open("{package_path}"))
# Create schema
client.schema.create_class(data["schema"])
# Batch upload objects
with client.batch as batch:
for obj in data["objects"]:
batch.add_data_object(obj["properties"], data["class_name"])
```
2. Query with hybrid search:
```python
result = client.query.get(data["class_name"], ["content", "source"]) \\
.with_hybrid("React hooks usage") \\
.with_limit(5) \\
.do()
```
📚 Resources:
- Weaviate Docs: https://weaviate.io/developers/weaviate
- Hybrid Search: https://weaviate.io/developers/weaviate/search/hybrid
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Weaviate: {str(e)}\n\nPlease check that the skill directory contains valid documentation.",
)
]
async def export_to_chroma_impl(args: dict) -> List[TextContent]:
"""
Export skill to Chroma vector database format.
Chroma is a popular open-source embedding database designed for
local-first development. Perfect for RAG prototyping with 800K+ developers.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Chroma collection data:
- collection_name: Collection identifier
- documents: List of document texts
- metadatas: List of metadata dicts
- ids: List of unique IDs
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("chroma")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ Chroma Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Load into Chroma:
```python
import chromadb
import json
client = chromadb.Client()
data = json.load(open("{package_path}"))
# Create collection
collection = client.create_collection(
name=data["collection_name"],
metadata={{"source": "skill-seekers"}}
)
# Add documents
collection.add(
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"]
)
```
2. Query the collection:
```python
results = collection.query(
query_texts=["How to use React hooks?"],
n_results=5
)
```
📚 Resources:
- Chroma Docs: https://docs.trychroma.com/
- Getting Started: https://docs.trychroma.com/getting-started
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Chroma: {str(e)}",
)
]
async def export_to_faiss_impl(args: dict) -> List[TextContent]:
"""
Export skill to FAISS vector index format.
FAISS (Facebook AI Similarity Search) is a library for efficient similarity
search at billion-scale. Supports GPU acceleration for ultra-fast search.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
- index_type (str, optional): FAISS index type (default: 'Flat')
Options: 'Flat', 'IVF', 'HNSW'
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output",
"index_type": "HNSW"
}
Output Format:
JSON file with FAISS data:
- embeddings: List of embedding vectors
- metadata: List of document metadata
- index_config: FAISS index configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("faiss")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ FAISS Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Build FAISS index:
```python
import faiss
import json
import numpy as np
data = json.load(open("{package_path}"))
embeddings = np.array(data["embeddings"], dtype="float32")
# Create index (choose based on scale)
dimension = embeddings.shape[1]
# Option 1: Flat (exact search, small datasets)
index = faiss.IndexFlatL2(dimension)
# Option 2: IVF (fast approximation, medium datasets)
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, 100)
# index.train(embeddings)
# Option 3: HNSW (best quality approximation, large datasets)
# index = faiss.IndexHNSWFlat(dimension, 32)
# Add vectors
index.add(embeddings)
```
2. Search:
```python
# Search for similar docs
query = np.array([your_query_embedding], dtype="float32")
distances, indices = index.search(query, k=5)
# Get metadata for results
for i in indices[0]:
print(data["metadata"][i])
```
3. Save index:
```python
faiss.write_index(index, "react_docs.index")
```
📚 Resources:
- FAISS Wiki: https://github.com/facebookresearch/faiss/wiki
- GPU Support: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to FAISS: {str(e)}",
)
]
async def export_to_qdrant_impl(args: dict) -> List[TextContent]:
"""
Export skill to Qdrant vector database format.
Qdrant is a modern vector database with native payload filtering and
high-performance search. Ideal for production RAG with 100K+ users.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Qdrant collection data:
- collection_name: Collection identifier
- points: List of points with id, vector, payload
- config: Vector configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("qdrant")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ Qdrant Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Upload to Qdrant:
```python
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import json
client = QdrantClient("localhost", port=6333)
data = json.load(open("{package_path}"))
# Create collection
client.create_collection(
collection_name=data["collection_name"],
vectors_config=VectorParams(
size=data["config"]["vector_size"],
distance=Distance.COSINE
)
)
# Upload points
client.upsert(
collection_name=data["collection_name"],
points=data["points"]
)
```
2. Search with filters:
```python
from qdrant_client.models import Filter, FieldCondition, MatchValue
results = client.search(
collection_name=data["collection_name"],
query_vector=your_query_vector,
query_filter=Filter(
must=[
FieldCondition(
key="category",
match=MatchValue(value="getting_started")
)
]
),
limit=5
)
```
📚 Resources:
- Qdrant Docs: https://qdrant.tech/documentation/
- Filtering: https://qdrant.tech/documentation/concepts/filtering/
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Qdrant: {str(e)}",
)
]
# Export all implementations
__all__ = [
"export_to_weaviate_impl",
"export_to_chroma_impl",
"export_to_faiss_impl",
"export_to_qdrant_impl",
]