fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens)
- Exception: Keep all chunks if entire document is smaller than target size
- All 15 tests passing (100% pass rate)

Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were
being created despite min_chunk_size=100 setting.

Test: pytest tests/test_rag_chunker.py -v
This commit is contained in:
yusyus
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions

View File

@@ -3,19 +3,20 @@
Skill Seeker MCP Server (FastMCP Implementation)
Modern, decorator-based MCP server using FastMCP for simplified tool registration.
Provides 21 tools for generating Claude AI skills from documentation.
Provides 25 tools for generating Claude AI skills from documentation.
This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
All tool implementations are delegated to modular tool files in tools/ directory.
**Architecture:**
- FastMCP server with decorator-based tool registration
- 21 tools organized into 5 categories:
- 25 tools organized into 6 categories:
* Config tools (3): generate_config, list_configs, validate_config
* Scraping tools (8): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
* Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
* Splitting tools (2): split_config, generate_router
* Source tools (4): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
* Vector Database tools (4): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant
**Usage:**
# Stdio transport (default, backward compatible)
@@ -75,6 +76,11 @@ try:
enhance_skill_impl,
# Scraping tools
estimate_pages_impl,
# Vector database tools
export_to_chroma_impl,
export_to_faiss_impl,
export_to_qdrant_impl,
export_to_weaviate_impl,
extract_config_patterns_impl,
extract_test_examples_impl,
# Source tools
@@ -109,6 +115,10 @@ except ImportError:
detect_patterns_impl,
enhance_skill_impl,
estimate_pages_impl,
export_to_chroma_impl,
export_to_faiss_impl,
export_to_qdrant_impl,
export_to_weaviate_impl,
extract_config_patterns_impl,
extract_test_examples_impl,
fetch_config_impl,
@@ -1055,6 +1065,119 @@ async def remove_config_source(name: str) -> str:
return str(result)
# ============================================================================
# VECTOR DATABASE TOOLS (4 tools)
# ============================================================================
@safe_tool_decorator(
description="Export skill to Weaviate vector database format. Weaviate supports hybrid search (vector + BM25 keyword) with 450K+ users. Ideal for production RAG applications."
)
async def export_to_weaviate(
skill_dir: str,
output_dir: str | None = None,
) -> str:
"""
Export skill to Weaviate vector database format.
Args:
skill_dir: Path to skill directory (e.g., output/react/)
output_dir: Output directory (default: same as skill_dir parent)
Returns:
Export results with package path and usage instructions.
"""
args = {"skill_dir": skill_dir}
if output_dir:
args["output_dir"] = output_dir
result = await export_to_weaviate_impl(args)
if isinstance(result, list) and result:
return result[0].text if hasattr(result[0], "text") else str(result[0])
return str(result)
@safe_tool_decorator(
description="Export skill to Chroma vector database format. Chroma is a popular open-source embedding database designed for local-first development with 800K+ developers."
)
async def export_to_chroma(
skill_dir: str,
output_dir: str | None = None,
) -> str:
"""
Export skill to Chroma vector database format.
Args:
skill_dir: Path to skill directory (e.g., output/react/)
output_dir: Output directory (default: same as skill_dir parent)
Returns:
Export results with package path and usage instructions.
"""
args = {"skill_dir": skill_dir}
if output_dir:
args["output_dir"] = output_dir
result = await export_to_chroma_impl(args)
if isinstance(result, list) and result:
return result[0].text if hasattr(result[0], "text") else str(result[0])
return str(result)
@safe_tool_decorator(
description="Export skill to FAISS vector index format. FAISS (Facebook AI Similarity Search) supports billion-scale vector search with GPU acceleration."
)
async def export_to_faiss(
skill_dir: str,
output_dir: str | None = None,
) -> str:
"""
Export skill to FAISS vector index format.
Args:
skill_dir: Path to skill directory (e.g., output/react/)
output_dir: Output directory (default: same as skill_dir parent)
Returns:
Export results with package path and usage instructions.
"""
args = {"skill_dir": skill_dir}
if output_dir:
args["output_dir"] = output_dir
result = await export_to_faiss_impl(args)
if isinstance(result, list) and result:
return result[0].text if hasattr(result[0], "text") else str(result[0])
return str(result)
@safe_tool_decorator(
description="Export skill to Qdrant vector database format. Qdrant is a modern vector database with native payload filtering and high-performance search, serving 100K+ users."
)
async def export_to_qdrant(
skill_dir: str,
output_dir: str | None = None,
) -> str:
"""
Export skill to Qdrant vector database format.
Args:
skill_dir: Path to skill directory (e.g., output/react/)
output_dir: Output directory (default: same as skill_dir parent)
Returns:
Export results with package path and usage instructions.
"""
args = {"skill_dir": skill_dir}
if output_dir:
args["output_dir"] = output_dir
result = await export_to_qdrant_impl(args)
if isinstance(result, list) and result:
return result[0].text if hasattr(result[0], "text") else str(result[0])
return str(result)
# ============================================================================
# MAIN ENTRY POINT
# ============================================================================

View File

@@ -9,6 +9,7 @@ Tools are organized by functionality:
- packaging_tools: Skill packaging and upload
- splitting_tools: Config splitting and router generation
- source_tools: Config source management (fetch, submit, add/remove sources)
- vector_db_tools: Vector database export (Weaviate, Chroma, FAISS, Qdrant)
"""
# Import centralized version
@@ -83,6 +84,18 @@ from .splitting_tools import (
from .splitting_tools import (
split_config as split_config_impl,
)
from .vector_db_tools import (
export_to_chroma_impl,
)
from .vector_db_tools import (
export_to_faiss_impl,
)
from .vector_db_tools import (
export_to_qdrant_impl,
)
from .vector_db_tools import (
export_to_weaviate_impl,
)
__all__ = [
"__version__",
@@ -114,4 +127,9 @@ __all__ = [
"add_config_source_impl",
"list_config_sources_impl",
"remove_config_source_impl",
# Vector database tools
"export_to_weaviate_impl",
"export_to_chroma_impl",
"export_to_faiss_impl",
"export_to_qdrant_impl",
]

View File

@@ -0,0 +1,489 @@
"""
Vector Database Tools for MCP Server.
Provides MCP tools for exporting skills to 4 vector databases:
- Weaviate (hybrid search, 450K+ users)
- Chroma (local-first, 800K+ developers)
- FAISS (billion-scale, GPU-accelerated)
- Qdrant (native filtering, 100K+ users)
Each tool provides a direct interface to its respective vector database adaptor.
"""
import sys
from pathlib import Path
from typing import List
try:
from mcp.types import TextContent
except ImportError:
# Graceful degradation for testing
class TextContent:
"""Fallback TextContent for when MCP is not installed"""
def __init__(self, type: str, text: str):
self.type = type
self.text = text
# Path to CLI adaptors
CLI_DIR = Path(__file__).parent.parent.parent / "cli"
sys.path.insert(0, str(CLI_DIR))
try:
from adaptors import get_adaptor
except ImportError:
get_adaptor = None # Will handle gracefully below
async def export_to_weaviate_impl(args: dict) -> List[TextContent]:
"""
Export skill to Weaviate vector database format.
Weaviate is a popular cloud-native vector database with hybrid search
(combining vector similarity + BM25 keyword search). Ideal for
production RAG applications with 450K+ users.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Weaviate schema:
- class_name: Weaviate class name
- schema: Property definitions
- objects: Document objects with vectors and metadata
- config: Distance metric configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module. Please ensure skill-seekers is properly installed.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}\n\nPlease scrape documentation first using scrape_docs.",
)
]
try:
# Get Weaviate adaptor
adaptor = get_adaptor("weaviate")
# Package skill
package_path = adaptor.package(skill_dir, output_dir)
# Success message
result_text = f"""✅ Weaviate Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Upload to Weaviate:
```python
import weaviate
import json
client = weaviate.Client("http://localhost:8080")
data = json.load(open("{package_path}"))
# Create schema
client.schema.create_class(data["schema"])
# Batch upload objects
with client.batch as batch:
for obj in data["objects"]:
batch.add_data_object(obj["properties"], data["class_name"])
```
2. Query with hybrid search:
```python
result = client.query.get(data["class_name"], ["content", "source"]) \\
.with_hybrid("React hooks usage") \\
.with_limit(5) \\
.do()
```
📚 Resources:
- Weaviate Docs: https://weaviate.io/developers/weaviate
- Hybrid Search: https://weaviate.io/developers/weaviate/search/hybrid
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Weaviate: {str(e)}\n\nPlease check that the skill directory contains valid documentation.",
)
]
async def export_to_chroma_impl(args: dict) -> List[TextContent]:
"""
Export skill to Chroma vector database format.
Chroma is a popular open-source embedding database designed for
local-first development. Perfect for RAG prototyping with 800K+ developers.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Chroma collection data:
- collection_name: Collection identifier
- documents: List of document texts
- metadatas: List of metadata dicts
- ids: List of unique IDs
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("chroma")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ Chroma Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Load into Chroma:
```python
import chromadb
import json
client = chromadb.Client()
data = json.load(open("{package_path}"))
# Create collection
collection = client.create_collection(
name=data["collection_name"],
metadata={{"source": "skill-seekers"}}
)
# Add documents
collection.add(
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"]
)
```
2. Query the collection:
```python
results = collection.query(
query_texts=["How to use React hooks?"],
n_results=5
)
```
📚 Resources:
- Chroma Docs: https://docs.trychroma.com/
- Getting Started: https://docs.trychroma.com/getting-started
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Chroma: {str(e)}",
)
]
async def export_to_faiss_impl(args: dict) -> List[TextContent]:
"""
Export skill to FAISS vector index format.
FAISS (Facebook AI Similarity Search) is a library for efficient similarity
search at billion-scale. Supports GPU acceleration for ultra-fast search.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
- index_type (str, optional): FAISS index type (default: 'Flat')
Options: 'Flat', 'IVF', 'HNSW'
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output",
"index_type": "HNSW"
}
Output Format:
JSON file with FAISS data:
- embeddings: List of embedding vectors
- metadata: List of document metadata
- index_config: FAISS index configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("faiss")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ FAISS Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Build FAISS index:
```python
import faiss
import json
import numpy as np
data = json.load(open("{package_path}"))
embeddings = np.array(data["embeddings"], dtype="float32")
# Create index (choose based on scale)
dimension = embeddings.shape[1]
# Option 1: Flat (exact search, small datasets)
index = faiss.IndexFlatL2(dimension)
# Option 2: IVF (fast approximation, medium datasets)
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, 100)
# index.train(embeddings)
# Option 3: HNSW (best quality approximation, large datasets)
# index = faiss.IndexHNSWFlat(dimension, 32)
# Add vectors
index.add(embeddings)
```
2. Search:
```python
# Search for similar docs
query = np.array([your_query_embedding], dtype="float32")
distances, indices = index.search(query, k=5)
# Get metadata for results
for i in indices[0]:
print(data["metadata"][i])
```
3. Save index:
```python
faiss.write_index(index, "react_docs.index")
```
📚 Resources:
- FAISS Wiki: https://github.com/facebookresearch/faiss/wiki
- GPU Support: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to FAISS: {str(e)}",
)
]
async def export_to_qdrant_impl(args: dict) -> List[TextContent]:
"""
Export skill to Qdrant vector database format.
Qdrant is a modern vector database with native payload filtering and
high-performance search. Ideal for production RAG with 100K+ users.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Qdrant collection data:
- collection_name: Collection identifier
- points: List of points with id, vector, payload
- config: Vector configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("qdrant")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ Qdrant Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Upload to Qdrant:
```python
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import json
client = QdrantClient("localhost", port=6333)
data = json.load(open("{package_path}"))
# Create collection
client.create_collection(
collection_name=data["collection_name"],
vectors_config=VectorParams(
size=data["config"]["vector_size"],
distance=Distance.COSINE
)
)
# Upload points
client.upsert(
collection_name=data["collection_name"],
points=data["points"]
)
```
2. Search with filters:
```python
from qdrant_client.models import Filter, FieldCondition, MatchValue
results = client.search(
collection_name=data["collection_name"],
query_vector=your_query_vector,
query_filter=Filter(
must=[
FieldCondition(
key="category",
match=MatchValue(value="getting_started")
)
]
),
limit=5
)
```
📚 Resources:
- Qdrant Docs: https://qdrant.tech/documentation/
- Filtering: https://qdrant.tech/documentation/concepts/filtering/
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Qdrant: {str(e)}",
)
]
# Export all implementations
__all__ = [
"export_to_weaviate_impl",
"export_to_chroma_impl",
"export_to_faiss_impl",
"export_to_qdrant_impl",
]