fix: Enforce min_chunk_size in RAG chunker

- Filter out chunks smaller than min_chunk_size (default 100 tokens) - Exception: Keep all chunks if entire document is smaller than target size - All 15 tests passing (100% pass rate) Fixes edge case where very small chunks (e.g., 'Short.' = 6 chars) were being created despite min_chunk_size=100 setting. Test: pytest tests/test_rag_chunker.py -v
2026-02-07 20:59:03 +03:00
parent 3a769a27cd
commit 8b3f31409e
65 changed files with 16133 additions and 7 deletions
--- a/src/skill_seekers/mcp/server_fastmcp.py
+++ b/src/skill_seekers/mcp/server_fastmcp.py
@@ -3,19 +3,20 @@
 Skill Seeker MCP Server (FastMCP Implementation)

 Modern, decorator-based MCP server using FastMCP for simplified tool registration.
-Provides 21 tools for generating Claude AI skills from documentation.
+Provides 25 tools for generating Claude AI skills from documentation.

 This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
 All tool implementations are delegated to modular tool files in tools/ directory.

 **Architecture:**
 - FastMCP server with decorator-based tool registration
- 21 tools organized into 5 categories:
+- 25 tools organized into 6 categories:
  * Config tools (3): generate_config, list_configs, validate_config
  * Scraping tools (8): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
  * Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
  * Splitting tools (2): split_config, generate_router
  * Source tools (4): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
+  * Vector Database tools (4): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant

 **Usage:**
  # Stdio transport (default, backward compatible)
@@ -75,6 +76,11 @@ try:
        enhance_skill_impl,
        # Scraping tools
        estimate_pages_impl,
+        # Vector database tools
+        export_to_chroma_impl,
+        export_to_faiss_impl,
+        export_to_qdrant_impl,
+        export_to_weaviate_impl,
        extract_config_patterns_impl,
        extract_test_examples_impl,
        # Source tools
@@ -109,6 +115,10 @@ except ImportError:
        detect_patterns_impl,
        enhance_skill_impl,
        estimate_pages_impl,
+        export_to_chroma_impl,
+        export_to_faiss_impl,
+        export_to_qdrant_impl,
+        export_to_weaviate_impl,
        extract_config_patterns_impl,
        extract_test_examples_impl,
        fetch_config_impl,
@@ -1055,6 +1065,119 @@ async def remove_config_source(name: str) -> str:
    return str(result)


+# ============================================================================
+# VECTOR DATABASE TOOLS (4 tools)
+# ============================================================================
+
+
+@safe_tool_decorator(
+    description="Export skill to Weaviate vector database format. Weaviate supports hybrid search (vector + BM25 keyword) with 450K+ users. Ideal for production RAG applications."
+)
+async def export_to_weaviate(
+    skill_dir: str,
+    output_dir: str | None = None,
+) -> str:
+    """
+    Export skill to Weaviate vector database format.
+
+    Args:
+        skill_dir: Path to skill directory (e.g., output/react/)
+        output_dir: Output directory (default: same as skill_dir parent)
+
+    Returns:
+        Export results with package path and usage instructions.
+    """
+    args = {"skill_dir": skill_dir}
+    if output_dir:
+        args["output_dir"] = output_dir
+
+    result = await export_to_weaviate_impl(args)
+    if isinstance(result, list) and result:
+        return result[0].text if hasattr(result[0], "text") else str(result[0])
+    return str(result)
+
+
+@safe_tool_decorator(
+    description="Export skill to Chroma vector database format. Chroma is a popular open-source embedding database designed for local-first development with 800K+ developers."
+)
+async def export_to_chroma(
+    skill_dir: str,
+    output_dir: str | None = None,
+) -> str:
+    """
+    Export skill to Chroma vector database format.
+
+    Args:
+        skill_dir: Path to skill directory (e.g., output/react/)
+        output_dir: Output directory (default: same as skill_dir parent)
+
+    Returns:
+        Export results with package path and usage instructions.
+    """
+    args = {"skill_dir": skill_dir}
+    if output_dir:
+        args["output_dir"] = output_dir
+
+    result = await export_to_chroma_impl(args)
+    if isinstance(result, list) and result:
+        return result[0].text if hasattr(result[0], "text") else str(result[0])
+    return str(result)
+
+
+@safe_tool_decorator(
+    description="Export skill to FAISS vector index format. FAISS (Facebook AI Similarity Search) supports billion-scale vector search with GPU acceleration."
+)
+async def export_to_faiss(
+    skill_dir: str,
+    output_dir: str | None = None,
+) -> str:
+    """
+    Export skill to FAISS vector index format.
+
+    Args:
+        skill_dir: Path to skill directory (e.g., output/react/)
+        output_dir: Output directory (default: same as skill_dir parent)
+
+    Returns:
+        Export results with package path and usage instructions.
+    """
+    args = {"skill_dir": skill_dir}
+    if output_dir:
+        args["output_dir"] = output_dir
+
+    result = await export_to_faiss_impl(args)
+    if isinstance(result, list) and result:
+        return result[0].text if hasattr(result[0], "text") else str(result[0])
+    return str(result)
+
+
+@safe_tool_decorator(
+    description="Export skill to Qdrant vector database format. Qdrant is a modern vector database with native payload filtering and high-performance search, serving 100K+ users."
+)
+async def export_to_qdrant(
+    skill_dir: str,
+    output_dir: str | None = None,
+) -> str:
+    """
+    Export skill to Qdrant vector database format.
+
+    Args:
+        skill_dir: Path to skill directory (e.g., output/react/)
+        output_dir: Output directory (default: same as skill_dir parent)
+
+    Returns:
+        Export results with package path and usage instructions.
+    """
+    args = {"skill_dir": skill_dir}
+    if output_dir:
+        args["output_dir"] = output_dir
+
+    result = await export_to_qdrant_impl(args)
+    if isinstance(result, list) and result:
+        return result[0].text if hasattr(result[0], "text") else str(result[0])
+    return str(result)
+
+
 # ============================================================================
 # MAIN ENTRY POINT
 # ============================================================================
--- a/src/skill_seekers/mcp/tools/init.py
+++ b/src/skill_seekers/mcp/tools/init.py
@@ -9,6 +9,7 @@ Tools are organized by functionality:
 - packaging_tools: Skill packaging and upload
 - splitting_tools: Config splitting and router generation
 - source_tools: Config source management (fetch, submit, add/remove sources)
+- vector_db_tools: Vector database export (Weaviate, Chroma, FAISS, Qdrant)
 """

 # Import centralized version
@@ -83,6 +84,18 @@ from .splitting_tools import (
 from .splitting_tools import (
    split_config as split_config_impl,
 )
+from .vector_db_tools import (
+    export_to_chroma_impl,
+)
+from .vector_db_tools import (
+    export_to_faiss_impl,
+)
+from .vector_db_tools import (
+    export_to_qdrant_impl,
+)
+from .vector_db_tools import (
+    export_to_weaviate_impl,
+)

 __all__ = [
    "__version__",
@@ -114,4 +127,9 @@ __all__ = [
    "add_config_source_impl",
    "list_config_sources_impl",
    "remove_config_source_impl",
+    # Vector database tools
+    "export_to_weaviate_impl",
+    "export_to_chroma_impl",
+    "export_to_faiss_impl",
+    "export_to_qdrant_impl",
 ]
--- a/src/skill_seekers/mcp/tools/vector_db_tools.py
+++ b/src/skill_seekers/mcp/tools/vector_db_tools.py
@@ -0,0 +1,489 @@
+"""
+Vector Database Tools for MCP Server.
+
+Provides MCP tools for exporting skills to 4 vector databases:
+- Weaviate (hybrid search, 450K+ users)
+- Chroma (local-first, 800K+ developers)
+- FAISS (billion-scale, GPU-accelerated)
+- Qdrant (native filtering, 100K+ users)
+
+Each tool provides a direct interface to its respective vector database adaptor.
+"""
+
+import sys
+from pathlib import Path
+from typing import List
+
+try:
+    from mcp.types import TextContent
+except ImportError:
+    # Graceful degradation for testing
+    class TextContent:
+        """Fallback TextContent for when MCP is not installed"""
+
+        def __init__(self, type: str, text: str):
+            self.type = type
+            self.text = text
+
+
+# Path to CLI adaptors
+CLI_DIR = Path(__file__).parent.parent.parent / "cli"
+sys.path.insert(0, str(CLI_DIR))
+
+try:
+    from adaptors import get_adaptor
+except ImportError:
+    get_adaptor = None  # Will handle gracefully below
+
+
+async def export_to_weaviate_impl(args: dict) -> List[TextContent]:
+    """
+    Export skill to Weaviate vector database format.
+
+    Weaviate is a popular cloud-native vector database with hybrid search
+    (combining vector similarity + BM25 keyword search). Ideal for
+    production RAG applications with 450K+ users.
+
+    Args:
+        args: Dictionary with:
+            - skill_dir (str): Path to skill directory (e.g., output/react/)
+            - output_dir (str, optional): Output directory (default: same as skill_dir)
+
+    Returns:
+        List of TextContent with export results
+
+    Example:
+        {
+            "skill_dir": "output/react",
+            "output_dir": "output"
+        }
+
+    Output Format:
+        JSON file with Weaviate schema:
+        - class_name: Weaviate class name
+        - schema: Property definitions
+        - objects: Document objects with vectors and metadata
+        - config: Distance metric configuration
+    """
+    if get_adaptor is None:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: Could not import adaptors module. Please ensure skill-seekers is properly installed.",
+            )
+        ]
+
+    skill_dir = Path(args["skill_dir"])
+    output_dir = Path(args.get("output_dir", skill_dir.parent))
+
+    if not skill_dir.exists():
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error: Skill directory not found: {skill_dir}\n\nPlease scrape documentation first using scrape_docs.",
+            )
+        ]
+
+    try:
+        # Get Weaviate adaptor
+        adaptor = get_adaptor("weaviate")
+
+        # Package skill
+        package_path = adaptor.package(skill_dir, output_dir)
+
+        # Success message
+        result_text = f"""✅ Weaviate Export Complete!
+
+📦 Package: {package_path.name}
+📁 Location: {package_path.parent}
+📊 Size: {package_path.stat().st_size:,} bytes
+
+🔧 Next Steps:
+1. Upload to Weaviate:
+   ```python
+   import weaviate
+   import json
+
+   client = weaviate.Client("http://localhost:8080")
+   data = json.load(open("{package_path}"))
+
+   # Create schema
+   client.schema.create_class(data["schema"])
+
+   # Batch upload objects
+   with client.batch as batch:
+       for obj in data["objects"]:
+           batch.add_data_object(obj["properties"], data["class_name"])
+   ```
+
+2. Query with hybrid search:
+   ```python
+   result = client.query.get(data["class_name"], ["content", "source"]) \\
+       .with_hybrid("React hooks usage") \\
+       .with_limit(5) \\
+       .do()
+   ```
+
+📚 Resources:
+- Weaviate Docs: https://weaviate.io/developers/weaviate
+- Hybrid Search: https://weaviate.io/developers/weaviate/search/hybrid
+"""
+
+        return [TextContent(type="text", text=result_text)]
+
+    except Exception as e:
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error exporting to Weaviate: {str(e)}\n\nPlease check that the skill directory contains valid documentation.",
+            )
+        ]
+
+
+async def export_to_chroma_impl(args: dict) -> List[TextContent]:
+    """
+    Export skill to Chroma vector database format.
+
+    Chroma is a popular open-source embedding database designed for
+    local-first development. Perfect for RAG prototyping with 800K+ developers.
+
+    Args:
+        args: Dictionary with:
+            - skill_dir (str): Path to skill directory (e.g., output/react/)
+            - output_dir (str, optional): Output directory (default: same as skill_dir)
+
+    Returns:
+        List of TextContent with export results
+
+    Example:
+        {
+            "skill_dir": "output/react",
+            "output_dir": "output"
+        }
+
+    Output Format:
+        JSON file with Chroma collection data:
+        - collection_name: Collection identifier
+        - documents: List of document texts
+        - metadatas: List of metadata dicts
+        - ids: List of unique IDs
+    """
+    if get_adaptor is None:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: Could not import adaptors module.",
+            )
+        ]
+
+    skill_dir = Path(args["skill_dir"])
+    output_dir = Path(args.get("output_dir", skill_dir.parent))
+
+    if not skill_dir.exists():
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error: Skill directory not found: {skill_dir}",
+            )
+        ]
+
+    try:
+        adaptor = get_adaptor("chroma")
+        package_path = adaptor.package(skill_dir, output_dir)
+
+        result_text = f"""✅ Chroma Export Complete!
+
+📦 Package: {package_path.name}
+📁 Location: {package_path.parent}
+📊 Size: {package_path.stat().st_size:,} bytes
+
+🔧 Next Steps:
+1. Load into Chroma:
+   ```python
+   import chromadb
+   import json
+
+   client = chromadb.Client()
+   data = json.load(open("{package_path}"))
+
+   # Create collection
+   collection = client.create_collection(
+       name=data["collection_name"],
+       metadata={{"source": "skill-seekers"}}
+   )
+
+   # Add documents
+   collection.add(
+       documents=data["documents"],
+       metadatas=data["metadatas"],
+       ids=data["ids"]
+   )
+   ```
+
+2. Query the collection:
+   ```python
+   results = collection.query(
+       query_texts=["How to use React hooks?"],
+       n_results=5
+   )
+   ```
+
+📚 Resources:
+- Chroma Docs: https://docs.trychroma.com/
+- Getting Started: https://docs.trychroma.com/getting-started
+"""
+
+        return [TextContent(type="text", text=result_text)]
+
+    except Exception as e:
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error exporting to Chroma: {str(e)}",
+            )
+        ]
+
+
+async def export_to_faiss_impl(args: dict) -> List[TextContent]:
+    """
+    Export skill to FAISS vector index format.
+
+    FAISS (Facebook AI Similarity Search) is a library for efficient similarity
+    search at billion-scale. Supports GPU acceleration for ultra-fast search.
+
+    Args:
+        args: Dictionary with:
+            - skill_dir (str): Path to skill directory (e.g., output/react/)
+            - output_dir (str, optional): Output directory (default: same as skill_dir)
+            - index_type (str, optional): FAISS index type (default: 'Flat')
+                                        Options: 'Flat', 'IVF', 'HNSW'
+
+    Returns:
+        List of TextContent with export results
+
+    Example:
+        {
+            "skill_dir": "output/react",
+            "output_dir": "output",
+            "index_type": "HNSW"
+        }
+
+    Output Format:
+        JSON file with FAISS data:
+        - embeddings: List of embedding vectors
+        - metadata: List of document metadata
+        - index_config: FAISS index configuration
+    """
+    if get_adaptor is None:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: Could not import adaptors module.",
+            )
+        ]
+
+    skill_dir = Path(args["skill_dir"])
+    output_dir = Path(args.get("output_dir", skill_dir.parent))
+
+    if not skill_dir.exists():
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error: Skill directory not found: {skill_dir}",
+            )
+        ]
+
+    try:
+        adaptor = get_adaptor("faiss")
+        package_path = adaptor.package(skill_dir, output_dir)
+
+        result_text = f"""✅ FAISS Export Complete!
+
+📦 Package: {package_path.name}
+📁 Location: {package_path.parent}
+📊 Size: {package_path.stat().st_size:,} bytes
+
+🔧 Next Steps:
+1. Build FAISS index:
+   ```python
+   import faiss
+   import json
+   import numpy as np
+
+   data = json.load(open("{package_path}"))
+   embeddings = np.array(data["embeddings"], dtype="float32")
+
+   # Create index (choose based on scale)
+   dimension = embeddings.shape[1]
+
+   # Option 1: Flat (exact search, small datasets)
+   index = faiss.IndexFlatL2(dimension)
+
+   # Option 2: IVF (fast approximation, medium datasets)
+   # quantizer = faiss.IndexFlatL2(dimension)
+   # index = faiss.IndexIVFFlat(quantizer, dimension, 100)
+   # index.train(embeddings)
+
+   # Option 3: HNSW (best quality approximation, large datasets)
+   # index = faiss.IndexHNSWFlat(dimension, 32)
+
+   # Add vectors
+   index.add(embeddings)
+   ```
+
+2. Search:
+   ```python
+   # Search for similar docs
+   query = np.array([your_query_embedding], dtype="float32")
+   distances, indices = index.search(query, k=5)
+
+   # Get metadata for results
+   for i in indices[0]:
+       print(data["metadata"][i])
+   ```
+
+3. Save index:
+   ```python
+   faiss.write_index(index, "react_docs.index")
+   ```
+
+📚 Resources:
+- FAISS Wiki: https://github.com/facebookresearch/faiss/wiki
+- GPU Support: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
+"""
+
+        return [TextContent(type="text", text=result_text)]
+
+    except Exception as e:
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error exporting to FAISS: {str(e)}",
+            )
+        ]
+
+
+async def export_to_qdrant_impl(args: dict) -> List[TextContent]:
+    """
+    Export skill to Qdrant vector database format.
+
+    Qdrant is a modern vector database with native payload filtering and
+    high-performance search. Ideal for production RAG with 100K+ users.
+
+    Args:
+        args: Dictionary with:
+            - skill_dir (str): Path to skill directory (e.g., output/react/)
+            - output_dir (str, optional): Output directory (default: same as skill_dir)
+
+    Returns:
+        List of TextContent with export results
+
+    Example:
+        {
+            "skill_dir": "output/react",
+            "output_dir": "output"
+        }
+
+    Output Format:
+        JSON file with Qdrant collection data:
+        - collection_name: Collection identifier
+        - points: List of points with id, vector, payload
+        - config: Vector configuration
+    """
+    if get_adaptor is None:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: Could not import adaptors module.",
+            )
+        ]
+
+    skill_dir = Path(args["skill_dir"])
+    output_dir = Path(args.get("output_dir", skill_dir.parent))
+
+    if not skill_dir.exists():
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error: Skill directory not found: {skill_dir}",
+            )
+        ]
+
+    try:
+        adaptor = get_adaptor("qdrant")
+        package_path = adaptor.package(skill_dir, output_dir)
+
+        result_text = f"""✅ Qdrant Export Complete!
+
+📦 Package: {package_path.name}
+📁 Location: {package_path.parent}
+📊 Size: {package_path.stat().st_size:,} bytes
+
+🔧 Next Steps:
+1. Upload to Qdrant:
+   ```python
+   from qdrant_client import QdrantClient
+   from qdrant_client.models import Distance, VectorParams
+   import json
+
+   client = QdrantClient("localhost", port=6333)
+   data = json.load(open("{package_path}"))
+
+   # Create collection
+   client.create_collection(
+       collection_name=data["collection_name"],
+       vectors_config=VectorParams(
+           size=data["config"]["vector_size"],
+           distance=Distance.COSINE
+       )
+   )
+
+   # Upload points
+   client.upsert(
+       collection_name=data["collection_name"],
+       points=data["points"]
+   )
+   ```
+
+2. Search with filters:
+   ```python
+   from qdrant_client.models import Filter, FieldCondition, MatchValue
+
+   results = client.search(
+       collection_name=data["collection_name"],
+       query_vector=your_query_vector,
+       query_filter=Filter(
+           must=[
+               FieldCondition(
+                   key="category",
+                   match=MatchValue(value="getting_started")
+               )
+           ]
+       ),
+       limit=5
+   )
+   ```
+
+📚 Resources:
+- Qdrant Docs: https://qdrant.tech/documentation/
+- Filtering: https://qdrant.tech/documentation/concepts/filtering/
+"""
+
+        return [TextContent(type="text", text=result_text)]
+
+    except Exception as e:
+        return [
+            TextContent(
+                type="text",
+                text=f"❌ Error exporting to Qdrant: {str(e)}",
+            )
+        ]
+
+
+# Export all implementations
+__all__ = [
+    "export_to_weaviate_impl",
+    "export_to_chroma_impl",
+    "export_to_faiss_impl",
+    "export_to_qdrant_impl",
+]