Files
skill-seekers-reference/src/skill_seekers/mcp/tools/vector_db_tools.py
yusyus 51787e57bc style: Fix 411 ruff lint issues (Kimi's issue #4)
Auto-fixed lint issues with ruff --fix and --unsafe-fixes:

Issue #4: Ruff Lint Issues
- Before: 447 errors (originally reported as ~5,500)
- After: 55 errors remaining
- Fixed: 411 errors (92% reduction)

Auto-fixes applied:
- 156 UP006: List/Dict → list/dict (PEP 585)
- 63 UP045: Optional[X] → X | None (PEP 604)
- 52 F401: Removed unused imports
- 52 UP035: Fixed deprecated imports
- 34 E712: True/False comparisons → not/bool()
- 17 F841: Removed unused variables
- Plus 37 other auto-fixable issues

Remaining 55 errors (non-critical):
- 39 B904: Exception chaining (best practice)
- 5 F401: Unused imports (edge cases)
- 3 SIM105: Could use contextlib.suppress
- 8 other minor style issues

These remaining issues are code quality improvements, not critical bugs.

Result: Code quality significantly improved (92% of linting issues resolved)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-08 12:46:38 +03:00

489 lines
13 KiB
Python

"""
Vector Database Tools for MCP Server.
Provides MCP tools for exporting skills to 4 vector databases:
- Weaviate (hybrid search, 450K+ users)
- Chroma (local-first, 800K+ developers)
- FAISS (billion-scale, GPU-accelerated)
- Qdrant (native filtering, 100K+ users)
Each tool provides a direct interface to its respective vector database adaptor.
"""
import sys
from pathlib import Path
try:
from mcp.types import TextContent
except ImportError:
# Graceful degradation for testing
class TextContent:
"""Fallback TextContent for when MCP is not installed"""
def __init__(self, type: str, text: str):
self.type = type
self.text = text
# Path to CLI adaptors
CLI_DIR = Path(__file__).parent.parent.parent / "cli"
sys.path.insert(0, str(CLI_DIR))
try:
from adaptors import get_adaptor
except ImportError:
get_adaptor = None # Will handle gracefully below
async def export_to_weaviate_impl(args: dict) -> list[TextContent]:
"""
Export skill to Weaviate vector database format.
Weaviate is a popular cloud-native vector database with hybrid search
(combining vector similarity + BM25 keyword search). Ideal for
production RAG applications with 450K+ users.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Weaviate schema:
- class_name: Weaviate class name
- schema: Property definitions
- objects: Document objects with vectors and metadata
- config: Distance metric configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module. Please ensure skill-seekers is properly installed.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}\n\nPlease scrape documentation first using scrape_docs.",
)
]
try:
# Get Weaviate adaptor
adaptor = get_adaptor("weaviate")
# Package skill
package_path = adaptor.package(skill_dir, output_dir)
# Success message
result_text = f"""✅ Weaviate Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Upload to Weaviate:
```python
import weaviate
import json
client = weaviate.Client("http://localhost:8080")
data = json.load(open("{package_path}"))
# Create schema
client.schema.create_class(data["schema"])
# Batch upload objects
with client.batch as batch:
for obj in data["objects"]:
batch.add_data_object(obj["properties"], data["class_name"])
```
2. Query with hybrid search:
```python
result = client.query.get(data["class_name"], ["content", "source"]) \\
.with_hybrid("React hooks usage") \\
.with_limit(5) \\
.do()
```
📚 Resources:
- Weaviate Docs: https://weaviate.io/developers/weaviate
- Hybrid Search: https://weaviate.io/developers/weaviate/search/hybrid
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Weaviate: {str(e)}\n\nPlease check that the skill directory contains valid documentation.",
)
]
async def export_to_chroma_impl(args: dict) -> list[TextContent]:
"""
Export skill to Chroma vector database format.
Chroma is a popular open-source embedding database designed for
local-first development. Perfect for RAG prototyping with 800K+ developers.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Chroma collection data:
- collection_name: Collection identifier
- documents: List of document texts
- metadatas: List of metadata dicts
- ids: List of unique IDs
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("chroma")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ Chroma Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Load into Chroma:
```python
import chromadb
import json
client = chromadb.Client()
data = json.load(open("{package_path}"))
# Create collection
collection = client.create_collection(
name=data["collection_name"],
metadata={{"source": "skill-seekers"}}
)
# Add documents
collection.add(
documents=data["documents"],
metadatas=data["metadatas"],
ids=data["ids"]
)
```
2. Query the collection:
```python
results = collection.query(
query_texts=["How to use React hooks?"],
n_results=5
)
```
📚 Resources:
- Chroma Docs: https://docs.trychroma.com/
- Getting Started: https://docs.trychroma.com/getting-started
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Chroma: {str(e)}",
)
]
async def export_to_faiss_impl(args: dict) -> list[TextContent]:
"""
Export skill to FAISS vector index format.
FAISS (Facebook AI Similarity Search) is a library for efficient similarity
search at billion-scale. Supports GPU acceleration for ultra-fast search.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
- index_type (str, optional): FAISS index type (default: 'Flat')
Options: 'Flat', 'IVF', 'HNSW'
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output",
"index_type": "HNSW"
}
Output Format:
JSON file with FAISS data:
- embeddings: List of embedding vectors
- metadata: List of document metadata
- index_config: FAISS index configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("faiss")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ FAISS Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Build FAISS index:
```python
import faiss
import json
import numpy as np
data = json.load(open("{package_path}"))
embeddings = np.array(data["embeddings"], dtype="float32")
# Create index (choose based on scale)
dimension = embeddings.shape[1]
# Option 1: Flat (exact search, small datasets)
index = faiss.IndexFlatL2(dimension)
# Option 2: IVF (fast approximation, medium datasets)
# quantizer = faiss.IndexFlatL2(dimension)
# index = faiss.IndexIVFFlat(quantizer, dimension, 100)
# index.train(embeddings)
# Option 3: HNSW (best quality approximation, large datasets)
# index = faiss.IndexHNSWFlat(dimension, 32)
# Add vectors
index.add(embeddings)
```
2. Search:
```python
# Search for similar docs
query = np.array([your_query_embedding], dtype="float32")
distances, indices = index.search(query, k=5)
# Get metadata for results
for i in indices[0]:
print(data["metadata"][i])
```
3. Save index:
```python
faiss.write_index(index, "react_docs.index")
```
📚 Resources:
- FAISS Wiki: https://github.com/facebookresearch/faiss/wiki
- GPU Support: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to FAISS: {str(e)}",
)
]
async def export_to_qdrant_impl(args: dict) -> list[TextContent]:
"""
Export skill to Qdrant vector database format.
Qdrant is a modern vector database with native payload filtering and
high-performance search. Ideal for production RAG with 100K+ users.
Args:
args: Dictionary with:
- skill_dir (str): Path to skill directory (e.g., output/react/)
- output_dir (str, optional): Output directory (default: same as skill_dir)
Returns:
List of TextContent with export results
Example:
{
"skill_dir": "output/react",
"output_dir": "output"
}
Output Format:
JSON file with Qdrant collection data:
- collection_name: Collection identifier
- points: List of points with id, vector, payload
- config: Vector configuration
"""
if get_adaptor is None:
return [
TextContent(
type="text",
text="❌ Error: Could not import adaptors module.",
)
]
skill_dir = Path(args["skill_dir"])
output_dir = Path(args.get("output_dir", skill_dir.parent))
if not skill_dir.exists():
return [
TextContent(
type="text",
text=f"❌ Error: Skill directory not found: {skill_dir}",
)
]
try:
adaptor = get_adaptor("qdrant")
package_path = adaptor.package(skill_dir, output_dir)
result_text = f"""✅ Qdrant Export Complete!
📦 Package: {package_path.name}
📁 Location: {package_path.parent}
📊 Size: {package_path.stat().st_size:,} bytes
🔧 Next Steps:
1. Upload to Qdrant:
```python
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
import json
client = QdrantClient("localhost", port=6333)
data = json.load(open("{package_path}"))
# Create collection
client.create_collection(
collection_name=data["collection_name"],
vectors_config=VectorParams(
size=data["config"]["vector_size"],
distance=Distance.COSINE
)
)
# Upload points
client.upsert(
collection_name=data["collection_name"],
points=data["points"]
)
```
2. Search with filters:
```python
from qdrant_client.models import Filter, FieldCondition, MatchValue
results = client.search(
collection_name=data["collection_name"],
query_vector=your_query_vector,
query_filter=Filter(
must=[
FieldCondition(
key="category",
match=MatchValue(value="getting_started")
)
]
),
limit=5
)
```
📚 Resources:
- Qdrant Docs: https://qdrant.tech/documentation/
- Filtering: https://qdrant.tech/documentation/concepts/filtering/
"""
return [TextContent(type="text", text=result_text)]
except Exception as e:
return [
TextContent(
type="text",
text=f"❌ Error exporting to Qdrant: {str(e)}",
)
]
# Export all implementations
__all__ = [
"export_to_weaviate_impl",
"export_to_chroma_impl",
"export_to_faiss_impl",
"export_to_qdrant_impl",
]