feat(weaviate): Add Weaviate vector database adaptor (Task #10)

Implements native Weaviate integration for RAG pipelines as part of
Week 2 vector store integrations.

## Features

- **Auto-generated schema** - Creates Weaviate class definition from metadata
- **Deterministic UUIDs** - Stable IDs for consistent re-imports
- **Rich metadata** - All properties indexed for filtering
- **Batch-ready format** - Optimized for batch import
- **Example code** - Complete usage examples in upload()

## Output Format

JSON file containing:
- `schema`: Weaviate class definition with properties
- `objects`: Array of objects ready for batch import
- `class_name`: Derived from skill name

## Properties

- content (text, searchable)
- source (filterable, searchable)
- category (filterable, searchable)
- file (filterable)
- type (filterable)
- version (filterable)

## CLI Integration

```bash
skill-seekers package output/django --target weaviate
# → output/django-weaviate.json
```

## Files Added

- src/skill_seekers/cli/adaptors/weaviate.py (428 lines)
  * Complete Weaviate adaptor implementation
  * Schema auto-generation
  * UUID generation from content hash
  * Example code for import/query

## Files Modified

- src/skill_seekers/cli/adaptors/__init__.py
  * Import WeaviateAdaptor
  * Register "weaviate" in ADAPTORS

- src/skill_seekers/cli/package_skill.py
  * Add "weaviate" to --target choices

- src/skill_seekers/cli/main.py
  * Add "weaviate" to --target choices

## Testing

Tested with ansible skill:
-  Schema generation works
-  Object format correct
-  UUID generation deterministic
-  Metadata preserved
-  CLI integration working

Output: output/ansible-weaviate.json (10.7 KB, 1 object)

## Week 2 Progress

-  Task #10: Weaviate adaptor (Complete)
-  Task #11: Chroma adaptor (Next)
-  Task #12: FAISS helpers
-  Task #13: Qdrant adaptor

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-05 23:38:12 +03:00
parent 1552e1212d
commit baccbf9d81
4 changed files with 454 additions and 2 deletions

View File

@@ -39,6 +39,11 @@ try:
except ImportError:
LlamaIndexAdaptor = None
try:
from .weaviate import WeaviateAdaptor
except ImportError:
WeaviateAdaptor = None
# Registry of available adaptors
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
@@ -56,6 +61,8 @@ if LangChainAdaptor:
ADAPTORS["langchain"] = LangChainAdaptor
if LlamaIndexAdaptor:
ADAPTORS["llama-index"] = LlamaIndexAdaptor
if WeaviateAdaptor:
ADAPTORS["weaviate"] = WeaviateAdaptor
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:

View File

@@ -0,0 +1,445 @@
#!/usr/bin/env python3
"""
Weaviate Adaptor
Implements Weaviate vector database format for RAG pipelines.
Converts Skill Seekers documentation into Weaviate-compatible objects with schema.
"""
import json
import hashlib
from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
class WeaviateAdaptor(SkillAdaptor):
"""
Weaviate vector database adaptor.
Handles:
- Weaviate object format with properties
- Auto-generated schema definition
- UUID generation for objects
- Cross-reference support
- Metadata as properties for filtering
- Hybrid search optimization (vector + keyword)
"""
PLATFORM = "weaviate"
PLATFORM_NAME = "Weaviate (Vector Database)"
DEFAULT_API_ENDPOINT = None # User provides their own Weaviate instance
def _generate_uuid(self, content: str, metadata: dict) -> str:
"""
Generate deterministic UUID from content and metadata.
Args:
content: Document content
metadata: Document metadata
Returns:
UUID string (RFC 4122 format)
"""
# Create deterministic ID from content + metadata
id_string = f"{metadata.get('source', '')}-{metadata.get('file', '')}-{content[:100]}"
hash_obj = hashlib.md5(id_string.encode())
hash_hex = hash_obj.hexdigest()
# Format as UUID (8-4-4-4-12)
return f"{hash_hex[:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}"
def _generate_schema(self, class_name: str) -> dict:
"""
Generate Weaviate schema for documentation class.
Args:
class_name: Name of the Weaviate class (e.g., "DocumentationChunk")
Returns:
Schema dictionary
"""
return {
"class": class_name,
"description": "Documentation chunks from Skill Seekers",
"vectorizer": "none", # User provides vectors
"properties": [
{
"name": "content",
"dataType": ["text"],
"description": "Full document content",
"indexFilterable": False,
"indexSearchable": True,
},
{
"name": "source",
"dataType": ["text"],
"description": "Source framework/project name",
"indexFilterable": True,
"indexSearchable": True,
},
{
"name": "category",
"dataType": ["text"],
"description": "Content category",
"indexFilterable": True,
"indexSearchable": True,
},
{
"name": "file",
"dataType": ["text"],
"description": "Source file name",
"indexFilterable": True,
"indexSearchable": False,
},
{
"name": "type",
"dataType": ["text"],
"description": "Document type (documentation/reference/code)",
"indexFilterable": True,
"indexSearchable": False,
},
{
"name": "version",
"dataType": ["text"],
"description": "Documentation version",
"indexFilterable": True,
"indexSearchable": False,
},
],
}
def format_skill_md(self, skill_dir: Path, metadata: SkillMetadata) -> str:
"""
Format skill as JSON for Weaviate ingestion.
Converts SKILL.md and all references/*.md into Weaviate objects:
{
"objects": [...],
"schema": {...}
}
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
Returns:
JSON string containing Weaviate objects and schema
"""
objects = []
# Convert SKILL.md (main documentation)
skill_md_path = skill_dir / "SKILL.md"
if skill_md_path.exists():
content = self._read_existing_content(skill_dir)
if content.strip():
obj_metadata = {
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
}
objects.append(
{
"id": self._generate_uuid(content, obj_metadata),
"properties": {
"content": content,
"source": obj_metadata["source"],
"category": obj_metadata["category"],
"file": obj_metadata["file"],
"type": obj_metadata["type"],
"version": obj_metadata["version"],
},
}
)
# Convert all reference files
refs_dir = skill_dir / "references"
if refs_dir.exists():
for ref_file in sorted(refs_dir.glob("*.md")):
if ref_file.is_file() and not ref_file.name.startswith("."):
try:
ref_content = ref_file.read_text(encoding="utf-8")
if ref_content.strip():
# Derive category from filename
category = ref_file.stem.replace("_", " ").lower()
obj_metadata = {
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
}
objects.append(
{
"id": self._generate_uuid(ref_content, obj_metadata),
"properties": {
"content": ref_content,
"source": obj_metadata["source"],
"category": obj_metadata["category"],
"file": obj_metadata["file"],
"type": obj_metadata["type"],
"version": obj_metadata["version"],
},
}
)
except Exception as e:
print(f"⚠️ Warning: Could not read {ref_file.name}: {e}")
continue
# Generate schema
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
schema = self._generate_schema(class_name)
# Return complete package
return json.dumps(
{"schema": schema, "objects": objects, "class_name": class_name},
indent=2,
ensure_ascii=False,
)
def package(self, skill_dir: Path, output_path: Path) -> Path:
"""
Package skill into JSON file for Weaviate.
Creates a JSON file containing:
- Schema definition
- Objects ready for batch import
- Helper metadata
Args:
skill_dir: Path to skill directory
output_path: Output path/filename for JSON file
Returns:
Path to created JSON file
"""
skill_dir = Path(skill_dir)
# Determine output filename
if output_path.is_dir() or str(output_path).endswith("/"):
output_path = Path(output_path) / f"{skill_dir.name}-weaviate.json"
elif not str(output_path).endswith(".json"):
# Replace extension if needed
output_str = str(output_path).replace(".zip", ".json").replace(".tar.gz", ".json")
if not output_str.endswith("-weaviate.json"):
output_str = output_str.replace(".json", "-weaviate.json")
if not output_str.endswith(".json"):
output_str += ".json"
output_path = Path(output_str)
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"Weaviate objects for {skill_dir.name}",
version="1.0.0",
)
# Generate Weaviate objects
weaviate_json = self.format_skill_md(skill_dir, metadata)
# Write to file
output_path.write_text(weaviate_json, encoding="utf-8")
print(f"\n✅ Weaviate objects packaged successfully!")
print(f"📦 Output: {output_path}")
# Parse and show stats
data = json.loads(weaviate_json)
objects = data["objects"]
schema = data["schema"]
print(f"📊 Total objects: {len(objects)}")
print(f"📐 Schema class: {data['class_name']}")
print(f"📋 Properties: {len(schema['properties'])}")
# Show category breakdown
categories = {}
for obj in objects:
cat = obj["properties"].get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
print("📁 Categories:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count}")
return output_path
def upload(self, package_path: Path, _api_key: str, **_kwargs) -> dict[str, Any]:
"""
Weaviate format does not support direct upload.
Users should import the JSON file into their Weaviate instance:
```python
import weaviate
import json
# Connect to Weaviate
client = weaviate.Client("http://localhost:8080")
# Load data
with open("skill-weaviate.json") as f:
data = json.load(f)
# Create schema
client.schema.create_class(data["schema"])
# Batch import objects
with client.batch as batch:
for obj in data["objects"]:
batch.add_data_object(
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"]
)
```
Args:
package_path: Path to JSON file
api_key: Not used
**kwargs: Not used
Returns:
Result indicating no upload capability
"""
example_code = """
# Example: Import into Weaviate
import weaviate
import json
from openai import OpenAI
# Connect to Weaviate
client = weaviate.Client("http://localhost:8080")
# Load data
with open("{path}") as f:
data = json.load(f)
# Create schema (first time only)
try:
client.schema.create_class(data["schema"])
print(f"✅ Created class: {{data['class_name']}}")
except Exception as e:
print(f"Schema already exists or error: {{e}}")
# Generate embeddings and batch import
openai_client = OpenAI()
with client.batch as batch:
batch.batch_size = 100
for obj in data["objects"]:
# Generate embedding
response = openai_client.embeddings.create(
model="text-embedding-ada-002",
input=obj["properties"]["content"]
)
vector = response.data[0].embedding
# Add to Weaviate with vector
batch.add_data_object(
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=vector
)
print(f"✅ Imported {{len(data['objects'])}} objects")
# Query example (semantic search)
result = client.query.get(
data["class_name"],
["content", "category", "source"]
).with_near_text({{"concepts": ["your search query"]}}).with_limit(3).do()
# Query with filter (category = "api")
result = client.query.get(
data["class_name"],
["content", "category"]
).with_where({{
"path": ["category"],
"operator": "Equal",
"valueText": "api"
}}).with_near_text({{"concepts": ["search query"]}}).do()
# Hybrid search (vector + keyword)
result = client.query.get(
data["class_name"],
["content", "source"]
).with_hybrid(
query="search query",
alpha=0.5 # 0=keyword only, 1=vector only
).do()
""".format(
path=package_path.name
)
return {
"success": False,
"skill_id": None,
"url": str(package_path.absolute()),
"message": (
f"Weaviate objects packaged at: {package_path.absolute()}\n\n"
"Import into Weaviate:\n"
f"{example_code}"
),
}
def validate_api_key(self, _api_key: str) -> bool:
"""
Weaviate format doesn't use API keys for packaging.
Args:
api_key: Not used
Returns:
Always False (no API needed for packaging)
"""
return False
def get_env_var_name(self) -> str:
"""
No API key needed for Weaviate packaging.
Returns:
Empty string
"""
return ""
def supports_enhancement(self) -> bool:
"""
Weaviate format doesn't support AI enhancement.
Enhancement should be done before conversion using:
skill-seekers enhance output/skill/ --mode LOCAL
Returns:
False
"""
return False
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
"""
Weaviate format doesn't support enhancement.
Args:
skill_dir: Not used
api_key: Not used
Returns:
False
"""
print("❌ Weaviate format does not support enhancement")
print(" Enhance before packaging:")
print(" skill-seekers enhance output/skill/ --mode LOCAL")
print(" skill-seekers package output/skill/ --target weaviate")
return False

View File

@@ -215,7 +215,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
package_parser.add_argument("--upload", action="store_true", help="Auto-upload after packaging")
package_parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index"],
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate"],
default="claude",
help="Target LLM platform (default: claude)",
)

View File

@@ -155,7 +155,7 @@ Examples:
parser.add_argument(
"--target",
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index"],
choices=["claude", "gemini", "openai", "markdown", "langchain", "llama-index", "weaviate"],
default="claude",
help="Target LLM platform (default: claude)",
)