style: Format all Python files with ruff
- Formatted 103 files to comply with ruff format requirements - No code logic changes, only formatting/whitespace - Fixes CI formatting check failures
This commit is contained in:
@@ -74,7 +74,7 @@ class SkillAdaptor(ABC):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill for platform (ZIP, tar.gz, etc.).
|
||||
@@ -282,7 +282,7 @@ class SkillAdaptor(ABC):
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
source_file: str = None
|
||||
source_file: str = None,
|
||||
) -> list[tuple[str, dict]]:
|
||||
"""
|
||||
Optionally chunk content for RAG platforms.
|
||||
@@ -326,33 +326,31 @@ class SkillAdaptor(ABC):
|
||||
chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
preserve_paragraphs=True,
|
||||
min_chunk_size=100 # 100 tokens minimum
|
||||
min_chunk_size=100, # 100 tokens minimum
|
||||
)
|
||||
|
||||
# Chunk the document
|
||||
chunks = chunker.chunk_document(
|
||||
text=content,
|
||||
metadata=metadata,
|
||||
source_file=source_file or metadata.get('file', 'unknown')
|
||||
source_file=source_file or metadata.get("file", "unknown"),
|
||||
)
|
||||
|
||||
# Convert RAGChunker output format to (text, metadata) tuples
|
||||
result = []
|
||||
for chunk_dict in chunks:
|
||||
chunk_text = chunk_dict['page_content']
|
||||
chunk_text = chunk_dict["page_content"]
|
||||
chunk_meta = {
|
||||
**metadata, # Base metadata
|
||||
**chunk_dict['metadata'], # RAGChunker metadata (chunk_index, etc.)
|
||||
'is_chunked': True,
|
||||
'chunk_id': chunk_dict['chunk_id']
|
||||
**chunk_dict["metadata"], # RAGChunker metadata (chunk_index, etc.)
|
||||
"is_chunked": True,
|
||||
"chunk_id": chunk_dict["chunk_id"],
|
||||
}
|
||||
result.append((chunk_text, chunk_meta))
|
||||
|
||||
return result
|
||||
|
||||
def _format_output_path(
|
||||
self, skill_dir: Path, output_path: Path, suffix: str
|
||||
) -> Path:
|
||||
def _format_output_path(self, skill_dir: Path, output_path: Path, suffix: str) -> Path:
|
||||
"""
|
||||
Generate standardized output path with intelligent format handling.
|
||||
|
||||
@@ -379,11 +377,13 @@ class SkillAdaptor(ABC):
|
||||
output_str = str(output_path)
|
||||
|
||||
# Extract the file extension from suffix (e.g., ".json" from "-langchain.json")
|
||||
correct_ext = suffix.split('.')[-1] if '.' in suffix else ''
|
||||
correct_ext = suffix.split(".")[-1] if "." in suffix else ""
|
||||
|
||||
if correct_ext and not output_str.endswith(f".{correct_ext}"):
|
||||
# Replace common incorrect extensions
|
||||
output_str = output_str.replace(".zip", f".{correct_ext}").replace(".tar.gz", f".{correct_ext}")
|
||||
output_str = output_str.replace(".zip", f".{correct_ext}").replace(
|
||||
".tar.gz", f".{correct_ext}"
|
||||
)
|
||||
|
||||
# Ensure platform suffix is present
|
||||
if not output_str.endswith(suffix):
|
||||
@@ -395,9 +395,7 @@ class SkillAdaptor(ABC):
|
||||
|
||||
return Path(output_str)
|
||||
|
||||
def _generate_deterministic_id(
|
||||
self, content: str, metadata: dict, format: str = "hex"
|
||||
) -> str:
|
||||
def _generate_deterministic_id(self, content: str, metadata: dict, format: str = "hex") -> str:
|
||||
"""
|
||||
Generate deterministic ID from content and metadata.
|
||||
|
||||
|
||||
@@ -43,11 +43,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for Chroma ingestion.
|
||||
@@ -90,9 +86,9 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -120,9 +116,9 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -149,7 +145,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Chroma.
|
||||
@@ -183,7 +179,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
@@ -233,7 +229,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "chromadb not installed. Run: pip install chromadb"
|
||||
"message": "chromadb not installed. Run: pip install chromadb",
|
||||
}
|
||||
|
||||
# Load package
|
||||
@@ -241,8 +237,8 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
data = json.load(f)
|
||||
|
||||
# Determine client type and configuration
|
||||
persist_directory = kwargs.get('persist_directory')
|
||||
chroma_url = kwargs.get('chroma_url')
|
||||
persist_directory = kwargs.get("persist_directory")
|
||||
chroma_url = kwargs.get("chroma_url")
|
||||
|
||||
try:
|
||||
if persist_directory:
|
||||
@@ -253,15 +249,15 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
# Remote HTTP client
|
||||
print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
|
||||
# Parse URL
|
||||
if '://' in chroma_url:
|
||||
parts = chroma_url.split('://')
|
||||
if "://" in chroma_url:
|
||||
parts = chroma_url.split("://")
|
||||
parts[0]
|
||||
host_port = parts[1]
|
||||
else:
|
||||
host_port = chroma_url
|
||||
|
||||
if ':' in host_port:
|
||||
host, port = host_port.rsplit(':', 1)
|
||||
if ":" in host_port:
|
||||
host, port = host_port.rsplit(":", 1)
|
||||
port = int(port)
|
||||
else:
|
||||
host = host_port
|
||||
@@ -276,12 +272,12 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server"
|
||||
"message": f"Failed to connect to ChromaDB: {e}\n\nTry:\n pip install chromadb\n chroma run # Start local server",
|
||||
}
|
||||
|
||||
# Get or create collection
|
||||
collection_name = kwargs.get('collection_name', data.get('collection_name', 'skill_docs'))
|
||||
distance_function = kwargs.get('distance_function', 'cosine')
|
||||
collection_name = kwargs.get("collection_name", data.get("collection_name", "skill_docs"))
|
||||
distance_function = kwargs.get("distance_function", "cosine")
|
||||
|
||||
try:
|
||||
# Try to get existing collection
|
||||
@@ -291,62 +287,57 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
try:
|
||||
# Create new collection
|
||||
metadata = {"hnsw:space": distance_function}
|
||||
collection = client.create_collection(
|
||||
name=collection_name,
|
||||
metadata=metadata
|
||||
)
|
||||
collection = client.create_collection(name=collection_name, metadata=metadata)
|
||||
print(f"✅ Created collection: {collection_name} (distance: {distance_function})")
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Failed to create collection '{collection_name}': {e}"
|
||||
"message": f"Failed to create collection '{collection_name}': {e}",
|
||||
}
|
||||
|
||||
# Handle embeddings
|
||||
embedding_function = kwargs.get('embedding_function')
|
||||
embedding_function = kwargs.get("embedding_function")
|
||||
|
||||
try:
|
||||
if embedding_function == 'openai':
|
||||
if embedding_function == "openai":
|
||||
# Generate embeddings with OpenAI
|
||||
print("🔄 Generating OpenAI embeddings...")
|
||||
embeddings = self._generate_openai_embeddings(
|
||||
data['documents'],
|
||||
api_key=kwargs.get('openai_api_key')
|
||||
data["documents"], api_key=kwargs.get("openai_api_key")
|
||||
)
|
||||
collection.add(
|
||||
documents=data['documents'],
|
||||
metadatas=data['metadatas'],
|
||||
ids=data['ids'],
|
||||
embeddings=embeddings
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"],
|
||||
embeddings=embeddings,
|
||||
)
|
||||
elif embedding_function == 'sentence-transformers':
|
||||
elif embedding_function == "sentence-transformers":
|
||||
# Use sentence-transformers
|
||||
print("🔄 Generating sentence-transformer embeddings...")
|
||||
try:
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
ef = embedding_functions.SentenceTransformerEmbeddingFunction()
|
||||
embeddings = [ef([doc])[0] for doc in data['documents']]
|
||||
embeddings = [ef([doc])[0] for doc in data["documents"]]
|
||||
collection.add(
|
||||
documents=data['documents'],
|
||||
metadatas=data['metadatas'],
|
||||
ids=data['ids'],
|
||||
embeddings=embeddings
|
||||
documents=data["documents"],
|
||||
metadatas=data["metadatas"],
|
||||
ids=data["ids"],
|
||||
embeddings=embeddings,
|
||||
)
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
|
||||
}
|
||||
else:
|
||||
# No embeddings - Chroma will auto-generate
|
||||
print("🔄 Using Chroma's default embedding function...")
|
||||
collection.add(
|
||||
documents=data['documents'],
|
||||
metadatas=data['metadatas'],
|
||||
ids=data['ids']
|
||||
documents=data["documents"], metadatas=data["metadatas"], ids=data["ids"]
|
||||
)
|
||||
|
||||
count = len(data['documents'])
|
||||
count = len(data["documents"])
|
||||
print(f"✅ Uploaded {count} documents to ChromaDB")
|
||||
print(f"📊 Collection '{collection_name}' now has {collection.count()} total documents")
|
||||
|
||||
@@ -355,19 +346,14 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
"message": f"Uploaded {count} documents to ChromaDB collection '{collection_name}'",
|
||||
"collection": collection_name,
|
||||
"count": count,
|
||||
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None
|
||||
"url": f"{chroma_url}/collections/{collection_name}" if chroma_url else None,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Upload failed: {e}"
|
||||
}
|
||||
return {"success": False, "message": f"Upload failed: {e}"}
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self,
|
||||
documents: list[str],
|
||||
api_key: str = None
|
||||
self, documents: list[str], api_key: str = None
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings using OpenAI API.
|
||||
@@ -380,12 +366,13 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv('OPENAI_API_KEY')
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
@@ -398,14 +385,14 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
print(f" Generating embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i+batch_size]
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(
|
||||
input=batch,
|
||||
model="text-embedding-3-small" # Cheapest, fastest
|
||||
model="text-embedding-3-small", # Cheapest, fastest
|
||||
)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(f" ✓ Processed {min(i+batch_size, len(documents))}/{len(documents)}")
|
||||
print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
|
||||
@@ -81,7 +81,14 @@ version: {metadata.version}
|
||||
{content_body}
|
||||
"""
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file for Claude.
|
||||
|
||||
|
||||
@@ -46,11 +46,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for FAISS ingestion.
|
||||
@@ -92,9 +88,9 @@ class FAISSHelpers(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -121,9 +117,9 @@ class FAISSHelpers(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -160,7 +156,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for FAISS.
|
||||
@@ -193,7 +189,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -86,7 +86,14 @@ See the references directory for complete documentation with examples and best p
|
||||
# Return plain markdown (NO frontmatter)
|
||||
return content_body
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into tar.gz file for Gemini.
|
||||
|
||||
|
||||
@@ -29,11 +29,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
DEFAULT_API_ENDPOINT = None # No upload endpoint
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON array of Haystack Documents.
|
||||
@@ -73,17 +69,19 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
})
|
||||
documents.append(
|
||||
{
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -104,17 +102,19 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
})
|
||||
documents.append(
|
||||
{
|
||||
"content": chunk_text,
|
||||
"meta": chunk_meta,
|
||||
}
|
||||
)
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(documents, indent=2, ensure_ascii=False)
|
||||
@@ -125,7 +125,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Haystack.
|
||||
@@ -159,7 +159,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -29,11 +29,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
DEFAULT_API_ENDPOINT = None # No upload endpoint
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON array of LangChain Documents.
|
||||
@@ -73,17 +69,14 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks to documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"page_content": chunk_text,
|
||||
"metadata": chunk_meta
|
||||
})
|
||||
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -104,17 +97,14 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks to documents
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"page_content": chunk_text,
|
||||
"metadata": chunk_meta
|
||||
})
|
||||
documents.append({"page_content": chunk_text, "metadata": chunk_meta})
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(documents, indent=2, ensure_ascii=False)
|
||||
@@ -125,7 +115,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for LangChain.
|
||||
@@ -162,7 +152,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -42,11 +42,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON array of LlamaIndex Nodes.
|
||||
@@ -88,19 +84,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
nodes.append({
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
})
|
||||
nodes.append(
|
||||
{
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -121,19 +119,21 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
nodes.append({
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
})
|
||||
nodes.append(
|
||||
{
|
||||
"text": chunk_text,
|
||||
"metadata": chunk_meta,
|
||||
"id_": self._generate_node_id(chunk_text, chunk_meta),
|
||||
"embedding": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Return as formatted JSON
|
||||
return json.dumps(nodes, indent=2, ensure_ascii=False)
|
||||
@@ -144,7 +144,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for LlamaIndex.
|
||||
@@ -178,7 +178,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -81,7 +81,14 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
# Return pure markdown (no frontmatter, no special formatting)
|
||||
return content_body
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file with markdown documentation.
|
||||
|
||||
|
||||
@@ -103,7 +103,14 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
# Return plain text instructions (NO frontmatter)
|
||||
return content_body
|
||||
|
||||
def package(self, skill_dir: Path, output_path: Path, enable_chunking: bool = False, chunk_max_tokens: int = 512, preserve_code_blocks: bool = True) -> Path:
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file for OpenAI Assistants.
|
||||
|
||||
|
||||
@@ -44,11 +44,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
return self._generate_deterministic_id(content, metadata, format="uuid5")
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as Qdrant collection JSON.
|
||||
@@ -87,30 +83,35 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
point_id = self._generate_point_id(chunk_text, {
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"file": chunk_meta.get("file", "SKILL.md")
|
||||
})
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
point_id = self._generate_point_id(
|
||||
chunk_text,
|
||||
{
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
)
|
||||
|
||||
points.append(
|
||||
{
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -130,30 +131,35 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
point_id = self._generate_point_id(chunk_text, {
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"file": chunk_meta.get("file", ref_file.name)
|
||||
})
|
||||
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
point_id = self._generate_point_id(
|
||||
chunk_text,
|
||||
{
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
)
|
||||
|
||||
points.append(
|
||||
{
|
||||
"id": point_id,
|
||||
"vector": None, # User will generate embeddings
|
||||
"payload": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
# Qdrant configuration
|
||||
config = {
|
||||
@@ -184,7 +190,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Qdrant.
|
||||
@@ -217,7 +223,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -36,7 +36,7 @@ class StreamingAdaptorMixin:
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200,
|
||||
batch_size: int = 100,
|
||||
progress_callback: callable | None = None
|
||||
progress_callback: callable | None = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill using streaming ingestion.
|
||||
@@ -60,9 +60,7 @@ class StreamingAdaptorMixin:
|
||||
|
||||
# Initialize streaming ingester
|
||||
ingester = StreamingIngester(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
batch_size=batch_size
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap, batch_size=batch_size
|
||||
)
|
||||
|
||||
print(f"\n📊 Streaming ingestion starting...")
|
||||
@@ -77,9 +75,11 @@ class StreamingAdaptorMixin:
|
||||
nonlocal last_update
|
||||
# Update every 10 chunks
|
||||
if progress.processed_chunks - last_update >= 10:
|
||||
print(f" {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
|
||||
f"({progress.chunks_per_second:.1f} chunks/sec)")
|
||||
print(
|
||||
f" {progress.progress_percent:.1f}% - "
|
||||
f"{progress.processed_chunks}/{progress.total_chunks} chunks "
|
||||
f"({progress.chunks_per_second:.1f} chunks/sec)"
|
||||
)
|
||||
last_update = progress.processed_chunks
|
||||
|
||||
if progress_callback:
|
||||
@@ -97,10 +97,7 @@ class StreamingAdaptorMixin:
|
||||
|
||||
# Convert chunks to platform format
|
||||
print(f"\n📦 Converting to {self.PLATFORM_NAME} format...")
|
||||
package_data = self._convert_chunks_to_platform_format(
|
||||
all_chunks,
|
||||
skill_dir.name
|
||||
)
|
||||
package_data = self._convert_chunks_to_platform_format(all_chunks, skill_dir.name)
|
||||
|
||||
# Determine output filename
|
||||
if output_path.is_dir() or str(output_path).endswith("/"):
|
||||
@@ -114,8 +111,7 @@ class StreamingAdaptorMixin:
|
||||
# Write output
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(
|
||||
json.dumps(package_data, indent=2, ensure_ascii=False),
|
||||
encoding="utf-8"
|
||||
json.dumps(package_data, indent=2, ensure_ascii=False), encoding="utf-8"
|
||||
)
|
||||
|
||||
print(f"✅ Package created: {output_path}")
|
||||
@@ -124,9 +120,7 @@ class StreamingAdaptorMixin:
|
||||
return output_path
|
||||
|
||||
def _convert_chunks_to_platform_format(
|
||||
self,
|
||||
chunks: list[tuple[str, dict]],
|
||||
skill_name: str
|
||||
self, chunks: list[tuple[str, dict]], skill_name: str
|
||||
) -> dict:
|
||||
"""
|
||||
Convert chunks to platform-specific format.
|
||||
@@ -156,14 +150,11 @@ class StreamingAdaptorMixin:
|
||||
"metadatas": metadatas,
|
||||
"ids": ids,
|
||||
"total_chunks": len(chunks),
|
||||
"streaming": True
|
||||
"streaming": True,
|
||||
}
|
||||
|
||||
def estimate_chunks(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
chunk_size: int = 4000,
|
||||
chunk_overlap: int = 200
|
||||
self, skill_dir: Path, chunk_size: int = 4000, chunk_overlap: int = 200
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Estimate chunking for a skill directory.
|
||||
@@ -179,10 +170,7 @@ class StreamingAdaptorMixin:
|
||||
Estimation statistics
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
StreamingIngester(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap
|
||||
)
|
||||
StreamingIngester(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
# Count files and estimate chunks
|
||||
total_docs = 0
|
||||
@@ -201,11 +189,9 @@ class StreamingAdaptorMixin:
|
||||
total_chars += char_count
|
||||
estimated_chunks += chunk_count
|
||||
|
||||
file_stats.append({
|
||||
"file": "SKILL.md",
|
||||
"chars": char_count,
|
||||
"estimated_chunks": chunk_count
|
||||
})
|
||||
file_stats.append(
|
||||
{"file": "SKILL.md", "chars": char_count, "estimated_chunks": chunk_count}
|
||||
)
|
||||
|
||||
# Reference files
|
||||
refs_dir = skill_dir / "references"
|
||||
@@ -214,17 +200,21 @@ class StreamingAdaptorMixin:
|
||||
if ref_file.is_file() and not ref_file.name.startswith("."):
|
||||
content = ref_file.read_text(encoding="utf-8")
|
||||
char_count = len(content)
|
||||
chunk_count = max(1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1)
|
||||
chunk_count = max(
|
||||
1, (char_count - chunk_overlap) // (chunk_size - chunk_overlap) + 1
|
||||
)
|
||||
|
||||
total_docs += 1
|
||||
total_chars += char_count
|
||||
estimated_chunks += chunk_count
|
||||
|
||||
file_stats.append({
|
||||
"file": ref_file.name,
|
||||
"chars": char_count,
|
||||
"estimated_chunks": chunk_count
|
||||
})
|
||||
file_stats.append(
|
||||
{
|
||||
"file": ref_file.name,
|
||||
"chars": char_count,
|
||||
"estimated_chunks": chunk_count,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"skill_name": skill_dir.name,
|
||||
@@ -235,7 +225,7 @@ class StreamingAdaptorMixin:
|
||||
"chunk_overlap": chunk_overlap,
|
||||
"file_stats": file_stats,
|
||||
"estimated_memory_mb": (total_chars * 2) / (1024 * 1024), # UTF-8 estimate
|
||||
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100
|
||||
"recommended_streaming": total_chars > 1_000_000 or total_docs > 100,
|
||||
}
|
||||
|
||||
|
||||
@@ -251,25 +241,27 @@ class StreamingLangChainAdaptor(StreamingAdaptorMixin):
|
||||
documents = []
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append({
|
||||
"page_content": chunk_text,
|
||||
"metadata": {
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_id": chunk_meta["chunk_id"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", "1.0.0"),
|
||||
documents.append(
|
||||
{
|
||||
"page_content": chunk_text,
|
||||
"metadata": {
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_id": chunk_meta["chunk_id"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", "1.0.0"),
|
||||
},
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
return {
|
||||
"documents": documents,
|
||||
"total_chunks": len(chunks),
|
||||
"streaming": True,
|
||||
"format": "LangChain Document"
|
||||
"format": "LangChain Document",
|
||||
}
|
||||
|
||||
|
||||
@@ -287,14 +279,16 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
documents.append(chunk_text)
|
||||
metadatas.append({
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
})
|
||||
metadatas.append(
|
||||
{
|
||||
"source": chunk_meta["source"],
|
||||
"category": chunk_meta["category"],
|
||||
"file": chunk_meta["file"],
|
||||
"chunk_index": chunk_meta["chunk_index"],
|
||||
"total_chunks": chunk_meta["total_chunks"],
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
}
|
||||
)
|
||||
ids.append(chunk_meta["chunk_id"])
|
||||
|
||||
return {
|
||||
@@ -303,7 +297,7 @@ class StreamingChromaAdaptor(StreamingAdaptorMixin):
|
||||
"ids": ids,
|
||||
"collection_name": skill_name.replace("_", "-"),
|
||||
"total_chunks": len(chunks),
|
||||
"streaming": True
|
||||
"streaming": True,
|
||||
}
|
||||
|
||||
|
||||
@@ -339,11 +333,7 @@ def demo_streaming():
|
||||
print("=" * 60)
|
||||
|
||||
output = adaptor.package_streaming(
|
||||
skill_dir,
|
||||
Path("output"),
|
||||
chunk_size=2000,
|
||||
chunk_overlap=100,
|
||||
batch_size=50
|
||||
skill_dir, Path("output"), chunk_size=2000, chunk_overlap=100, batch_size=50
|
||||
)
|
||||
|
||||
print(f"\n✅ Complete! Output: {output}")
|
||||
|
||||
@@ -104,11 +104,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
}
|
||||
|
||||
def format_skill_md(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
metadata: SkillMetadata,
|
||||
enable_chunking: bool = False,
|
||||
**kwargs
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for Weaviate ingestion.
|
||||
@@ -148,24 +144,26 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file="SKILL.md"
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
objects.append({
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
})
|
||||
objects.append(
|
||||
{
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", "overview"),
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files using base helper method
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
@@ -186,24 +184,26 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get('chunk_max_tokens', 512),
|
||||
preserve_code_blocks=kwargs.get('preserve_code_blocks', True),
|
||||
source_file=ref_file.name
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
objects.append({
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
})
|
||||
objects.append(
|
||||
{
|
||||
"id": self._generate_uuid(chunk_text, chunk_meta),
|
||||
"properties": {
|
||||
"content": chunk_text,
|
||||
"source": chunk_meta.get("source", metadata.name),
|
||||
"category": chunk_meta.get("category", category),
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Generate schema
|
||||
class_name = "".join(word.capitalize() for word in metadata.name.split("_"))
|
||||
@@ -222,7 +222,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
preserve_code_blocks: bool = True
|
||||
preserve_code_blocks: bool = True,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Weaviate.
|
||||
@@ -258,7 +258,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
@@ -310,7 +310,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "weaviate-client not installed. Run: pip install weaviate-client"
|
||||
"message": "weaviate-client not installed. Run: pip install weaviate-client",
|
||||
}
|
||||
|
||||
# Load package
|
||||
@@ -319,16 +319,16 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
|
||||
# Connect to Weaviate
|
||||
try:
|
||||
if kwargs.get('use_cloud') and api_key:
|
||||
if kwargs.get("use_cloud") and api_key:
|
||||
# Weaviate Cloud
|
||||
print(f"🌐 Connecting to Weaviate Cloud: {kwargs.get('cluster_url')}")
|
||||
client = weaviate.Client(
|
||||
url=kwargs.get('cluster_url'),
|
||||
auth_client_secret=weaviate.AuthApiKey(api_key=api_key)
|
||||
url=kwargs.get("cluster_url"),
|
||||
auth_client_secret=weaviate.AuthApiKey(api_key=api_key),
|
||||
)
|
||||
else:
|
||||
# Local Weaviate instance
|
||||
weaviate_url = kwargs.get('weaviate_url', 'http://localhost:8080')
|
||||
weaviate_url = kwargs.get("weaviate_url", "http://localhost:8080")
|
||||
print(f"🌐 Connecting to Weaviate at: {weaviate_url}")
|
||||
client = weaviate.Client(url=weaviate_url)
|
||||
|
||||
@@ -336,69 +336,67 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
if not client.is_ready():
|
||||
return {
|
||||
"success": False,
|
||||
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest"
|
||||
"message": "Weaviate server not ready. Make sure Weaviate is running:\n docker run -p 8080:8080 semitechnologies/weaviate:latest",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials."
|
||||
"message": f"Failed to connect to Weaviate: {e}\n\nMake sure Weaviate is running or provide correct credentials.",
|
||||
}
|
||||
|
||||
# Create schema
|
||||
try:
|
||||
client.schema.create_class(data['schema'])
|
||||
client.schema.create_class(data["schema"])
|
||||
print(f"✅ Created schema: {data['class_name']}")
|
||||
except Exception as e:
|
||||
if "already exists" in str(e).lower():
|
||||
print(f"ℹ️ Schema already exists: {data['class_name']}")
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Schema creation failed: {e}"
|
||||
}
|
||||
return {"success": False, "message": f"Schema creation failed: {e}"}
|
||||
|
||||
# Handle embeddings
|
||||
embedding_function = kwargs.get('embedding_function')
|
||||
embedding_function = kwargs.get("embedding_function")
|
||||
|
||||
try:
|
||||
with client.batch as batch:
|
||||
batch.batch_size = 100
|
||||
|
||||
if embedding_function == 'openai':
|
||||
if embedding_function == "openai":
|
||||
# Generate embeddings with OpenAI
|
||||
print("🔄 Generating OpenAI embeddings and uploading...")
|
||||
embeddings = self._generate_openai_embeddings(
|
||||
[obj['properties']['content'] for obj in data['objects']],
|
||||
api_key=kwargs.get('openai_api_key')
|
||||
[obj["properties"]["content"] for obj in data["objects"]],
|
||||
api_key=kwargs.get("openai_api_key"),
|
||||
)
|
||||
|
||||
for i, obj in enumerate(data['objects']):
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj['properties'],
|
||||
class_name=data['class_name'],
|
||||
uuid=obj['id'],
|
||||
vector=embeddings[i]
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
vector=embeddings[i],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
elif embedding_function == 'sentence-transformers':
|
||||
elif embedding_function == "sentence-transformers":
|
||||
# Use sentence-transformers
|
||||
print("🔄 Generating sentence-transformer embeddings and uploading...")
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||
contents = [obj['properties']['content'] for obj in data['objects']]
|
||||
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
contents = [obj["properties"]["content"] for obj in data["objects"]]
|
||||
embeddings = model.encode(contents, show_progress_bar=True).tolist()
|
||||
|
||||
for i, obj in enumerate(data['objects']):
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj['properties'],
|
||||
class_name=data['class_name'],
|
||||
uuid=obj['id'],
|
||||
vector=embeddings[i]
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
vector=embeddings[i],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
@@ -407,42 +405,37 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers"
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
|
||||
}
|
||||
|
||||
else:
|
||||
# No embeddings - Weaviate will use its configured vectorizer
|
||||
print("🔄 Uploading objects (Weaviate will generate embeddings)...")
|
||||
for i, obj in enumerate(data['objects']):
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj['properties'],
|
||||
class_name=data['class_name'],
|
||||
uuid=obj['id']
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
count = len(data['objects'])
|
||||
count = len(data["objects"])
|
||||
print(f"✅ Upload complete! {count} objects added to Weaviate")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
|
||||
"class_name": data['class_name'],
|
||||
"count": count
|
||||
"class_name": data["class_name"],
|
||||
"count": count,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Upload failed: {e}"
|
||||
}
|
||||
return {"success": False, "message": f"Upload failed: {e}"}
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self,
|
||||
documents: list[str],
|
||||
api_key: str = None
|
||||
self, documents: list[str], api_key: str = None
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings using OpenAI API.
|
||||
@@ -455,12 +448,13 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv('OPENAI_API_KEY')
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
@@ -473,14 +467,16 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
print(f" Generating embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i:i+batch_size]
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(
|
||||
input=batch,
|
||||
model="text-embedding-3-small" # Cheapest, fastest
|
||||
model="text-embedding-3-small", # Cheapest, fastest
|
||||
)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(f" ✓ Generated {min(i+batch_size, len(documents))}/{len(documents)} embeddings")
|
||||
print(
|
||||
f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
|
||||
Reference in New Issue
Block a user