Merge branch 'development' into feature/video-scraper-pipeline
Sync with latest development changes including ruff formatting, bug fixes, and pinecone adaptor additions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -64,6 +64,11 @@ try:
|
||||
except ImportError:
|
||||
HaystackAdaptor = None
|
||||
|
||||
try:
|
||||
from .pinecone_adaptor import PineconeAdaptor
|
||||
except ImportError:
|
||||
PineconeAdaptor = None
|
||||
|
||||
|
||||
# Registry of available adaptors
|
||||
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
|
||||
@@ -91,6 +96,8 @@ if QdrantAdaptor:
|
||||
ADAPTORS["qdrant"] = QdrantAdaptor
|
||||
if HaystackAdaptor:
|
||||
ADAPTORS["haystack"] = HaystackAdaptor
|
||||
if PineconeAdaptor:
|
||||
ADAPTORS["pinecone"] = PineconeAdaptor
|
||||
|
||||
|
||||
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
|
||||
|
||||
@@ -11,6 +11,8 @@ from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
@dataclass
|
||||
class SkillMetadata:
|
||||
@@ -19,6 +21,7 @@ class SkillMetadata:
|
||||
name: str
|
||||
description: str
|
||||
version: str = "1.0.0"
|
||||
doc_version: str = "" # Documentation version (e.g., "16.2") for RAG metadata filtering
|
||||
author: str | None = None
|
||||
tags: list[str] = field(default_factory=list)
|
||||
|
||||
@@ -73,8 +76,9 @@ class SkillAdaptor(ABC):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill for platform (ZIP, tar.gz, etc.).
|
||||
@@ -228,6 +232,47 @@ class SkillAdaptor(ABC):
|
||||
|
||||
return skill_md_path.read_text(encoding="utf-8")
|
||||
|
||||
def _read_frontmatter(self, skill_dir: Path) -> dict[str, str]:
|
||||
"""Read YAML frontmatter from SKILL.md.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
|
||||
Returns:
|
||||
Dict of key-value pairs from the frontmatter block.
|
||||
"""
|
||||
content = self._read_skill_md(skill_dir)
|
||||
if content.startswith("---"):
|
||||
parts = content.split("---", 2)
|
||||
if len(parts) >= 3:
|
||||
frontmatter: dict[str, str] = {}
|
||||
for line in parts[1].strip().splitlines():
|
||||
if ":" in line:
|
||||
key, _, value = line.partition(":")
|
||||
frontmatter[key.strip()] = value.strip()
|
||||
return frontmatter
|
||||
return {}
|
||||
|
||||
def _build_skill_metadata(self, skill_dir: Path) -> SkillMetadata:
|
||||
"""Build SkillMetadata from SKILL.md frontmatter.
|
||||
|
||||
Reads name, description, version, and doc_version from frontmatter
|
||||
instead of using hardcoded defaults.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
|
||||
Returns:
|
||||
SkillMetadata populated from frontmatter values.
|
||||
"""
|
||||
fm = self._read_frontmatter(skill_dir)
|
||||
return SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=fm.get("description", f"Documentation for {skill_dir.name}"),
|
||||
version=fm.get("version", "1.0.0"),
|
||||
doc_version=fm.get("doc_version", ""),
|
||||
)
|
||||
|
||||
def _iterate_references(self, skill_dir: Path):
|
||||
"""
|
||||
Iterate over all reference files in skill directory.
|
||||
@@ -266,6 +311,7 @@ class SkillAdaptor(ABC):
|
||||
base_meta = {
|
||||
"source": metadata.name,
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
"description": metadata.description,
|
||||
}
|
||||
if metadata.author:
|
||||
@@ -280,9 +326,10 @@ class SkillAdaptor(ABC):
|
||||
content: str,
|
||||
metadata: dict,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
source_file: str = None,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> list[tuple[str, dict]]:
|
||||
"""
|
||||
Optionally chunk content for RAG platforms.
|
||||
@@ -321,9 +368,18 @@ class SkillAdaptor(ABC):
|
||||
return [(content, metadata)]
|
||||
|
||||
# RAGChunker uses TOKENS (it converts to chars internally)
|
||||
# If overlap is at the default value but chunk size was customized,
|
||||
# scale overlap proportionally (10% of chunk size, min DEFAULT_CHUNK_OVERLAP_TOKENS)
|
||||
effective_overlap = chunk_overlap_tokens
|
||||
if (
|
||||
chunk_overlap_tokens == DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
and chunk_max_tokens != DEFAULT_CHUNK_TOKENS
|
||||
):
|
||||
effective_overlap = max(DEFAULT_CHUNK_OVERLAP_TOKENS, chunk_max_tokens // 10)
|
||||
|
||||
chunker = RAGChunker(
|
||||
chunk_size=chunk_max_tokens,
|
||||
chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap
|
||||
chunk_overlap=effective_overlap,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
preserve_paragraphs=True,
|
||||
min_chunk_size=100, # 100 tokens minimum
|
||||
@@ -433,6 +489,67 @@ class SkillAdaptor(ABC):
|
||||
# Plain hex digest
|
||||
return hash_hex
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self, documents: list[str], api_key: str | None = None
|
||||
) -> list[list[float]]:
|
||||
"""Generate embeddings using OpenAI text-embedding-3-small.
|
||||
|
||||
Args:
|
||||
documents: List of document texts
|
||||
api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
embeddings: list[list[float]] = []
|
||||
batch_size = 100
|
||||
|
||||
print(f" Generating OpenAI embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(input=batch, model="text-embedding-3-small")
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(f" ✓ Embedded {min(i + batch_size, len(documents))}/{len(documents)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
return embeddings
|
||||
|
||||
def _generate_st_embeddings(self, documents: list[str]) -> list[list[float]]:
|
||||
"""Generate embeddings using sentence-transformers (all-MiniLM-L6-v2).
|
||||
|
||||
Args:
|
||||
documents: List of document texts
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers not installed. Run: pip install sentence-transformers"
|
||||
) from None
|
||||
|
||||
print(f" Generating sentence-transformer embeddings for {len(documents)} documents...")
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = model.encode(documents, show_progress_bar=True)
|
||||
return [emb.tolist() for emb in embeddings]
|
||||
|
||||
def _generate_toc(self, skill_dir: Path) -> str:
|
||||
"""
|
||||
Helper to generate table of contents from references.
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class ChromaAdaptor(SkillAdaptor):
|
||||
@@ -79,6 +80,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -86,9 +88,12 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -109,6 +114,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -116,9 +122,12 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -144,8 +153,9 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Chroma.
|
||||
@@ -166,12 +176,8 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-chroma.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"Chroma collection data for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate Chroma data
|
||||
chroma_json = self.format_skill_md(
|
||||
@@ -180,6 +186,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
@@ -206,7 +213,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str = None, **kwargs) -> dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload packaged skill to ChromaDB.
|
||||
|
||||
@@ -250,9 +257,7 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
|
||||
# Parse URL
|
||||
if "://" in chroma_url:
|
||||
parts = chroma_url.split("://")
|
||||
parts[0]
|
||||
host_port = parts[1]
|
||||
_scheme, host_port = chroma_url.split("://", 1)
|
||||
else:
|
||||
host_port = chroma_url
|
||||
|
||||
@@ -352,52 +357,6 @@ class ChromaAdaptor(SkillAdaptor):
|
||||
except Exception as e:
|
||||
return {"success": False, "message": f"Upload failed: {e}"}
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self, documents: list[str], api_key: str = None
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings using OpenAI API.
|
||||
|
||||
Args:
|
||||
documents: List of document texts
|
||||
api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
# Batch process (OpenAI allows up to 2048 inputs)
|
||||
embeddings = []
|
||||
batch_size = 100
|
||||
|
||||
print(f" Generating embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(
|
||||
input=batch,
|
||||
model="text-embedding-3-small", # Cheapest, fastest
|
||||
)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
return embeddings
|
||||
|
||||
def validate_api_key(self, _api_key: str) -> bool:
|
||||
"""
|
||||
Chroma format doesn't use API keys for packaging.
|
||||
|
||||
@@ -12,6 +12,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class ClaudeAdaptor(SkillAdaptor):
|
||||
@@ -86,8 +87,9 @@ version: {metadata.version}
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file for Claude.
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class FAISSHelpers(SkillAdaptor):
|
||||
@@ -81,6 +82,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -88,9 +90,12 @@ class FAISSHelpers(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -110,6 +115,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -117,9 +123,12 @@ class FAISSHelpers(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -155,8 +164,9 @@ class FAISSHelpers(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for FAISS.
|
||||
@@ -176,12 +186,8 @@ class FAISSHelpers(SkillAdaptor):
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-faiss.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"FAISS data for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate FAISS data
|
||||
faiss_json = self.format_skill_md(
|
||||
@@ -190,6 +196,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -13,6 +13,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class GeminiAdaptor(SkillAdaptor):
|
||||
@@ -91,8 +92,9 @@ See the references directory for complete documentation with examples and best p
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into tar.gz file for Gemini.
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class HaystackAdaptor(SkillAdaptor):
|
||||
@@ -62,6 +63,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -69,9 +71,12 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
@@ -95,6 +100,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -102,9 +108,12 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as documents
|
||||
@@ -124,8 +133,9 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Haystack.
|
||||
@@ -147,11 +157,8 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"Haystack documents for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate Haystack documents
|
||||
documents_json = self.format_skill_md(
|
||||
@@ -160,6 +167,7 @@ class HaystackAdaptor(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class LangChainAdaptor(SkillAdaptor):
|
||||
@@ -62,6 +63,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -69,9 +71,12 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks to documents
|
||||
@@ -90,6 +95,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -97,9 +103,12 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks to documents
|
||||
@@ -114,8 +123,9 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for LangChain.
|
||||
@@ -139,12 +149,8 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-langchain.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"LangChain documents for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate LangChain documents with chunking
|
||||
documents_json = self.format_skill_md(
|
||||
@@ -153,6 +159,7 @@ class LangChainAdaptor(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class LlamaIndexAdaptor(SkillAdaptor):
|
||||
@@ -77,6 +78,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -84,9 +86,12 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
@@ -112,6 +117,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -119,9 +125,12 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
node_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as nodes
|
||||
@@ -143,8 +152,9 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for LlamaIndex.
|
||||
@@ -166,11 +176,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"LlamaIndex nodes for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate LlamaIndex nodes
|
||||
nodes_json = self.format_skill_md(
|
||||
@@ -179,6 +186,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class MarkdownAdaptor(SkillAdaptor):
|
||||
@@ -86,8 +87,9 @@ Browse the reference files for detailed information on each topic. All files are
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file with markdown documentation.
|
||||
|
||||
@@ -12,6 +12,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class OpenAIAdaptor(SkillAdaptor):
|
||||
@@ -108,8 +109,9 @@ Always prioritize accuracy by consulting the attached documentation files before
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into ZIP file for OpenAI Assistants.
|
||||
|
||||
405
src/skill_seekers/cli/adaptors/pinecone_adaptor.py
Normal file
405
src/skill_seekers/cli/adaptors/pinecone_adaptor.py
Normal file
@@ -0,0 +1,405 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pinecone Adaptor
|
||||
|
||||
Implements Pinecone vector database format for RAG pipelines.
|
||||
Converts Skill Seekers documentation into Pinecone-compatible format
|
||||
with namespace support and batch upsert.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
# Pinecone metadata value limit: 40 KB per vector
|
||||
PINECONE_METADATA_BYTES_LIMIT = 40_000
|
||||
|
||||
|
||||
class PineconeAdaptor(SkillAdaptor):
|
||||
"""
|
||||
Pinecone vector database adaptor.
|
||||
|
||||
Handles:
|
||||
- Pinecone-compatible vector format with metadata
|
||||
- Namespace support for multi-tenant indexing
|
||||
- Batch upsert (100 vectors per batch)
|
||||
- OpenAI and sentence-transformers embedding generation
|
||||
- Metadata truncation to stay within Pinecone's 40KB limit
|
||||
"""
|
||||
|
||||
PLATFORM = "pinecone"
|
||||
PLATFORM_NAME = "Pinecone (Vector Database)"
|
||||
DEFAULT_API_ENDPOINT = None
|
||||
|
||||
def _generate_id(self, content: str, metadata: dict) -> str:
|
||||
"""Generate deterministic ID from content and metadata."""
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def _truncate_text_for_metadata(
|
||||
self, text: str, max_bytes: int = PINECONE_METADATA_BYTES_LIMIT
|
||||
) -> str:
|
||||
"""Truncate text to fit within Pinecone's metadata byte limit.
|
||||
|
||||
Pinecone limits metadata to 40KB per vector. This truncates
|
||||
the text field (largest metadata value) to stay within limits,
|
||||
leaving room for other metadata fields (~1KB overhead).
|
||||
|
||||
Args:
|
||||
text: Text content to potentially truncate
|
||||
max_bytes: Maximum bytes for the text field
|
||||
|
||||
Returns:
|
||||
Truncated text that fits within the byte limit
|
||||
"""
|
||||
# Reserve ~2KB for other metadata fields
|
||||
available = max_bytes - 2000
|
||||
encoded = text.encode("utf-8")
|
||||
if len(encoded) <= available:
|
||||
return text
|
||||
# Truncate at byte boundary, decode safely
|
||||
truncated = encoded[:available].decode("utf-8", errors="ignore")
|
||||
return truncated
|
||||
|
||||
def format_skill_md(
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for Pinecone ingestion.
|
||||
|
||||
Creates a package with vectors ready for upsert:
|
||||
{
|
||||
"index_name": "...",
|
||||
"namespace": "...",
|
||||
"dimension": 1536,
|
||||
"metric": "cosine",
|
||||
"vectors": [
|
||||
{
|
||||
"id": "hex-id",
|
||||
"metadata": {
|
||||
"text": "content",
|
||||
"source": "...",
|
||||
"category": "...",
|
||||
...
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
No ``values`` field — embeddings are added at upload time.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters
|
||||
|
||||
Returns:
|
||||
JSON string containing Pinecone-compatible data
|
||||
"""
|
||||
vectors: list[dict[str, Any]] = []
|
||||
|
||||
# Convert SKILL.md (main documentation)
|
||||
skill_md_path = skill_dir / "SKILL.md"
|
||||
if skill_md_path.exists():
|
||||
content = self._read_existing_content(skill_dir)
|
||||
if content.strip():
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
vectors.append(
|
||||
{
|
||||
"id": self._generate_id(chunk_text, chunk_meta),
|
||||
"metadata": {
|
||||
**chunk_meta,
|
||||
"text": self._truncate_text_for_metadata(chunk_text),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
if ref_content.strip():
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
vectors.append(
|
||||
{
|
||||
"id": self._generate_id(chunk_text, chunk_meta),
|
||||
"metadata": {
|
||||
**chunk_meta,
|
||||
"text": self._truncate_text_for_metadata(chunk_text),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
index_name = metadata.name.replace("_", "-").lower()
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"index_name": index_name,
|
||||
"namespace": index_name,
|
||||
"dimension": 1536,
|
||||
"metric": "cosine",
|
||||
"vectors": vectors,
|
||||
},
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Pinecone.
|
||||
|
||||
Creates a JSON file containing vectors with metadata, ready for
|
||||
embedding generation and upsert to a Pinecone index.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
output_path: Output path/filename for JSON file
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
chunk_max_tokens: Maximum tokens per chunk (default: 512)
|
||||
preserve_code_blocks: Preserve code blocks during chunking
|
||||
|
||||
Returns:
|
||||
Path to created JSON file
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-pinecone.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
pinecone_json = self.format_skill_md(
|
||||
skill_dir,
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
output_path.write_text(pinecone_json, encoding="utf-8")
|
||||
|
||||
print(f"\n✅ Pinecone data packaged successfully!")
|
||||
print(f"📦 Output: {output_path}")
|
||||
|
||||
data = json.loads(pinecone_json)
|
||||
print(f"📊 Total vectors: {len(data['vectors'])}")
|
||||
print(f"🗂️ Index name: {data['index_name']}")
|
||||
print(f"📁 Namespace: {data['namespace']}")
|
||||
print(f"📐 Default dimension: {data['dimension']} (auto-detected at upload time)")
|
||||
|
||||
# Show category breakdown
|
||||
categories: dict[str, int] = {}
|
||||
for vec in data["vectors"]:
|
||||
cat = vec["metadata"].get("category", "unknown")
|
||||
categories[cat] = categories.get(cat, 0) + 1
|
||||
|
||||
print("📁 Categories:")
|
||||
for cat, count in sorted(categories.items()):
|
||||
print(f" - {cat}: {count}")
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload packaged skill to Pinecone.
|
||||
|
||||
Args:
|
||||
package_path: Path to packaged JSON
|
||||
api_key: Pinecone API key (or uses PINECONE_API_KEY env var)
|
||||
**kwargs:
|
||||
index_name: Override index name from JSON
|
||||
namespace: Override namespace from JSON
|
||||
dimension: Embedding dimension (default: 1536)
|
||||
metric: Distance metric (default: "cosine")
|
||||
embedding_function: "openai" or "sentence-transformers"
|
||||
cloud: Cloud provider (default: "aws")
|
||||
region: Cloud region (default: "us-east-1")
|
||||
|
||||
Returns:
|
||||
{"success": bool, "index": str, "namespace": str, "count": int}
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from pinecone import Pinecone, ServerlessSpec
|
||||
except (ImportError, Exception):
|
||||
return {
|
||||
"success": False,
|
||||
"message": "pinecone not installed. Run: pip install 'pinecone>=5.0.0'",
|
||||
}
|
||||
|
||||
api_key = api_key or os.getenv("PINECONE_API_KEY")
|
||||
if not api_key:
|
||||
return {
|
||||
"success": False,
|
||||
"message": ("PINECONE_API_KEY not set. Set via env var or pass api_key parameter."),
|
||||
}
|
||||
|
||||
# Load package
|
||||
with open(package_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
index_name = kwargs.get("index_name", data.get("index_name", "skill-docs"))
|
||||
namespace = kwargs.get("namespace", data.get("namespace", ""))
|
||||
metric = kwargs.get("metric", data.get("metric", "cosine"))
|
||||
cloud = kwargs.get("cloud", "aws")
|
||||
region = kwargs.get("region", "us-east-1")
|
||||
|
||||
# Auto-detect dimension from embedding model
|
||||
embedding_function = kwargs.get("embedding_function", "openai")
|
||||
EMBEDDING_DIMENSIONS = {
|
||||
"openai": 1536, # text-embedding-3-small
|
||||
"sentence-transformers": 384, # all-MiniLM-L6-v2
|
||||
}
|
||||
# Priority: explicit kwarg > model-based auto-detect > JSON file > fallback
|
||||
# Note: format_skill_md() hardcodes dimension=1536 in the JSON, so we must
|
||||
# give EMBEDDING_DIMENSIONS priority over the file to handle sentence-transformers (384).
|
||||
dimension = kwargs.get(
|
||||
"dimension",
|
||||
EMBEDDING_DIMENSIONS.get(embedding_function, data.get("dimension", 1536)),
|
||||
)
|
||||
|
||||
try:
|
||||
# Generate embeddings FIRST — before creating the index.
|
||||
# This avoids leaving an empty Pinecone index behind when
|
||||
# embedding generation fails (e.g. missing API key).
|
||||
texts = [vec["metadata"]["text"] for vec in data["vectors"]]
|
||||
|
||||
if embedding_function == "openai":
|
||||
embeddings = self._generate_openai_embeddings(texts)
|
||||
elif embedding_function == "sentence-transformers":
|
||||
embeddings = self._generate_st_embeddings(texts)
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Unknown embedding_function: {embedding_function}. Use 'openai' or 'sentence-transformers'.",
|
||||
}
|
||||
|
||||
pc = Pinecone(api_key=api_key)
|
||||
|
||||
# Create index if it doesn't exist
|
||||
existing_indexes = [idx.name for idx in pc.list_indexes()]
|
||||
if index_name not in existing_indexes:
|
||||
print(
|
||||
f"🔧 Creating Pinecone index: {index_name} (dimension={dimension}, metric={metric})"
|
||||
)
|
||||
pc.create_index(
|
||||
name=index_name,
|
||||
dimension=dimension,
|
||||
metric=metric,
|
||||
spec=ServerlessSpec(cloud=cloud, region=region),
|
||||
)
|
||||
print(f"✅ Index '{index_name}' created")
|
||||
else:
|
||||
print(f"ℹ️ Using existing index: {index_name}")
|
||||
|
||||
index = pc.Index(index_name)
|
||||
|
||||
# Batch upsert (100 per batch — Pinecone recommendation)
|
||||
batch_size = 100
|
||||
vectors_to_upsert = []
|
||||
for i, vec in enumerate(data["vectors"]):
|
||||
vectors_to_upsert.append(
|
||||
{
|
||||
"id": vec["id"],
|
||||
"values": embeddings[i],
|
||||
"metadata": vec["metadata"],
|
||||
}
|
||||
)
|
||||
|
||||
total = len(vectors_to_upsert)
|
||||
print(f"🔄 Upserting {total} vectors to Pinecone...")
|
||||
|
||||
for i in range(0, total, batch_size):
|
||||
batch = vectors_to_upsert[i : i + batch_size]
|
||||
index.upsert(vectors=batch, namespace=namespace)
|
||||
print(f" ✓ Upserted {min(i + batch_size, total)}/{total}")
|
||||
|
||||
print(f"✅ Uploaded {total} vectors to Pinecone index '{index_name}'")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"Uploaded {total} vectors to Pinecone index '{index_name}' (namespace: '{namespace}')",
|
||||
"url": None,
|
||||
"index": index_name,
|
||||
"namespace": namespace,
|
||||
"count": total,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "message": f"Pinecone upload failed: {e}"}
|
||||
|
||||
def validate_api_key(self, _api_key: str) -> bool:
|
||||
"""Pinecone doesn't need API key for packaging."""
|
||||
return False
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""Return the expected env var for Pinecone API key."""
|
||||
return "PINECONE_API_KEY"
|
||||
|
||||
def supports_enhancement(self) -> bool:
|
||||
"""Pinecone format doesn't support AI enhancement."""
|
||||
return False
|
||||
|
||||
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
|
||||
"""Pinecone format doesn't support enhancement."""
|
||||
print("❌ Pinecone format does not support enhancement")
|
||||
print(" Enhance before packaging:")
|
||||
print(" skill-seekers enhance output/skill/ --mode LOCAL")
|
||||
print(" skill-seekers package output/skill/ --target pinecone")
|
||||
return False
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class QdrantAdaptor(SkillAdaptor):
|
||||
@@ -76,6 +77,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -83,9 +85,12 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
@@ -109,6 +114,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
"doc_version": chunk_meta.get("doc_version", ""),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -124,6 +130,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -131,9 +138,12 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
payload_meta,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as points
|
||||
@@ -157,6 +167,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
"doc_version": chunk_meta.get("doc_version", ""),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -189,8 +200,9 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Qdrant.
|
||||
@@ -211,11 +223,8 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"Qdrant data for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate Qdrant data
|
||||
qdrant_json = self.format_skill_md(
|
||||
@@ -224,6 +233,7 @@ class QdrantAdaptor(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class WeaviateAdaptor(SkillAdaptor):
|
||||
@@ -96,7 +97,14 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
{
|
||||
"name": "version",
|
||||
"dataType": ["text"],
|
||||
"description": "Documentation version",
|
||||
"description": "Skill package version",
|
||||
"indexFilterable": True,
|
||||
"indexSearchable": False,
|
||||
},
|
||||
{
|
||||
"name": "doc_version",
|
||||
"dataType": ["text"],
|
||||
"description": "Documentation version (e.g., 16.2)",
|
||||
"indexFilterable": True,
|
||||
"indexSearchable": False,
|
||||
},
|
||||
@@ -137,6 +145,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -144,9 +153,12 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
@@ -161,6 +173,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
"file": chunk_meta.get("file", "SKILL.md"),
|
||||
"type": chunk_meta.get("type", "documentation"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
"doc_version": chunk_meta.get("doc_version", ""),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -177,6 +190,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -184,9 +198,12 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
ref_content,
|
||||
obj_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get(
|
||||
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
),
|
||||
)
|
||||
|
||||
# Add all chunks as objects
|
||||
@@ -201,6 +218,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
"file": chunk_meta.get("file", ref_file.name),
|
||||
"type": chunk_meta.get("type", "reference"),
|
||||
"version": chunk_meta.get("version", metadata.version),
|
||||
"doc_version": chunk_meta.get("doc_version", ""),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -221,8 +239,9 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Weaviate.
|
||||
@@ -245,12 +264,8 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-weaviate.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"Weaviate objects for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate Weaviate objects
|
||||
weaviate_json = self.format_skill_md(
|
||||
@@ -259,6 +274,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
@@ -288,7 +304,7 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str = None, **kwargs) -> dict[str, Any]:
|
||||
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload packaged skill to Weaviate.
|
||||
|
||||
@@ -382,31 +398,20 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
elif embedding_function == "sentence-transformers":
|
||||
# Use sentence-transformers
|
||||
print("🔄 Generating sentence-transformer embeddings and uploading...")
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
# Use sentence-transformers (via shared base method)
|
||||
contents = [obj["properties"]["content"] for obj in data["objects"]]
|
||||
embeddings = self._generate_st_embeddings(contents)
|
||||
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
contents = [obj["properties"]["content"] for obj in data["objects"]]
|
||||
embeddings = model.encode(contents, show_progress_bar=True).tolist()
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
vector=embeddings[i],
|
||||
)
|
||||
|
||||
for i, obj in enumerate(data["objects"]):
|
||||
batch.add_data_object(
|
||||
data_object=obj["properties"],
|
||||
class_name=data["class_name"],
|
||||
uuid=obj["id"],
|
||||
vector=embeddings[i],
|
||||
)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
except ImportError:
|
||||
return {
|
||||
"success": False,
|
||||
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
|
||||
}
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
|
||||
|
||||
else:
|
||||
# No embeddings - Weaviate will use its configured vectorizer
|
||||
@@ -427,61 +432,16 @@ class WeaviateAdaptor(SkillAdaptor):
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
|
||||
"url": None,
|
||||
"class_name": data["class_name"],
|
||||
"count": count,
|
||||
}
|
||||
|
||||
except ImportError as e:
|
||||
return {"success": False, "message": str(e)}
|
||||
except Exception as e:
|
||||
return {"success": False, "message": f"Upload failed: {e}"}
|
||||
|
||||
def _generate_openai_embeddings(
|
||||
self, documents: list[str], api_key: str = None
|
||||
) -> list[list[float]]:
|
||||
"""
|
||||
Generate embeddings using OpenAI API.
|
||||
|
||||
Args:
|
||||
documents: List of document texts
|
||||
api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
|
||||
|
||||
Returns:
|
||||
List of embedding vectors
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
raise ImportError("openai not installed. Run: pip install openai") from None
|
||||
|
||||
api_key = api_key or os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
# Batch process (OpenAI allows up to 2048 inputs)
|
||||
embeddings = []
|
||||
batch_size = 100
|
||||
|
||||
print(f" Generating embeddings for {len(documents)} documents...")
|
||||
|
||||
for i in range(0, len(documents), batch_size):
|
||||
batch = documents[i : i + batch_size]
|
||||
try:
|
||||
response = client.embeddings.create(
|
||||
input=batch,
|
||||
model="text-embedding-3-small", # Cheapest, fastest
|
||||
)
|
||||
embeddings.extend([item.embedding for item in response.data])
|
||||
print(
|
||||
f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(f"OpenAI embedding generation failed: {e}") from e
|
||||
|
||||
return embeddings
|
||||
|
||||
def validate_api_key(self, _api_key: str) -> bool:
|
||||
"""
|
||||
Weaviate format doesn't use API keys for packaging.
|
||||
|
||||
@@ -15,6 +15,10 @@ Hierarchy:
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
# Default chunking constants used by RAG and package arguments
|
||||
DEFAULT_CHUNK_TOKENS = 512
|
||||
DEFAULT_CHUNK_OVERLAP_TOKENS = 50
|
||||
|
||||
# Common argument definitions as data structure
|
||||
# These are arguments that appear in MULTIPLE commands
|
||||
COMMON_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
@@ -64,6 +68,15 @@ COMMON_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"metavar": "KEY",
|
||||
},
|
||||
},
|
||||
"doc_version": {
|
||||
"flags": ("--doc-version",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"default": "",
|
||||
"help": "Documentation version tag for RAG metadata (e.g., '16.2')",
|
||||
"metavar": "VERSION",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Behavior arguments — runtime flags shared by every scraper
|
||||
@@ -105,18 +118,18 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"flags": ("--chunk-tokens",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 512,
|
||||
"default": DEFAULT_CHUNK_TOKENS,
|
||||
"metavar": "TOKENS",
|
||||
"help": "Chunk size in tokens for RAG (default: 512)",
|
||||
"help": f"Chunk size in tokens for RAG (default: {DEFAULT_CHUNK_TOKENS})",
|
||||
},
|
||||
},
|
||||
"chunk_overlap_tokens": {
|
||||
"flags": ("--chunk-overlap-tokens",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 50,
|
||||
"default": DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
"metavar": "TOKENS",
|
||||
"help": "Overlap between chunks in tokens (default: 50)",
|
||||
"help": f"Overlap between chunks in tokens (default: {DEFAULT_CHUNK_OVERLAP_TOKENS})",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
@@ -153,6 +153,15 @@ UNIVERSAL_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"doc_version": {
|
||||
"flags": ("--doc-version",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"default": "",
|
||||
"help": "Documentation version tag for RAG metadata (e.g., '16.2')",
|
||||
"metavar": "VERSION",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Merge RAG arguments from common.py into universal arguments
|
||||
@@ -655,3 +664,11 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
if mode in ["advanced", "all"]:
|
||||
for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
# Deprecated alias for backward compatibility (removed in v4.0.0)
|
||||
parser.add_argument(
|
||||
"--no-preserve-code",
|
||||
dest="no_preserve_code_blocks",
|
||||
action="store_true",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
|
||||
@@ -8,6 +8,8 @@ import and use these definitions.
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
# Positional argument
|
||||
"skill_directory": {
|
||||
@@ -49,6 +51,7 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant",
|
||||
"pinecone",
|
||||
],
|
||||
"default": "claude",
|
||||
"help": "Target LLM platform (default: claude)",
|
||||
@@ -109,13 +112,22 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"flags": ("--chunk-tokens",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 512,
|
||||
"help": "Maximum tokens per chunk (default: 512)",
|
||||
"default": DEFAULT_CHUNK_TOKENS,
|
||||
"help": f"Maximum tokens per chunk (default: {DEFAULT_CHUNK_TOKENS})",
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"no_preserve_code": {
|
||||
"flags": ("--no-preserve-code",),
|
||||
"chunk_overlap_tokens": {
|
||||
"flags": ("--chunk-overlap-tokens",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
"help": f"Overlap between chunks in tokens (default: {DEFAULT_CHUNK_OVERLAP_TOKENS})",
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"no_preserve_code_blocks": {
|
||||
"flags": ("--no-preserve-code-blocks",),
|
||||
"kwargs": {
|
||||
"action": "store_true",
|
||||
"help": "Allow code block splitting (default: code blocks preserved)",
|
||||
@@ -130,3 +142,11 @@ def add_package_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
|
||||
# Deprecated alias for backward compatibility (removed in v4.0.0)
|
||||
parser.add_argument(
|
||||
"--no-preserve-code",
|
||||
dest="no_preserve_code_blocks",
|
||||
action="store_true",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
|
||||
@@ -172,6 +172,14 @@ def add_scrape_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
|
||||
# Deprecated alias for backward compatibility (removed in v4.0.0)
|
||||
parser.add_argument(
|
||||
"--no-preserve-code",
|
||||
dest="no_preserve_code_blocks",
|
||||
action="store_true",
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
|
||||
|
||||
def get_scrape_argument_names() -> set:
|
||||
"""Get the set of scrape argument destination names.
|
||||
|
||||
@@ -1057,6 +1057,7 @@ def analyze_codebase(
|
||||
enhance_level: int = 0,
|
||||
skill_name: str | None = None,
|
||||
skill_description: str | None = None,
|
||||
doc_version: str = "",
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Analyze local codebase and extract code knowledge.
|
||||
@@ -1603,6 +1604,7 @@ def analyze_codebase(
|
||||
docs_data=docs_data,
|
||||
skill_name=skill_name,
|
||||
skill_description=skill_description,
|
||||
doc_version=doc_version,
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -1622,6 +1624,7 @@ def _generate_skill_md(
|
||||
docs_data: dict[str, Any] | None = None,
|
||||
skill_name: str | None = None,
|
||||
skill_description: str | None = None,
|
||||
doc_version: str = "",
|
||||
):
|
||||
"""
|
||||
Generate rich SKILL.md from codebase analysis results.
|
||||
@@ -1657,6 +1660,7 @@ def _generate_skill_md(
|
||||
skill_content = f"""---
|
||||
name: {skill_name}
|
||||
description: {description}
|
||||
doc_version: {doc_version}
|
||||
---
|
||||
|
||||
# {repo_name} Codebase
|
||||
@@ -2197,13 +2201,11 @@ def _generate_references(output_dir: Path):
|
||||
|
||||
if source_dir.exists() and source_dir.is_dir():
|
||||
# Copy directory to references/ (not symlink, for portability)
|
||||
if target_dir.exists():
|
||||
import shutil
|
||||
|
||||
shutil.rmtree(target_dir)
|
||||
|
||||
import shutil
|
||||
|
||||
if target_dir.exists():
|
||||
shutil.rmtree(target_dir)
|
||||
|
||||
shutil.copytree(source_dir, target_dir)
|
||||
logger.debug(f"Copied {source} → references/{target}")
|
||||
|
||||
@@ -2451,6 +2453,7 @@ Examples:
|
||||
enhance_level=args.enhance_level, # AI enhancement level (0-3)
|
||||
skill_name=getattr(args, "name", None),
|
||||
skill_description=getattr(args, "description", None),
|
||||
doc_version=getattr(args, "doc_version", ""),
|
||||
)
|
||||
|
||||
# ============================================================
|
||||
|
||||
@@ -13,6 +13,7 @@ from skill_seekers.cli.arguments.create import (
|
||||
get_compatible_arguments,
|
||||
get_universal_argument_names,
|
||||
)
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -106,8 +107,8 @@ class CreateCommand:
|
||||
# Check against common defaults
|
||||
defaults = {
|
||||
"max_issues": 100,
|
||||
"chunk_tokens": 512,
|
||||
"chunk_overlap_tokens": 50,
|
||||
"chunk_tokens": DEFAULT_CHUNK_TOKENS,
|
||||
"chunk_overlap_tokens": DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
"output": None,
|
||||
}
|
||||
|
||||
@@ -162,11 +163,14 @@ class CreateCommand:
|
||||
# RAG arguments (web scraper only)
|
||||
if getattr(self.args, "chunk_for_rag", False):
|
||||
argv.append("--chunk-for-rag")
|
||||
if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512:
|
||||
if (
|
||||
getattr(self.args, "chunk_tokens", None)
|
||||
and self.args.chunk_tokens != DEFAULT_CHUNK_TOKENS
|
||||
):
|
||||
argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)])
|
||||
if (
|
||||
getattr(self.args, "chunk_overlap_tokens", None)
|
||||
and self.args.chunk_overlap_tokens != 50
|
||||
and self.args.chunk_overlap_tokens != DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
):
|
||||
argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)])
|
||||
|
||||
@@ -479,6 +483,10 @@ class CreateCommand:
|
||||
if self.args.quiet:
|
||||
argv.append("--quiet")
|
||||
|
||||
# Documentation version metadata
|
||||
if getattr(self.args, "doc_version", ""):
|
||||
argv.extend(["--doc-version", self.args.doc_version])
|
||||
|
||||
# Enhancement Workflow arguments
|
||||
if getattr(self.args, "enhance_workflow", None):
|
||||
for wf in self.args.enhance_workflow:
|
||||
|
||||
@@ -1565,9 +1565,11 @@ class DocToSkillConverter:
|
||||
if len(example_codes) >= 10:
|
||||
break
|
||||
|
||||
doc_version = self.config.get("doc_version", "")
|
||||
content = f"""---
|
||||
name: {self.name}
|
||||
description: {description}
|
||||
doc_version: {doc_version}
|
||||
---
|
||||
|
||||
# {self.name.title()} Skill
|
||||
@@ -2103,6 +2105,11 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"max_pages": DEFAULT_MAX_PAGES,
|
||||
}
|
||||
|
||||
# Apply CLI override for doc_version (works for all config modes)
|
||||
cli_doc_version = getattr(args, "doc_version", "")
|
||||
if cli_doc_version:
|
||||
config["doc_version"] = cli_doc_version
|
||||
|
||||
# Apply CLI overrides for rate limiting
|
||||
if args.no_rate_limit:
|
||||
config["rate_limit"] = 0
|
||||
|
||||
@@ -367,7 +367,7 @@ class LocalSkillEnhancer:
|
||||
if line.startswith("#"):
|
||||
# Found heading - keep it and next 3 lines
|
||||
chunk = lines[i : min(i + 4, len(lines))]
|
||||
chunk_chars = sum(len(l) for l in chunk)
|
||||
chunk_chars = sum(len(line_text) for line_text in chunk)
|
||||
if current_chars + chunk_chars > max_chars:
|
||||
break
|
||||
result.extend(chunk)
|
||||
|
||||
@@ -968,10 +968,13 @@ class GitHubToSkillConverter:
|
||||
# Truncate description to 1024 chars if needed
|
||||
desc = self.description[:1024] if len(self.description) > 1024 else self.description
|
||||
|
||||
doc_version = self.config.get("doc_version", "")
|
||||
|
||||
# Build skill content
|
||||
skill_content = f"""---
|
||||
name: {skill_name}
|
||||
description: {desc}
|
||||
doc_version: {doc_version}
|
||||
---
|
||||
|
||||
# {repo_info.get("name", self.name)}
|
||||
@@ -1003,10 +1006,11 @@ Use this skill when you need to:
|
||||
|
||||
# Repository info
|
||||
skill_content += "### Repository Info\n"
|
||||
skill_content += f"- **Homepage:** {repo_info.get('homepage', 'N/A')}\n"
|
||||
skill_content += f"- **Homepage:** {repo_info.get('homepage') or 'N/A'}\n"
|
||||
skill_content += f"- **Topics:** {', '.join(repo_info.get('topics', []))}\n"
|
||||
skill_content += f"- **Open Issues:** {repo_info.get('open_issues', 0)}\n"
|
||||
skill_content += f"- **Last Updated:** {repo_info.get('updated_at', 'N/A')[:10]}\n\n"
|
||||
updated_at = repo_info.get("updated_at") or "N/A"
|
||||
skill_content += f"- **Last Updated:** {updated_at[:10]}\n\n"
|
||||
|
||||
# Languages
|
||||
skill_content += "### Languages\n"
|
||||
@@ -1101,9 +1105,9 @@ Use this skill when you need to:
|
||||
|
||||
lines = []
|
||||
for release in releases[:3]:
|
||||
lines.append(
|
||||
f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}"
|
||||
)
|
||||
published_at = release.get("published_at") or "N/A"
|
||||
release_name = release.get("name") or release["tag_name"]
|
||||
lines.append(f"- **{release['tag_name']}** ({published_at[:10]}): {release_name}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@@ -1298,15 +1302,17 @@ Use this skill when you need to:
|
||||
content += f"## Open Issues ({len(open_issues)})\n\n"
|
||||
for issue in open_issues:
|
||||
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
|
||||
created_at = issue.get("created_at") or "N/A"
|
||||
content += f"### #{issue['number']}: {issue['title']}\n"
|
||||
content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
|
||||
content += f"**Labels:** {labels} | **Created:** {created_at[:10]}\n"
|
||||
content += f"[View on GitHub]({issue['url']})\n\n"
|
||||
|
||||
content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
|
||||
for issue in closed_issues:
|
||||
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
|
||||
closed_at = issue.get("closed_at") or "N/A"
|
||||
content += f"### #{issue['number']}: {issue['title']}\n"
|
||||
content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"
|
||||
content += f"**Labels:** {labels} | **Closed:** {closed_at[:10]}\n"
|
||||
content += f"[View on GitHub]({issue['url']})\n\n"
|
||||
|
||||
issues_path = f"{self.skill_dir}/references/issues.md"
|
||||
@@ -1323,11 +1329,14 @@ Use this skill when you need to:
|
||||
)
|
||||
|
||||
for release in releases:
|
||||
content += f"## {release['tag_name']}: {release['name']}\n"
|
||||
content += f"**Published:** {release['published_at'][:10]}\n"
|
||||
published_at = release.get("published_at") or "N/A"
|
||||
release_name = release.get("name") or release["tag_name"]
|
||||
release_body = release.get("body") or ""
|
||||
content += f"## {release['tag_name']}: {release_name}\n"
|
||||
content += f"**Published:** {published_at[:10]}\n"
|
||||
if release["prerelease"]:
|
||||
content += "**Pre-release**\n"
|
||||
content += f"\n{release['body']}\n\n"
|
||||
content += f"\n{release_body}\n\n"
|
||||
content += f"[View on GitHub]({release['url']})\n\n---\n\n"
|
||||
|
||||
releases_path = f"{self.skill_dir}/references/releases.md"
|
||||
|
||||
@@ -325,8 +325,8 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
|
||||
if getattr(args, "enhance_stage", None):
|
||||
for stage in args.enhance_stage:
|
||||
sys.argv.extend(["--enhance-stage", stage])
|
||||
if getattr(args, "workflow_var", None):
|
||||
for var in args.workflow_var:
|
||||
if getattr(args, "var", None):
|
||||
for var in args.var:
|
||||
sys.argv.extend(["--var", var])
|
||||
if getattr(args, "workflow_dry_run", False):
|
||||
sys.argv.append("--workflow-dry-run")
|
||||
|
||||
@@ -14,6 +14,8 @@ import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
# Import utilities
|
||||
try:
|
||||
from quality_checker import SkillQualityChecker, print_report
|
||||
@@ -45,8 +47,9 @@ def package_skill(
|
||||
chunk_overlap=200,
|
||||
batch_size=100,
|
||||
enable_chunking=False,
|
||||
chunk_max_tokens=512,
|
||||
chunk_max_tokens=DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks=True,
|
||||
chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
):
|
||||
"""
|
||||
Package a skill directory into platform-specific format
|
||||
@@ -121,6 +124,7 @@ def package_skill(
|
||||
"chroma",
|
||||
"faiss",
|
||||
"qdrant",
|
||||
"pinecone",
|
||||
]
|
||||
|
||||
if target in RAG_PLATFORMS and not enable_chunking:
|
||||
@@ -156,6 +160,7 @@ def package_skill(
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
else:
|
||||
package_path = adaptor.package(
|
||||
@@ -164,6 +169,7 @@ def package_skill(
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
print(f" Output: {package_path}")
|
||||
@@ -226,7 +232,8 @@ Examples:
|
||||
batch_size=args.batch_size,
|
||||
enable_chunking=args.chunk_for_rag,
|
||||
chunk_max_tokens=args.chunk_tokens,
|
||||
preserve_code_blocks=not args.no_preserve_code,
|
||||
preserve_code_blocks=not args.no_preserve_code_blocks,
|
||||
chunk_overlap_tokens=args.chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
if not success:
|
||||
|
||||
@@ -14,6 +14,8 @@ Usage:
|
||||
chunks = chunker.chunk_skill(Path("output/react"))
|
||||
"""
|
||||
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
import json
|
||||
@@ -35,8 +37,8 @@ class RAGChunker:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 512,
|
||||
chunk_overlap: int = 50,
|
||||
chunk_size: int = DEFAULT_CHUNK_TOKENS,
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
preserve_paragraphs: bool = True,
|
||||
min_chunk_size: int = 100,
|
||||
@@ -383,9 +385,14 @@ def main():
|
||||
)
|
||||
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
|
||||
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
|
||||
parser.add_argument("--chunk-tokens", type=int, default=512, help="Target chunk size in tokens")
|
||||
parser.add_argument(
|
||||
"--chunk-overlap-tokens", type=int, default=50, help="Overlap size in tokens"
|
||||
"--chunk-tokens", type=int, default=DEFAULT_CHUNK_TOKENS, help="Target chunk size in tokens"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chunk-overlap-tokens",
|
||||
type=int,
|
||||
default=DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
help="Overlap size in tokens",
|
||||
)
|
||||
parser.add_argument("--no-code-blocks", action="store_true", help="Don't preserve code blocks")
|
||||
parser.add_argument("--no-paragraphs", action="store_true", help="Don't preserve paragraphs")
|
||||
|
||||
@@ -1296,7 +1296,9 @@ This skill combines knowledge from multiple sources:
|
||||
f.write(f"- **File**: `{ex.get('file_path', 'N/A')}`\n")
|
||||
if ex.get("code_snippet"):
|
||||
lang = ex.get("language", "text")
|
||||
f.write(f"\n```{lang}\n{ex['code_snippet']}\n```\n") # Full code, no truncation
|
||||
f.write(
|
||||
f"\n```{lang}\n{ex['code_snippet']}\n```\n"
|
||||
) # Full code, no truncation
|
||||
f.write("\n")
|
||||
|
||||
logger.info(f" ✓ Test examples: {total} total, {high_value} high-value")
|
||||
|
||||
@@ -79,7 +79,9 @@ class WordToSkillConverter:
|
||||
self.config = config
|
||||
self.name = config["name"]
|
||||
self.docx_path = config.get("docx_path", "")
|
||||
self.description = config.get("description") or f"Use when referencing {self.name} documentation"
|
||||
self.description = (
|
||||
config.get("description") or f"Use when referencing {self.name} documentation"
|
||||
)
|
||||
|
||||
# Paths
|
||||
self.skill_dir = f"output/{self.name}"
|
||||
@@ -109,6 +111,9 @@ class WordToSkillConverter:
|
||||
if not os.path.exists(self.docx_path):
|
||||
raise FileNotFoundError(f"Word document not found: {self.docx_path}")
|
||||
|
||||
if not self.docx_path.lower().endswith(".docx"):
|
||||
raise ValueError(f"Not a Word document (expected .docx): {self.docx_path}")
|
||||
|
||||
# --- Extract metadata via python-docx ---
|
||||
doc = python_docx.Document(self.docx_path)
|
||||
core_props = doc.core_properties
|
||||
@@ -728,12 +733,13 @@ class WordToSkillConverter:
|
||||
# HTML-to-sections helper (module-level for clarity)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _build_section(
|
||||
section_number: int,
|
||||
heading: str | None,
|
||||
heading_level: str | None,
|
||||
elements: list,
|
||||
doc,
|
||||
doc, # noqa: ARG001
|
||||
) -> dict:
|
||||
"""Build a section dict from a list of BeautifulSoup elements.
|
||||
|
||||
@@ -769,10 +775,7 @@ def _build_section(
|
||||
# Code blocks
|
||||
if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None):
|
||||
code_elem = elem.find("code") if tag == "pre" else elem
|
||||
if code_elem:
|
||||
code_text = code_elem.get_text()
|
||||
else:
|
||||
code_text = elem.get_text()
|
||||
code_text = code_elem.get_text() if code_elem else elem.get_text()
|
||||
|
||||
code_text = code_text.strip()
|
||||
if code_text:
|
||||
@@ -825,8 +828,8 @@ def _build_section(
|
||||
raw_text = elem.get_text(separator="\n").strip()
|
||||
# Exclude bullet-point / prose lists (•, *, -)
|
||||
if raw_text and not re.search(r"^[•\-\*]\s", raw_text, re.MULTILINE):
|
||||
if _score_code_quality(raw_text) >= 5.5:
|
||||
quality_score = _score_code_quality(raw_text)
|
||||
quality_score = _score_code_quality(raw_text)
|
||||
if quality_score >= 5.5:
|
||||
code_samples.append(
|
||||
{"code": raw_text, "language": "", "quality_score": quality_score}
|
||||
)
|
||||
@@ -956,7 +959,8 @@ def main():
|
||||
name = Path(args.from_json).stem.replace("_extracted", "")
|
||||
config = {
|
||||
"name": getattr(args, "name", None) or name,
|
||||
"description": getattr(args, "description", None) or f"Use when referencing {name} documentation",
|
||||
"description": getattr(args, "description", None)
|
||||
or f"Use when referencing {name} documentation",
|
||||
}
|
||||
try:
|
||||
converter = WordToSkillConverter(config)
|
||||
@@ -1044,6 +1048,7 @@ def main():
|
||||
except Exception as e:
|
||||
print(f"\n❌ Unexpected error during Word processing: {e}", file=sys.stderr)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user