Merge branch 'development' into feature/video-scraper-pipeline

Sync with latest development changes including ruff formatting,
bug fixes, and pinecone adaptor additions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-01 11:38:45 +03:00
43 changed files with 1988 additions and 261 deletions

View File

@@ -64,6 +64,11 @@ try:
except ImportError:
HaystackAdaptor = None
try:
from .pinecone_adaptor import PineconeAdaptor
except ImportError:
PineconeAdaptor = None
# Registry of available adaptors
ADAPTORS: dict[str, type[SkillAdaptor]] = {}
@@ -91,6 +96,8 @@ if QdrantAdaptor:
ADAPTORS["qdrant"] = QdrantAdaptor
if HaystackAdaptor:
ADAPTORS["haystack"] = HaystackAdaptor
if PineconeAdaptor:
ADAPTORS["pinecone"] = PineconeAdaptor
def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:

View File

@@ -11,6 +11,8 @@ from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
@dataclass
class SkillMetadata:
@@ -19,6 +21,7 @@ class SkillMetadata:
name: str
description: str
version: str = "1.0.0"
doc_version: str = "" # Documentation version (e.g., "16.2") for RAG metadata filtering
author: str | None = None
tags: list[str] = field(default_factory=list)
@@ -73,8 +76,9 @@ class SkillAdaptor(ABC):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill for platform (ZIP, tar.gz, etc.).
@@ -228,6 +232,47 @@ class SkillAdaptor(ABC):
return skill_md_path.read_text(encoding="utf-8")
def _read_frontmatter(self, skill_dir: Path) -> dict[str, str]:
"""Read YAML frontmatter from SKILL.md.
Args:
skill_dir: Path to skill directory
Returns:
Dict of key-value pairs from the frontmatter block.
"""
content = self._read_skill_md(skill_dir)
if content.startswith("---"):
parts = content.split("---", 2)
if len(parts) >= 3:
frontmatter: dict[str, str] = {}
for line in parts[1].strip().splitlines():
if ":" in line:
key, _, value = line.partition(":")
frontmatter[key.strip()] = value.strip()
return frontmatter
return {}
def _build_skill_metadata(self, skill_dir: Path) -> SkillMetadata:
"""Build SkillMetadata from SKILL.md frontmatter.
Reads name, description, version, and doc_version from frontmatter
instead of using hardcoded defaults.
Args:
skill_dir: Path to skill directory
Returns:
SkillMetadata populated from frontmatter values.
"""
fm = self._read_frontmatter(skill_dir)
return SkillMetadata(
name=skill_dir.name,
description=fm.get("description", f"Documentation for {skill_dir.name}"),
version=fm.get("version", "1.0.0"),
doc_version=fm.get("doc_version", ""),
)
def _iterate_references(self, skill_dir: Path):
"""
Iterate over all reference files in skill directory.
@@ -266,6 +311,7 @@ class SkillAdaptor(ABC):
base_meta = {
"source": metadata.name,
"version": metadata.version,
"doc_version": metadata.doc_version,
"description": metadata.description,
}
if metadata.author:
@@ -280,9 +326,10 @@ class SkillAdaptor(ABC):
content: str,
metadata: dict,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
source_file: str = None,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> list[tuple[str, dict]]:
"""
Optionally chunk content for RAG platforms.
@@ -321,9 +368,18 @@ class SkillAdaptor(ABC):
return [(content, metadata)]
# RAGChunker uses TOKENS (it converts to chars internally)
# If overlap is at the default value but chunk size was customized,
# scale overlap proportionally (10% of chunk size, min DEFAULT_CHUNK_OVERLAP_TOKENS)
effective_overlap = chunk_overlap_tokens
if (
chunk_overlap_tokens == DEFAULT_CHUNK_OVERLAP_TOKENS
and chunk_max_tokens != DEFAULT_CHUNK_TOKENS
):
effective_overlap = max(DEFAULT_CHUNK_OVERLAP_TOKENS, chunk_max_tokens // 10)
chunker = RAGChunker(
chunk_size=chunk_max_tokens,
chunk_overlap=max(50, chunk_max_tokens // 10), # 10% overlap
chunk_overlap=effective_overlap,
preserve_code_blocks=preserve_code_blocks,
preserve_paragraphs=True,
min_chunk_size=100, # 100 tokens minimum
@@ -433,6 +489,67 @@ class SkillAdaptor(ABC):
# Plain hex digest
return hash_hex
def _generate_openai_embeddings(
self, documents: list[str], api_key: str | None = None
) -> list[list[float]]:
"""Generate embeddings using OpenAI text-embedding-3-small.
Args:
documents: List of document texts
api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
Returns:
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
client = OpenAI(api_key=api_key)
embeddings: list[list[float]] = []
batch_size = 100
print(f" Generating OpenAI embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(input=batch, model="text-embedding-3-small")
embeddings.extend([item.embedding for item in response.data])
print(f" ✓ Embedded {min(i + batch_size, len(documents))}/{len(documents)}")
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e
return embeddings
def _generate_st_embeddings(self, documents: list[str]) -> list[list[float]]:
"""Generate embeddings using sentence-transformers (all-MiniLM-L6-v2).
Args:
documents: List of document texts
Returns:
List of embedding vectors
"""
try:
from sentence_transformers import SentenceTransformer
except ImportError:
raise ImportError(
"sentence-transformers not installed. Run: pip install sentence-transformers"
) from None
print(f" Generating sentence-transformer embeddings for {len(documents)} documents...")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(documents, show_progress_bar=True)
return [emb.tolist() for emb in embeddings]
def _generate_toc(self, skill_dir: Path) -> str:
"""
Helper to generate table of contents from references.

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class ChromaAdaptor(SkillAdaptor):
@@ -79,6 +80,7 @@ class ChromaAdaptor(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -86,9 +88,12 @@ class ChromaAdaptor(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks to parallel arrays
@@ -109,6 +114,7 @@ class ChromaAdaptor(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -116,9 +122,12 @@ class ChromaAdaptor(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks to parallel arrays
@@ -144,8 +153,9 @@ class ChromaAdaptor(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for Chroma.
@@ -166,12 +176,8 @@ class ChromaAdaptor(SkillAdaptor):
output_path = self._format_output_path(skill_dir, Path(output_path), "-chroma.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"Chroma collection data for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate Chroma data
chroma_json = self.format_skill_md(
@@ -180,6 +186,7 @@ class ChromaAdaptor(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file
@@ -206,7 +213,7 @@ class ChromaAdaptor(SkillAdaptor):
return output_path
def upload(self, package_path: Path, api_key: str = None, **kwargs) -> dict[str, Any]:
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
"""
Upload packaged skill to ChromaDB.
@@ -250,9 +257,7 @@ class ChromaAdaptor(SkillAdaptor):
print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
# Parse URL
if "://" in chroma_url:
parts = chroma_url.split("://")
parts[0]
host_port = parts[1]
_scheme, host_port = chroma_url.split("://", 1)
else:
host_port = chroma_url
@@ -352,52 +357,6 @@ class ChromaAdaptor(SkillAdaptor):
except Exception as e:
return {"success": False, "message": f"Upload failed: {e}"}
def _generate_openai_embeddings(
self, documents: list[str], api_key: str = None
) -> list[list[float]]:
"""
Generate embeddings using OpenAI API.
Args:
documents: List of document texts
api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
Returns:
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
client = OpenAI(api_key=api_key)
# Batch process (OpenAI allows up to 2048 inputs)
embeddings = []
batch_size = 100
print(f" Generating embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(
input=batch,
model="text-embedding-3-small", # Cheapest, fastest
)
embeddings.extend([item.embedding for item in response.data])
print(f" ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e
return embeddings
def validate_api_key(self, _api_key: str) -> bool:
"""
Chroma format doesn't use API keys for packaging.

View File

@@ -12,6 +12,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class ClaudeAdaptor(SkillAdaptor):
@@ -86,8 +87,9 @@ version: {metadata.version}
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into ZIP file for Claude.

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class FAISSHelpers(SkillAdaptor):
@@ -81,6 +82,7 @@ class FAISSHelpers(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -88,9 +90,12 @@ class FAISSHelpers(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks to parallel arrays
@@ -110,6 +115,7 @@ class FAISSHelpers(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -117,9 +123,12 @@ class FAISSHelpers(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks to parallel arrays
@@ -155,8 +164,9 @@ class FAISSHelpers(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for FAISS.
@@ -176,12 +186,8 @@ class FAISSHelpers(SkillAdaptor):
output_path = self._format_output_path(skill_dir, Path(output_path), "-faiss.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"FAISS data for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate FAISS data
faiss_json = self.format_skill_md(
@@ -190,6 +196,7 @@ class FAISSHelpers(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file

View File

@@ -13,6 +13,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class GeminiAdaptor(SkillAdaptor):
@@ -91,8 +92,9 @@ See the references directory for complete documentation with examples and best p
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into tar.gz file for Gemini.

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class HaystackAdaptor(SkillAdaptor):
@@ -62,6 +63,7 @@ class HaystackAdaptor(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -69,9 +71,12 @@ class HaystackAdaptor(SkillAdaptor):
content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as documents
@@ -95,6 +100,7 @@ class HaystackAdaptor(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -102,9 +108,12 @@ class HaystackAdaptor(SkillAdaptor):
ref_content,
doc_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as documents
@@ -124,8 +133,9 @@ class HaystackAdaptor(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for Haystack.
@@ -147,11 +157,8 @@ class HaystackAdaptor(SkillAdaptor):
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"Haystack documents for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate Haystack documents
documents_json = self.format_skill_md(
@@ -160,6 +167,7 @@ class HaystackAdaptor(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class LangChainAdaptor(SkillAdaptor):
@@ -62,6 +63,7 @@ class LangChainAdaptor(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -69,9 +71,12 @@ class LangChainAdaptor(SkillAdaptor):
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks to documents
@@ -90,6 +95,7 @@ class LangChainAdaptor(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -97,9 +103,12 @@ class LangChainAdaptor(SkillAdaptor):
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks to documents
@@ -114,8 +123,9 @@ class LangChainAdaptor(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for LangChain.
@@ -139,12 +149,8 @@ class LangChainAdaptor(SkillAdaptor):
output_path = self._format_output_path(skill_dir, Path(output_path), "-langchain.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"LangChain documents for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate LangChain documents with chunking
documents_json = self.format_skill_md(
@@ -153,6 +159,7 @@ class LangChainAdaptor(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class LlamaIndexAdaptor(SkillAdaptor):
@@ -77,6 +78,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -84,9 +86,12 @@ class LlamaIndexAdaptor(SkillAdaptor):
content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as nodes
@@ -112,6 +117,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -119,9 +125,12 @@ class LlamaIndexAdaptor(SkillAdaptor):
ref_content,
node_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as nodes
@@ -143,8 +152,9 @@ class LlamaIndexAdaptor(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for LlamaIndex.
@@ -166,11 +176,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"LlamaIndex nodes for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate LlamaIndex nodes
nodes_json = self.format_skill_md(
@@ -179,6 +186,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class MarkdownAdaptor(SkillAdaptor):
@@ -86,8 +87,9 @@ Browse the reference files for detailed information on each topic. All files are
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into ZIP file with markdown documentation.

View File

@@ -12,6 +12,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class OpenAIAdaptor(SkillAdaptor):
@@ -108,8 +109,9 @@ Always prioritize accuracy by consulting the attached documentation files before
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into ZIP file for OpenAI Assistants.

View File

@@ -0,0 +1,405 @@
#!/usr/bin/env python3
"""
Pinecone Adaptor
Implements Pinecone vector database format for RAG pipelines.
Converts Skill Seekers documentation into Pinecone-compatible format
with namespace support and batch upsert.
"""
import json
from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
# Pinecone metadata value limit: 40 KB per vector
PINECONE_METADATA_BYTES_LIMIT = 40_000
class PineconeAdaptor(SkillAdaptor):
"""
Pinecone vector database adaptor.
Handles:
- Pinecone-compatible vector format with metadata
- Namespace support for multi-tenant indexing
- Batch upsert (100 vectors per batch)
- OpenAI and sentence-transformers embedding generation
- Metadata truncation to stay within Pinecone's 40KB limit
"""
PLATFORM = "pinecone"
PLATFORM_NAME = "Pinecone (Vector Database)"
DEFAULT_API_ENDPOINT = None
def _generate_id(self, content: str, metadata: dict) -> str:
"""Generate deterministic ID from content and metadata."""
return self._generate_deterministic_id(content, metadata, format="hex")
def _truncate_text_for_metadata(
self, text: str, max_bytes: int = PINECONE_METADATA_BYTES_LIMIT
) -> str:
"""Truncate text to fit within Pinecone's metadata byte limit.
Pinecone limits metadata to 40KB per vector. This truncates
the text field (largest metadata value) to stay within limits,
leaving room for other metadata fields (~1KB overhead).
Args:
text: Text content to potentially truncate
max_bytes: Maximum bytes for the text field
Returns:
Truncated text that fits within the byte limit
"""
# Reserve ~2KB for other metadata fields
available = max_bytes - 2000
encoded = text.encode("utf-8")
if len(encoded) <= available:
return text
# Truncate at byte boundary, decode safely
truncated = encoded[:available].decode("utf-8", errors="ignore")
return truncated
def format_skill_md(
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
) -> str:
"""
Format skill as JSON for Pinecone ingestion.
Creates a package with vectors ready for upsert:
{
"index_name": "...",
"namespace": "...",
"dimension": 1536,
"metric": "cosine",
"vectors": [
{
"id": "hex-id",
"metadata": {
"text": "content",
"source": "...",
"category": "...",
...
}
}
]
}
No ``values`` field — embeddings are added at upload time.
Args:
skill_dir: Path to skill directory
metadata: Skill metadata
enable_chunking: Enable intelligent chunking for large documents
**kwargs: Additional chunking parameters
Returns:
JSON string containing Pinecone-compatible data
"""
vectors: list[dict[str, Any]] = []
# Convert SKILL.md (main documentation)
skill_md_path = skill_dir / "SKILL.md"
if skill_md_path.exists():
content = self._read_existing_content(skill_dir)
if content.strip():
doc_metadata = {
"source": metadata.name,
"category": "overview",
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
chunks = self._maybe_chunk_content(
content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
for chunk_text, chunk_meta in chunks:
vectors.append(
{
"id": self._generate_id(chunk_text, chunk_meta),
"metadata": {
**chunk_meta,
"text": self._truncate_text_for_metadata(chunk_text),
},
}
)
# Convert all reference files
for ref_file, ref_content in self._iterate_references(skill_dir):
if ref_content.strip():
category = ref_file.stem.replace("_", " ").lower()
doc_metadata = {
"source": metadata.name,
"category": category,
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
chunks = self._maybe_chunk_content(
ref_content,
doc_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
for chunk_text, chunk_meta in chunks:
vectors.append(
{
"id": self._generate_id(chunk_text, chunk_meta),
"metadata": {
**chunk_meta,
"text": self._truncate_text_for_metadata(chunk_text),
},
}
)
index_name = metadata.name.replace("_", "-").lower()
return json.dumps(
{
"index_name": index_name,
"namespace": index_name,
"dimension": 1536,
"metric": "cosine",
"vectors": vectors,
},
indent=2,
ensure_ascii=False,
)
def package(
self,
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for Pinecone.
Creates a JSON file containing vectors with metadata, ready for
embedding generation and upsert to a Pinecone index.
Args:
skill_dir: Path to skill directory
output_path: Output path/filename for JSON file
enable_chunking: Enable intelligent chunking for large documents
chunk_max_tokens: Maximum tokens per chunk (default: 512)
preserve_code_blocks: Preserve code blocks during chunking
Returns:
Path to created JSON file
"""
skill_dir = Path(skill_dir)
output_path = self._format_output_path(skill_dir, Path(output_path), "-pinecone.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
pinecone_json = self.format_skill_md(
skill_dir,
metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
output_path.write_text(pinecone_json, encoding="utf-8")
print(f"\n✅ Pinecone data packaged successfully!")
print(f"📦 Output: {output_path}")
data = json.loads(pinecone_json)
print(f"📊 Total vectors: {len(data['vectors'])}")
print(f"🗂️ Index name: {data['index_name']}")
print(f"📁 Namespace: {data['namespace']}")
print(f"📐 Default dimension: {data['dimension']} (auto-detected at upload time)")
# Show category breakdown
categories: dict[str, int] = {}
for vec in data["vectors"]:
cat = vec["metadata"].get("category", "unknown")
categories[cat] = categories.get(cat, 0) + 1
print("📁 Categories:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count}")
return output_path
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
"""
Upload packaged skill to Pinecone.
Args:
package_path: Path to packaged JSON
api_key: Pinecone API key (or uses PINECONE_API_KEY env var)
**kwargs:
index_name: Override index name from JSON
namespace: Override namespace from JSON
dimension: Embedding dimension (default: 1536)
metric: Distance metric (default: "cosine")
embedding_function: "openai" or "sentence-transformers"
cloud: Cloud provider (default: "aws")
region: Cloud region (default: "us-east-1")
Returns:
{"success": bool, "index": str, "namespace": str, "count": int}
"""
import os
try:
from pinecone import Pinecone, ServerlessSpec
except (ImportError, Exception):
return {
"success": False,
"message": "pinecone not installed. Run: pip install 'pinecone>=5.0.0'",
}
api_key = api_key or os.getenv("PINECONE_API_KEY")
if not api_key:
return {
"success": False,
"message": ("PINECONE_API_KEY not set. Set via env var or pass api_key parameter."),
}
# Load package
with open(package_path) as f:
data = json.load(f)
index_name = kwargs.get("index_name", data.get("index_name", "skill-docs"))
namespace = kwargs.get("namespace", data.get("namespace", ""))
metric = kwargs.get("metric", data.get("metric", "cosine"))
cloud = kwargs.get("cloud", "aws")
region = kwargs.get("region", "us-east-1")
# Auto-detect dimension from embedding model
embedding_function = kwargs.get("embedding_function", "openai")
EMBEDDING_DIMENSIONS = {
"openai": 1536, # text-embedding-3-small
"sentence-transformers": 384, # all-MiniLM-L6-v2
}
# Priority: explicit kwarg > model-based auto-detect > JSON file > fallback
# Note: format_skill_md() hardcodes dimension=1536 in the JSON, so we must
# give EMBEDDING_DIMENSIONS priority over the file to handle sentence-transformers (384).
dimension = kwargs.get(
"dimension",
EMBEDDING_DIMENSIONS.get(embedding_function, data.get("dimension", 1536)),
)
try:
# Generate embeddings FIRST — before creating the index.
# This avoids leaving an empty Pinecone index behind when
# embedding generation fails (e.g. missing API key).
texts = [vec["metadata"]["text"] for vec in data["vectors"]]
if embedding_function == "openai":
embeddings = self._generate_openai_embeddings(texts)
elif embedding_function == "sentence-transformers":
embeddings = self._generate_st_embeddings(texts)
else:
return {
"success": False,
"message": f"Unknown embedding_function: {embedding_function}. Use 'openai' or 'sentence-transformers'.",
}
pc = Pinecone(api_key=api_key)
# Create index if it doesn't exist
existing_indexes = [idx.name for idx in pc.list_indexes()]
if index_name not in existing_indexes:
print(
f"🔧 Creating Pinecone index: {index_name} (dimension={dimension}, metric={metric})"
)
pc.create_index(
name=index_name,
dimension=dimension,
metric=metric,
spec=ServerlessSpec(cloud=cloud, region=region),
)
print(f"✅ Index '{index_name}' created")
else:
print(f" Using existing index: {index_name}")
index = pc.Index(index_name)
# Batch upsert (100 per batch — Pinecone recommendation)
batch_size = 100
vectors_to_upsert = []
for i, vec in enumerate(data["vectors"]):
vectors_to_upsert.append(
{
"id": vec["id"],
"values": embeddings[i],
"metadata": vec["metadata"],
}
)
total = len(vectors_to_upsert)
print(f"🔄 Upserting {total} vectors to Pinecone...")
for i in range(0, total, batch_size):
batch = vectors_to_upsert[i : i + batch_size]
index.upsert(vectors=batch, namespace=namespace)
print(f" ✓ Upserted {min(i + batch_size, total)}/{total}")
print(f"✅ Uploaded {total} vectors to Pinecone index '{index_name}'")
return {
"success": True,
"message": f"Uploaded {total} vectors to Pinecone index '{index_name}' (namespace: '{namespace}')",
"url": None,
"index": index_name,
"namespace": namespace,
"count": total,
}
except Exception as e:
return {"success": False, "message": f"Pinecone upload failed: {e}"}
def validate_api_key(self, _api_key: str) -> bool:
"""Pinecone doesn't need API key for packaging."""
return False
def get_env_var_name(self) -> str:
"""Return the expected env var for Pinecone API key."""
return "PINECONE_API_KEY"
def supports_enhancement(self) -> bool:
"""Pinecone format doesn't support AI enhancement."""
return False
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
"""Pinecone format doesn't support enhancement."""
print("❌ Pinecone format does not support enhancement")
print(" Enhance before packaging:")
print(" skill-seekers enhance output/skill/ --mode LOCAL")
print(" skill-seekers package output/skill/ --target pinecone")
return False

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class QdrantAdaptor(SkillAdaptor):
@@ -76,6 +77,7 @@ class QdrantAdaptor(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -83,9 +85,12 @@ class QdrantAdaptor(SkillAdaptor):
content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as points
@@ -109,6 +114,7 @@ class QdrantAdaptor(SkillAdaptor):
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
"doc_version": chunk_meta.get("doc_version", ""),
},
}
)
@@ -124,6 +130,7 @@ class QdrantAdaptor(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -131,9 +138,12 @@ class QdrantAdaptor(SkillAdaptor):
ref_content,
payload_meta,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as points
@@ -157,6 +167,7 @@ class QdrantAdaptor(SkillAdaptor):
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
"doc_version": chunk_meta.get("doc_version", ""),
},
}
)
@@ -189,8 +200,9 @@ class QdrantAdaptor(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for Qdrant.
@@ -211,11 +223,8 @@ class QdrantAdaptor(SkillAdaptor):
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"Qdrant data for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate Qdrant data
qdrant_json = self.format_skill_md(
@@ -224,6 +233,7 @@ class QdrantAdaptor(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from .base import SkillAdaptor, SkillMetadata
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
class WeaviateAdaptor(SkillAdaptor):
@@ -96,7 +97,14 @@ class WeaviateAdaptor(SkillAdaptor):
{
"name": "version",
"dataType": ["text"],
"description": "Documentation version",
"description": "Skill package version",
"indexFilterable": True,
"indexSearchable": False,
},
{
"name": "doc_version",
"dataType": ["text"],
"description": "Documentation version (e.g., 16.2)",
"indexFilterable": True,
"indexSearchable": False,
},
@@ -137,6 +145,7 @@ class WeaviateAdaptor(SkillAdaptor):
"file": "SKILL.md",
"type": "documentation",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -144,9 +153,12 @@ class WeaviateAdaptor(SkillAdaptor):
content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file="SKILL.md",
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as objects
@@ -161,6 +173,7 @@ class WeaviateAdaptor(SkillAdaptor):
"file": chunk_meta.get("file", "SKILL.md"),
"type": chunk_meta.get("type", "documentation"),
"version": chunk_meta.get("version", metadata.version),
"doc_version": chunk_meta.get("doc_version", ""),
},
}
)
@@ -177,6 +190,7 @@ class WeaviateAdaptor(SkillAdaptor):
"file": ref_file.name,
"type": "reference",
"version": metadata.version,
"doc_version": metadata.doc_version,
}
# Chunk if enabled
@@ -184,9 +198,12 @@ class WeaviateAdaptor(SkillAdaptor):
ref_content,
obj_metadata,
enable_chunking=enable_chunking,
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
source_file=ref_file.name,
chunk_overlap_tokens=kwargs.get(
"chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
),
)
# Add all chunks as objects
@@ -201,6 +218,7 @@ class WeaviateAdaptor(SkillAdaptor):
"file": chunk_meta.get("file", ref_file.name),
"type": chunk_meta.get("type", "reference"),
"version": chunk_meta.get("version", metadata.version),
"doc_version": chunk_meta.get("doc_version", ""),
},
}
)
@@ -221,8 +239,9 @@ class WeaviateAdaptor(SkillAdaptor):
skill_dir: Path,
output_path: Path,
enable_chunking: bool = False,
chunk_max_tokens: int = 512,
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
preserve_code_blocks: bool = True,
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
) -> Path:
"""
Package skill into JSON file for Weaviate.
@@ -245,12 +264,8 @@ class WeaviateAdaptor(SkillAdaptor):
output_path = self._format_output_path(skill_dir, Path(output_path), "-weaviate.json")
output_path.parent.mkdir(parents=True, exist_ok=True)
# Read metadata
metadata = SkillMetadata(
name=skill_dir.name,
description=f"Weaviate objects for {skill_dir.name}",
version="1.0.0",
)
# Read metadata from SKILL.md frontmatter
metadata = self._build_skill_metadata(skill_dir)
# Generate Weaviate objects
weaviate_json = self.format_skill_md(
@@ -259,6 +274,7 @@ class WeaviateAdaptor(SkillAdaptor):
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
# Write to file
@@ -288,7 +304,7 @@ class WeaviateAdaptor(SkillAdaptor):
return output_path
def upload(self, package_path: Path, api_key: str = None, **kwargs) -> dict[str, Any]:
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
"""
Upload packaged skill to Weaviate.
@@ -382,31 +398,20 @@ class WeaviateAdaptor(SkillAdaptor):
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
elif embedding_function == "sentence-transformers":
# Use sentence-transformers
print("🔄 Generating sentence-transformer embeddings and uploading...")
try:
from sentence_transformers import SentenceTransformer
# Use sentence-transformers (via shared base method)
contents = [obj["properties"]["content"] for obj in data["objects"]]
embeddings = self._generate_st_embeddings(contents)
model = SentenceTransformer("all-MiniLM-L6-v2")
contents = [obj["properties"]["content"] for obj in data["objects"]]
embeddings = model.encode(contents, show_progress_bar=True).tolist()
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=embeddings[i],
)
for i, obj in enumerate(data["objects"]):
batch.add_data_object(
data_object=obj["properties"],
class_name=data["class_name"],
uuid=obj["id"],
vector=embeddings[i],
)
if (i + 1) % 100 == 0:
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
except ImportError:
return {
"success": False,
"message": "sentence-transformers not installed. Run: pip install sentence-transformers",
}
if (i + 1) % 100 == 0:
print(f" ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
else:
# No embeddings - Weaviate will use its configured vectorizer
@@ -427,61 +432,16 @@ class WeaviateAdaptor(SkillAdaptor):
return {
"success": True,
"message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
"url": None,
"class_name": data["class_name"],
"count": count,
}
except ImportError as e:
return {"success": False, "message": str(e)}
except Exception as e:
return {"success": False, "message": f"Upload failed: {e}"}
def _generate_openai_embeddings(
self, documents: list[str], api_key: str = None
) -> list[list[float]]:
"""
Generate embeddings using OpenAI API.
Args:
documents: List of document texts
api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
Returns:
List of embedding vectors
"""
import os
try:
from openai import OpenAI
except ImportError:
raise ImportError("openai not installed. Run: pip install openai") from None
api_key = api_key or os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
client = OpenAI(api_key=api_key)
# Batch process (OpenAI allows up to 2048 inputs)
embeddings = []
batch_size = 100
print(f" Generating embeddings for {len(documents)} documents...")
for i in range(0, len(documents), batch_size):
batch = documents[i : i + batch_size]
try:
response = client.embeddings.create(
input=batch,
model="text-embedding-3-small", # Cheapest, fastest
)
embeddings.extend([item.embedding for item in response.data])
print(
f" ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
)
except Exception as e:
raise Exception(f"OpenAI embedding generation failed: {e}") from e
return embeddings
def validate_api_key(self, _api_key: str) -> bool:
"""
Weaviate format doesn't use API keys for packaging.

View File

@@ -15,6 +15,10 @@ Hierarchy:
import argparse
from typing import Any
# Default chunking constants used by RAG and package arguments
DEFAULT_CHUNK_TOKENS = 512
DEFAULT_CHUNK_OVERLAP_TOKENS = 50
# Common argument definitions as data structure
# These are arguments that appear in MULTIPLE commands
COMMON_ARGUMENTS: dict[str, dict[str, Any]] = {
@@ -64,6 +68,15 @@ COMMON_ARGUMENTS: dict[str, dict[str, Any]] = {
"metavar": "KEY",
},
},
"doc_version": {
"flags": ("--doc-version",),
"kwargs": {
"type": str,
"default": "",
"help": "Documentation version tag for RAG metadata (e.g., '16.2')",
"metavar": "VERSION",
},
},
}
# Behavior arguments — runtime flags shared by every scraper
@@ -105,18 +118,18 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
"flags": ("--chunk-tokens",),
"kwargs": {
"type": int,
"default": 512,
"default": DEFAULT_CHUNK_TOKENS,
"metavar": "TOKENS",
"help": "Chunk size in tokens for RAG (default: 512)",
"help": f"Chunk size in tokens for RAG (default: {DEFAULT_CHUNK_TOKENS})",
},
},
"chunk_overlap_tokens": {
"flags": ("--chunk-overlap-tokens",),
"kwargs": {
"type": int,
"default": 50,
"default": DEFAULT_CHUNK_OVERLAP_TOKENS,
"metavar": "TOKENS",
"help": "Overlap between chunks in tokens (default: 50)",
"help": f"Overlap between chunks in tokens (default: {DEFAULT_CHUNK_OVERLAP_TOKENS})",
},
},
}

View File

@@ -153,6 +153,15 @@ UNIVERSAL_ARGUMENTS: dict[str, dict[str, Any]] = {
"metavar": "PATH",
},
},
"doc_version": {
"flags": ("--doc-version",),
"kwargs": {
"type": str,
"default": "",
"help": "Documentation version tag for RAG metadata (e.g., '16.2')",
"metavar": "VERSION",
},
},
}
# Merge RAG arguments from common.py into universal arguments
@@ -655,3 +664,11 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
if mode in ["advanced", "all"]:
for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
# Deprecated alias for backward compatibility (removed in v4.0.0)
parser.add_argument(
"--no-preserve-code",
dest="no_preserve_code_blocks",
action="store_true",
help=argparse.SUPPRESS,
)

View File

@@ -8,6 +8,8 @@ import and use these definitions.
import argparse
from typing import Any
from .common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
# Positional argument
"skill_directory": {
@@ -49,6 +51,7 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
"chroma",
"faiss",
"qdrant",
"pinecone",
],
"default": "claude",
"help": "Target LLM platform (default: claude)",
@@ -109,13 +112,22 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
"flags": ("--chunk-tokens",),
"kwargs": {
"type": int,
"default": 512,
"help": "Maximum tokens per chunk (default: 512)",
"default": DEFAULT_CHUNK_TOKENS,
"help": f"Maximum tokens per chunk (default: {DEFAULT_CHUNK_TOKENS})",
"metavar": "N",
},
},
"no_preserve_code": {
"flags": ("--no-preserve-code",),
"chunk_overlap_tokens": {
"flags": ("--chunk-overlap-tokens",),
"kwargs": {
"type": int,
"default": DEFAULT_CHUNK_OVERLAP_TOKENS,
"help": f"Overlap between chunks in tokens (default: {DEFAULT_CHUNK_OVERLAP_TOKENS})",
"metavar": "N",
},
},
"no_preserve_code_blocks": {
"flags": ("--no-preserve-code-blocks",),
"kwargs": {
"action": "store_true",
"help": "Allow code block splitting (default: code blocks preserved)",
@@ -130,3 +142,11 @@ def add_package_arguments(parser: argparse.ArgumentParser) -> None:
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)
# Deprecated alias for backward compatibility (removed in v4.0.0)
parser.add_argument(
"--no-preserve-code",
dest="no_preserve_code_blocks",
action="store_true",
help=argparse.SUPPRESS,
)

View File

@@ -172,6 +172,14 @@ def add_scrape_arguments(parser: argparse.ArgumentParser) -> None:
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)
# Deprecated alias for backward compatibility (removed in v4.0.0)
parser.add_argument(
"--no-preserve-code",
dest="no_preserve_code_blocks",
action="store_true",
help=argparse.SUPPRESS,
)
def get_scrape_argument_names() -> set:
"""Get the set of scrape argument destination names.

View File

@@ -1057,6 +1057,7 @@ def analyze_codebase(
enhance_level: int = 0,
skill_name: str | None = None,
skill_description: str | None = None,
doc_version: str = "",
) -> dict[str, Any]:
"""
Analyze local codebase and extract code knowledge.
@@ -1603,6 +1604,7 @@ def analyze_codebase(
docs_data=docs_data,
skill_name=skill_name,
skill_description=skill_description,
doc_version=doc_version,
)
return results
@@ -1622,6 +1624,7 @@ def _generate_skill_md(
docs_data: dict[str, Any] | None = None,
skill_name: str | None = None,
skill_description: str | None = None,
doc_version: str = "",
):
"""
Generate rich SKILL.md from codebase analysis results.
@@ -1657,6 +1660,7 @@ def _generate_skill_md(
skill_content = f"""---
name: {skill_name}
description: {description}
doc_version: {doc_version}
---
# {repo_name} Codebase
@@ -2197,13 +2201,11 @@ def _generate_references(output_dir: Path):
if source_dir.exists() and source_dir.is_dir():
# Copy directory to references/ (not symlink, for portability)
if target_dir.exists():
import shutil
shutil.rmtree(target_dir)
import shutil
if target_dir.exists():
shutil.rmtree(target_dir)
shutil.copytree(source_dir, target_dir)
logger.debug(f"Copied {source} → references/{target}")
@@ -2451,6 +2453,7 @@ Examples:
enhance_level=args.enhance_level, # AI enhancement level (0-3)
skill_name=getattr(args, "name", None),
skill_description=getattr(args, "description", None),
doc_version=getattr(args, "doc_version", ""),
)
# ============================================================

View File

@@ -13,6 +13,7 @@ from skill_seekers.cli.arguments.create import (
get_compatible_arguments,
get_universal_argument_names,
)
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
logger = logging.getLogger(__name__)
@@ -106,8 +107,8 @@ class CreateCommand:
# Check against common defaults
defaults = {
"max_issues": 100,
"chunk_tokens": 512,
"chunk_overlap_tokens": 50,
"chunk_tokens": DEFAULT_CHUNK_TOKENS,
"chunk_overlap_tokens": DEFAULT_CHUNK_OVERLAP_TOKENS,
"output": None,
}
@@ -162,11 +163,14 @@ class CreateCommand:
# RAG arguments (web scraper only)
if getattr(self.args, "chunk_for_rag", False):
argv.append("--chunk-for-rag")
if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512:
if (
getattr(self.args, "chunk_tokens", None)
and self.args.chunk_tokens != DEFAULT_CHUNK_TOKENS
):
argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)])
if (
getattr(self.args, "chunk_overlap_tokens", None)
and self.args.chunk_overlap_tokens != 50
and self.args.chunk_overlap_tokens != DEFAULT_CHUNK_OVERLAP_TOKENS
):
argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)])
@@ -479,6 +483,10 @@ class CreateCommand:
if self.args.quiet:
argv.append("--quiet")
# Documentation version metadata
if getattr(self.args, "doc_version", ""):
argv.extend(["--doc-version", self.args.doc_version])
# Enhancement Workflow arguments
if getattr(self.args, "enhance_workflow", None):
for wf in self.args.enhance_workflow:

View File

@@ -1565,9 +1565,11 @@ class DocToSkillConverter:
if len(example_codes) >= 10:
break
doc_version = self.config.get("doc_version", "")
content = f"""---
name: {self.name}
description: {description}
doc_version: {doc_version}
---
# {self.name.title()} Skill
@@ -2103,6 +2105,11 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
"max_pages": DEFAULT_MAX_PAGES,
}
# Apply CLI override for doc_version (works for all config modes)
cli_doc_version = getattr(args, "doc_version", "")
if cli_doc_version:
config["doc_version"] = cli_doc_version
# Apply CLI overrides for rate limiting
if args.no_rate_limit:
config["rate_limit"] = 0

View File

@@ -367,7 +367,7 @@ class LocalSkillEnhancer:
if line.startswith("#"):
# Found heading - keep it and next 3 lines
chunk = lines[i : min(i + 4, len(lines))]
chunk_chars = sum(len(l) for l in chunk)
chunk_chars = sum(len(line_text) for line_text in chunk)
if current_chars + chunk_chars > max_chars:
break
result.extend(chunk)

View File

@@ -968,10 +968,13 @@ class GitHubToSkillConverter:
# Truncate description to 1024 chars if needed
desc = self.description[:1024] if len(self.description) > 1024 else self.description
doc_version = self.config.get("doc_version", "")
# Build skill content
skill_content = f"""---
name: {skill_name}
description: {desc}
doc_version: {doc_version}
---
# {repo_info.get("name", self.name)}
@@ -1003,10 +1006,11 @@ Use this skill when you need to:
# Repository info
skill_content += "### Repository Info\n"
skill_content += f"- **Homepage:** {repo_info.get('homepage', 'N/A')}\n"
skill_content += f"- **Homepage:** {repo_info.get('homepage') or 'N/A'}\n"
skill_content += f"- **Topics:** {', '.join(repo_info.get('topics', []))}\n"
skill_content += f"- **Open Issues:** {repo_info.get('open_issues', 0)}\n"
skill_content += f"- **Last Updated:** {repo_info.get('updated_at', 'N/A')[:10]}\n\n"
updated_at = repo_info.get("updated_at") or "N/A"
skill_content += f"- **Last Updated:** {updated_at[:10]}\n\n"
# Languages
skill_content += "### Languages\n"
@@ -1101,9 +1105,9 @@ Use this skill when you need to:
lines = []
for release in releases[:3]:
lines.append(
f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}"
)
published_at = release.get("published_at") or "N/A"
release_name = release.get("name") or release["tag_name"]
lines.append(f"- **{release['tag_name']}** ({published_at[:10]}): {release_name}")
return "\n".join(lines)
@@ -1298,15 +1302,17 @@ Use this skill when you need to:
content += f"## Open Issues ({len(open_issues)})\n\n"
for issue in open_issues:
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
created_at = issue.get("created_at") or "N/A"
content += f"### #{issue['number']}: {issue['title']}\n"
content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
content += f"**Labels:** {labels} | **Created:** {created_at[:10]}\n"
content += f"[View on GitHub]({issue['url']})\n\n"
content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
for issue in closed_issues:
labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
closed_at = issue.get("closed_at") or "N/A"
content += f"### #{issue['number']}: {issue['title']}\n"
content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"
content += f"**Labels:** {labels} | **Closed:** {closed_at[:10]}\n"
content += f"[View on GitHub]({issue['url']})\n\n"
issues_path = f"{self.skill_dir}/references/issues.md"
@@ -1323,11 +1329,14 @@ Use this skill when you need to:
)
for release in releases:
content += f"## {release['tag_name']}: {release['name']}\n"
content += f"**Published:** {release['published_at'][:10]}\n"
published_at = release.get("published_at") or "N/A"
release_name = release.get("name") or release["tag_name"]
release_body = release.get("body") or ""
content += f"## {release['tag_name']}: {release_name}\n"
content += f"**Published:** {published_at[:10]}\n"
if release["prerelease"]:
content += "**Pre-release**\n"
content += f"\n{release['body']}\n\n"
content += f"\n{release_body}\n\n"
content += f"[View on GitHub]({release['url']})\n\n---\n\n"
releases_path = f"{self.skill_dir}/references/releases.md"

View File

@@ -325,8 +325,8 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
if getattr(args, "enhance_stage", None):
for stage in args.enhance_stage:
sys.argv.extend(["--enhance-stage", stage])
if getattr(args, "workflow_var", None):
for var in args.workflow_var:
if getattr(args, "var", None):
for var in args.var:
sys.argv.extend(["--var", var])
if getattr(args, "workflow_dry_run", False):
sys.argv.append("--workflow-dry-run")

View File

@@ -14,6 +14,8 @@ import os
import sys
from pathlib import Path
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
# Import utilities
try:
from quality_checker import SkillQualityChecker, print_report
@@ -45,8 +47,9 @@ def package_skill(
chunk_overlap=200,
batch_size=100,
enable_chunking=False,
chunk_max_tokens=512,
chunk_max_tokens=DEFAULT_CHUNK_TOKENS,
preserve_code_blocks=True,
chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
):
"""
Package a skill directory into platform-specific format
@@ -121,6 +124,7 @@ def package_skill(
"chroma",
"faiss",
"qdrant",
"pinecone",
]
if target in RAG_PLATFORMS and not enable_chunking:
@@ -156,6 +160,7 @@ def package_skill(
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
else:
package_path = adaptor.package(
@@ -164,6 +169,7 @@ def package_skill(
enable_chunking=enable_chunking,
chunk_max_tokens=chunk_max_tokens,
preserve_code_blocks=preserve_code_blocks,
chunk_overlap_tokens=chunk_overlap_tokens,
)
print(f" Output: {package_path}")
@@ -226,7 +232,8 @@ Examples:
batch_size=args.batch_size,
enable_chunking=args.chunk_for_rag,
chunk_max_tokens=args.chunk_tokens,
preserve_code_blocks=not args.no_preserve_code,
preserve_code_blocks=not args.no_preserve_code_blocks,
chunk_overlap_tokens=args.chunk_overlap_tokens,
)
if not success:

View File

@@ -14,6 +14,8 @@ Usage:
chunks = chunker.chunk_skill(Path("output/react"))
"""
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
import re
from pathlib import Path
import json
@@ -35,8 +37,8 @@ class RAGChunker:
def __init__(
self,
chunk_size: int = 512,
chunk_overlap: int = 50,
chunk_size: int = DEFAULT_CHUNK_TOKENS,
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
preserve_code_blocks: bool = True,
preserve_paragraphs: bool = True,
min_chunk_size: int = 100,
@@ -383,9 +385,14 @@ def main():
)
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
parser.add_argument("--chunk-tokens", type=int, default=512, help="Target chunk size in tokens")
parser.add_argument(
"--chunk-overlap-tokens", type=int, default=50, help="Overlap size in tokens"
"--chunk-tokens", type=int, default=DEFAULT_CHUNK_TOKENS, help="Target chunk size in tokens"
)
parser.add_argument(
"--chunk-overlap-tokens",
type=int,
default=DEFAULT_CHUNK_OVERLAP_TOKENS,
help="Overlap size in tokens",
)
parser.add_argument("--no-code-blocks", action="store_true", help="Don't preserve code blocks")
parser.add_argument("--no-paragraphs", action="store_true", help="Don't preserve paragraphs")

View File

@@ -1296,7 +1296,9 @@ This skill combines knowledge from multiple sources:
f.write(f"- **File**: `{ex.get('file_path', 'N/A')}`\n")
if ex.get("code_snippet"):
lang = ex.get("language", "text")
f.write(f"\n```{lang}\n{ex['code_snippet']}\n```\n") # Full code, no truncation
f.write(
f"\n```{lang}\n{ex['code_snippet']}\n```\n"
) # Full code, no truncation
f.write("\n")
logger.info(f" ✓ Test examples: {total} total, {high_value} high-value")

View File

@@ -79,7 +79,9 @@ class WordToSkillConverter:
self.config = config
self.name = config["name"]
self.docx_path = config.get("docx_path", "")
self.description = config.get("description") or f"Use when referencing {self.name} documentation"
self.description = (
config.get("description") or f"Use when referencing {self.name} documentation"
)
# Paths
self.skill_dir = f"output/{self.name}"
@@ -109,6 +111,9 @@ class WordToSkillConverter:
if not os.path.exists(self.docx_path):
raise FileNotFoundError(f"Word document not found: {self.docx_path}")
if not self.docx_path.lower().endswith(".docx"):
raise ValueError(f"Not a Word document (expected .docx): {self.docx_path}")
# --- Extract metadata via python-docx ---
doc = python_docx.Document(self.docx_path)
core_props = doc.core_properties
@@ -728,12 +733,13 @@ class WordToSkillConverter:
# HTML-to-sections helper (module-level for clarity)
# ---------------------------------------------------------------------------
def _build_section(
section_number: int,
heading: str | None,
heading_level: str | None,
elements: list,
doc,
doc, # noqa: ARG001
) -> dict:
"""Build a section dict from a list of BeautifulSoup elements.
@@ -769,10 +775,7 @@ def _build_section(
# Code blocks
if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None):
code_elem = elem.find("code") if tag == "pre" else elem
if code_elem:
code_text = code_elem.get_text()
else:
code_text = elem.get_text()
code_text = code_elem.get_text() if code_elem else elem.get_text()
code_text = code_text.strip()
if code_text:
@@ -825,8 +828,8 @@ def _build_section(
raw_text = elem.get_text(separator="\n").strip()
# Exclude bullet-point / prose lists (•, *, -)
if raw_text and not re.search(r"^[•\-\*]\s", raw_text, re.MULTILINE):
if _score_code_quality(raw_text) >= 5.5:
quality_score = _score_code_quality(raw_text)
quality_score = _score_code_quality(raw_text)
if quality_score >= 5.5:
code_samples.append(
{"code": raw_text, "language": "", "quality_score": quality_score}
)
@@ -956,7 +959,8 @@ def main():
name = Path(args.from_json).stem.replace("_extracted", "")
config = {
"name": getattr(args, "name", None) or name,
"description": getattr(args, "description", None) or f"Use when referencing {name} documentation",
"description": getattr(args, "description", None)
or f"Use when referencing {name} documentation",
}
try:
converter = WordToSkillConverter(config)
@@ -1044,6 +1048,7 @@ def main():
except Exception as e:
print(f"\n❌ Unexpected error during Word processing: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)