Merge branch 'development' into feature/video-scraper-pipeline

Sync with latest development changes including ruff formatting, bug fixes, and pinecone adaptor additions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 11:38:45 +03:00
parent 62071c4aa9 68bdbe8307
commit 066e19674a
43 changed files with 1988 additions and 261 deletions
--- a/src/skill_seekers/cli/adaptors/init.py
+++ b/src/skill_seekers/cli/adaptors/init.py
@@ -64,6 +64,11 @@ try:
 except ImportError:
    HaystackAdaptor = None

+try:
+    from .pinecone_adaptor import PineconeAdaptor
+except ImportError:
+    PineconeAdaptor = None
+

 # Registry of available adaptors
 ADAPTORS: dict[str, type[SkillAdaptor]] = {}
@@ -91,6 +96,8 @@ if QdrantAdaptor:
    ADAPTORS["qdrant"] = QdrantAdaptor
 if HaystackAdaptor:
    ADAPTORS["haystack"] = HaystackAdaptor
+if PineconeAdaptor:
+    ADAPTORS["pinecone"] = PineconeAdaptor


 def get_adaptor(platform: str, config: dict = None) -> SkillAdaptor:
--- a/src/skill_seekers/cli/adaptors/base.py
+++ b/src/skill_seekers/cli/adaptors/base.py
@@ -11,6 +11,8 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any

+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
+

@dataclass
 class SkillMetadata:
@@ -19,6 +21,7 @@ class SkillMetadata:
    name: str
    description: str
    version: str = "1.0.0"
+    doc_version: str = ""  # Documentation version (e.g., "16.2") for RAG metadata filtering
    author: str | None = None
    tags: list[str] = field(default_factory=list)

@@ -73,8 +76,9 @@ class SkillAdaptor(ABC):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill for platform (ZIP, tar.gz, etc.).
@@ -228,6 +232,47 @@ class SkillAdaptor(ABC):

        return skill_md_path.read_text(encoding="utf-8")

+    def _read_frontmatter(self, skill_dir: Path) -> dict[str, str]:
+        """Read YAML frontmatter from SKILL.md.
+
+        Args:
+            skill_dir: Path to skill directory
+
+        Returns:
+            Dict of key-value pairs from the frontmatter block.
+        """
+        content = self._read_skill_md(skill_dir)
+        if content.startswith("---"):
+            parts = content.split("---", 2)
+            if len(parts) >= 3:
+                frontmatter: dict[str, str] = {}
+                for line in parts[1].strip().splitlines():
+                    if ":" in line:
+                        key, _, value = line.partition(":")
+                        frontmatter[key.strip()] = value.strip()
+                return frontmatter
+        return {}
+
+    def _build_skill_metadata(self, skill_dir: Path) -> SkillMetadata:
+        """Build SkillMetadata from SKILL.md frontmatter.
+
+        Reads name, description, version, and doc_version from frontmatter
+        instead of using hardcoded defaults.
+
+        Args:
+            skill_dir: Path to skill directory
+
+        Returns:
+            SkillMetadata populated from frontmatter values.
+        """
+        fm = self._read_frontmatter(skill_dir)
+        return SkillMetadata(
+            name=skill_dir.name,
+            description=fm.get("description", f"Documentation for {skill_dir.name}"),
+            version=fm.get("version", "1.0.0"),
+            doc_version=fm.get("doc_version", ""),
+        )
+
    def _iterate_references(self, skill_dir: Path):
        """
        Iterate over all reference files in skill directory.
@@ -266,6 +311,7 @@ class SkillAdaptor(ABC):
        base_meta = {
            "source": metadata.name,
            "version": metadata.version,
+            "doc_version": metadata.doc_version,
            "description": metadata.description,
        }
        if metadata.author:
@@ -280,9 +326,10 @@ class SkillAdaptor(ABC):
        content: str,
        metadata: dict,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
        source_file: str = None,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> list[tuple[str, dict]]:
        """
        Optionally chunk content for RAG platforms.
@@ -321,9 +368,18 @@ class SkillAdaptor(ABC):
            return [(content, metadata)]

        # RAGChunker uses TOKENS (it converts to chars internally)
+        # If overlap is at the default value but chunk size was customized,
+        # scale overlap proportionally (10% of chunk size, min DEFAULT_CHUNK_OVERLAP_TOKENS)
+        effective_overlap = chunk_overlap_tokens
+        if (
+            chunk_overlap_tokens == DEFAULT_CHUNK_OVERLAP_TOKENS
+            and chunk_max_tokens != DEFAULT_CHUNK_TOKENS
+        ):
+            effective_overlap = max(DEFAULT_CHUNK_OVERLAP_TOKENS, chunk_max_tokens // 10)
+
        chunker = RAGChunker(
            chunk_size=chunk_max_tokens,
-            chunk_overlap=max(50, chunk_max_tokens // 10),  # 10% overlap
+            chunk_overlap=effective_overlap,
            preserve_code_blocks=preserve_code_blocks,
            preserve_paragraphs=True,
            min_chunk_size=100,  # 100 tokens minimum
@@ -433,6 +489,67 @@ class SkillAdaptor(ABC):
            # Plain hex digest
            return hash_hex

+    def _generate_openai_embeddings(
+        self, documents: list[str], api_key: str | None = None
+    ) -> list[list[float]]:
+        """Generate embeddings using OpenAI text-embedding-3-small.
+
+        Args:
+            documents: List of document texts
+            api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
+
+        Returns:
+            List of embedding vectors
+        """
+        import os
+
+        try:
+            from openai import OpenAI
+        except ImportError:
+            raise ImportError("openai not installed. Run: pip install openai") from None
+
+        api_key = api_key or os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
+
+        client = OpenAI(api_key=api_key)
+        embeddings: list[list[float]] = []
+        batch_size = 100
+
+        print(f"  Generating OpenAI embeddings for {len(documents)} documents...")
+
+        for i in range(0, len(documents), batch_size):
+            batch = documents[i : i + batch_size]
+            try:
+                response = client.embeddings.create(input=batch, model="text-embedding-3-small")
+                embeddings.extend([item.embedding for item in response.data])
+                print(f"  ✓ Embedded {min(i + batch_size, len(documents))}/{len(documents)}")
+            except Exception as e:
+                raise Exception(f"OpenAI embedding generation failed: {e}") from e
+
+        return embeddings
+
+    def _generate_st_embeddings(self, documents: list[str]) -> list[list[float]]:
+        """Generate embeddings using sentence-transformers (all-MiniLM-L6-v2).
+
+        Args:
+            documents: List of document texts
+
+        Returns:
+            List of embedding vectors
+        """
+        try:
+            from sentence_transformers import SentenceTransformer
+        except ImportError:
+            raise ImportError(
+                "sentence-transformers not installed. Run: pip install sentence-transformers"
+            ) from None
+
+        print(f"  Generating sentence-transformer embeddings for {len(documents)} documents...")
+        model = SentenceTransformer("all-MiniLM-L6-v2")
+        embeddings = model.encode(documents, show_progress_bar=True)
+        return [emb.tolist() for emb in embeddings]
+
    def _generate_toc(self, skill_dir: Path) -> str:
        """
        Helper to generate table of contents from references.
--- a/src/skill_seekers/cli/adaptors/chroma.py
+++ b/src/skill_seekers/cli/adaptors/chroma.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class ChromaAdaptor(SkillAdaptor):
@@ -79,6 +80,7 @@ class ChromaAdaptor(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -86,9 +88,12 @@ class ChromaAdaptor(SkillAdaptor):
                    content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks to parallel arrays
@@ -109,6 +114,7 @@ class ChromaAdaptor(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -116,9 +122,12 @@ class ChromaAdaptor(SkillAdaptor):
                    ref_content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks to parallel arrays
@@ -144,8 +153,9 @@ class ChromaAdaptor(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for Chroma.
@@ -166,12 +176,8 @@ class ChromaAdaptor(SkillAdaptor):
        output_path = self._format_output_path(skill_dir, Path(output_path), "-chroma.json")
        output_path.parent.mkdir(parents=True, exist_ok=True)

-        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"Chroma collection data for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate Chroma data
        chroma_json = self.format_skill_md(
@@ -180,6 +186,7 @@ class ChromaAdaptor(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
@@ -206,7 +213,7 @@ class ChromaAdaptor(SkillAdaptor):

        return output_path

-    def upload(self, package_path: Path, api_key: str = None, **kwargs) -> dict[str, Any]:
+    def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
        """
        Upload packaged skill to ChromaDB.

@@ -250,9 +257,7 @@ class ChromaAdaptor(SkillAdaptor):
                print(f"🌐 Connecting to ChromaDB at: {chroma_url}")
                # Parse URL
                if "://" in chroma_url:
-                    parts = chroma_url.split("://")
-                    parts[0]
-                    host_port = parts[1]
+                    _scheme, host_port = chroma_url.split("://", 1)
                else:
                    host_port = chroma_url

@@ -352,52 +357,6 @@ class ChromaAdaptor(SkillAdaptor):
        except Exception as e:
            return {"success": False, "message": f"Upload failed: {e}"}

-    def _generate_openai_embeddings(
-        self, documents: list[str], api_key: str = None
-    ) -> list[list[float]]:
-        """
-        Generate embeddings using OpenAI API.
-
-        Args:
-            documents: List of document texts
-            api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
-
-        Returns:
-            List of embedding vectors
-        """
-        import os
-
-        try:
-            from openai import OpenAI
-        except ImportError:
-            raise ImportError("openai not installed. Run: pip install openai") from None
-
-        api_key = api_key or os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
-
-        client = OpenAI(api_key=api_key)
-
-        # Batch process (OpenAI allows up to 2048 inputs)
-        embeddings = []
-        batch_size = 100
-
-        print(f"  Generating embeddings for {len(documents)} documents...")
-
-        for i in range(0, len(documents), batch_size):
-            batch = documents[i : i + batch_size]
-            try:
-                response = client.embeddings.create(
-                    input=batch,
-                    model="text-embedding-3-small",  # Cheapest, fastest
-                )
-                embeddings.extend([item.embedding for item in response.data])
-                print(f"  ✓ Processed {min(i + batch_size, len(documents))}/{len(documents)}")
-            except Exception as e:
-                raise Exception(f"OpenAI embedding generation failed: {e}") from e
-
-        return embeddings
-
    def validate_api_key(self, _api_key: str) -> bool:
        """
        Chroma format doesn't use API keys for packaging.
--- a/src/skill_seekers/cli/adaptors/claude.py
+++ b/src/skill_seekers/cli/adaptors/claude.py
@@ -12,6 +12,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class ClaudeAdaptor(SkillAdaptor):
@@ -86,8 +87,9 @@ version: {metadata.version}
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into ZIP file for Claude.
--- a/src/skill_seekers/cli/adaptors/faiss_helpers.py
+++ b/src/skill_seekers/cli/adaptors/faiss_helpers.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class FAISSHelpers(SkillAdaptor):
@@ -81,6 +82,7 @@ class FAISSHelpers(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -88,9 +90,12 @@ class FAISSHelpers(SkillAdaptor):
                    content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks to parallel arrays
@@ -110,6 +115,7 @@ class FAISSHelpers(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -117,9 +123,12 @@ class FAISSHelpers(SkillAdaptor):
                    ref_content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks to parallel arrays
@@ -155,8 +164,9 @@ class FAISSHelpers(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for FAISS.
@@ -176,12 +186,8 @@ class FAISSHelpers(SkillAdaptor):
        output_path = self._format_output_path(skill_dir, Path(output_path), "-faiss.json")
        output_path.parent.mkdir(parents=True, exist_ok=True)

-        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"FAISS data for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate FAISS data
        faiss_json = self.format_skill_md(
@@ -190,6 +196,7 @@ class FAISSHelpers(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
--- a/src/skill_seekers/cli/adaptors/gemini.py
+++ b/src/skill_seekers/cli/adaptors/gemini.py
@@ -13,6 +13,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class GeminiAdaptor(SkillAdaptor):
@@ -91,8 +92,9 @@ See the references directory for complete documentation with examples and best p
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into tar.gz file for Gemini.
--- a/src/skill_seekers/cli/adaptors/haystack.py
+++ b/src/skill_seekers/cli/adaptors/haystack.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class HaystackAdaptor(SkillAdaptor):
@@ -62,6 +63,7 @@ class HaystackAdaptor(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -69,9 +71,12 @@ class HaystackAdaptor(SkillAdaptor):
                    content,
                    doc_meta,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as documents
@@ -95,6 +100,7 @@ class HaystackAdaptor(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -102,9 +108,12 @@ class HaystackAdaptor(SkillAdaptor):
                    ref_content,
                    doc_meta,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as documents
@@ -124,8 +133,9 @@ class HaystackAdaptor(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for Haystack.
@@ -147,11 +157,8 @@ class HaystackAdaptor(SkillAdaptor):
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"Haystack documents for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate Haystack documents
        documents_json = self.format_skill_md(
@@ -160,6 +167,7 @@ class HaystackAdaptor(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
--- a/src/skill_seekers/cli/adaptors/langchain.py
+++ b/src/skill_seekers/cli/adaptors/langchain.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class LangChainAdaptor(SkillAdaptor):
@@ -62,6 +63,7 @@ class LangChainAdaptor(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -69,9 +71,12 @@ class LangChainAdaptor(SkillAdaptor):
                    content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks to documents
@@ -90,6 +95,7 @@ class LangChainAdaptor(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -97,9 +103,12 @@ class LangChainAdaptor(SkillAdaptor):
                    ref_content,
                    doc_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks to documents
@@ -114,8 +123,9 @@ class LangChainAdaptor(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for LangChain.
@@ -139,12 +149,8 @@ class LangChainAdaptor(SkillAdaptor):
        output_path = self._format_output_path(skill_dir, Path(output_path), "-langchain.json")
        output_path.parent.mkdir(parents=True, exist_ok=True)

-        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"LangChain documents for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate LangChain documents with chunking
        documents_json = self.format_skill_md(
@@ -153,6 +159,7 @@ class LangChainAdaptor(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
--- a/src/skill_seekers/cli/adaptors/llama_index.py
+++ b/src/skill_seekers/cli/adaptors/llama_index.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class LlamaIndexAdaptor(SkillAdaptor):
@@ -77,6 +78,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -84,9 +86,12 @@ class LlamaIndexAdaptor(SkillAdaptor):
                    content,
                    node_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as nodes
@@ -112,6 +117,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -119,9 +125,12 @@ class LlamaIndexAdaptor(SkillAdaptor):
                    ref_content,
                    node_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as nodes
@@ -143,8 +152,9 @@ class LlamaIndexAdaptor(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for LlamaIndex.
@@ -166,11 +176,8 @@ class LlamaIndexAdaptor(SkillAdaptor):
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"LlamaIndex nodes for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate LlamaIndex nodes
        nodes_json = self.format_skill_md(
@@ -179,6 +186,7 @@ class LlamaIndexAdaptor(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
--- a/src/skill_seekers/cli/adaptors/markdown.py
+++ b/src/skill_seekers/cli/adaptors/markdown.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class MarkdownAdaptor(SkillAdaptor):
@@ -86,8 +87,9 @@ Browse the reference files for detailed information on each topic. All files are
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into ZIP file with markdown documentation.
--- a/src/skill_seekers/cli/adaptors/openai.py
+++ b/src/skill_seekers/cli/adaptors/openai.py
@@ -12,6 +12,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class OpenAIAdaptor(SkillAdaptor):
@@ -108,8 +109,9 @@ Always prioritize accuracy by consulting the attached documentation files before
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into ZIP file for OpenAI Assistants.
--- a/src/skill_seekers/cli/adaptors/pinecone_adaptor.py
+++ b/src/skill_seekers/cli/adaptors/pinecone_adaptor.py
@@ -0,0 +1,405 @@
+#!/usr/bin/env python3
+"""
+Pinecone Adaptor
+
+Implements Pinecone vector database format for RAG pipelines.
+Converts Skill Seekers documentation into Pinecone-compatible format
+with namespace support and batch upsert.
+"""
+
+import json
+from pathlib import Path
+from typing import Any
+
+from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
+
+# Pinecone metadata value limit: 40 KB per vector
+PINECONE_METADATA_BYTES_LIMIT = 40_000
+
+
+class PineconeAdaptor(SkillAdaptor):
+    """
+    Pinecone vector database adaptor.
+
+    Handles:
+    - Pinecone-compatible vector format with metadata
+    - Namespace support for multi-tenant indexing
+    - Batch upsert (100 vectors per batch)
+    - OpenAI and sentence-transformers embedding generation
+    - Metadata truncation to stay within Pinecone's 40KB limit
+    """
+
+    PLATFORM = "pinecone"
+    PLATFORM_NAME = "Pinecone (Vector Database)"
+    DEFAULT_API_ENDPOINT = None
+
+    def _generate_id(self, content: str, metadata: dict) -> str:
+        """Generate deterministic ID from content and metadata."""
+        return self._generate_deterministic_id(content, metadata, format="hex")
+
+    def _truncate_text_for_metadata(
+        self, text: str, max_bytes: int = PINECONE_METADATA_BYTES_LIMIT
+    ) -> str:
+        """Truncate text to fit within Pinecone's metadata byte limit.
+
+        Pinecone limits metadata to 40KB per vector. This truncates
+        the text field (largest metadata value) to stay within limits,
+        leaving room for other metadata fields (~1KB overhead).
+
+        Args:
+            text: Text content to potentially truncate
+            max_bytes: Maximum bytes for the text field
+
+        Returns:
+            Truncated text that fits within the byte limit
+        """
+        # Reserve ~2KB for other metadata fields
+        available = max_bytes - 2000
+        encoded = text.encode("utf-8")
+        if len(encoded) <= available:
+            return text
+        # Truncate at byte boundary, decode safely
+        truncated = encoded[:available].decode("utf-8", errors="ignore")
+        return truncated
+
+    def format_skill_md(
+        self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
+    ) -> str:
+        """
+        Format skill as JSON for Pinecone ingestion.
+
+        Creates a package with vectors ready for upsert:
+        {
+          "index_name": "...",
+          "namespace": "...",
+          "dimension": 1536,
+          "metric": "cosine",
+          "vectors": [
+            {
+              "id": "hex-id",
+              "metadata": {
+                "text": "content",
+                "source": "...",
+                "category": "...",
+                ...
+              }
+            }
+          ]
+        }
+
+        No ``values`` field — embeddings are added at upload time.
+
+        Args:
+            skill_dir: Path to skill directory
+            metadata: Skill metadata
+            enable_chunking: Enable intelligent chunking for large documents
+            **kwargs: Additional chunking parameters
+
+        Returns:
+            JSON string containing Pinecone-compatible data
+        """
+        vectors: list[dict[str, Any]] = []
+
+        # Convert SKILL.md (main documentation)
+        skill_md_path = skill_dir / "SKILL.md"
+        if skill_md_path.exists():
+            content = self._read_existing_content(skill_dir)
+            if content.strip():
+                doc_metadata = {
+                    "source": metadata.name,
+                    "category": "overview",
+                    "file": "SKILL.md",
+                    "type": "documentation",
+                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
+                }
+
+                chunks = self._maybe_chunk_content(
+                    content,
+                    doc_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
+                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
+                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
+                )
+
+                for chunk_text, chunk_meta in chunks:
+                    vectors.append(
+                        {
+                            "id": self._generate_id(chunk_text, chunk_meta),
+                            "metadata": {
+                                **chunk_meta,
+                                "text": self._truncate_text_for_metadata(chunk_text),
+                            },
+                        }
+                    )
+
+        # Convert all reference files
+        for ref_file, ref_content in self._iterate_references(skill_dir):
+            if ref_content.strip():
+                category = ref_file.stem.replace("_", " ").lower()
+
+                doc_metadata = {
+                    "source": metadata.name,
+                    "category": category,
+                    "file": ref_file.name,
+                    "type": "reference",
+                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
+                }
+
+                chunks = self._maybe_chunk_content(
+                    ref_content,
+                    doc_metadata,
+                    enable_chunking=enable_chunking,
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
+                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
+                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
+                )
+
+                for chunk_text, chunk_meta in chunks:
+                    vectors.append(
+                        {
+                            "id": self._generate_id(chunk_text, chunk_meta),
+                            "metadata": {
+                                **chunk_meta,
+                                "text": self._truncate_text_for_metadata(chunk_text),
+                            },
+                        }
+                    )
+
+        index_name = metadata.name.replace("_", "-").lower()
+
+        return json.dumps(
+            {
+                "index_name": index_name,
+                "namespace": index_name,
+                "dimension": 1536,
+                "metric": "cosine",
+                "vectors": vectors,
+            },
+            indent=2,
+            ensure_ascii=False,
+        )
+
+    def package(
+        self,
+        skill_dir: Path,
+        output_path: Path,
+        enable_chunking: bool = False,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
+        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
+    ) -> Path:
+        """
+        Package skill into JSON file for Pinecone.
+
+        Creates a JSON file containing vectors with metadata, ready for
+        embedding generation and upsert to a Pinecone index.
+
+        Args:
+            skill_dir: Path to skill directory
+            output_path: Output path/filename for JSON file
+            enable_chunking: Enable intelligent chunking for large documents
+            chunk_max_tokens: Maximum tokens per chunk (default: 512)
+            preserve_code_blocks: Preserve code blocks during chunking
+
+        Returns:
+            Path to created JSON file
+        """
+        skill_dir = Path(skill_dir)
+
+        output_path = self._format_output_path(skill_dir, Path(output_path), "-pinecone.json")
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)
+
+        pinecone_json = self.format_skill_md(
+            skill_dir,
+            metadata,
+            enable_chunking=enable_chunking,
+            chunk_max_tokens=chunk_max_tokens,
+            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
+        )
+
+        output_path.write_text(pinecone_json, encoding="utf-8")
+
+        print(f"\n✅ Pinecone data packaged successfully!")
+        print(f"📦 Output: {output_path}")
+
+        data = json.loads(pinecone_json)
+        print(f"📊 Total vectors: {len(data['vectors'])}")
+        print(f"🗂️  Index name: {data['index_name']}")
+        print(f"📁 Namespace: {data['namespace']}")
+        print(f"📐 Default dimension: {data['dimension']} (auto-detected at upload time)")
+
+        # Show category breakdown
+        categories: dict[str, int] = {}
+        for vec in data["vectors"]:
+            cat = vec["metadata"].get("category", "unknown")
+            categories[cat] = categories.get(cat, 0) + 1
+
+        print("📁 Categories:")
+        for cat, count in sorted(categories.items()):
+            print(f"   - {cat}: {count}")
+
+        return output_path
+
+    def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
+        """
+        Upload packaged skill to Pinecone.
+
+        Args:
+            package_path: Path to packaged JSON
+            api_key: Pinecone API key (or uses PINECONE_API_KEY env var)
+            **kwargs:
+                index_name: Override index name from JSON
+                namespace: Override namespace from JSON
+                dimension: Embedding dimension (default: 1536)
+                metric: Distance metric (default: "cosine")
+                embedding_function: "openai" or "sentence-transformers"
+                cloud: Cloud provider (default: "aws")
+                region: Cloud region (default: "us-east-1")
+
+        Returns:
+            {"success": bool, "index": str, "namespace": str, "count": int}
+        """
+        import os
+
+        try:
+            from pinecone import Pinecone, ServerlessSpec
+        except (ImportError, Exception):
+            return {
+                "success": False,
+                "message": "pinecone not installed. Run: pip install 'pinecone>=5.0.0'",
+            }
+
+        api_key = api_key or os.getenv("PINECONE_API_KEY")
+        if not api_key:
+            return {
+                "success": False,
+                "message": ("PINECONE_API_KEY not set. Set via env var or pass api_key parameter."),
+            }
+
+        # Load package
+        with open(package_path) as f:
+            data = json.load(f)
+
+        index_name = kwargs.get("index_name", data.get("index_name", "skill-docs"))
+        namespace = kwargs.get("namespace", data.get("namespace", ""))
+        metric = kwargs.get("metric", data.get("metric", "cosine"))
+        cloud = kwargs.get("cloud", "aws")
+        region = kwargs.get("region", "us-east-1")
+
+        # Auto-detect dimension from embedding model
+        embedding_function = kwargs.get("embedding_function", "openai")
+        EMBEDDING_DIMENSIONS = {
+            "openai": 1536,  # text-embedding-3-small
+            "sentence-transformers": 384,  # all-MiniLM-L6-v2
+        }
+        # Priority: explicit kwarg > model-based auto-detect > JSON file > fallback
+        # Note: format_skill_md() hardcodes dimension=1536 in the JSON, so we must
+        # give EMBEDDING_DIMENSIONS priority over the file to handle sentence-transformers (384).
+        dimension = kwargs.get(
+            "dimension",
+            EMBEDDING_DIMENSIONS.get(embedding_function, data.get("dimension", 1536)),
+        )
+
+        try:
+            # Generate embeddings FIRST — before creating the index.
+            # This avoids leaving an empty Pinecone index behind when
+            # embedding generation fails (e.g. missing API key).
+            texts = [vec["metadata"]["text"] for vec in data["vectors"]]
+
+            if embedding_function == "openai":
+                embeddings = self._generate_openai_embeddings(texts)
+            elif embedding_function == "sentence-transformers":
+                embeddings = self._generate_st_embeddings(texts)
+            else:
+                return {
+                    "success": False,
+                    "message": f"Unknown embedding_function: {embedding_function}. Use 'openai' or 'sentence-transformers'.",
+                }
+
+            pc = Pinecone(api_key=api_key)
+
+            # Create index if it doesn't exist
+            existing_indexes = [idx.name for idx in pc.list_indexes()]
+            if index_name not in existing_indexes:
+                print(
+                    f"🔧 Creating Pinecone index: {index_name} (dimension={dimension}, metric={metric})"
+                )
+                pc.create_index(
+                    name=index_name,
+                    dimension=dimension,
+                    metric=metric,
+                    spec=ServerlessSpec(cloud=cloud, region=region),
+                )
+                print(f"✅ Index '{index_name}' created")
+            else:
+                print(f"ℹ️  Using existing index: {index_name}")
+
+            index = pc.Index(index_name)
+
+            # Batch upsert (100 per batch — Pinecone recommendation)
+            batch_size = 100
+            vectors_to_upsert = []
+            for i, vec in enumerate(data["vectors"]):
+                vectors_to_upsert.append(
+                    {
+                        "id": vec["id"],
+                        "values": embeddings[i],
+                        "metadata": vec["metadata"],
+                    }
+                )
+
+            total = len(vectors_to_upsert)
+            print(f"🔄 Upserting {total} vectors to Pinecone...")
+
+            for i in range(0, total, batch_size):
+                batch = vectors_to_upsert[i : i + batch_size]
+                index.upsert(vectors=batch, namespace=namespace)
+                print(f"  ✓ Upserted {min(i + batch_size, total)}/{total}")
+
+            print(f"✅ Uploaded {total} vectors to Pinecone index '{index_name}'")
+
+            return {
+                "success": True,
+                "message": f"Uploaded {total} vectors to Pinecone index '{index_name}' (namespace: '{namespace}')",
+                "url": None,
+                "index": index_name,
+                "namespace": namespace,
+                "count": total,
+            }
+
+        except Exception as e:
+            return {"success": False, "message": f"Pinecone upload failed: {e}"}
+
+    def validate_api_key(self, _api_key: str) -> bool:
+        """Pinecone doesn't need API key for packaging."""
+        return False
+
+    def get_env_var_name(self) -> str:
+        """Return the expected env var for Pinecone API key."""
+        return "PINECONE_API_KEY"
+
+    def supports_enhancement(self) -> bool:
+        """Pinecone format doesn't support AI enhancement."""
+        return False
+
+    def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
+        """Pinecone format doesn't support enhancement."""
+        print("❌ Pinecone format does not support enhancement")
+        print("   Enhance before packaging:")
+        print("   skill-seekers enhance output/skill/ --mode LOCAL")
+        print("   skill-seekers package output/skill/ --target pinecone")
+        return False
--- a/src/skill_seekers/cli/adaptors/qdrant.py
+++ b/src/skill_seekers/cli/adaptors/qdrant.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class QdrantAdaptor(SkillAdaptor):
@@ -76,6 +77,7 @@ class QdrantAdaptor(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -83,9 +85,12 @@ class QdrantAdaptor(SkillAdaptor):
                    content,
                    payload_meta,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as points
@@ -109,6 +114,7 @@ class QdrantAdaptor(SkillAdaptor):
                                "file": chunk_meta.get("file", "SKILL.md"),
                                "type": chunk_meta.get("type", "documentation"),
                                "version": chunk_meta.get("version", metadata.version),
+                                "doc_version": chunk_meta.get("doc_version", ""),
                            },
                        }
                    )
@@ -124,6 +130,7 @@ class QdrantAdaptor(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -131,9 +138,12 @@ class QdrantAdaptor(SkillAdaptor):
                    ref_content,
                    payload_meta,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as points
@@ -157,6 +167,7 @@ class QdrantAdaptor(SkillAdaptor):
                                "file": chunk_meta.get("file", ref_file.name),
                                "type": chunk_meta.get("type", "reference"),
                                "version": chunk_meta.get("version", metadata.version),
+                                "doc_version": chunk_meta.get("doc_version", ""),
                            },
                        }
                    )
@@ -189,8 +200,9 @@ class QdrantAdaptor(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for Qdrant.
@@ -211,11 +223,8 @@ class QdrantAdaptor(SkillAdaptor):
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"Qdrant data for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate Qdrant data
        qdrant_json = self.format_skill_md(
@@ -224,6 +233,7 @@ class QdrantAdaptor(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
--- a/src/skill_seekers/cli/adaptors/weaviate.py
+++ b/src/skill_seekers/cli/adaptors/weaviate.py
@@ -11,6 +11,7 @@ from pathlib import Path
 from typing import Any

 from .base import SkillAdaptor, SkillMetadata
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS


 class WeaviateAdaptor(SkillAdaptor):
@@ -96,7 +97,14 @@ class WeaviateAdaptor(SkillAdaptor):
                {
                    "name": "version",
                    "dataType": ["text"],
-                    "description": "Documentation version",
+                    "description": "Skill package version",
+                    "indexFilterable": True,
+                    "indexSearchable": False,
+                },
+                {
+                    "name": "doc_version",
+                    "dataType": ["text"],
+                    "description": "Documentation version (e.g., 16.2)",
                    "indexFilterable": True,
                    "indexSearchable": False,
                },
@@ -137,6 +145,7 @@ class WeaviateAdaptor(SkillAdaptor):
                    "file": "SKILL.md",
                    "type": "documentation",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -144,9 +153,12 @@ class WeaviateAdaptor(SkillAdaptor):
                    content,
                    obj_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file="SKILL.md",
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as objects
@@ -161,6 +173,7 @@ class WeaviateAdaptor(SkillAdaptor):
                                "file": chunk_meta.get("file", "SKILL.md"),
                                "type": chunk_meta.get("type", "documentation"),
                                "version": chunk_meta.get("version", metadata.version),
+                                "doc_version": chunk_meta.get("doc_version", ""),
                            },
                        }
                    )
@@ -177,6 +190,7 @@ class WeaviateAdaptor(SkillAdaptor):
                    "file": ref_file.name,
                    "type": "reference",
                    "version": metadata.version,
+                    "doc_version": metadata.doc_version,
                }

                # Chunk if enabled
@@ -184,9 +198,12 @@ class WeaviateAdaptor(SkillAdaptor):
                    ref_content,
                    obj_metadata,
                    enable_chunking=enable_chunking,
-                    chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
+                    chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
                    preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
                    source_file=ref_file.name,
+                    chunk_overlap_tokens=kwargs.get(
+                        "chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS
+                    ),
                )

                # Add all chunks as objects
@@ -201,6 +218,7 @@ class WeaviateAdaptor(SkillAdaptor):
                                "file": chunk_meta.get("file", ref_file.name),
                                "type": chunk_meta.get("type", "reference"),
                                "version": chunk_meta.get("version", metadata.version),
+                                "doc_version": chunk_meta.get("doc_version", ""),
                            },
                        }
                    )
@@ -221,8 +239,9 @@ class WeaviateAdaptor(SkillAdaptor):
        skill_dir: Path,
        output_path: Path,
        enable_chunking: bool = False,
-        chunk_max_tokens: int = 512,
+        chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
        preserve_code_blocks: bool = True,
+        chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
    ) -> Path:
        """
        Package skill into JSON file for Weaviate.
@@ -245,12 +264,8 @@ class WeaviateAdaptor(SkillAdaptor):
        output_path = self._format_output_path(skill_dir, Path(output_path), "-weaviate.json")
        output_path.parent.mkdir(parents=True, exist_ok=True)

-        # Read metadata
-        metadata = SkillMetadata(
-            name=skill_dir.name,
-            description=f"Weaviate objects for {skill_dir.name}",
-            version="1.0.0",
-        )
+        # Read metadata from SKILL.md frontmatter
+        metadata = self._build_skill_metadata(skill_dir)

        # Generate Weaviate objects
        weaviate_json = self.format_skill_md(
@@ -259,6 +274,7 @@ class WeaviateAdaptor(SkillAdaptor):
            enable_chunking=enable_chunking,
            chunk_max_tokens=chunk_max_tokens,
            preserve_code_blocks=preserve_code_blocks,
+            chunk_overlap_tokens=chunk_overlap_tokens,
        )

        # Write to file
@@ -288,7 +304,7 @@ class WeaviateAdaptor(SkillAdaptor):

        return output_path

-    def upload(self, package_path: Path, api_key: str = None, **kwargs) -> dict[str, Any]:
+    def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
        """
        Upload packaged skill to Weaviate.

@@ -382,31 +398,20 @@ class WeaviateAdaptor(SkillAdaptor):
                            print(f"  ✓ Uploaded {i + 1}/{len(data['objects'])} objects")

                elif embedding_function == "sentence-transformers":
-                    # Use sentence-transformers
-                    print("🔄 Generating sentence-transformer embeddings and uploading...")
-                    try:
-                        from sentence_transformers import SentenceTransformer
+                    # Use sentence-transformers (via shared base method)
+                    contents = [obj["properties"]["content"] for obj in data["objects"]]
+                    embeddings = self._generate_st_embeddings(contents)

-                        model = SentenceTransformer("all-MiniLM-L6-v2")
-                        contents = [obj["properties"]["content"] for obj in data["objects"]]
-                        embeddings = model.encode(contents, show_progress_bar=True).tolist()
+                    for i, obj in enumerate(data["objects"]):
+                        batch.add_data_object(
+                            data_object=obj["properties"],
+                            class_name=data["class_name"],
+                            uuid=obj["id"],
+                            vector=embeddings[i],
+                        )

-                        for i, obj in enumerate(data["objects"]):
-                            batch.add_data_object(
-                                data_object=obj["properties"],
-                                class_name=data["class_name"],
-                                uuid=obj["id"],
-                                vector=embeddings[i],
-                            )
-
-                            if (i + 1) % 100 == 0:
-                                print(f"  ✓ Uploaded {i + 1}/{len(data['objects'])} objects")
-
-                    except ImportError:
-                        return {
-                            "success": False,
-                            "message": "sentence-transformers not installed. Run: pip install sentence-transformers",
-                        }
+                        if (i + 1) % 100 == 0:
+                            print(f"  ✓ Uploaded {i + 1}/{len(data['objects'])} objects")

                else:
                    # No embeddings - Weaviate will use its configured vectorizer
@@ -427,61 +432,16 @@ class WeaviateAdaptor(SkillAdaptor):
            return {
                "success": True,
                "message": f"Uploaded {count} objects to Weaviate class '{data['class_name']}'",
+                "url": None,
                "class_name": data["class_name"],
                "count": count,
            }

+        except ImportError as e:
+            return {"success": False, "message": str(e)}
        except Exception as e:
            return {"success": False, "message": f"Upload failed: {e}"}

-    def _generate_openai_embeddings(
-        self, documents: list[str], api_key: str = None
-    ) -> list[list[float]]:
-        """
-        Generate embeddings using OpenAI API.
-
-        Args:
-            documents: List of document texts
-            api_key: OpenAI API key (or uses OPENAI_API_KEY env var)
-
-        Returns:
-            List of embedding vectors
-        """
-        import os
-
-        try:
-            from openai import OpenAI
-        except ImportError:
-            raise ImportError("openai not installed. Run: pip install openai") from None
-
-        api_key = api_key or os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            raise ValueError("OPENAI_API_KEY not set. Set via env var or --openai-api-key")
-
-        client = OpenAI(api_key=api_key)
-
-        # Batch process (OpenAI allows up to 2048 inputs)
-        embeddings = []
-        batch_size = 100
-
-        print(f"  Generating embeddings for {len(documents)} documents...")
-
-        for i in range(0, len(documents), batch_size):
-            batch = documents[i : i + batch_size]
-            try:
-                response = client.embeddings.create(
-                    input=batch,
-                    model="text-embedding-3-small",  # Cheapest, fastest
-                )
-                embeddings.extend([item.embedding for item in response.data])
-                print(
-                    f"  ✓ Generated {min(i + batch_size, len(documents))}/{len(documents)} embeddings"
-                )
-            except Exception as e:
-                raise Exception(f"OpenAI embedding generation failed: {e}") from e
-
-        return embeddings
-
    def validate_api_key(self, _api_key: str) -> bool:
        """
        Weaviate format doesn't use API keys for packaging.
--- a/src/skill_seekers/cli/arguments/common.py
+++ b/src/skill_seekers/cli/arguments/common.py
@@ -15,6 +15,10 @@ Hierarchy:
 import argparse
 from typing import Any

+# Default chunking constants used by RAG and package arguments
+DEFAULT_CHUNK_TOKENS = 512
+DEFAULT_CHUNK_OVERLAP_TOKENS = 50
+
 # Common argument definitions as data structure
 # These are arguments that appear in MULTIPLE commands
 COMMON_ARGUMENTS: dict[str, dict[str, Any]] = {
@@ -64,6 +68,15 @@ COMMON_ARGUMENTS: dict[str, dict[str, Any]] = {
            "metavar": "KEY",
        },
    },
+    "doc_version": {
+        "flags": ("--doc-version",),
+        "kwargs": {
+            "type": str,
+            "default": "",
+            "help": "Documentation version tag for RAG metadata (e.g., '16.2')",
+            "metavar": "VERSION",
+        },
+    },
 }

 # Behavior arguments — runtime flags shared by every scraper
@@ -105,18 +118,18 @@ RAG_ARGUMENTS: dict[str, dict[str, Any]] = {
        "flags": ("--chunk-tokens",),
        "kwargs": {
            "type": int,
-            "default": 512,
+            "default": DEFAULT_CHUNK_TOKENS,
            "metavar": "TOKENS",
-            "help": "Chunk size in tokens for RAG (default: 512)",
+            "help": f"Chunk size in tokens for RAG (default: {DEFAULT_CHUNK_TOKENS})",
        },
    },
    "chunk_overlap_tokens": {
        "flags": ("--chunk-overlap-tokens",),
        "kwargs": {
            "type": int,
-            "default": 50,
+            "default": DEFAULT_CHUNK_OVERLAP_TOKENS,
            "metavar": "TOKENS",
-            "help": "Overlap between chunks in tokens (default: 50)",
+            "help": f"Overlap between chunks in tokens (default: {DEFAULT_CHUNK_OVERLAP_TOKENS})",
        },
    },
 }
--- a/src/skill_seekers/cli/arguments/create.py
+++ b/src/skill_seekers/cli/arguments/create.py
@@ -153,6 +153,15 @@ UNIVERSAL_ARGUMENTS: dict[str, dict[str, Any]] = {
            "metavar": "PATH",
        },
    },
+    "doc_version": {
+        "flags": ("--doc-version",),
+        "kwargs": {
+            "type": str,
+            "default": "",
+            "help": "Documentation version tag for RAG metadata (e.g., '16.2')",
+            "metavar": "VERSION",
+        },
+    },
 }

 # Merge RAG arguments from common.py into universal arguments
@@ -655,3 +664,11 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
    if mode in ["advanced", "all"]:
        for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
            parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
+
+    # Deprecated alias for backward compatibility (removed in v4.0.0)
+    parser.add_argument(
+        "--no-preserve-code",
+        dest="no_preserve_code_blocks",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
--- a/src/skill_seekers/cli/arguments/package.py
+++ b/src/skill_seekers/cli/arguments/package.py
@@ -8,6 +8,8 @@ import and use these definitions.
 import argparse
 from typing import Any

+from .common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
+
 PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
    # Positional argument
    "skill_directory": {
@@ -49,6 +51,7 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
                "chroma",
                "faiss",
                "qdrant",
+                "pinecone",
            ],
            "default": "claude",
            "help": "Target LLM platform (default: claude)",
@@ -109,13 +112,22 @@ PACKAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
        "flags": ("--chunk-tokens",),
        "kwargs": {
            "type": int,
-            "default": 512,
-            "help": "Maximum tokens per chunk (default: 512)",
+            "default": DEFAULT_CHUNK_TOKENS,
+            "help": f"Maximum tokens per chunk (default: {DEFAULT_CHUNK_TOKENS})",
            "metavar": "N",
        },
    },
-    "no_preserve_code": {
-        "flags": ("--no-preserve-code",),
+    "chunk_overlap_tokens": {
+        "flags": ("--chunk-overlap-tokens",),
+        "kwargs": {
+            "type": int,
+            "default": DEFAULT_CHUNK_OVERLAP_TOKENS,
+            "help": f"Overlap between chunks in tokens (default: {DEFAULT_CHUNK_OVERLAP_TOKENS})",
+            "metavar": "N",
+        },
+    },
+    "no_preserve_code_blocks": {
+        "flags": ("--no-preserve-code-blocks",),
        "kwargs": {
            "action": "store_true",
            "help": "Allow code block splitting (default: code blocks preserved)",
@@ -130,3 +142,11 @@ def add_package_arguments(parser: argparse.ArgumentParser) -> None:
        flags = arg_def["flags"]
        kwargs = arg_def["kwargs"]
        parser.add_argument(*flags, **kwargs)
+
+    # Deprecated alias for backward compatibility (removed in v4.0.0)
+    parser.add_argument(
+        "--no-preserve-code",
+        dest="no_preserve_code_blocks",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
--- a/src/skill_seekers/cli/arguments/scrape.py
+++ b/src/skill_seekers/cli/arguments/scrape.py
@@ -172,6 +172,14 @@ def add_scrape_arguments(parser: argparse.ArgumentParser) -> None:
        kwargs = arg_def["kwargs"]
        parser.add_argument(*flags, **kwargs)

+    # Deprecated alias for backward compatibility (removed in v4.0.0)
+    parser.add_argument(
+        "--no-preserve-code",
+        dest="no_preserve_code_blocks",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
+

 def get_scrape_argument_names() -> set:
    """Get the set of scrape argument destination names.
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -1057,6 +1057,7 @@ def analyze_codebase(
    enhance_level: int = 0,
    skill_name: str | None = None,
    skill_description: str | None = None,
+    doc_version: str = "",
 ) -> dict[str, Any]:
    """
    Analyze local codebase and extract code knowledge.
@@ -1603,6 +1604,7 @@ def analyze_codebase(
        docs_data=docs_data,
        skill_name=skill_name,
        skill_description=skill_description,
+        doc_version=doc_version,
    )

    return results
@@ -1622,6 +1624,7 @@ def _generate_skill_md(
    docs_data: dict[str, Any] | None = None,
    skill_name: str | None = None,
    skill_description: str | None = None,
+    doc_version: str = "",
 ):
    """
    Generate rich SKILL.md from codebase analysis results.
@@ -1657,6 +1660,7 @@ def _generate_skill_md(
    skill_content = f"""---
 name: {skill_name}
 description: {description}
+doc_version: {doc_version}
 ---

 # {repo_name} Codebase
@@ -2197,13 +2201,11 @@ def _generate_references(output_dir: Path):

        if source_dir.exists() and source_dir.is_dir():
            # Copy directory to references/ (not symlink, for portability)
-            if target_dir.exists():
-                import shutil
-
-                shutil.rmtree(target_dir)
-
            import shutil

+            if target_dir.exists():
+                shutil.rmtree(target_dir)
+
            shutil.copytree(source_dir, target_dir)
            logger.debug(f"Copied {source} → references/{target}")

@@ -2451,6 +2453,7 @@ Examples:
            enhance_level=args.enhance_level,  # AI enhancement level (0-3)
            skill_name=getattr(args, "name", None),
            skill_description=getattr(args, "description", None),
+            doc_version=getattr(args, "doc_version", ""),
        )

        # ============================================================
--- a/src/skill_seekers/cli/create_command.py
+++ b/src/skill_seekers/cli/create_command.py
@@ -13,6 +13,7 @@ from skill_seekers.cli.arguments.create import (
    get_compatible_arguments,
    get_universal_argument_names,
 )
+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS

 logger = logging.getLogger(__name__)

@@ -106,8 +107,8 @@ class CreateCommand:
        # Check against common defaults
        defaults = {
            "max_issues": 100,
-            "chunk_tokens": 512,
-            "chunk_overlap_tokens": 50,
+            "chunk_tokens": DEFAULT_CHUNK_TOKENS,
+            "chunk_overlap_tokens": DEFAULT_CHUNK_OVERLAP_TOKENS,
            "output": None,
        }

@@ -162,11 +163,14 @@ class CreateCommand:
        # RAG arguments (web scraper only)
        if getattr(self.args, "chunk_for_rag", False):
            argv.append("--chunk-for-rag")
-        if getattr(self.args, "chunk_tokens", None) and self.args.chunk_tokens != 512:
+        if (
+            getattr(self.args, "chunk_tokens", None)
+            and self.args.chunk_tokens != DEFAULT_CHUNK_TOKENS
+        ):
            argv.extend(["--chunk-tokens", str(self.args.chunk_tokens)])
        if (
            getattr(self.args, "chunk_overlap_tokens", None)
-            and self.args.chunk_overlap_tokens != 50
+            and self.args.chunk_overlap_tokens != DEFAULT_CHUNK_OVERLAP_TOKENS
        ):
            argv.extend(["--chunk-overlap-tokens", str(self.args.chunk_overlap_tokens)])

@@ -479,6 +483,10 @@ class CreateCommand:
        if self.args.quiet:
            argv.append("--quiet")

+        # Documentation version metadata
+        if getattr(self.args, "doc_version", ""):
+            argv.extend(["--doc-version", self.args.doc_version])
+
        # Enhancement Workflow arguments
        if getattr(self.args, "enhance_workflow", None):
            for wf in self.args.enhance_workflow:
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -1565,9 +1565,11 @@ class DocToSkillConverter:
            if len(example_codes) >= 10:
                break

+        doc_version = self.config.get("doc_version", "")
        content = f"""---
 name: {self.name}
 description: {description}
+doc_version: {doc_version}
 ---

 # {self.name.title()} Skill
@@ -2103,6 +2105,11 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
            "max_pages": DEFAULT_MAX_PAGES,
        }

+    # Apply CLI override for doc_version (works for all config modes)
+    cli_doc_version = getattr(args, "doc_version", "")
+    if cli_doc_version:
+        config["doc_version"] = cli_doc_version
+
    # Apply CLI overrides for rate limiting
    if args.no_rate_limit:
        config["rate_limit"] = 0
--- a/src/skill_seekers/cli/enhance_skill_local.py
+++ b/src/skill_seekers/cli/enhance_skill_local.py
@@ -367,7 +367,7 @@ class LocalSkillEnhancer:
            if line.startswith("#"):
                # Found heading - keep it and next 3 lines
                chunk = lines[i : min(i + 4, len(lines))]
-                chunk_chars = sum(len(l) for l in chunk)
+                chunk_chars = sum(len(line_text) for line_text in chunk)
                if current_chars + chunk_chars > max_chars:
                    break
                result.extend(chunk)
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -968,10 +968,13 @@ class GitHubToSkillConverter:
        # Truncate description to 1024 chars if needed
        desc = self.description[:1024] if len(self.description) > 1024 else self.description

+        doc_version = self.config.get("doc_version", "")
+
        # Build skill content
        skill_content = f"""---
 name: {skill_name}
 description: {desc}
+doc_version: {doc_version}
 ---

 # {repo_info.get("name", self.name)}
@@ -1003,10 +1006,11 @@ Use this skill when you need to:

        # Repository info
        skill_content += "### Repository Info\n"
-        skill_content += f"- **Homepage:** {repo_info.get('homepage', 'N/A')}\n"
+        skill_content += f"- **Homepage:** {repo_info.get('homepage') or 'N/A'}\n"
        skill_content += f"- **Topics:** {', '.join(repo_info.get('topics', []))}\n"
        skill_content += f"- **Open Issues:** {repo_info.get('open_issues', 0)}\n"
-        skill_content += f"- **Last Updated:** {repo_info.get('updated_at', 'N/A')[:10]}\n\n"
+        updated_at = repo_info.get("updated_at") or "N/A"
+        skill_content += f"- **Last Updated:** {updated_at[:10]}\n\n"

        # Languages
        skill_content += "### Languages\n"
@@ -1101,9 +1105,9 @@ Use this skill when you need to:

        lines = []
        for release in releases[:3]:
-            lines.append(
-                f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}"
-            )
+            published_at = release.get("published_at") or "N/A"
+            release_name = release.get("name") or release["tag_name"]
+            lines.append(f"- **{release['tag_name']}** ({published_at[:10]}): {release_name}")

        return "\n".join(lines)

@@ -1298,15 +1302,17 @@ Use this skill when you need to:
        content += f"## Open Issues ({len(open_issues)})\n\n"
        for issue in open_issues:
            labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
+            created_at = issue.get("created_at") or "N/A"
            content += f"### #{issue['number']}: {issue['title']}\n"
-            content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
+            content += f"**Labels:** {labels} | **Created:** {created_at[:10]}\n"
            content += f"[View on GitHub]({issue['url']})\n\n"

        content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
        for issue in closed_issues:
            labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
+            closed_at = issue.get("closed_at") or "N/A"
            content += f"### #{issue['number']}: {issue['title']}\n"
-            content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"
+            content += f"**Labels:** {labels} | **Closed:** {closed_at[:10]}\n"
            content += f"[View on GitHub]({issue['url']})\n\n"

        issues_path = f"{self.skill_dir}/references/issues.md"
@@ -1323,11 +1329,14 @@ Use this skill when you need to:
        )

        for release in releases:
-            content += f"## {release['tag_name']}: {release['name']}\n"
-            content += f"**Published:** {release['published_at'][:10]}\n"
+            published_at = release.get("published_at") or "N/A"
+            release_name = release.get("name") or release["tag_name"]
+            release_body = release.get("body") or ""
+            content += f"## {release['tag_name']}: {release_name}\n"
+            content += f"**Published:** {published_at[:10]}\n"
            if release["prerelease"]:
                content += "**Pre-release**\n"
-            content += f"\n{release['body']}\n\n"
+            content += f"\n{release_body}\n\n"
            content += f"[View on GitHub]({release['url']})\n\n---\n\n"

        releases_path = f"{self.skill_dir}/references/releases.md"
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -325,8 +325,8 @@ def _handle_analyze_command(args: argparse.Namespace) -> int:
    if getattr(args, "enhance_stage", None):
        for stage in args.enhance_stage:
            sys.argv.extend(["--enhance-stage", stage])
-    if getattr(args, "workflow_var", None):
-        for var in args.workflow_var:
+    if getattr(args, "var", None):
+        for var in args.var:
            sys.argv.extend(["--var", var])
    if getattr(args, "workflow_dry_run", False):
        sys.argv.append("--workflow-dry-run")
--- a/src/skill_seekers/cli/package_skill.py
+++ b/src/skill_seekers/cli/package_skill.py
@@ -14,6 +14,8 @@ import os
 import sys
 from pathlib import Path

+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
+
 # Import utilities
 try:
    from quality_checker import SkillQualityChecker, print_report
@@ -45,8 +47,9 @@ def package_skill(
    chunk_overlap=200,
    batch_size=100,
    enable_chunking=False,
-    chunk_max_tokens=512,
+    chunk_max_tokens=DEFAULT_CHUNK_TOKENS,
    preserve_code_blocks=True,
+    chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
 ):
    """
    Package a skill directory into platform-specific format
@@ -121,6 +124,7 @@ def package_skill(
        "chroma",
        "faiss",
        "qdrant",
+        "pinecone",
    ]

    if target in RAG_PLATFORMS and not enable_chunking:
@@ -156,6 +160,7 @@ def package_skill(
                enable_chunking=enable_chunking,
                chunk_max_tokens=chunk_max_tokens,
                preserve_code_blocks=preserve_code_blocks,
+                chunk_overlap_tokens=chunk_overlap_tokens,
            )
        else:
            package_path = adaptor.package(
@@ -164,6 +169,7 @@ def package_skill(
                enable_chunking=enable_chunking,
                chunk_max_tokens=chunk_max_tokens,
                preserve_code_blocks=preserve_code_blocks,
+                chunk_overlap_tokens=chunk_overlap_tokens,
            )

        print(f"   Output: {package_path}")
@@ -226,7 +232,8 @@ Examples:
        batch_size=args.batch_size,
        enable_chunking=args.chunk_for_rag,
        chunk_max_tokens=args.chunk_tokens,
-        preserve_code_blocks=not args.no_preserve_code,
+        preserve_code_blocks=not args.no_preserve_code_blocks,
+        chunk_overlap_tokens=args.chunk_overlap_tokens,
    )

    if not success:
--- a/src/skill_seekers/cli/rag_chunker.py
+++ b/src/skill_seekers/cli/rag_chunker.py
@@ -14,6 +14,8 @@ Usage:
    chunks = chunker.chunk_skill(Path("output/react"))
 """

+from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
+
 import re
 from pathlib import Path
 import json
@@ -35,8 +37,8 @@ class RAGChunker:

    def __init__(
        self,
-        chunk_size: int = 512,
-        chunk_overlap: int = 50,
+        chunk_size: int = DEFAULT_CHUNK_TOKENS,
+        chunk_overlap: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
        preserve_code_blocks: bool = True,
        preserve_paragraphs: bool = True,
        min_chunk_size: int = 100,
@@ -383,9 +385,14 @@ def main():
    )
    parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
    parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
-    parser.add_argument("--chunk-tokens", type=int, default=512, help="Target chunk size in tokens")
    parser.add_argument(
-        "--chunk-overlap-tokens", type=int, default=50, help="Overlap size in tokens"
+        "--chunk-tokens", type=int, default=DEFAULT_CHUNK_TOKENS, help="Target chunk size in tokens"
+    )
+    parser.add_argument(
+        "--chunk-overlap-tokens",
+        type=int,
+        default=DEFAULT_CHUNK_OVERLAP_TOKENS,
+        help="Overlap size in tokens",
    )
    parser.add_argument("--no-code-blocks", action="store_true", help="Don't preserve code blocks")
    parser.add_argument("--no-paragraphs", action="store_true", help="Don't preserve paragraphs")
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -1296,7 +1296,9 @@ This skill combines knowledge from multiple sources:
                    f.write(f"- **File**: `{ex.get('file_path', 'N/A')}`\n")
                    if ex.get("code_snippet"):
                        lang = ex.get("language", "text")
-                        f.write(f"\n```{lang}\n{ex['code_snippet']}\n```\n")  # Full code, no truncation
+                        f.write(
+                            f"\n```{lang}\n{ex['code_snippet']}\n```\n"
+                        )  # Full code, no truncation
                    f.write("\n")

        logger.info(f"   ✓ Test examples: {total} total, {high_value} high-value")
--- a/src/skill_seekers/cli/word_scraper.py
+++ b/src/skill_seekers/cli/word_scraper.py
@@ -79,7 +79,9 @@ class WordToSkillConverter:
        self.config = config
        self.name = config["name"]
        self.docx_path = config.get("docx_path", "")
-        self.description = config.get("description") or f"Use when referencing {self.name} documentation"
+        self.description = (
+            config.get("description") or f"Use when referencing {self.name} documentation"
+        )

        # Paths
        self.skill_dir = f"output/{self.name}"
@@ -109,6 +111,9 @@ class WordToSkillConverter:
        if not os.path.exists(self.docx_path):
            raise FileNotFoundError(f"Word document not found: {self.docx_path}")

+        if not self.docx_path.lower().endswith(".docx"):
+            raise ValueError(f"Not a Word document (expected .docx): {self.docx_path}")
+
        # --- Extract metadata via python-docx ---
        doc = python_docx.Document(self.docx_path)
        core_props = doc.core_properties
@@ -728,12 +733,13 @@ class WordToSkillConverter:
 # HTML-to-sections helper (module-level for clarity)
 # ---------------------------------------------------------------------------

+
 def _build_section(
    section_number: int,
    heading: str | None,
    heading_level: str | None,
    elements: list,
-    doc,
+    doc,  # noqa: ARG001
 ) -> dict:
    """Build a section dict from a list of BeautifulSoup elements.

@@ -769,10 +775,7 @@ def _build_section(
        # Code blocks
        if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None):
            code_elem = elem.find("code") if tag == "pre" else elem
-            if code_elem:
-                code_text = code_elem.get_text()
-            else:
-                code_text = elem.get_text()
+            code_text = code_elem.get_text() if code_elem else elem.get_text()

            code_text = code_text.strip()
            if code_text:
@@ -825,8 +828,8 @@ def _build_section(
            raw_text = elem.get_text(separator="\n").strip()
            # Exclude bullet-point / prose lists (•, *, -)
            if raw_text and not re.search(r"^[•\-\*]\s", raw_text, re.MULTILINE):
-                if _score_code_quality(raw_text) >= 5.5:
-                    quality_score = _score_code_quality(raw_text)
+                quality_score = _score_code_quality(raw_text)
+                if quality_score >= 5.5:
                    code_samples.append(
                        {"code": raw_text, "language": "", "quality_score": quality_score}
                    )
@@ -956,7 +959,8 @@ def main():
        name = Path(args.from_json).stem.replace("_extracted", "")
        config = {
            "name": getattr(args, "name", None) or name,
-            "description": getattr(args, "description", None) or f"Use when referencing {name} documentation",
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} documentation",
        }
        try:
            converter = WordToSkillConverter(config)
@@ -1044,6 +1048,7 @@ def main():
    except Exception as e:
        print(f"\n❌ Unexpected error during Word processing: {e}", file=sys.stderr)
        import traceback
+
        traceback.print_exc()
        sys.exit(1)