Bug fixes: - Fix --var flag silently dropped in create routing (args.workflow_var → args.var) - Fix double _score_code_quality() call in word scraper - Add .docx file extension validation in WordToSkillConverter - Fix weaviate ImportError masked by generic Exception handler - Fix RAG chunking crash using non-existent converter.output_dir Chunking pipeline improvements: - Wire --chunk-overlap-tokens through entire package pipeline (package_skill → adaptor.package → format_skill_md → _maybe_chunk_content → RAGChunker) - Add auto-scaling overlap: max(50, chunk_tokens//10) when chunk size is non-default - Rename --no-preserve-code to --no-preserve-code-blocks (backward-compat alias kept) - Replace hardcoded 512/50 chunk defaults with DEFAULT_CHUNK_TOKENS/DEFAULT_CHUNK_OVERLAP_TOKENS constants across all 12 concrete adaptors, rag_chunker, base, and package_skill Code quality: - Extract shared _generate_openai_embeddings() and _generate_st_embeddings() to SkillAdaptor base class, removing ~150 lines of duplication from chroma/weaviate/pinecone - Add Pinecone adaptor with full upload support (pinecone_adaptor.py) Tests (14 new): - chunk_overlap_tokens parameter wiring, auto-scaling overlap, preserve_code_blocks flag - .docx/.doc/no-extension file validation, --var flag routing E2E - Embedding method inheritance verification, backward-compatible flag aliases Docs: - Update CHANGELOG, CLI_REFERENCE, API_REFERENCE, packaging guide (EN+ZH) - Update README test count badge (1880+ → 2283+) All 2283 tests passing, 8 skipped, 0 failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
401 lines
15 KiB
Python
401 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Pinecone Adaptor
|
||
|
||
Implements Pinecone vector database format for RAG pipelines.
|
||
Converts Skill Seekers documentation into Pinecone-compatible format
|
||
with namespace support and batch upsert.
|
||
"""
|
||
|
||
import json
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from .base import SkillAdaptor, SkillMetadata
|
||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||
|
||
# Pinecone metadata value limit: 40 KB per vector
|
||
PINECONE_METADATA_BYTES_LIMIT = 40_000
|
||
|
||
|
||
class PineconeAdaptor(SkillAdaptor):
|
||
"""
|
||
Pinecone vector database adaptor.
|
||
|
||
Handles:
|
||
- Pinecone-compatible vector format with metadata
|
||
- Namespace support for multi-tenant indexing
|
||
- Batch upsert (100 vectors per batch)
|
||
- OpenAI and sentence-transformers embedding generation
|
||
- Metadata truncation to stay within Pinecone's 40KB limit
|
||
"""
|
||
|
||
PLATFORM = "pinecone"
|
||
PLATFORM_NAME = "Pinecone (Vector Database)"
|
||
DEFAULT_API_ENDPOINT = None
|
||
|
||
def _generate_id(self, content: str, metadata: dict) -> str:
|
||
"""Generate deterministic ID from content and metadata."""
|
||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||
|
||
def _truncate_text_for_metadata(self, text: str, max_bytes: int = PINECONE_METADATA_BYTES_LIMIT) -> str:
|
||
"""Truncate text to fit within Pinecone's metadata byte limit.
|
||
|
||
Pinecone limits metadata to 40KB per vector. This truncates
|
||
the text field (largest metadata value) to stay within limits,
|
||
leaving room for other metadata fields (~1KB overhead).
|
||
|
||
Args:
|
||
text: Text content to potentially truncate
|
||
max_bytes: Maximum bytes for the text field
|
||
|
||
Returns:
|
||
Truncated text that fits within the byte limit
|
||
"""
|
||
# Reserve ~2KB for other metadata fields
|
||
available = max_bytes - 2000
|
||
encoded = text.encode("utf-8")
|
||
if len(encoded) <= available:
|
||
return text
|
||
# Truncate at byte boundary, decode safely
|
||
truncated = encoded[:available].decode("utf-8", errors="ignore")
|
||
return truncated
|
||
|
||
def format_skill_md(
|
||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||
) -> str:
|
||
"""
|
||
Format skill as JSON for Pinecone ingestion.
|
||
|
||
Creates a package with vectors ready for upsert:
|
||
{
|
||
"index_name": "...",
|
||
"namespace": "...",
|
||
"dimension": 1536,
|
||
"metric": "cosine",
|
||
"vectors": [
|
||
{
|
||
"id": "hex-id",
|
||
"metadata": {
|
||
"text": "content",
|
||
"source": "...",
|
||
"category": "...",
|
||
...
|
||
}
|
||
}
|
||
]
|
||
}
|
||
|
||
No ``values`` field — embeddings are added at upload time.
|
||
|
||
Args:
|
||
skill_dir: Path to skill directory
|
||
metadata: Skill metadata
|
||
enable_chunking: Enable intelligent chunking for large documents
|
||
**kwargs: Additional chunking parameters
|
||
|
||
Returns:
|
||
JSON string containing Pinecone-compatible data
|
||
"""
|
||
vectors: list[dict[str, Any]] = []
|
||
|
||
# Convert SKILL.md (main documentation)
|
||
skill_md_path = skill_dir / "SKILL.md"
|
||
if skill_md_path.exists():
|
||
content = self._read_existing_content(skill_dir)
|
||
if content.strip():
|
||
doc_metadata = {
|
||
"source": metadata.name,
|
||
"category": "overview",
|
||
"file": "SKILL.md",
|
||
"type": "documentation",
|
||
"version": metadata.version,
|
||
"doc_version": metadata.doc_version,
|
||
}
|
||
|
||
chunks = self._maybe_chunk_content(
|
||
content,
|
||
doc_metadata,
|
||
enable_chunking=enable_chunking,
|
||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||
source_file="SKILL.md",
|
||
chunk_overlap_tokens=kwargs.get("chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS),
|
||
)
|
||
|
||
for chunk_text, chunk_meta in chunks:
|
||
vectors.append(
|
||
{
|
||
"id": self._generate_id(chunk_text, chunk_meta),
|
||
"metadata": {
|
||
**chunk_meta,
|
||
"text": self._truncate_text_for_metadata(chunk_text),
|
||
},
|
||
}
|
||
)
|
||
|
||
# Convert all reference files
|
||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||
if ref_content.strip():
|
||
category = ref_file.stem.replace("_", " ").lower()
|
||
|
||
doc_metadata = {
|
||
"source": metadata.name,
|
||
"category": category,
|
||
"file": ref_file.name,
|
||
"type": "reference",
|
||
"version": metadata.version,
|
||
"doc_version": metadata.doc_version,
|
||
}
|
||
|
||
chunks = self._maybe_chunk_content(
|
||
ref_content,
|
||
doc_metadata,
|
||
enable_chunking=enable_chunking,
|
||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||
source_file=ref_file.name,
|
||
chunk_overlap_tokens=kwargs.get("chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS),
|
||
)
|
||
|
||
for chunk_text, chunk_meta in chunks:
|
||
vectors.append(
|
||
{
|
||
"id": self._generate_id(chunk_text, chunk_meta),
|
||
"metadata": {
|
||
**chunk_meta,
|
||
"text": self._truncate_text_for_metadata(chunk_text),
|
||
},
|
||
}
|
||
)
|
||
|
||
index_name = metadata.name.replace("_", "-").lower()
|
||
|
||
return json.dumps(
|
||
{
|
||
"index_name": index_name,
|
||
"namespace": index_name,
|
||
"dimension": 1536,
|
||
"metric": "cosine",
|
||
"vectors": vectors,
|
||
},
|
||
indent=2,
|
||
ensure_ascii=False,
|
||
)
|
||
|
||
def package(
|
||
self,
|
||
skill_dir: Path,
|
||
output_path: Path,
|
||
enable_chunking: bool = False,
|
||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||
preserve_code_blocks: bool = True,
|
||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||
) -> Path:
|
||
"""
|
||
Package skill into JSON file for Pinecone.
|
||
|
||
Creates a JSON file containing vectors with metadata, ready for
|
||
embedding generation and upsert to a Pinecone index.
|
||
|
||
Args:
|
||
skill_dir: Path to skill directory
|
||
output_path: Output path/filename for JSON file
|
||
enable_chunking: Enable intelligent chunking for large documents
|
||
chunk_max_tokens: Maximum tokens per chunk (default: 512)
|
||
preserve_code_blocks: Preserve code blocks during chunking
|
||
|
||
Returns:
|
||
Path to created JSON file
|
||
"""
|
||
skill_dir = Path(skill_dir)
|
||
|
||
output_path = self._format_output_path(skill_dir, Path(output_path), "-pinecone.json")
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Read metadata from SKILL.md frontmatter
|
||
metadata = self._build_skill_metadata(skill_dir)
|
||
|
||
pinecone_json = self.format_skill_md(
|
||
skill_dir,
|
||
metadata,
|
||
enable_chunking=enable_chunking,
|
||
chunk_max_tokens=chunk_max_tokens,
|
||
preserve_code_blocks=preserve_code_blocks,
|
||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||
)
|
||
|
||
output_path.write_text(pinecone_json, encoding="utf-8")
|
||
|
||
print(f"\n✅ Pinecone data packaged successfully!")
|
||
print(f"📦 Output: {output_path}")
|
||
|
||
data = json.loads(pinecone_json)
|
||
print(f"📊 Total vectors: {len(data['vectors'])}")
|
||
print(f"🗂️ Index name: {data['index_name']}")
|
||
print(f"📁 Namespace: {data['namespace']}")
|
||
print(f"📐 Default dimension: {data['dimension']} (auto-detected at upload time)")
|
||
|
||
# Show category breakdown
|
||
categories: dict[str, int] = {}
|
||
for vec in data["vectors"]:
|
||
cat = vec["metadata"].get("category", "unknown")
|
||
categories[cat] = categories.get(cat, 0) + 1
|
||
|
||
print("📁 Categories:")
|
||
for cat, count in sorted(categories.items()):
|
||
print(f" - {cat}: {count}")
|
||
|
||
return output_path
|
||
|
||
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
|
||
"""
|
||
Upload packaged skill to Pinecone.
|
||
|
||
Args:
|
||
package_path: Path to packaged JSON
|
||
api_key: Pinecone API key (or uses PINECONE_API_KEY env var)
|
||
**kwargs:
|
||
index_name: Override index name from JSON
|
||
namespace: Override namespace from JSON
|
||
dimension: Embedding dimension (default: 1536)
|
||
metric: Distance metric (default: "cosine")
|
||
embedding_function: "openai" or "sentence-transformers"
|
||
cloud: Cloud provider (default: "aws")
|
||
region: Cloud region (default: "us-east-1")
|
||
|
||
Returns:
|
||
{"success": bool, "index": str, "namespace": str, "count": int}
|
||
"""
|
||
import os
|
||
|
||
try:
|
||
from pinecone import Pinecone, ServerlessSpec
|
||
except (ImportError, Exception):
|
||
return {
|
||
"success": False,
|
||
"message": "pinecone not installed. Run: pip install 'pinecone>=5.0.0'",
|
||
}
|
||
|
||
api_key = api_key or os.getenv("PINECONE_API_KEY")
|
||
if not api_key:
|
||
return {
|
||
"success": False,
|
||
"message": (
|
||
"PINECONE_API_KEY not set. "
|
||
"Set via env var or pass api_key parameter."
|
||
),
|
||
}
|
||
|
||
# Load package
|
||
with open(package_path) as f:
|
||
data = json.load(f)
|
||
|
||
index_name = kwargs.get("index_name", data.get("index_name", "skill-docs"))
|
||
namespace = kwargs.get("namespace", data.get("namespace", ""))
|
||
metric = kwargs.get("metric", data.get("metric", "cosine"))
|
||
cloud = kwargs.get("cloud", "aws")
|
||
region = kwargs.get("region", "us-east-1")
|
||
|
||
# Auto-detect dimension from embedding model
|
||
embedding_function = kwargs.get("embedding_function", "openai")
|
||
EMBEDDING_DIMENSIONS = {
|
||
"openai": 1536, # text-embedding-3-small
|
||
"sentence-transformers": 384, # all-MiniLM-L6-v2
|
||
}
|
||
# Priority: explicit kwarg > model-based auto-detect > JSON file > fallback
|
||
# Note: format_skill_md() hardcodes dimension=1536 in the JSON, so we must
|
||
# give EMBEDDING_DIMENSIONS priority over the file to handle sentence-transformers (384).
|
||
dimension = kwargs.get(
|
||
"dimension",
|
||
EMBEDDING_DIMENSIONS.get(embedding_function, data.get("dimension", 1536)),
|
||
)
|
||
|
||
try:
|
||
# Generate embeddings FIRST — before creating the index.
|
||
# This avoids leaving an empty Pinecone index behind when
|
||
# embedding generation fails (e.g. missing API key).
|
||
texts = [vec["metadata"]["text"] for vec in data["vectors"]]
|
||
|
||
if embedding_function == "openai":
|
||
embeddings = self._generate_openai_embeddings(texts)
|
||
elif embedding_function == "sentence-transformers":
|
||
embeddings = self._generate_st_embeddings(texts)
|
||
else:
|
||
return {
|
||
"success": False,
|
||
"message": f"Unknown embedding_function: {embedding_function}. Use 'openai' or 'sentence-transformers'.",
|
||
}
|
||
|
||
pc = Pinecone(api_key=api_key)
|
||
|
||
# Create index if it doesn't exist
|
||
existing_indexes = [idx.name for idx in pc.list_indexes()]
|
||
if index_name not in existing_indexes:
|
||
print(f"🔧 Creating Pinecone index: {index_name} (dimension={dimension}, metric={metric})")
|
||
pc.create_index(
|
||
name=index_name,
|
||
dimension=dimension,
|
||
metric=metric,
|
||
spec=ServerlessSpec(cloud=cloud, region=region),
|
||
)
|
||
print(f"✅ Index '{index_name}' created")
|
||
else:
|
||
print(f"ℹ️ Using existing index: {index_name}")
|
||
|
||
index = pc.Index(index_name)
|
||
|
||
# Batch upsert (100 per batch — Pinecone recommendation)
|
||
batch_size = 100
|
||
vectors_to_upsert = []
|
||
for i, vec in enumerate(data["vectors"]):
|
||
vectors_to_upsert.append(
|
||
{
|
||
"id": vec["id"],
|
||
"values": embeddings[i],
|
||
"metadata": vec["metadata"],
|
||
}
|
||
)
|
||
|
||
total = len(vectors_to_upsert)
|
||
print(f"🔄 Upserting {total} vectors to Pinecone...")
|
||
|
||
for i in range(0, total, batch_size):
|
||
batch = vectors_to_upsert[i : i + batch_size]
|
||
index.upsert(vectors=batch, namespace=namespace)
|
||
print(f" ✓ Upserted {min(i + batch_size, total)}/{total}")
|
||
|
||
print(f"✅ Uploaded {total} vectors to Pinecone index '{index_name}'")
|
||
|
||
return {
|
||
"success": True,
|
||
"message": f"Uploaded {total} vectors to Pinecone index '{index_name}' (namespace: '{namespace}')",
|
||
"url": None,
|
||
"index": index_name,
|
||
"namespace": namespace,
|
||
"count": total,
|
||
}
|
||
|
||
except Exception as e:
|
||
return {"success": False, "message": f"Pinecone upload failed: {e}"}
|
||
|
||
def validate_api_key(self, _api_key: str) -> bool:
|
||
"""Pinecone doesn't need API key for packaging."""
|
||
return False
|
||
|
||
def get_env_var_name(self) -> str:
|
||
"""Return the expected env var for Pinecone API key."""
|
||
return "PINECONE_API_KEY"
|
||
|
||
def supports_enhancement(self) -> bool:
|
||
"""Pinecone format doesn't support AI enhancement."""
|
||
return False
|
||
|
||
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
|
||
"""Pinecone format doesn't support enhancement."""
|
||
print("❌ Pinecone format does not support enhancement")
|
||
print(" Enhance before packaging:")
|
||
print(" skill-seekers enhance output/skill/ --mode LOCAL")
|
||
print(" skill-seekers package output/skill/ --target pinecone")
|
||
return False
|