fix: resolve 18 bugs and code quality issues across adaptors, CLI, and chunking pipeline
Bug fixes: - Fix --var flag silently dropped in create routing (args.workflow_var → args.var) - Fix double _score_code_quality() call in word scraper - Add .docx file extension validation in WordToSkillConverter - Fix weaviate ImportError masked by generic Exception handler - Fix RAG chunking crash using non-existent converter.output_dir Chunking pipeline improvements: - Wire --chunk-overlap-tokens through entire package pipeline (package_skill → adaptor.package → format_skill_md → _maybe_chunk_content → RAGChunker) - Add auto-scaling overlap: max(50, chunk_tokens//10) when chunk size is non-default - Rename --no-preserve-code to --no-preserve-code-blocks (backward-compat alias kept) - Replace hardcoded 512/50 chunk defaults with DEFAULT_CHUNK_TOKENS/DEFAULT_CHUNK_OVERLAP_TOKENS constants across all 12 concrete adaptors, rag_chunker, base, and package_skill Code quality: - Extract shared _generate_openai_embeddings() and _generate_st_embeddings() to SkillAdaptor base class, removing ~150 lines of duplication from chroma/weaviate/pinecone - Add Pinecone adaptor with full upload support (pinecone_adaptor.py) Tests (14 new): - chunk_overlap_tokens parameter wiring, auto-scaling overlap, preserve_code_blocks flag - .docx/.doc/no-extension file validation, --var flag routing E2E - Embedding method inheritance verification, backward-compatible flag aliases Docs: - Update CHANGELOG, CLI_REFERENCE, API_REFERENCE, packaging guide (EN+ZH) - Update README test count badge (1880+ → 2283+) All 2283 tests passing, 8 skipped, 0 failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
400
src/skill_seekers/cli/adaptors/pinecone_adaptor.py
Normal file
400
src/skill_seekers/cli/adaptors/pinecone_adaptor.py
Normal file
@@ -0,0 +1,400 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pinecone Adaptor
|
||||
|
||||
Implements Pinecone vector database format for RAG pipelines.
|
||||
Converts Skill Seekers documentation into Pinecone-compatible format
|
||||
with namespace support and batch upsert.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
# Pinecone metadata value limit: 40 KB per vector
|
||||
PINECONE_METADATA_BYTES_LIMIT = 40_000
|
||||
|
||||
|
||||
class PineconeAdaptor(SkillAdaptor):
|
||||
"""
|
||||
Pinecone vector database adaptor.
|
||||
|
||||
Handles:
|
||||
- Pinecone-compatible vector format with metadata
|
||||
- Namespace support for multi-tenant indexing
|
||||
- Batch upsert (100 vectors per batch)
|
||||
- OpenAI and sentence-transformers embedding generation
|
||||
- Metadata truncation to stay within Pinecone's 40KB limit
|
||||
"""
|
||||
|
||||
PLATFORM = "pinecone"
|
||||
PLATFORM_NAME = "Pinecone (Vector Database)"
|
||||
DEFAULT_API_ENDPOINT = None
|
||||
|
||||
def _generate_id(self, content: str, metadata: dict) -> str:
|
||||
"""Generate deterministic ID from content and metadata."""
|
||||
return self._generate_deterministic_id(content, metadata, format="hex")
|
||||
|
||||
def _truncate_text_for_metadata(self, text: str, max_bytes: int = PINECONE_METADATA_BYTES_LIMIT) -> str:
|
||||
"""Truncate text to fit within Pinecone's metadata byte limit.
|
||||
|
||||
Pinecone limits metadata to 40KB per vector. This truncates
|
||||
the text field (largest metadata value) to stay within limits,
|
||||
leaving room for other metadata fields (~1KB overhead).
|
||||
|
||||
Args:
|
||||
text: Text content to potentially truncate
|
||||
max_bytes: Maximum bytes for the text field
|
||||
|
||||
Returns:
|
||||
Truncated text that fits within the byte limit
|
||||
"""
|
||||
# Reserve ~2KB for other metadata fields
|
||||
available = max_bytes - 2000
|
||||
encoded = text.encode("utf-8")
|
||||
if len(encoded) <= available:
|
||||
return text
|
||||
# Truncate at byte boundary, decode safely
|
||||
truncated = encoded[:available].decode("utf-8", errors="ignore")
|
||||
return truncated
|
||||
|
||||
def format_skill_md(
|
||||
self, skill_dir: Path, metadata: SkillMetadata, enable_chunking: bool = False, **kwargs
|
||||
) -> str:
|
||||
"""
|
||||
Format skill as JSON for Pinecone ingestion.
|
||||
|
||||
Creates a package with vectors ready for upsert:
|
||||
{
|
||||
"index_name": "...",
|
||||
"namespace": "...",
|
||||
"dimension": 1536,
|
||||
"metric": "cosine",
|
||||
"vectors": [
|
||||
{
|
||||
"id": "hex-id",
|
||||
"metadata": {
|
||||
"text": "content",
|
||||
"source": "...",
|
||||
"category": "...",
|
||||
...
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
No ``values`` field — embeddings are added at upload time.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
metadata: Skill metadata
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
**kwargs: Additional chunking parameters
|
||||
|
||||
Returns:
|
||||
JSON string containing Pinecone-compatible data
|
||||
"""
|
||||
vectors: list[dict[str, Any]] = []
|
||||
|
||||
# Convert SKILL.md (main documentation)
|
||||
skill_md_path = skill_dir / "SKILL.md"
|
||||
if skill_md_path.exists():
|
||||
content = self._read_existing_content(skill_dir)
|
||||
if content.strip():
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": "overview",
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
chunks = self._maybe_chunk_content(
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get("chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS),
|
||||
)
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
vectors.append(
|
||||
{
|
||||
"id": self._generate_id(chunk_text, chunk_meta),
|
||||
"metadata": {
|
||||
**chunk_meta,
|
||||
"text": self._truncate_text_for_metadata(chunk_text),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Convert all reference files
|
||||
for ref_file, ref_content in self._iterate_references(skill_dir):
|
||||
if ref_content.strip():
|
||||
category = ref_file.stem.replace("_", " ").lower()
|
||||
|
||||
doc_metadata = {
|
||||
"source": metadata.name,
|
||||
"category": category,
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
chunks = self._maybe_chunk_content(
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get("chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS),
|
||||
)
|
||||
|
||||
for chunk_text, chunk_meta in chunks:
|
||||
vectors.append(
|
||||
{
|
||||
"id": self._generate_id(chunk_text, chunk_meta),
|
||||
"metadata": {
|
||||
**chunk_meta,
|
||||
"text": self._truncate_text_for_metadata(chunk_text),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
index_name = metadata.name.replace("_", "-").lower()
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"index_name": index_name,
|
||||
"namespace": index_name,
|
||||
"dimension": 1536,
|
||||
"metric": "cosine",
|
||||
"vectors": vectors,
|
||||
},
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
|
||||
def package(
|
||||
self,
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for Pinecone.
|
||||
|
||||
Creates a JSON file containing vectors with metadata, ready for
|
||||
embedding generation and upsert to a Pinecone index.
|
||||
|
||||
Args:
|
||||
skill_dir: Path to skill directory
|
||||
output_path: Output path/filename for JSON file
|
||||
enable_chunking: Enable intelligent chunking for large documents
|
||||
chunk_max_tokens: Maximum tokens per chunk (default: 512)
|
||||
preserve_code_blocks: Preserve code blocks during chunking
|
||||
|
||||
Returns:
|
||||
Path to created JSON file
|
||||
"""
|
||||
skill_dir = Path(skill_dir)
|
||||
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-pinecone.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
pinecone_json = self.format_skill_md(
|
||||
skill_dir,
|
||||
metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
output_path.write_text(pinecone_json, encoding="utf-8")
|
||||
|
||||
print(f"\n✅ Pinecone data packaged successfully!")
|
||||
print(f"📦 Output: {output_path}")
|
||||
|
||||
data = json.loads(pinecone_json)
|
||||
print(f"📊 Total vectors: {len(data['vectors'])}")
|
||||
print(f"🗂️ Index name: {data['index_name']}")
|
||||
print(f"📁 Namespace: {data['namespace']}")
|
||||
print(f"📐 Default dimension: {data['dimension']} (auto-detected at upload time)")
|
||||
|
||||
# Show category breakdown
|
||||
categories: dict[str, int] = {}
|
||||
for vec in data["vectors"]:
|
||||
cat = vec["metadata"].get("category", "unknown")
|
||||
categories[cat] = categories.get(cat, 0) + 1
|
||||
|
||||
print("📁 Categories:")
|
||||
for cat, count in sorted(categories.items()):
|
||||
print(f" - {cat}: {count}")
|
||||
|
||||
return output_path
|
||||
|
||||
def upload(self, package_path: Path, api_key: str | None = None, **kwargs) -> dict[str, Any]:
|
||||
"""
|
||||
Upload packaged skill to Pinecone.
|
||||
|
||||
Args:
|
||||
package_path: Path to packaged JSON
|
||||
api_key: Pinecone API key (or uses PINECONE_API_KEY env var)
|
||||
**kwargs:
|
||||
index_name: Override index name from JSON
|
||||
namespace: Override namespace from JSON
|
||||
dimension: Embedding dimension (default: 1536)
|
||||
metric: Distance metric (default: "cosine")
|
||||
embedding_function: "openai" or "sentence-transformers"
|
||||
cloud: Cloud provider (default: "aws")
|
||||
region: Cloud region (default: "us-east-1")
|
||||
|
||||
Returns:
|
||||
{"success": bool, "index": str, "namespace": str, "count": int}
|
||||
"""
|
||||
import os
|
||||
|
||||
try:
|
||||
from pinecone import Pinecone, ServerlessSpec
|
||||
except (ImportError, Exception):
|
||||
return {
|
||||
"success": False,
|
||||
"message": "pinecone not installed. Run: pip install 'pinecone>=5.0.0'",
|
||||
}
|
||||
|
||||
api_key = api_key or os.getenv("PINECONE_API_KEY")
|
||||
if not api_key:
|
||||
return {
|
||||
"success": False,
|
||||
"message": (
|
||||
"PINECONE_API_KEY not set. "
|
||||
"Set via env var or pass api_key parameter."
|
||||
),
|
||||
}
|
||||
|
||||
# Load package
|
||||
with open(package_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
index_name = kwargs.get("index_name", data.get("index_name", "skill-docs"))
|
||||
namespace = kwargs.get("namespace", data.get("namespace", ""))
|
||||
metric = kwargs.get("metric", data.get("metric", "cosine"))
|
||||
cloud = kwargs.get("cloud", "aws")
|
||||
region = kwargs.get("region", "us-east-1")
|
||||
|
||||
# Auto-detect dimension from embedding model
|
||||
embedding_function = kwargs.get("embedding_function", "openai")
|
||||
EMBEDDING_DIMENSIONS = {
|
||||
"openai": 1536, # text-embedding-3-small
|
||||
"sentence-transformers": 384, # all-MiniLM-L6-v2
|
||||
}
|
||||
# Priority: explicit kwarg > model-based auto-detect > JSON file > fallback
|
||||
# Note: format_skill_md() hardcodes dimension=1536 in the JSON, so we must
|
||||
# give EMBEDDING_DIMENSIONS priority over the file to handle sentence-transformers (384).
|
||||
dimension = kwargs.get(
|
||||
"dimension",
|
||||
EMBEDDING_DIMENSIONS.get(embedding_function, data.get("dimension", 1536)),
|
||||
)
|
||||
|
||||
try:
|
||||
# Generate embeddings FIRST — before creating the index.
|
||||
# This avoids leaving an empty Pinecone index behind when
|
||||
# embedding generation fails (e.g. missing API key).
|
||||
texts = [vec["metadata"]["text"] for vec in data["vectors"]]
|
||||
|
||||
if embedding_function == "openai":
|
||||
embeddings = self._generate_openai_embeddings(texts)
|
||||
elif embedding_function == "sentence-transformers":
|
||||
embeddings = self._generate_st_embeddings(texts)
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"Unknown embedding_function: {embedding_function}. Use 'openai' or 'sentence-transformers'.",
|
||||
}
|
||||
|
||||
pc = Pinecone(api_key=api_key)
|
||||
|
||||
# Create index if it doesn't exist
|
||||
existing_indexes = [idx.name for idx in pc.list_indexes()]
|
||||
if index_name not in existing_indexes:
|
||||
print(f"🔧 Creating Pinecone index: {index_name} (dimension={dimension}, metric={metric})")
|
||||
pc.create_index(
|
||||
name=index_name,
|
||||
dimension=dimension,
|
||||
metric=metric,
|
||||
spec=ServerlessSpec(cloud=cloud, region=region),
|
||||
)
|
||||
print(f"✅ Index '{index_name}' created")
|
||||
else:
|
||||
print(f"ℹ️ Using existing index: {index_name}")
|
||||
|
||||
index = pc.Index(index_name)
|
||||
|
||||
# Batch upsert (100 per batch — Pinecone recommendation)
|
||||
batch_size = 100
|
||||
vectors_to_upsert = []
|
||||
for i, vec in enumerate(data["vectors"]):
|
||||
vectors_to_upsert.append(
|
||||
{
|
||||
"id": vec["id"],
|
||||
"values": embeddings[i],
|
||||
"metadata": vec["metadata"],
|
||||
}
|
||||
)
|
||||
|
||||
total = len(vectors_to_upsert)
|
||||
print(f"🔄 Upserting {total} vectors to Pinecone...")
|
||||
|
||||
for i in range(0, total, batch_size):
|
||||
batch = vectors_to_upsert[i : i + batch_size]
|
||||
index.upsert(vectors=batch, namespace=namespace)
|
||||
print(f" ✓ Upserted {min(i + batch_size, total)}/{total}")
|
||||
|
||||
print(f"✅ Uploaded {total} vectors to Pinecone index '{index_name}'")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"Uploaded {total} vectors to Pinecone index '{index_name}' (namespace: '{namespace}')",
|
||||
"url": None,
|
||||
"index": index_name,
|
||||
"namespace": namespace,
|
||||
"count": total,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"success": False, "message": f"Pinecone upload failed: {e}"}
|
||||
|
||||
def validate_api_key(self, _api_key: str) -> bool:
|
||||
"""Pinecone doesn't need API key for packaging."""
|
||||
return False
|
||||
|
||||
def get_env_var_name(self) -> str:
|
||||
"""Return the expected env var for Pinecone API key."""
|
||||
return "PINECONE_API_KEY"
|
||||
|
||||
def supports_enhancement(self) -> bool:
|
||||
"""Pinecone format doesn't support AI enhancement."""
|
||||
return False
|
||||
|
||||
def enhance(self, _skill_dir: Path, _api_key: str) -> bool:
|
||||
"""Pinecone format doesn't support enhancement."""
|
||||
print("❌ Pinecone format does not support enhancement")
|
||||
print(" Enhance before packaging:")
|
||||
print(" skill-seekers enhance output/skill/ --mode LOCAL")
|
||||
print(" skill-seekers package output/skill/ --target pinecone")
|
||||
return False
|
||||
Reference in New Issue
Block a user