fix: resolve 18 bugs and code quality issues across adaptors, CLI, and chunking pipeline
Bug fixes: - Fix --var flag silently dropped in create routing (args.workflow_var → args.var) - Fix double _score_code_quality() call in word scraper - Add .docx file extension validation in WordToSkillConverter - Fix weaviate ImportError masked by generic Exception handler - Fix RAG chunking crash using non-existent converter.output_dir Chunking pipeline improvements: - Wire --chunk-overlap-tokens through entire package pipeline (package_skill → adaptor.package → format_skill_md → _maybe_chunk_content → RAGChunker) - Add auto-scaling overlap: max(50, chunk_tokens//10) when chunk size is non-default - Rename --no-preserve-code to --no-preserve-code-blocks (backward-compat alias kept) - Replace hardcoded 512/50 chunk defaults with DEFAULT_CHUNK_TOKENS/DEFAULT_CHUNK_OVERLAP_TOKENS constants across all 12 concrete adaptors, rag_chunker, base, and package_skill Code quality: - Extract shared _generate_openai_embeddings() and _generate_st_embeddings() to SkillAdaptor base class, removing ~150 lines of duplication from chroma/weaviate/pinecone - Add Pinecone adaptor with full upload support (pinecone_adaptor.py) Tests (14 new): - chunk_overlap_tokens parameter wiring, auto-scaling overlap, preserve_code_blocks flag - .docx/.doc/no-extension file validation, --var flag routing E2E - Embedding method inheritance verification, backward-compatible flag aliases Docs: - Update CHANGELOG, CLI_REFERENCE, API_REFERENCE, packaging guide (EN+ZH) - Update README test count badge (1880+ → 2283+) All 2283 tests passing, 8 skipped, 0 failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -11,6 +11,7 @@ from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .base import SkillAdaptor, SkillMetadata
|
||||
from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS
|
||||
|
||||
|
||||
class FAISSHelpers(SkillAdaptor):
|
||||
@@ -81,6 +82,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
"file": "SKILL.md",
|
||||
"type": "documentation",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -88,9 +90,10 @@ class FAISSHelpers(SkillAdaptor):
|
||||
content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file="SKILL.md",
|
||||
chunk_overlap_tokens=kwargs.get("chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS),
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -110,6 +113,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
"file": ref_file.name,
|
||||
"type": "reference",
|
||||
"version": metadata.version,
|
||||
"doc_version": metadata.doc_version,
|
||||
}
|
||||
|
||||
# Chunk if enabled
|
||||
@@ -117,9 +121,10 @@ class FAISSHelpers(SkillAdaptor):
|
||||
ref_content,
|
||||
doc_metadata,
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", 512),
|
||||
chunk_max_tokens=kwargs.get("chunk_max_tokens", DEFAULT_CHUNK_TOKENS),
|
||||
preserve_code_blocks=kwargs.get("preserve_code_blocks", True),
|
||||
source_file=ref_file.name,
|
||||
chunk_overlap_tokens=kwargs.get("chunk_overlap_tokens", DEFAULT_CHUNK_OVERLAP_TOKENS),
|
||||
)
|
||||
|
||||
# Add all chunks to parallel arrays
|
||||
@@ -155,8 +160,9 @@ class FAISSHelpers(SkillAdaptor):
|
||||
skill_dir: Path,
|
||||
output_path: Path,
|
||||
enable_chunking: bool = False,
|
||||
chunk_max_tokens: int = 512,
|
||||
chunk_max_tokens: int = DEFAULT_CHUNK_TOKENS,
|
||||
preserve_code_blocks: bool = True,
|
||||
chunk_overlap_tokens: int = DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
) -> Path:
|
||||
"""
|
||||
Package skill into JSON file for FAISS.
|
||||
@@ -176,12 +182,8 @@ class FAISSHelpers(SkillAdaptor):
|
||||
output_path = self._format_output_path(skill_dir, Path(output_path), "-faiss.json")
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read metadata
|
||||
metadata = SkillMetadata(
|
||||
name=skill_dir.name,
|
||||
description=f"FAISS data for {skill_dir.name}",
|
||||
version="1.0.0",
|
||||
)
|
||||
# Read metadata from SKILL.md frontmatter
|
||||
metadata = self._build_skill_metadata(skill_dir)
|
||||
|
||||
# Generate FAISS data
|
||||
faiss_json = self.format_skill_md(
|
||||
@@ -190,6 +192,7 @@ class FAISSHelpers(SkillAdaptor):
|
||||
enable_chunking=enable_chunking,
|
||||
chunk_max_tokens=chunk_max_tokens,
|
||||
preserve_code_blocks=preserve_code_blocks,
|
||||
chunk_overlap_tokens=chunk_overlap_tokens,
|
||||
)
|
||||
|
||||
# Write to file
|
||||
|
||||
Reference in New Issue
Block a user