fix: resolve 18 bugs and code quality issues across adaptors, CLI, and chunking pipeline

Bug fixes:
- Fix --var flag silently dropped in create routing (args.workflow_var → args.var)
- Fix double _score_code_quality() call in word scraper
- Add .docx file extension validation in WordToSkillConverter
- Fix weaviate ImportError masked by generic Exception handler
- Fix RAG chunking crash using non-existent converter.output_dir

Chunking pipeline improvements:
- Wire --chunk-overlap-tokens through entire package pipeline
  (package_skill → adaptor.package → format_skill_md → _maybe_chunk_content → RAGChunker)
- Add auto-scaling overlap: max(50, chunk_tokens//10) when chunk size is non-default
- Rename --no-preserve-code to --no-preserve-code-blocks (backward-compat alias kept)
- Replace hardcoded 512/50 chunk defaults with DEFAULT_CHUNK_TOKENS/DEFAULT_CHUNK_OVERLAP_TOKENS
  constants across all 12 concrete adaptors, rag_chunker, base, and package_skill

Code quality:
- Extract shared _generate_openai_embeddings() and _generate_st_embeddings() to SkillAdaptor
  base class, removing ~150 lines of duplication from chroma/weaviate/pinecone
- Add Pinecone adaptor with full upload support (pinecone_adaptor.py)

Tests (14 new):
- chunk_overlap_tokens parameter wiring, auto-scaling overlap, preserve_code_blocks flag
- .docx/.doc/no-extension file validation, --var flag routing E2E
- Embedding method inheritance verification, backward-compatible flag aliases

Docs:
- Update CHANGELOG, CLI_REFERENCE, API_REFERENCE, packaging guide (EN+ZH)
- Update README test count badge (1880+ → 2283+)

All 2283 tests passing, 8 skipped, 0 failures.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-02-28 21:57:59 +03:00
parent 3bad7cf365
commit 064405c052
41 changed files with 1864 additions and 237 deletions

56
uv.lock generated
View File

@@ -3621,11 +3621,11 @@ wheels = [
[[package]]
name = "packaging"
version = "25.0"
version = "24.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950, upload-time = "2024-11-08T09:47:47.202Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
{ url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451, upload-time = "2024-11-08T09:47:44.722Z" },
]
[[package]]
@@ -3797,6 +3797,46 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/2d/71/64e9b1c7f04ae0027f788a248e6297d7fcc29571371fe7d45495a78172c0/pillow-12.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:75af0b4c229ac519b155028fa1be632d812a519abba9b46b20e50c6caa184f19", size = 7029809, upload-time = "2026-01-02T09:13:26.541Z" },
]
[[package]]
name = "pinecone"
version = "8.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "orjson" },
{ name = "pinecone-plugin-assistant" },
{ name = "pinecone-plugin-interface" },
{ name = "python-dateutil" },
{ name = "typing-extensions" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/e2/e4/8303133de5b3850c85d56caf9cc23cc38c74942bb8a940890b225245d7df/pinecone-8.1.0.tar.gz", hash = "sha256:48a00843fb232ccfd57eba618f0c0294e918b030e1bc7e853fb88d04f80ba569", size = 1041965, upload-time = "2026-02-19T20:08:32.999Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4e/f7/beee7033ef92e5964e570fc29a048627e298745916e65c66105378405d06/pinecone-8.1.0-py3-none-any.whl", hash = "sha256:b0ba9c55c9a072fbe4fc7381bc3e5eb1b14550a8007233a3368ada74b1747534", size = 742745, upload-time = "2026-02-19T20:08:31.319Z" },
]
[[package]]
name = "pinecone-plugin-assistant"
version = "3.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "packaging" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c4/16/dcaff42ddfeab75dccd17685a0db46489717c3d23753dc14c55770e12aa8/pinecone_plugin_assistant-3.0.2.tar.gz", hash = "sha256:04163af282ad7895b581ab89f850ed139e4ddcea72010cadfa4c573759d5c896", size = 152066, upload-time = "2026-02-01T09:08:48.04Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/4a/dd/8bc4f3baf6c03acfb0b300f5aba53d19cc3a319281da518182bf22671b92/pinecone_plugin_assistant-3.0.2-py3-none-any.whl", hash = "sha256:de21ff696219fcad6c7ec86a3d1f70875024314537758ab345b6230462342903", size = 280863, upload-time = "2026-02-01T09:08:49.384Z" },
]
[[package]]
name = "pinecone-plugin-interface"
version = "0.0.7"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f4/fb/e8a4063264953ead9e2b24d9b390152c60f042c951c47f4592e9996e57ff/pinecone_plugin_interface-0.0.7.tar.gz", hash = "sha256:b8e6675e41847333aa13923cc44daa3f85676d7157324682dc1640588a982846", size = 3370, upload-time = "2024-06-05T01:57:52.093Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3b/1d/a21fdfcd6d022cb64cef5c2a29ee6691c6c103c4566b41646b080b7536a5/pinecone_plugin_interface-0.0.7-py3-none-any.whl", hash = "sha256:875857ad9c9fc8bbc074dbe780d187a2afd21f5bfe0f3b08601924a61ef1bba8", size = 6249, upload-time = "2024-06-05T01:57:50.583Z" },
]
[[package]]
name = "platformdirs"
version = "4.9.2"
@@ -5405,6 +5445,7 @@ all = [
{ name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
{ name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
{ name = "openai" },
{ name = "pinecone" },
{ name = "python-docx" },
{ name = "sentence-transformers" },
{ name = "sse-starlette" },
@@ -5457,8 +5498,12 @@ mcp = [
openai = [
{ name = "openai" },
]
pinecone = [
{ name = "pinecone" },
]
rag-upload = [
{ name = "chromadb" },
{ name = "pinecone" },
{ name = "sentence-transformers" },
{ name = "weaviate-client" },
]
@@ -5533,6 +5578,9 @@ requires-dist = [
{ name = "openai", marker = "extra == 'openai'", specifier = ">=1.0.0" },
{ name = "pathspec", specifier = ">=0.12.1" },
{ name = "pillow", specifier = ">=11.0.0" },
{ name = "pinecone", marker = "extra == 'all'", specifier = ">=5.0.0" },
{ name = "pinecone", marker = "extra == 'pinecone'", specifier = ">=5.0.0" },
{ name = "pinecone", marker = "extra == 'rag-upload'", specifier = ">=5.0.0" },
{ name = "pydantic", specifier = ">=2.12.3" },
{ name = "pydantic-settings", specifier = ">=2.11.0" },
{ name = "pygithub", specifier = ">=2.5.0" },
@@ -5563,7 +5611,7 @@ requires-dist = [
{ name = "weaviate-client", marker = "extra == 'rag-upload'", specifier = ">=3.25.0" },
{ name = "weaviate-client", marker = "extra == 'weaviate'", specifier = ">=3.25.0" },
]
provides-extras = ["mcp", "gemini", "openai", "all-llms", "s3", "gcs", "azure", "docx", "chroma", "weaviate", "sentence-transformers", "rag-upload", "all-cloud", "embedding", "all"]
provides-extras = ["mcp", "gemini", "openai", "all-llms", "s3", "gcs", "azure", "docx", "chroma", "weaviate", "sentence-transformers", "pinecone", "rag-upload", "all-cloud", "embedding", "all"]
[package.metadata.requires-dev]
dev = [