Merge branch 'development' into feature/video-scraper-pipeline

Sync with latest development changes including ruff formatting,
bug fixes, and pinecone adaptor additions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-01 11:38:45 +03:00
43 changed files with 1988 additions and 261 deletions

View File

@@ -358,6 +358,107 @@ class TestChunkingCLIIntegration:
f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})"
)
def test_chunk_overlap_tokens_parameter(self, tmp_path):
"""Test --chunk-overlap-tokens controls RAGChunker overlap."""
from skill_seekers.cli.package_skill import package_skill
skill_dir = create_test_skill(tmp_path, large_doc=True)
# Package with default overlap (50)
success, package_path = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target="langchain",
enable_chunking=True,
chunk_max_tokens=256,
chunk_overlap_tokens=50,
)
assert success
assert package_path.exists()
with open(package_path) as f:
data_default = json.load(f)
# Package with large overlap (128)
success2, package_path2 = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target="langchain",
enable_chunking=True,
chunk_max_tokens=256,
chunk_overlap_tokens=128,
)
assert success2
assert package_path2.exists()
with open(package_path2) as f:
data_large_overlap = json.load(f)
# Large overlap should produce more chunks (more overlap = more chunks)
assert len(data_large_overlap) >= len(data_default), (
f"Large overlap ({len(data_large_overlap)}) should produce >= chunks than default ({len(data_default)})"
)
def test_chunk_overlap_scales_with_chunk_size(self, tmp_path):
"""Test that overlap auto-scales when chunk_tokens is non-default but overlap is default."""
from skill_seekers.cli.adaptors.base import (
DEFAULT_CHUNK_TOKENS,
DEFAULT_CHUNK_OVERLAP_TOKENS,
)
adaptor = get_adaptor("langchain")
skill_dir = create_test_skill(tmp_path, large_doc=True)
adaptor._build_skill_metadata(skill_dir)
content = (skill_dir / "SKILL.md").read_text()
# With default chunk size (512) and default overlap (50), overlap should be 50
chunks_default = adaptor._maybe_chunk_content(
content,
{"source": "test"},
enable_chunking=True,
chunk_max_tokens=DEFAULT_CHUNK_TOKENS,
chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
)
# With large chunk size (1024) and default overlap (50),
# overlap should auto-scale to max(50, 1024//10) = 102
chunks_large = adaptor._maybe_chunk_content(
content,
{"source": "test"},
enable_chunking=True,
chunk_max_tokens=1024,
chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
)
# Both should produce valid chunks
assert len(chunks_default) > 1
assert len(chunks_large) >= 1
def test_preserve_code_blocks_flag(self, tmp_path):
"""Test --no-preserve-code-blocks parameter is accepted."""
from skill_seekers.cli.package_skill import package_skill
skill_dir = create_test_skill(tmp_path, large_doc=True)
# Package with code block preservation disabled
success, package_path = package_skill(
skill_dir=skill_dir,
open_folder_after=False,
skip_quality_check=True,
target="langchain",
enable_chunking=True,
chunk_max_tokens=256,
preserve_code_blocks=False,
)
assert success
assert package_path.exists()
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -294,5 +294,84 @@ class TestE2EWorkflow:
assert "unrecognized arguments" not in result.stderr.lower()
class TestVarFlagRouting:
"""Test that --var flag is correctly routed through create command."""
def test_var_flag_accepted_by_create(self):
"""Test that --var flag is accepted (not 'unrecognized') by create command."""
result = subprocess.run(
["skill-seekers", "create", "--help"],
capture_output=True,
text=True,
)
assert "--var" in result.stdout, "create --help should show --var flag"
def test_var_flag_accepted_by_analyze(self):
"""Test that --var flag is accepted by analyze command."""
result = subprocess.run(
["skill-seekers", "analyze", "--help"],
capture_output=True,
text=True,
)
assert "--var" in result.stdout, "analyze --help should show --var flag"
@pytest.mark.slow
def test_var_flag_not_rejected_in_create_local(self, tmp_path):
"""Test --var KEY=VALUE doesn't cause 'unrecognized arguments' in create."""
test_dir = tmp_path / "test_code"
test_dir.mkdir()
(test_dir / "test.py").write_text("def hello(): pass")
result = subprocess.run(
[
"skill-seekers",
"create",
str(test_dir),
"--var",
"foo=bar",
"--dry-run",
],
capture_output=True,
text=True,
timeout=15,
)
assert "unrecognized arguments" not in result.stderr.lower(), (
f"--var should be accepted, got stderr: {result.stderr}"
)
class TestBackwardCompatibleFlags:
"""Test that deprecated flag aliases still work."""
def test_no_preserve_code_alias_accepted_by_package(self):
"""Test --no-preserve-code (old name) is still accepted by package command."""
result = subprocess.run(
["skill-seekers", "package", "--help"],
capture_output=True,
text=True,
)
# The old flag should not appear in --help (it's suppressed)
# but should not cause an error if used
assert result.returncode == 0
def test_no_preserve_code_alias_accepted_by_scrape(self):
"""Test --no-preserve-code (old name) is still accepted by scrape command."""
result = subprocess.run(
["skill-seekers", "scrape", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
def test_no_preserve_code_alias_accepted_by_create(self):
"""Test --no-preserve-code (old name) is still accepted by create command."""
result = subprocess.run(
["skill-seekers", "create", "--help-all"],
capture_output=True,
text=True,
)
assert result.returncode == 0
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])

View File

@@ -25,8 +25,8 @@ class TestUniversalArguments:
"""Test universal argument definitions."""
def test_universal_count(self):
"""Should have exactly 18 universal arguments (after Phase 2 workflow integration + local_repo_path)."""
assert len(UNIVERSAL_ARGUMENTS) == 18
"""Should have exactly 19 universal arguments (after Phase 2 workflow integration + local_repo_path + doc_version)."""
assert len(UNIVERSAL_ARGUMENTS) == 19
def test_universal_argument_names(self):
"""Universal arguments should have expected names."""
@@ -50,6 +50,7 @@ class TestUniversalArguments:
"var",
"workflow_dry_run",
"local_repo_path", # GitHub local clone path for unlimited C3.x analysis
"doc_version", # Documentation version tag for RAG metadata
}
assert set(UNIVERSAL_ARGUMENTS.keys()) == expected_names
@@ -130,7 +131,9 @@ class TestArgumentHelpers:
"""Should return set of universal argument names."""
names = get_universal_argument_names()
assert isinstance(names, set)
assert len(names) == 18 # Phase 2: added 4 workflow arguments + local_repo_path
assert (
len(names) == 19
) # Phase 2: added 4 workflow arguments + local_repo_path + doc_version
assert "name" in names
assert "enhance_level" in names # Phase 1: consolidated flag
assert "enhance_workflow" in names # Phase 2: workflow support

View File

@@ -0,0 +1,764 @@
#!/usr/bin/env python3
"""
Tests for Pinecone adaptor and doc_version metadata flow.
"""
import json
import pytest
from skill_seekers.cli.adaptors.base import SkillMetadata
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def sample_skill_dir(tmp_path):
"""Create a minimal skill directory with SKILL.md and references."""
skill_dir = tmp_path / "test-skill"
skill_dir.mkdir()
skill_md = """---
name: test-skill
description: A test skill for pinecone
doc_version: 16.2
---
# Test Skill
This is a test skill for Pinecone adaptor testing.
## Quick Start
Get started quickly.
"""
(skill_dir / "SKILL.md").write_text(skill_md)
refs_dir = skill_dir / "references"
refs_dir.mkdir()
(refs_dir / "api_reference.md").write_text("# API Reference\n\nSome API docs.\n")
(refs_dir / "getting_started.md").write_text(
"# Getting Started\n\nSome getting started docs.\n"
)
return skill_dir
@pytest.fixture
def sample_skill_dir_no_doc_version(tmp_path):
"""Create a skill directory without doc_version in frontmatter."""
skill_dir = tmp_path / "no-version-skill"
skill_dir.mkdir()
skill_md = """---
name: no-version-skill
description: A test skill without doc_version
---
# No Version Skill
Content here.
"""
(skill_dir / "SKILL.md").write_text(skill_md)
refs_dir = skill_dir / "references"
refs_dir.mkdir()
(refs_dir / "api.md").write_text("# API\n\nAPI docs.\n")
return skill_dir
# ---------------------------------------------------------------------------
# Pinecone Adaptor Tests
# ---------------------------------------------------------------------------
class TestPineconeAdaptor:
"""Test Pinecone adaptor functionality."""
def test_import(self):
"""PineconeAdaptor can be imported."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
assert PineconeAdaptor is not None
def test_platform_constants(self):
"""Platform constants are set correctly."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
assert adaptor.PLATFORM == "pinecone"
assert adaptor.PLATFORM_NAME == "Pinecone (Vector Database)"
assert adaptor.DEFAULT_API_ENDPOINT is None
def test_registered_in_factory(self):
"""PineconeAdaptor is registered in the adaptor factory."""
from skill_seekers.cli.adaptors import ADAPTORS
assert "pinecone" in ADAPTORS
def test_get_adaptor(self):
"""get_adaptor('pinecone') returns PineconeAdaptor instance."""
from skill_seekers.cli.adaptors import get_adaptor
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = get_adaptor("pinecone")
assert isinstance(adaptor, PineconeAdaptor)
def test_format_skill_md_structure(self, sample_skill_dir):
"""format_skill_md returns valid JSON with expected structure."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
metadata = SkillMetadata(
name="test-skill",
description="Test skill",
version="1.0.0",
doc_version="16.2",
)
result = adaptor.format_skill_md(sample_skill_dir, metadata)
data = json.loads(result)
assert "index_name" in data
assert "namespace" in data
assert "dimension" in data
assert "metric" in data
assert "vectors" in data
assert data["dimension"] == 1536
assert data["metric"] == "cosine"
def test_format_skill_md_vectors_have_metadata(self, sample_skill_dir):
"""Each vector has id and metadata fields."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
metadata = SkillMetadata(
name="test-skill",
description="Test",
doc_version="16.2",
)
result = adaptor.format_skill_md(sample_skill_dir, metadata)
data = json.loads(result)
assert len(data["vectors"]) > 0
for vec in data["vectors"]:
assert "id" in vec
assert "metadata" in vec
assert "text" in vec["metadata"]
assert "source" in vec["metadata"]
assert "category" in vec["metadata"]
assert "file" in vec["metadata"]
assert "type" in vec["metadata"]
assert "version" in vec["metadata"]
assert "doc_version" in vec["metadata"]
def test_format_skill_md_doc_version_propagates(self, sample_skill_dir):
"""doc_version flows into every vector's metadata."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
metadata = SkillMetadata(
name="test-skill",
description="Test",
doc_version="16.2",
)
result = adaptor.format_skill_md(sample_skill_dir, metadata)
data = json.loads(result)
for vec in data["vectors"]:
assert vec["metadata"]["doc_version"] == "16.2"
def test_format_skill_md_empty_doc_version(self, sample_skill_dir):
"""Empty doc_version is preserved as empty string."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
metadata = SkillMetadata(name="test-skill", description="Test", doc_version="")
result = adaptor.format_skill_md(sample_skill_dir, metadata)
data = json.loads(result)
for vec in data["vectors"]:
assert vec["metadata"]["doc_version"] == ""
def test_format_skill_md_has_overview_and_references(self, sample_skill_dir):
"""Output includes overview (SKILL.md) and reference documents."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
metadata = SkillMetadata(name="test-skill", description="Test")
result = adaptor.format_skill_md(sample_skill_dir, metadata)
data = json.loads(result)
categories = {vec["metadata"]["category"] for vec in data["vectors"]}
types = {vec["metadata"]["type"] for vec in data["vectors"]}
assert "overview" in categories
assert "documentation" in types
assert "reference" in types
def test_package_creates_file(self, sample_skill_dir, tmp_path):
"""package() creates a JSON file at expected path."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
output_path = adaptor.package(sample_skill_dir, tmp_path)
assert output_path.exists()
assert output_path.name.endswith("-pinecone.json")
data = json.loads(output_path.read_text())
assert "vectors" in data
assert len(data["vectors"]) > 0
def test_package_reads_frontmatter_metadata(self, sample_skill_dir, tmp_path):
"""package() reads doc_version from SKILL.md frontmatter."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
for vec in data["vectors"]:
assert vec["metadata"]["doc_version"] == "16.2"
def test_package_with_chunking(self, sample_skill_dir, tmp_path):
"""package() with chunking enabled produces valid output."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
output_path = adaptor.package(
sample_skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=64
)
data = json.loads(output_path.read_text())
assert "vectors" in data
assert len(data["vectors"]) > 0
def test_index_name_derived_from_skill_name(self, sample_skill_dir, tmp_path):
"""index_name and namespace are derived from skill directory name."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
assert data["index_name"] == "test-skill"
assert data["namespace"] == "test-skill"
def test_no_values_field_in_vectors(self, sample_skill_dir, tmp_path):
"""Vectors have no 'values' field — embeddings are added at upload time."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
for vec in data["vectors"]:
assert "values" not in vec
def test_text_truncation(self):
"""_truncate_text_for_metadata respects byte limit."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
# Short text should not be truncated
assert adaptor._truncate_text_for_metadata("hello") == "hello"
# Very long text should be truncated
long_text = "x" * 50000
truncated = adaptor._truncate_text_for_metadata(long_text)
assert len(truncated.encode("utf-8")) <= 40000
def test_validate_api_key_returns_false(self):
"""validate_api_key returns False (no key needed for packaging)."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
assert adaptor.validate_api_key("some-key") is False
def test_get_env_var_name(self):
"""get_env_var_name returns PINECONE_API_KEY."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
assert adaptor.get_env_var_name() == "PINECONE_API_KEY"
def test_supports_enhancement_false(self):
"""Pinecone doesn't support enhancement."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
assert adaptor.supports_enhancement() is False
def test_upload_without_pinecone_installed(self, tmp_path):
"""upload() returns helpful error when pinecone not installed."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
# Create a dummy package file
pkg = tmp_path / "test-pinecone.json"
pkg.write_text(json.dumps({"vectors": [], "index_name": "test", "namespace": "test"}))
# This will either work (if pinecone is installed) or return error
result = adaptor.upload(pkg)
# Without API key, should fail
assert result["success"] is False
def _make_mock_pinecone(self, monkeypatch):
"""Helper: stub the pinecone module so upload() can run without a real server."""
import sys
import types
from unittest.mock import MagicMock
mock_module = types.ModuleType("pinecone")
mock_index = MagicMock()
mock_pc = MagicMock()
mock_pc.list_indexes.return_value = [] # no existing indexes
mock_pc.Index.return_value = mock_index
mock_module.Pinecone = MagicMock(return_value=mock_pc)
mock_module.ServerlessSpec = MagicMock()
monkeypatch.setitem(sys.modules, "pinecone", mock_module)
return mock_pc, mock_index
def _make_package(self, tmp_path, vectors=None):
"""Helper: create a minimal Pinecone package JSON."""
if vectors is None:
vectors = [{"id": "a", "metadata": {"text": "hello world"}}]
pkg = tmp_path / "test-pinecone.json"
pkg.write_text(
json.dumps(
{
"vectors": vectors,
"index_name": "test",
"namespace": "test",
"metric": "cosine",
"dimension": 1536,
}
)
)
return pkg
def test_upload_success_has_url_key(self, tmp_path, monkeypatch):
"""upload() success return dict includes 'url' key (prevents KeyError in package_skill.py)."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
monkeypatch.setattr(
adaptor,
"_generate_openai_embeddings",
lambda docs: [[0.0] * 1536] * len(docs),
)
pkg = self._make_package(tmp_path)
result = adaptor.upload(pkg, api_key="fake-key")
assert result["success"] is True
assert "url" in result # key must exist to avoid KeyError in package_skill.py
# Value should be None for Pinecone (no web URL)
assert result["url"] is None
def test_embedding_dimension_autodetect_st(self, tmp_path, monkeypatch):
"""sentence-transformers upload creates index with dimension=384."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
monkeypatch.setattr(
adaptor,
"_generate_st_embeddings",
lambda docs: [[0.0] * 384] * len(docs),
)
pkg = self._make_package(tmp_path)
result = adaptor.upload(
pkg,
api_key="fake-key",
embedding_function="sentence-transformers",
)
assert result["success"] is True
# Verify create_index was called with dimension=384
mock_pc.create_index.assert_called_once()
call_kwargs = mock_pc.create_index.call_args
assert call_kwargs.kwargs["dimension"] == 384
def test_embedding_dimension_autodetect_openai(self, tmp_path, monkeypatch):
"""openai upload creates index with dimension=1536."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
monkeypatch.setattr(
adaptor,
"_generate_openai_embeddings",
lambda docs: [[0.0] * 1536] * len(docs),
)
pkg = self._make_package(tmp_path)
result = adaptor.upload(
pkg,
api_key="fake-key",
embedding_function="openai",
)
assert result["success"] is True
mock_pc.create_index.assert_called_once()
call_kwargs = mock_pc.create_index.call_args
assert call_kwargs.kwargs["dimension"] == 1536
def test_embedding_before_index_creation(self, tmp_path, monkeypatch):
"""If embedding generation fails, index is never created (no side-effects)."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
def fail_embeddings(_docs):
raise RuntimeError("OPENAI_API_KEY not set")
monkeypatch.setattr(adaptor, "_generate_openai_embeddings", fail_embeddings)
pkg = self._make_package(tmp_path)
result = adaptor.upload(pkg, api_key="fake-key")
assert result["success"] is False
# Index must NOT have been created since embedding failed first
mock_pc.create_index.assert_not_called()
def test_embedding_dimension_explicit_override(self, tmp_path, monkeypatch):
"""Explicit dimension kwarg overrides both auto-detect and JSON file value."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
monkeypatch.setattr(
adaptor,
"_generate_openai_embeddings",
lambda docs: [[0.0] * 768] * len(docs),
)
pkg = self._make_package(tmp_path)
result = adaptor.upload(
pkg,
api_key="fake-key",
embedding_function="openai",
dimension=768,
)
assert result["success"] is True
mock_pc.create_index.assert_called_once()
call_kwargs = mock_pc.create_index.call_args
assert call_kwargs.kwargs["dimension"] == 768
def test_deterministic_ids(self, sample_skill_dir):
"""IDs are deterministic — same input produces same ID."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
metadata = SkillMetadata(name="test-skill", description="Test")
result1 = adaptor.format_skill_md(sample_skill_dir, metadata)
result2 = adaptor.format_skill_md(sample_skill_dir, metadata)
data1 = json.loads(result1)
data2 = json.loads(result2)
ids1 = [v["id"] for v in data1["vectors"]]
ids2 = [v["id"] for v in data2["vectors"]]
assert ids1 == ids2
# ---------------------------------------------------------------------------
# doc_version Metadata Tests (cross-adaptor)
# ---------------------------------------------------------------------------
class TestDocVersionMetadata:
"""Test doc_version flows through all RAG adaptors."""
def test_skill_metadata_has_doc_version(self):
"""SkillMetadata dataclass has doc_version field."""
meta = SkillMetadata(name="test", description="test", doc_version="3.2")
assert meta.doc_version == "3.2"
def test_skill_metadata_doc_version_default_empty(self):
"""doc_version defaults to empty string."""
meta = SkillMetadata(name="test", description="test")
assert meta.doc_version == ""
def test_read_frontmatter(self, sample_skill_dir):
"""_read_frontmatter reads doc_version from SKILL.md."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
fm = adaptor._read_frontmatter(sample_skill_dir)
assert fm["doc_version"] == "16.2"
assert fm["name"] == "test-skill"
def test_read_frontmatter_missing(self, sample_skill_dir_no_doc_version):
"""_read_frontmatter returns empty string when doc_version is absent."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
fm = adaptor._read_frontmatter(sample_skill_dir_no_doc_version)
assert fm.get("doc_version") is None # key not present
def test_build_skill_metadata_reads_doc_version(self, sample_skill_dir):
"""_build_skill_metadata populates doc_version from frontmatter."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
meta = adaptor._build_skill_metadata(sample_skill_dir)
assert meta.doc_version == "16.2"
assert meta.name == "test-skill"
def test_build_skill_metadata_no_doc_version(self, sample_skill_dir_no_doc_version):
"""_build_skill_metadata defaults to empty string when frontmatter has no doc_version."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
meta = adaptor._build_skill_metadata(sample_skill_dir_no_doc_version)
assert meta.doc_version == ""
def test_build_metadata_dict_includes_doc_version(self):
"""_build_metadata_dict includes doc_version in output."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
meta = SkillMetadata(name="test", description="desc", doc_version="3.0")
result = adaptor._build_metadata_dict(meta)
assert "doc_version" in result
assert result["doc_version"] == "3.0"
def test_build_metadata_dict_empty_doc_version(self):
"""_build_metadata_dict preserves empty doc_version."""
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
adaptor = PineconeAdaptor()
meta = SkillMetadata(name="test", description="desc")
result = adaptor._build_metadata_dict(meta)
assert "doc_version" in result
assert result["doc_version"] == ""
@pytest.mark.parametrize(
"platform",
["chroma", "faiss", "langchain", "llama-index", "haystack", "pinecone"],
)
def test_doc_version_in_package_output(self, platform, sample_skill_dir, tmp_path):
"""doc_version appears in package output for all RAG adaptors."""
from skill_seekers.cli.adaptors import get_adaptor
adaptor = get_adaptor(platform)
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
# Each adaptor has a different structure — extract metadata dicts
meta_list = _extract_metadata_from_package(platform, data)
assert len(meta_list) > 0, f"No metadata found in {platform} output"
for meta in meta_list:
assert "doc_version" in meta, f"doc_version missing in {platform} metadata: {meta}"
assert meta["doc_version"] == "16.2", (
f"doc_version mismatch in {platform}: expected '16.2', got '{meta['doc_version']}'"
)
@pytest.mark.parametrize(
"platform",
["chroma", "faiss", "langchain", "llama-index", "haystack", "pinecone"],
)
def test_empty_doc_version_in_package_output(
self, platform, sample_skill_dir_no_doc_version, tmp_path
):
"""Empty doc_version is preserved (not omitted) in all adaptors."""
from skill_seekers.cli.adaptors import get_adaptor
adaptor = get_adaptor(platform)
output_path = adaptor.package(sample_skill_dir_no_doc_version, tmp_path)
data = json.loads(output_path.read_text())
meta_list = _extract_metadata_from_package(platform, data)
assert len(meta_list) > 0
for meta in meta_list:
assert "doc_version" in meta
# Qdrant and Weaviate may not be installed — test separately if available
class TestDocVersionQdrant:
"""Test doc_version in Qdrant adaptor (may require qdrant client)."""
def test_qdrant_doc_version(self, sample_skill_dir, tmp_path):
from skill_seekers.cli.adaptors import ADAPTORS
if "qdrant" not in ADAPTORS:
pytest.skip("Qdrant adaptor not available")
from skill_seekers.cli.adaptors import get_adaptor
adaptor = get_adaptor("qdrant")
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
for point in data["points"]:
assert "doc_version" in point["payload"]
assert point["payload"]["doc_version"] == "16.2"
class TestWeaviateUploadReturnKeys:
"""Test Weaviate upload() return dict has required keys."""
def test_weaviate_upload_success_has_url_key(self, sample_skill_dir, tmp_path, monkeypatch):
"""Weaviate upload() success return includes 'url' key (prevents KeyError in package_skill.py)."""
import sys
import types
from unittest.mock import MagicMock
from skill_seekers.cli.adaptors import ADAPTORS
if "weaviate" not in ADAPTORS:
pytest.skip("Weaviate adaptor not available")
from skill_seekers.cli.adaptors.weaviate import WeaviateAdaptor
adaptor = WeaviateAdaptor()
# Stub the weaviate module
mock_module = types.ModuleType("weaviate")
mock_client = MagicMock()
mock_client.is_ready.return_value = True
mock_module.Client = MagicMock(return_value=mock_client)
mock_module.AuthApiKey = MagicMock()
monkeypatch.setitem(sys.modules, "weaviate", mock_module)
# Create a minimal weaviate package
output_path = adaptor.package(sample_skill_dir, tmp_path)
result = adaptor.upload(output_path)
assert result["success"] is True
assert "url" in result
assert result["url"] is None
class TestDocVersionWeaviate:
"""Test doc_version in Weaviate adaptor (may require weaviate client)."""
def test_weaviate_doc_version(self, sample_skill_dir, tmp_path):
from skill_seekers.cli.adaptors import ADAPTORS
if "weaviate" not in ADAPTORS:
pytest.skip("Weaviate adaptor not available")
from skill_seekers.cli.adaptors import get_adaptor
adaptor = get_adaptor("weaviate")
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
for obj in data["objects"]:
assert "doc_version" in obj["properties"]
assert obj["properties"]["doc_version"] == "16.2"
def test_weaviate_schema_includes_doc_version(self, sample_skill_dir, tmp_path):
from skill_seekers.cli.adaptors import ADAPTORS
if "weaviate" not in ADAPTORS:
pytest.skip("Weaviate adaptor not available")
from skill_seekers.cli.adaptors import get_adaptor
adaptor = get_adaptor("weaviate")
output_path = adaptor.package(sample_skill_dir, tmp_path)
data = json.loads(output_path.read_text())
property_names = [p["name"] for p in data["schema"]["properties"]]
assert "doc_version" in property_names
# ---------------------------------------------------------------------------
# CLI Flag Tests
# ---------------------------------------------------------------------------
class TestDocVersionCLIFlag:
"""Test --doc-version CLI flag is accepted."""
def test_common_arguments_has_doc_version(self):
"""COMMON_ARGUMENTS includes doc_version."""
from skill_seekers.cli.arguments.common import COMMON_ARGUMENTS
assert "doc_version" in COMMON_ARGUMENTS
def test_create_arguments_has_doc_version(self):
"""UNIVERSAL_ARGUMENTS includes doc_version."""
from skill_seekers.cli.arguments.create import UNIVERSAL_ARGUMENTS
assert "doc_version" in UNIVERSAL_ARGUMENTS
def test_doc_version_flag_parsed(self):
"""--doc-version is parsed correctly by argparse."""
import argparse
from skill_seekers.cli.arguments.common import add_common_arguments
parser = argparse.ArgumentParser()
add_common_arguments(parser)
args = parser.parse_args(["--doc-version", "16.2"])
assert args.doc_version == "16.2"
def test_doc_version_default_empty(self):
"""--doc-version defaults to empty string."""
import argparse
from skill_seekers.cli.arguments.common import add_common_arguments
parser = argparse.ArgumentParser()
add_common_arguments(parser)
args = parser.parse_args([])
assert args.doc_version == ""
# ---------------------------------------------------------------------------
# Package choices test
# ---------------------------------------------------------------------------
class TestPineconeInPackageChoices:
"""Test pinecone is in package CLI choices."""
def test_pinecone_in_package_arguments(self):
"""pinecone is listed in package --target choices."""
from skill_seekers.cli.arguments.package import PACKAGE_ARGUMENTS
choices = PACKAGE_ARGUMENTS["target"]["kwargs"]["choices"]
assert "pinecone" in choices
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _extract_metadata_from_package(platform: str, data: dict) -> list[dict]:
"""Extract metadata dicts from adaptor-specific package format."""
meta_list = []
if platform == "pinecone":
for vec in data.get("vectors", []):
meta_list.append(vec.get("metadata", {}))
elif platform == "chroma":
for meta in data.get("metadatas", []):
meta_list.append(meta)
elif platform == "faiss":
for meta in data.get("metadatas", []):
meta_list.append(meta)
elif platform == "langchain":
for doc in data if isinstance(data, list) else []:
meta_list.append(doc.get("metadata", {}))
elif platform == "llama-index":
for node in data if isinstance(data, list) else []:
meta_list.append(node.get("metadata", {}))
elif platform == "haystack":
for doc in data if isinstance(data, list) else []:
meta_list.append(doc.get("meta", {}))
elif platform == "qdrant":
for point in data.get("points", []):
meta_list.append(point.get("payload", {}))
elif platform == "weaviate":
for obj in data.get("objects", []):
meta_list.append(obj.get("properties", {}))
return meta_list

View File

@@ -151,6 +151,45 @@ class TestWeaviateUploadBasics:
assert hasattr(adaptor, "_generate_openai_embeddings")
class TestEmbeddingMethodInheritance:
"""Test that shared embedding methods are properly inherited from base."""
def test_chroma_inherits_openai_embeddings(self):
"""Test chroma adaptor gets _generate_openai_embeddings from base."""
adaptor = get_adaptor("chroma")
assert hasattr(adaptor, "_generate_openai_embeddings")
# Verify it's the base class method, not a local override
from skill_seekers.cli.adaptors.base import SkillAdaptor
assert (
adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings
)
def test_weaviate_inherits_both_embedding_methods(self):
"""Test weaviate adaptor gets both embedding methods from base."""
adaptor = get_adaptor("weaviate")
assert hasattr(adaptor, "_generate_openai_embeddings")
assert hasattr(adaptor, "_generate_st_embeddings")
from skill_seekers.cli.adaptors.base import SkillAdaptor
assert (
adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings
)
assert adaptor._generate_st_embeddings.__func__ is SkillAdaptor._generate_st_embeddings
def test_pinecone_inherits_both_embedding_methods(self):
"""Test pinecone adaptor gets both embedding methods from base."""
adaptor = get_adaptor("pinecone")
assert hasattr(adaptor, "_generate_openai_embeddings")
assert hasattr(adaptor, "_generate_st_embeddings")
from skill_seekers.cli.adaptors.base import SkillAdaptor
assert (
adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings
)
assert adaptor._generate_st_embeddings.__func__ is SkillAdaptor._generate_st_embeddings
class TestPackageStructure:
"""Test that packages are correctly structured for upload."""

View File

@@ -16,6 +16,7 @@ Tests cover:
"""
import json
import os
import shutil
import tempfile
import unittest
@@ -30,8 +31,9 @@ except ImportError:
WORD_AVAILABLE = False
def _make_sample_extracted_data(num_sections=2, include_code=False, include_tables=False,
include_images=False):
def _make_sample_extracted_data(
num_sections=2, include_code=False, include_tables=False, include_images=False
):
"""Helper to build a minimal extracted_data dict for testing."""
mock_image_bytes = (
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01"
@@ -53,23 +55,29 @@ def _make_sample_extracted_data(num_sections=2, include_code=False, include_tabl
}
if include_code:
section["code_samples"] = [
{"code": f"def hello_{i}():\n return 'world'", "language": "python",
"quality_score": 7.5}
{
"code": f"def hello_{i}():\n return 'world'",
"language": "python",
"quality_score": 7.5,
}
]
if include_tables:
section["tables"] = [
{"headers": ["Col A", "Col B"], "rows": [["val1", "val2"], ["val3", "val4"]]}
]
if include_images:
section["images"] = [
{"index": 0, "data": mock_image_bytes, "width": 100, "height": 80}
]
section["images"] = [{"index": 0, "data": mock_image_bytes, "width": 100, "height": 80}]
pages.append(section)
return {
"source_file": "test.docx",
"metadata": {"title": "Test Doc", "author": "Test Author", "created": "", "modified": "",
"subject": ""},
"metadata": {
"title": "Test Doc",
"author": "Test Author",
"created": "",
"modified": "",
"subject": "",
},
"total_sections": num_sections,
"total_code_blocks": num_sections if include_code else 0,
"total_images": num_sections if include_images else 0,
@@ -85,6 +93,7 @@ class TestWordToSkillConverterInit(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -130,6 +139,7 @@ class TestWordToSkillConverterInit(unittest.TestCase):
def test_name_auto_detected_from_filename(self):
"""Test name can be extracted from filename via infer_description_from_word."""
from skill_seekers.cli.word_scraper import infer_description_from_word
desc = infer_description_from_word({}, name="my_doc")
self.assertIn("my_doc", desc)
@@ -141,6 +151,7 @@ class TestWordCategorization(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -174,10 +185,22 @@ class TestWordCategorization(unittest.TestCase):
converter.docx_path = ""
converter.extracted_data = {
"pages": [
{"section_number": 1, "heading": "API Reference", "text": "api reference docs",
"code_samples": [], "tables": [], "images": []},
{"section_number": 2, "heading": "Getting Started", "text": "getting started guide",
"code_samples": [], "tables": [], "images": []},
{
"section_number": 1,
"heading": "API Reference",
"text": "api reference docs",
"code_samples": [],
"tables": [],
"images": [],
},
{
"section_number": 2,
"heading": "Getting Started",
"text": "getting started guide",
"code_samples": [],
"tables": [],
"images": [],
},
]
}
@@ -204,6 +227,7 @@ class TestWordSkillBuilding(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -296,6 +320,7 @@ class TestWordCodeBlocks(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -350,6 +375,7 @@ class TestWordTables(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -392,6 +418,7 @@ class TestWordImages(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -433,6 +460,7 @@ class TestWordErrorHandling(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()
@@ -456,6 +484,37 @@ class TestWordErrorHandling(unittest.TestCase):
with self.assertRaises((KeyError, TypeError)):
self.WordToSkillConverter({"docx_path": "test.docx"})
def test_non_docx_file_raises_value_error(self):
"""extract_docx raises ValueError for non-.docx files."""
# Create a real file with wrong extension
txt_path = os.path.join(self.temp_dir, "test.txt")
with open(txt_path, "w") as f:
f.write("not a docx")
config = {"name": "test", "docx_path": txt_path}
converter = self.WordToSkillConverter(config)
with self.assertRaises(ValueError):
converter.extract_docx()
def test_doc_file_raises_value_error(self):
"""extract_docx raises ValueError for .doc (old Word format)."""
doc_path = os.path.join(self.temp_dir, "test.doc")
with open(doc_path, "w") as f:
f.write("not a docx")
config = {"name": "test", "docx_path": doc_path}
converter = self.WordToSkillConverter(config)
with self.assertRaises(ValueError):
converter.extract_docx()
def test_no_extension_file_raises_value_error(self):
"""extract_docx raises ValueError for file with no extension."""
no_ext_path = os.path.join(self.temp_dir, "document")
with open(no_ext_path, "w") as f:
f.write("not a docx")
config = {"name": "test", "docx_path": no_ext_path}
converter = self.WordToSkillConverter(config)
with self.assertRaises(ValueError):
converter.extract_docx()
class TestWordJSONWorkflow(unittest.TestCase):
"""Test building skills from extracted JSON."""
@@ -464,6 +523,7 @@ class TestWordJSONWorkflow(unittest.TestCase):
if not WORD_AVAILABLE:
self.skipTest("mammoth and python-docx not installed")
from skill_seekers.cli.word_scraper import WordToSkillConverter
self.WordToSkillConverter = WordToSkillConverter
self.temp_dir = tempfile.mkdtemp()