Merge branch 'development' into feature/video-scraper-pipeline
Sync with latest development changes including ruff formatting, bug fixes, and pinecone adaptor additions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -358,6 +358,107 @@ class TestChunkingCLIIntegration:
|
||||
f"Small chunks ({len(data_small)}) should be more than large chunks ({len(data_large)})"
|
||||
)
|
||||
|
||||
def test_chunk_overlap_tokens_parameter(self, tmp_path):
|
||||
"""Test --chunk-overlap-tokens controls RAGChunker overlap."""
|
||||
from skill_seekers.cli.package_skill import package_skill
|
||||
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
|
||||
# Package with default overlap (50)
|
||||
success, package_path = package_skill(
|
||||
skill_dir=skill_dir,
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target="langchain",
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=256,
|
||||
chunk_overlap_tokens=50,
|
||||
)
|
||||
|
||||
assert success
|
||||
assert package_path.exists()
|
||||
|
||||
with open(package_path) as f:
|
||||
data_default = json.load(f)
|
||||
|
||||
# Package with large overlap (128)
|
||||
success2, package_path2 = package_skill(
|
||||
skill_dir=skill_dir,
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target="langchain",
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=256,
|
||||
chunk_overlap_tokens=128,
|
||||
)
|
||||
|
||||
assert success2
|
||||
assert package_path2.exists()
|
||||
|
||||
with open(package_path2) as f:
|
||||
data_large_overlap = json.load(f)
|
||||
|
||||
# Large overlap should produce more chunks (more overlap = more chunks)
|
||||
assert len(data_large_overlap) >= len(data_default), (
|
||||
f"Large overlap ({len(data_large_overlap)}) should produce >= chunks than default ({len(data_default)})"
|
||||
)
|
||||
|
||||
def test_chunk_overlap_scales_with_chunk_size(self, tmp_path):
|
||||
"""Test that overlap auto-scales when chunk_tokens is non-default but overlap is default."""
|
||||
from skill_seekers.cli.adaptors.base import (
|
||||
DEFAULT_CHUNK_TOKENS,
|
||||
DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
)
|
||||
|
||||
adaptor = get_adaptor("langchain")
|
||||
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
adaptor._build_skill_metadata(skill_dir)
|
||||
content = (skill_dir / "SKILL.md").read_text()
|
||||
|
||||
# With default chunk size (512) and default overlap (50), overlap should be 50
|
||||
chunks_default = adaptor._maybe_chunk_content(
|
||||
content,
|
||||
{"source": "test"},
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=DEFAULT_CHUNK_TOKENS,
|
||||
chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
)
|
||||
|
||||
# With large chunk size (1024) and default overlap (50),
|
||||
# overlap should auto-scale to max(50, 1024//10) = 102
|
||||
chunks_large = adaptor._maybe_chunk_content(
|
||||
content,
|
||||
{"source": "test"},
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=1024,
|
||||
chunk_overlap_tokens=DEFAULT_CHUNK_OVERLAP_TOKENS,
|
||||
)
|
||||
|
||||
# Both should produce valid chunks
|
||||
assert len(chunks_default) > 1
|
||||
assert len(chunks_large) >= 1
|
||||
|
||||
def test_preserve_code_blocks_flag(self, tmp_path):
|
||||
"""Test --no-preserve-code-blocks parameter is accepted."""
|
||||
from skill_seekers.cli.package_skill import package_skill
|
||||
|
||||
skill_dir = create_test_skill(tmp_path, large_doc=True)
|
||||
|
||||
# Package with code block preservation disabled
|
||||
success, package_path = package_skill(
|
||||
skill_dir=skill_dir,
|
||||
open_folder_after=False,
|
||||
skip_quality_check=True,
|
||||
target="langchain",
|
||||
enable_chunking=True,
|
||||
chunk_max_tokens=256,
|
||||
preserve_code_blocks=False,
|
||||
)
|
||||
|
||||
assert success
|
||||
assert package_path.exists()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -294,5 +294,84 @@ class TestE2EWorkflow:
|
||||
assert "unrecognized arguments" not in result.stderr.lower()
|
||||
|
||||
|
||||
class TestVarFlagRouting:
|
||||
"""Test that --var flag is correctly routed through create command."""
|
||||
|
||||
def test_var_flag_accepted_by_create(self):
|
||||
"""Test that --var flag is accepted (not 'unrecognized') by create command."""
|
||||
result = subprocess.run(
|
||||
["skill-seekers", "create", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert "--var" in result.stdout, "create --help should show --var flag"
|
||||
|
||||
def test_var_flag_accepted_by_analyze(self):
|
||||
"""Test that --var flag is accepted by analyze command."""
|
||||
result = subprocess.run(
|
||||
["skill-seekers", "analyze", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert "--var" in result.stdout, "analyze --help should show --var flag"
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_var_flag_not_rejected_in_create_local(self, tmp_path):
|
||||
"""Test --var KEY=VALUE doesn't cause 'unrecognized arguments' in create."""
|
||||
test_dir = tmp_path / "test_code"
|
||||
test_dir.mkdir()
|
||||
(test_dir / "test.py").write_text("def hello(): pass")
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
"skill-seekers",
|
||||
"create",
|
||||
str(test_dir),
|
||||
"--var",
|
||||
"foo=bar",
|
||||
"--dry-run",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=15,
|
||||
)
|
||||
assert "unrecognized arguments" not in result.stderr.lower(), (
|
||||
f"--var should be accepted, got stderr: {result.stderr}"
|
||||
)
|
||||
|
||||
|
||||
class TestBackwardCompatibleFlags:
|
||||
"""Test that deprecated flag aliases still work."""
|
||||
|
||||
def test_no_preserve_code_alias_accepted_by_package(self):
|
||||
"""Test --no-preserve-code (old name) is still accepted by package command."""
|
||||
result = subprocess.run(
|
||||
["skill-seekers", "package", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
# The old flag should not appear in --help (it's suppressed)
|
||||
# but should not cause an error if used
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_no_preserve_code_alias_accepted_by_scrape(self):
|
||||
"""Test --no-preserve-code (old name) is still accepted by scrape command."""
|
||||
result = subprocess.run(
|
||||
["skill-seekers", "scrape", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0
|
||||
|
||||
def test_no_preserve_code_alias_accepted_by_create(self):
|
||||
"""Test --no-preserve-code (old name) is still accepted by create command."""
|
||||
result = subprocess.run(
|
||||
["skill-seekers", "create", "--help-all"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v", "-s"])
|
||||
|
||||
@@ -25,8 +25,8 @@ class TestUniversalArguments:
|
||||
"""Test universal argument definitions."""
|
||||
|
||||
def test_universal_count(self):
|
||||
"""Should have exactly 18 universal arguments (after Phase 2 workflow integration + local_repo_path)."""
|
||||
assert len(UNIVERSAL_ARGUMENTS) == 18
|
||||
"""Should have exactly 19 universal arguments (after Phase 2 workflow integration + local_repo_path + doc_version)."""
|
||||
assert len(UNIVERSAL_ARGUMENTS) == 19
|
||||
|
||||
def test_universal_argument_names(self):
|
||||
"""Universal arguments should have expected names."""
|
||||
@@ -50,6 +50,7 @@ class TestUniversalArguments:
|
||||
"var",
|
||||
"workflow_dry_run",
|
||||
"local_repo_path", # GitHub local clone path for unlimited C3.x analysis
|
||||
"doc_version", # Documentation version tag for RAG metadata
|
||||
}
|
||||
assert set(UNIVERSAL_ARGUMENTS.keys()) == expected_names
|
||||
|
||||
@@ -130,7 +131,9 @@ class TestArgumentHelpers:
|
||||
"""Should return set of universal argument names."""
|
||||
names = get_universal_argument_names()
|
||||
assert isinstance(names, set)
|
||||
assert len(names) == 18 # Phase 2: added 4 workflow arguments + local_repo_path
|
||||
assert (
|
||||
len(names) == 19
|
||||
) # Phase 2: added 4 workflow arguments + local_repo_path + doc_version
|
||||
assert "name" in names
|
||||
assert "enhance_level" in names # Phase 1: consolidated flag
|
||||
assert "enhance_workflow" in names # Phase 2: workflow support
|
||||
|
||||
764
tests/test_pinecone_adaptor.py
Normal file
764
tests/test_pinecone_adaptor.py
Normal file
@@ -0,0 +1,764 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for Pinecone adaptor and doc_version metadata flow.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from skill_seekers.cli.adaptors.base import SkillMetadata
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_skill_dir(tmp_path):
|
||||
"""Create a minimal skill directory with SKILL.md and references."""
|
||||
skill_dir = tmp_path / "test-skill"
|
||||
skill_dir.mkdir()
|
||||
|
||||
skill_md = """---
|
||||
name: test-skill
|
||||
description: A test skill for pinecone
|
||||
doc_version: 16.2
|
||||
---
|
||||
|
||||
# Test Skill
|
||||
|
||||
This is a test skill for Pinecone adaptor testing.
|
||||
|
||||
## Quick Start
|
||||
|
||||
Get started quickly.
|
||||
"""
|
||||
(skill_dir / "SKILL.md").write_text(skill_md)
|
||||
|
||||
refs_dir = skill_dir / "references"
|
||||
refs_dir.mkdir()
|
||||
(refs_dir / "api_reference.md").write_text("# API Reference\n\nSome API docs.\n")
|
||||
(refs_dir / "getting_started.md").write_text(
|
||||
"# Getting Started\n\nSome getting started docs.\n"
|
||||
)
|
||||
|
||||
return skill_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_skill_dir_no_doc_version(tmp_path):
|
||||
"""Create a skill directory without doc_version in frontmatter."""
|
||||
skill_dir = tmp_path / "no-version-skill"
|
||||
skill_dir.mkdir()
|
||||
|
||||
skill_md = """---
|
||||
name: no-version-skill
|
||||
description: A test skill without doc_version
|
||||
---
|
||||
|
||||
# No Version Skill
|
||||
|
||||
Content here.
|
||||
"""
|
||||
(skill_dir / "SKILL.md").write_text(skill_md)
|
||||
|
||||
refs_dir = skill_dir / "references"
|
||||
refs_dir.mkdir()
|
||||
(refs_dir / "api.md").write_text("# API\n\nAPI docs.\n")
|
||||
|
||||
return skill_dir
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pinecone Adaptor Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPineconeAdaptor:
|
||||
"""Test Pinecone adaptor functionality."""
|
||||
|
||||
def test_import(self):
|
||||
"""PineconeAdaptor can be imported."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
assert PineconeAdaptor is not None
|
||||
|
||||
def test_platform_constants(self):
|
||||
"""Platform constants are set correctly."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
assert adaptor.PLATFORM == "pinecone"
|
||||
assert adaptor.PLATFORM_NAME == "Pinecone (Vector Database)"
|
||||
assert adaptor.DEFAULT_API_ENDPOINT is None
|
||||
|
||||
def test_registered_in_factory(self):
|
||||
"""PineconeAdaptor is registered in the adaptor factory."""
|
||||
from skill_seekers.cli.adaptors import ADAPTORS
|
||||
|
||||
assert "pinecone" in ADAPTORS
|
||||
|
||||
def test_get_adaptor(self):
|
||||
"""get_adaptor('pinecone') returns PineconeAdaptor instance."""
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = get_adaptor("pinecone")
|
||||
assert isinstance(adaptor, PineconeAdaptor)
|
||||
|
||||
def test_format_skill_md_structure(self, sample_skill_dir):
|
||||
"""format_skill_md returns valid JSON with expected structure."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
metadata = SkillMetadata(
|
||||
name="test-skill",
|
||||
description="Test skill",
|
||||
version="1.0.0",
|
||||
doc_version="16.2",
|
||||
)
|
||||
result = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
data = json.loads(result)
|
||||
|
||||
assert "index_name" in data
|
||||
assert "namespace" in data
|
||||
assert "dimension" in data
|
||||
assert "metric" in data
|
||||
assert "vectors" in data
|
||||
assert data["dimension"] == 1536
|
||||
assert data["metric"] == "cosine"
|
||||
|
||||
def test_format_skill_md_vectors_have_metadata(self, sample_skill_dir):
|
||||
"""Each vector has id and metadata fields."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
metadata = SkillMetadata(
|
||||
name="test-skill",
|
||||
description="Test",
|
||||
doc_version="16.2",
|
||||
)
|
||||
result = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
data = json.loads(result)
|
||||
|
||||
assert len(data["vectors"]) > 0
|
||||
for vec in data["vectors"]:
|
||||
assert "id" in vec
|
||||
assert "metadata" in vec
|
||||
assert "text" in vec["metadata"]
|
||||
assert "source" in vec["metadata"]
|
||||
assert "category" in vec["metadata"]
|
||||
assert "file" in vec["metadata"]
|
||||
assert "type" in vec["metadata"]
|
||||
assert "version" in vec["metadata"]
|
||||
assert "doc_version" in vec["metadata"]
|
||||
|
||||
def test_format_skill_md_doc_version_propagates(self, sample_skill_dir):
|
||||
"""doc_version flows into every vector's metadata."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
metadata = SkillMetadata(
|
||||
name="test-skill",
|
||||
description="Test",
|
||||
doc_version="16.2",
|
||||
)
|
||||
result = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
data = json.loads(result)
|
||||
|
||||
for vec in data["vectors"]:
|
||||
assert vec["metadata"]["doc_version"] == "16.2"
|
||||
|
||||
def test_format_skill_md_empty_doc_version(self, sample_skill_dir):
|
||||
"""Empty doc_version is preserved as empty string."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
metadata = SkillMetadata(name="test-skill", description="Test", doc_version="")
|
||||
result = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
data = json.loads(result)
|
||||
|
||||
for vec in data["vectors"]:
|
||||
assert vec["metadata"]["doc_version"] == ""
|
||||
|
||||
def test_format_skill_md_has_overview_and_references(self, sample_skill_dir):
|
||||
"""Output includes overview (SKILL.md) and reference documents."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
metadata = SkillMetadata(name="test-skill", description="Test")
|
||||
result = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
data = json.loads(result)
|
||||
|
||||
categories = {vec["metadata"]["category"] for vec in data["vectors"]}
|
||||
types = {vec["metadata"]["type"] for vec in data["vectors"]}
|
||||
assert "overview" in categories
|
||||
assert "documentation" in types
|
||||
assert "reference" in types
|
||||
|
||||
def test_package_creates_file(self, sample_skill_dir, tmp_path):
|
||||
"""package() creates a JSON file at expected path."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
assert output_path.exists()
|
||||
assert output_path.name.endswith("-pinecone.json")
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
assert "vectors" in data
|
||||
assert len(data["vectors"]) > 0
|
||||
|
||||
def test_package_reads_frontmatter_metadata(self, sample_skill_dir, tmp_path):
|
||||
"""package() reads doc_version from SKILL.md frontmatter."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
for vec in data["vectors"]:
|
||||
assert vec["metadata"]["doc_version"] == "16.2"
|
||||
|
||||
def test_package_with_chunking(self, sample_skill_dir, tmp_path):
|
||||
"""package() with chunking enabled produces valid output."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
output_path = adaptor.package(
|
||||
sample_skill_dir, tmp_path, enable_chunking=True, chunk_max_tokens=64
|
||||
)
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
assert "vectors" in data
|
||||
assert len(data["vectors"]) > 0
|
||||
|
||||
def test_index_name_derived_from_skill_name(self, sample_skill_dir, tmp_path):
|
||||
"""index_name and namespace are derived from skill directory name."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
assert data["index_name"] == "test-skill"
|
||||
assert data["namespace"] == "test-skill"
|
||||
|
||||
def test_no_values_field_in_vectors(self, sample_skill_dir, tmp_path):
|
||||
"""Vectors have no 'values' field — embeddings are added at upload time."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
for vec in data["vectors"]:
|
||||
assert "values" not in vec
|
||||
|
||||
def test_text_truncation(self):
|
||||
"""_truncate_text_for_metadata respects byte limit."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
# Short text should not be truncated
|
||||
assert adaptor._truncate_text_for_metadata("hello") == "hello"
|
||||
|
||||
# Very long text should be truncated
|
||||
long_text = "x" * 50000
|
||||
truncated = adaptor._truncate_text_for_metadata(long_text)
|
||||
assert len(truncated.encode("utf-8")) <= 40000
|
||||
|
||||
def test_validate_api_key_returns_false(self):
|
||||
"""validate_api_key returns False (no key needed for packaging)."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
assert adaptor.validate_api_key("some-key") is False
|
||||
|
||||
def test_get_env_var_name(self):
|
||||
"""get_env_var_name returns PINECONE_API_KEY."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
assert adaptor.get_env_var_name() == "PINECONE_API_KEY"
|
||||
|
||||
def test_supports_enhancement_false(self):
|
||||
"""Pinecone doesn't support enhancement."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
assert adaptor.supports_enhancement() is False
|
||||
|
||||
def test_upload_without_pinecone_installed(self, tmp_path):
|
||||
"""upload() returns helpful error when pinecone not installed."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
# Create a dummy package file
|
||||
pkg = tmp_path / "test-pinecone.json"
|
||||
pkg.write_text(json.dumps({"vectors": [], "index_name": "test", "namespace": "test"}))
|
||||
|
||||
# This will either work (if pinecone is installed) or return error
|
||||
result = adaptor.upload(pkg)
|
||||
# Without API key, should fail
|
||||
assert result["success"] is False
|
||||
|
||||
def _make_mock_pinecone(self, monkeypatch):
|
||||
"""Helper: stub the pinecone module so upload() can run without a real server."""
|
||||
import sys
|
||||
import types
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
mock_module = types.ModuleType("pinecone")
|
||||
mock_index = MagicMock()
|
||||
mock_pc = MagicMock()
|
||||
mock_pc.list_indexes.return_value = [] # no existing indexes
|
||||
mock_pc.Index.return_value = mock_index
|
||||
mock_module.Pinecone = MagicMock(return_value=mock_pc)
|
||||
mock_module.ServerlessSpec = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "pinecone", mock_module)
|
||||
return mock_pc, mock_index
|
||||
|
||||
def _make_package(self, tmp_path, vectors=None):
|
||||
"""Helper: create a minimal Pinecone package JSON."""
|
||||
if vectors is None:
|
||||
vectors = [{"id": "a", "metadata": {"text": "hello world"}}]
|
||||
pkg = tmp_path / "test-pinecone.json"
|
||||
pkg.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"vectors": vectors,
|
||||
"index_name": "test",
|
||||
"namespace": "test",
|
||||
"metric": "cosine",
|
||||
"dimension": 1536,
|
||||
}
|
||||
)
|
||||
)
|
||||
return pkg
|
||||
|
||||
def test_upload_success_has_url_key(self, tmp_path, monkeypatch):
|
||||
"""upload() success return dict includes 'url' key (prevents KeyError in package_skill.py)."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
|
||||
monkeypatch.setattr(
|
||||
adaptor,
|
||||
"_generate_openai_embeddings",
|
||||
lambda docs: [[0.0] * 1536] * len(docs),
|
||||
)
|
||||
pkg = self._make_package(tmp_path)
|
||||
|
||||
result = adaptor.upload(pkg, api_key="fake-key")
|
||||
assert result["success"] is True
|
||||
assert "url" in result # key must exist to avoid KeyError in package_skill.py
|
||||
# Value should be None for Pinecone (no web URL)
|
||||
assert result["url"] is None
|
||||
|
||||
def test_embedding_dimension_autodetect_st(self, tmp_path, monkeypatch):
|
||||
"""sentence-transformers upload creates index with dimension=384."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
|
||||
monkeypatch.setattr(
|
||||
adaptor,
|
||||
"_generate_st_embeddings",
|
||||
lambda docs: [[0.0] * 384] * len(docs),
|
||||
)
|
||||
pkg = self._make_package(tmp_path)
|
||||
|
||||
result = adaptor.upload(
|
||||
pkg,
|
||||
api_key="fake-key",
|
||||
embedding_function="sentence-transformers",
|
||||
)
|
||||
assert result["success"] is True
|
||||
# Verify create_index was called with dimension=384
|
||||
mock_pc.create_index.assert_called_once()
|
||||
call_kwargs = mock_pc.create_index.call_args
|
||||
assert call_kwargs.kwargs["dimension"] == 384
|
||||
|
||||
def test_embedding_dimension_autodetect_openai(self, tmp_path, monkeypatch):
|
||||
"""openai upload creates index with dimension=1536."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
|
||||
monkeypatch.setattr(
|
||||
adaptor,
|
||||
"_generate_openai_embeddings",
|
||||
lambda docs: [[0.0] * 1536] * len(docs),
|
||||
)
|
||||
pkg = self._make_package(tmp_path)
|
||||
|
||||
result = adaptor.upload(
|
||||
pkg,
|
||||
api_key="fake-key",
|
||||
embedding_function="openai",
|
||||
)
|
||||
assert result["success"] is True
|
||||
mock_pc.create_index.assert_called_once()
|
||||
call_kwargs = mock_pc.create_index.call_args
|
||||
assert call_kwargs.kwargs["dimension"] == 1536
|
||||
|
||||
def test_embedding_before_index_creation(self, tmp_path, monkeypatch):
|
||||
"""If embedding generation fails, index is never created (no side-effects)."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
|
||||
|
||||
def fail_embeddings(_docs):
|
||||
raise RuntimeError("OPENAI_API_KEY not set")
|
||||
|
||||
monkeypatch.setattr(adaptor, "_generate_openai_embeddings", fail_embeddings)
|
||||
pkg = self._make_package(tmp_path)
|
||||
|
||||
result = adaptor.upload(pkg, api_key="fake-key")
|
||||
assert result["success"] is False
|
||||
# Index must NOT have been created since embedding failed first
|
||||
mock_pc.create_index.assert_not_called()
|
||||
|
||||
def test_embedding_dimension_explicit_override(self, tmp_path, monkeypatch):
|
||||
"""Explicit dimension kwarg overrides both auto-detect and JSON file value."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
mock_pc, _mock_index = self._make_mock_pinecone(monkeypatch)
|
||||
monkeypatch.setattr(
|
||||
adaptor,
|
||||
"_generate_openai_embeddings",
|
||||
lambda docs: [[0.0] * 768] * len(docs),
|
||||
)
|
||||
pkg = self._make_package(tmp_path)
|
||||
|
||||
result = adaptor.upload(
|
||||
pkg,
|
||||
api_key="fake-key",
|
||||
embedding_function="openai",
|
||||
dimension=768,
|
||||
)
|
||||
assert result["success"] is True
|
||||
mock_pc.create_index.assert_called_once()
|
||||
call_kwargs = mock_pc.create_index.call_args
|
||||
assert call_kwargs.kwargs["dimension"] == 768
|
||||
|
||||
def test_deterministic_ids(self, sample_skill_dir):
|
||||
"""IDs are deterministic — same input produces same ID."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
metadata = SkillMetadata(name="test-skill", description="Test")
|
||||
|
||||
result1 = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
result2 = adaptor.format_skill_md(sample_skill_dir, metadata)
|
||||
|
||||
data1 = json.loads(result1)
|
||||
data2 = json.loads(result2)
|
||||
|
||||
ids1 = [v["id"] for v in data1["vectors"]]
|
||||
ids2 = [v["id"] for v in data2["vectors"]]
|
||||
assert ids1 == ids2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# doc_version Metadata Tests (cross-adaptor)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDocVersionMetadata:
|
||||
"""Test doc_version flows through all RAG adaptors."""
|
||||
|
||||
def test_skill_metadata_has_doc_version(self):
|
||||
"""SkillMetadata dataclass has doc_version field."""
|
||||
meta = SkillMetadata(name="test", description="test", doc_version="3.2")
|
||||
assert meta.doc_version == "3.2"
|
||||
|
||||
def test_skill_metadata_doc_version_default_empty(self):
|
||||
"""doc_version defaults to empty string."""
|
||||
meta = SkillMetadata(name="test", description="test")
|
||||
assert meta.doc_version == ""
|
||||
|
||||
def test_read_frontmatter(self, sample_skill_dir):
|
||||
"""_read_frontmatter reads doc_version from SKILL.md."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
fm = adaptor._read_frontmatter(sample_skill_dir)
|
||||
assert fm["doc_version"] == "16.2"
|
||||
assert fm["name"] == "test-skill"
|
||||
|
||||
def test_read_frontmatter_missing(self, sample_skill_dir_no_doc_version):
|
||||
"""_read_frontmatter returns empty string when doc_version is absent."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
fm = adaptor._read_frontmatter(sample_skill_dir_no_doc_version)
|
||||
assert fm.get("doc_version") is None # key not present
|
||||
|
||||
def test_build_skill_metadata_reads_doc_version(self, sample_skill_dir):
|
||||
"""_build_skill_metadata populates doc_version from frontmatter."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
meta = adaptor._build_skill_metadata(sample_skill_dir)
|
||||
assert meta.doc_version == "16.2"
|
||||
assert meta.name == "test-skill"
|
||||
|
||||
def test_build_skill_metadata_no_doc_version(self, sample_skill_dir_no_doc_version):
|
||||
"""_build_skill_metadata defaults to empty string when frontmatter has no doc_version."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
meta = adaptor._build_skill_metadata(sample_skill_dir_no_doc_version)
|
||||
assert meta.doc_version == ""
|
||||
|
||||
def test_build_metadata_dict_includes_doc_version(self):
|
||||
"""_build_metadata_dict includes doc_version in output."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
meta = SkillMetadata(name="test", description="desc", doc_version="3.0")
|
||||
result = adaptor._build_metadata_dict(meta)
|
||||
assert "doc_version" in result
|
||||
assert result["doc_version"] == "3.0"
|
||||
|
||||
def test_build_metadata_dict_empty_doc_version(self):
|
||||
"""_build_metadata_dict preserves empty doc_version."""
|
||||
from skill_seekers.cli.adaptors.pinecone_adaptor import PineconeAdaptor
|
||||
|
||||
adaptor = PineconeAdaptor()
|
||||
meta = SkillMetadata(name="test", description="desc")
|
||||
result = adaptor._build_metadata_dict(meta)
|
||||
assert "doc_version" in result
|
||||
assert result["doc_version"] == ""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"platform",
|
||||
["chroma", "faiss", "langchain", "llama-index", "haystack", "pinecone"],
|
||||
)
|
||||
def test_doc_version_in_package_output(self, platform, sample_skill_dir, tmp_path):
|
||||
"""doc_version appears in package output for all RAG adaptors."""
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
|
||||
adaptor = get_adaptor(platform)
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
|
||||
# Each adaptor has a different structure — extract metadata dicts
|
||||
meta_list = _extract_metadata_from_package(platform, data)
|
||||
assert len(meta_list) > 0, f"No metadata found in {platform} output"
|
||||
|
||||
for meta in meta_list:
|
||||
assert "doc_version" in meta, f"doc_version missing in {platform} metadata: {meta}"
|
||||
assert meta["doc_version"] == "16.2", (
|
||||
f"doc_version mismatch in {platform}: expected '16.2', got '{meta['doc_version']}'"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"platform",
|
||||
["chroma", "faiss", "langchain", "llama-index", "haystack", "pinecone"],
|
||||
)
|
||||
def test_empty_doc_version_in_package_output(
|
||||
self, platform, sample_skill_dir_no_doc_version, tmp_path
|
||||
):
|
||||
"""Empty doc_version is preserved (not omitted) in all adaptors."""
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
|
||||
adaptor = get_adaptor(platform)
|
||||
output_path = adaptor.package(sample_skill_dir_no_doc_version, tmp_path)
|
||||
|
||||
data = json.loads(output_path.read_text())
|
||||
meta_list = _extract_metadata_from_package(platform, data)
|
||||
assert len(meta_list) > 0
|
||||
|
||||
for meta in meta_list:
|
||||
assert "doc_version" in meta
|
||||
|
||||
|
||||
# Qdrant and Weaviate may not be installed — test separately if available
|
||||
class TestDocVersionQdrant:
|
||||
"""Test doc_version in Qdrant adaptor (may require qdrant client)."""
|
||||
|
||||
def test_qdrant_doc_version(self, sample_skill_dir, tmp_path):
|
||||
from skill_seekers.cli.adaptors import ADAPTORS
|
||||
|
||||
if "qdrant" not in ADAPTORS:
|
||||
pytest.skip("Qdrant adaptor not available")
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
|
||||
adaptor = get_adaptor("qdrant")
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
data = json.loads(output_path.read_text())
|
||||
|
||||
for point in data["points"]:
|
||||
assert "doc_version" in point["payload"]
|
||||
assert point["payload"]["doc_version"] == "16.2"
|
||||
|
||||
|
||||
class TestWeaviateUploadReturnKeys:
|
||||
"""Test Weaviate upload() return dict has required keys."""
|
||||
|
||||
def test_weaviate_upload_success_has_url_key(self, sample_skill_dir, tmp_path, monkeypatch):
|
||||
"""Weaviate upload() success return includes 'url' key (prevents KeyError in package_skill.py)."""
|
||||
import sys
|
||||
import types
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from skill_seekers.cli.adaptors import ADAPTORS
|
||||
|
||||
if "weaviate" not in ADAPTORS:
|
||||
pytest.skip("Weaviate adaptor not available")
|
||||
|
||||
from skill_seekers.cli.adaptors.weaviate import WeaviateAdaptor
|
||||
|
||||
adaptor = WeaviateAdaptor()
|
||||
|
||||
# Stub the weaviate module
|
||||
mock_module = types.ModuleType("weaviate")
|
||||
mock_client = MagicMock()
|
||||
mock_client.is_ready.return_value = True
|
||||
mock_module.Client = MagicMock(return_value=mock_client)
|
||||
mock_module.AuthApiKey = MagicMock()
|
||||
monkeypatch.setitem(sys.modules, "weaviate", mock_module)
|
||||
|
||||
# Create a minimal weaviate package
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
result = adaptor.upload(output_path)
|
||||
|
||||
assert result["success"] is True
|
||||
assert "url" in result
|
||||
assert result["url"] is None
|
||||
|
||||
|
||||
class TestDocVersionWeaviate:
|
||||
"""Test doc_version in Weaviate adaptor (may require weaviate client)."""
|
||||
|
||||
def test_weaviate_doc_version(self, sample_skill_dir, tmp_path):
|
||||
from skill_seekers.cli.adaptors import ADAPTORS
|
||||
|
||||
if "weaviate" not in ADAPTORS:
|
||||
pytest.skip("Weaviate adaptor not available")
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
|
||||
adaptor = get_adaptor("weaviate")
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
data = json.loads(output_path.read_text())
|
||||
|
||||
for obj in data["objects"]:
|
||||
assert "doc_version" in obj["properties"]
|
||||
assert obj["properties"]["doc_version"] == "16.2"
|
||||
|
||||
def test_weaviate_schema_includes_doc_version(self, sample_skill_dir, tmp_path):
|
||||
from skill_seekers.cli.adaptors import ADAPTORS
|
||||
|
||||
if "weaviate" not in ADAPTORS:
|
||||
pytest.skip("Weaviate adaptor not available")
|
||||
from skill_seekers.cli.adaptors import get_adaptor
|
||||
|
||||
adaptor = get_adaptor("weaviate")
|
||||
output_path = adaptor.package(sample_skill_dir, tmp_path)
|
||||
data = json.loads(output_path.read_text())
|
||||
|
||||
property_names = [p["name"] for p in data["schema"]["properties"]]
|
||||
assert "doc_version" in property_names
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI Flag Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestDocVersionCLIFlag:
|
||||
"""Test --doc-version CLI flag is accepted."""
|
||||
|
||||
def test_common_arguments_has_doc_version(self):
|
||||
"""COMMON_ARGUMENTS includes doc_version."""
|
||||
from skill_seekers.cli.arguments.common import COMMON_ARGUMENTS
|
||||
|
||||
assert "doc_version" in COMMON_ARGUMENTS
|
||||
|
||||
def test_create_arguments_has_doc_version(self):
|
||||
"""UNIVERSAL_ARGUMENTS includes doc_version."""
|
||||
from skill_seekers.cli.arguments.create import UNIVERSAL_ARGUMENTS
|
||||
|
||||
assert "doc_version" in UNIVERSAL_ARGUMENTS
|
||||
|
||||
def test_doc_version_flag_parsed(self):
|
||||
"""--doc-version is parsed correctly by argparse."""
|
||||
import argparse
|
||||
from skill_seekers.cli.arguments.common import add_common_arguments
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_arguments(parser)
|
||||
args = parser.parse_args(["--doc-version", "16.2"])
|
||||
assert args.doc_version == "16.2"
|
||||
|
||||
def test_doc_version_default_empty(self):
|
||||
"""--doc-version defaults to empty string."""
|
||||
import argparse
|
||||
from skill_seekers.cli.arguments.common import add_common_arguments
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_arguments(parser)
|
||||
args = parser.parse_args([])
|
||||
assert args.doc_version == ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Package choices test
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPineconeInPackageChoices:
|
||||
"""Test pinecone is in package CLI choices."""
|
||||
|
||||
def test_pinecone_in_package_arguments(self):
|
||||
"""pinecone is listed in package --target choices."""
|
||||
from skill_seekers.cli.arguments.package import PACKAGE_ARGUMENTS
|
||||
|
||||
choices = PACKAGE_ARGUMENTS["target"]["kwargs"]["choices"]
|
||||
assert "pinecone" in choices
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _extract_metadata_from_package(platform: str, data: dict) -> list[dict]:
|
||||
"""Extract metadata dicts from adaptor-specific package format."""
|
||||
meta_list = []
|
||||
|
||||
if platform == "pinecone":
|
||||
for vec in data.get("vectors", []):
|
||||
meta_list.append(vec.get("metadata", {}))
|
||||
elif platform == "chroma":
|
||||
for meta in data.get("metadatas", []):
|
||||
meta_list.append(meta)
|
||||
elif platform == "faiss":
|
||||
for meta in data.get("metadatas", []):
|
||||
meta_list.append(meta)
|
||||
elif platform == "langchain":
|
||||
for doc in data if isinstance(data, list) else []:
|
||||
meta_list.append(doc.get("metadata", {}))
|
||||
elif platform == "llama-index":
|
||||
for node in data if isinstance(data, list) else []:
|
||||
meta_list.append(node.get("metadata", {}))
|
||||
elif platform == "haystack":
|
||||
for doc in data if isinstance(data, list) else []:
|
||||
meta_list.append(doc.get("meta", {}))
|
||||
elif platform == "qdrant":
|
||||
for point in data.get("points", []):
|
||||
meta_list.append(point.get("payload", {}))
|
||||
elif platform == "weaviate":
|
||||
for obj in data.get("objects", []):
|
||||
meta_list.append(obj.get("properties", {}))
|
||||
|
||||
return meta_list
|
||||
@@ -151,6 +151,45 @@ class TestWeaviateUploadBasics:
|
||||
assert hasattr(adaptor, "_generate_openai_embeddings")
|
||||
|
||||
|
||||
class TestEmbeddingMethodInheritance:
|
||||
"""Test that shared embedding methods are properly inherited from base."""
|
||||
|
||||
def test_chroma_inherits_openai_embeddings(self):
|
||||
"""Test chroma adaptor gets _generate_openai_embeddings from base."""
|
||||
adaptor = get_adaptor("chroma")
|
||||
assert hasattr(adaptor, "_generate_openai_embeddings")
|
||||
# Verify it's the base class method, not a local override
|
||||
from skill_seekers.cli.adaptors.base import SkillAdaptor
|
||||
|
||||
assert (
|
||||
adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings
|
||||
)
|
||||
|
||||
def test_weaviate_inherits_both_embedding_methods(self):
|
||||
"""Test weaviate adaptor gets both embedding methods from base."""
|
||||
adaptor = get_adaptor("weaviate")
|
||||
assert hasattr(adaptor, "_generate_openai_embeddings")
|
||||
assert hasattr(adaptor, "_generate_st_embeddings")
|
||||
from skill_seekers.cli.adaptors.base import SkillAdaptor
|
||||
|
||||
assert (
|
||||
adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings
|
||||
)
|
||||
assert adaptor._generate_st_embeddings.__func__ is SkillAdaptor._generate_st_embeddings
|
||||
|
||||
def test_pinecone_inherits_both_embedding_methods(self):
|
||||
"""Test pinecone adaptor gets both embedding methods from base."""
|
||||
adaptor = get_adaptor("pinecone")
|
||||
assert hasattr(adaptor, "_generate_openai_embeddings")
|
||||
assert hasattr(adaptor, "_generate_st_embeddings")
|
||||
from skill_seekers.cli.adaptors.base import SkillAdaptor
|
||||
|
||||
assert (
|
||||
adaptor._generate_openai_embeddings.__func__ is SkillAdaptor._generate_openai_embeddings
|
||||
)
|
||||
assert adaptor._generate_st_embeddings.__func__ is SkillAdaptor._generate_st_embeddings
|
||||
|
||||
|
||||
class TestPackageStructure:
|
||||
"""Test that packages are correctly structured for upload."""
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
@@ -30,8 +31,9 @@ except ImportError:
|
||||
WORD_AVAILABLE = False
|
||||
|
||||
|
||||
def _make_sample_extracted_data(num_sections=2, include_code=False, include_tables=False,
|
||||
include_images=False):
|
||||
def _make_sample_extracted_data(
|
||||
num_sections=2, include_code=False, include_tables=False, include_images=False
|
||||
):
|
||||
"""Helper to build a minimal extracted_data dict for testing."""
|
||||
mock_image_bytes = (
|
||||
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01"
|
||||
@@ -53,23 +55,29 @@ def _make_sample_extracted_data(num_sections=2, include_code=False, include_tabl
|
||||
}
|
||||
if include_code:
|
||||
section["code_samples"] = [
|
||||
{"code": f"def hello_{i}():\n return 'world'", "language": "python",
|
||||
"quality_score": 7.5}
|
||||
{
|
||||
"code": f"def hello_{i}():\n return 'world'",
|
||||
"language": "python",
|
||||
"quality_score": 7.5,
|
||||
}
|
||||
]
|
||||
if include_tables:
|
||||
section["tables"] = [
|
||||
{"headers": ["Col A", "Col B"], "rows": [["val1", "val2"], ["val3", "val4"]]}
|
||||
]
|
||||
if include_images:
|
||||
section["images"] = [
|
||||
{"index": 0, "data": mock_image_bytes, "width": 100, "height": 80}
|
||||
]
|
||||
section["images"] = [{"index": 0, "data": mock_image_bytes, "width": 100, "height": 80}]
|
||||
pages.append(section)
|
||||
|
||||
return {
|
||||
"source_file": "test.docx",
|
||||
"metadata": {"title": "Test Doc", "author": "Test Author", "created": "", "modified": "",
|
||||
"subject": ""},
|
||||
"metadata": {
|
||||
"title": "Test Doc",
|
||||
"author": "Test Author",
|
||||
"created": "",
|
||||
"modified": "",
|
||||
"subject": "",
|
||||
},
|
||||
"total_sections": num_sections,
|
||||
"total_code_blocks": num_sections if include_code else 0,
|
||||
"total_images": num_sections if include_images else 0,
|
||||
@@ -85,6 +93,7 @@ class TestWordToSkillConverterInit(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -130,6 +139,7 @@ class TestWordToSkillConverterInit(unittest.TestCase):
|
||||
def test_name_auto_detected_from_filename(self):
|
||||
"""Test name can be extracted from filename via infer_description_from_word."""
|
||||
from skill_seekers.cli.word_scraper import infer_description_from_word
|
||||
|
||||
desc = infer_description_from_word({}, name="my_doc")
|
||||
self.assertIn("my_doc", desc)
|
||||
|
||||
@@ -141,6 +151,7 @@ class TestWordCategorization(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -174,10 +185,22 @@ class TestWordCategorization(unittest.TestCase):
|
||||
converter.docx_path = ""
|
||||
converter.extracted_data = {
|
||||
"pages": [
|
||||
{"section_number": 1, "heading": "API Reference", "text": "api reference docs",
|
||||
"code_samples": [], "tables": [], "images": []},
|
||||
{"section_number": 2, "heading": "Getting Started", "text": "getting started guide",
|
||||
"code_samples": [], "tables": [], "images": []},
|
||||
{
|
||||
"section_number": 1,
|
||||
"heading": "API Reference",
|
||||
"text": "api reference docs",
|
||||
"code_samples": [],
|
||||
"tables": [],
|
||||
"images": [],
|
||||
},
|
||||
{
|
||||
"section_number": 2,
|
||||
"heading": "Getting Started",
|
||||
"text": "getting started guide",
|
||||
"code_samples": [],
|
||||
"tables": [],
|
||||
"images": [],
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
@@ -204,6 +227,7 @@ class TestWordSkillBuilding(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -296,6 +320,7 @@ class TestWordCodeBlocks(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -350,6 +375,7 @@ class TestWordTables(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -392,6 +418,7 @@ class TestWordImages(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -433,6 +460,7 @@ class TestWordErrorHandling(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
@@ -456,6 +484,37 @@ class TestWordErrorHandling(unittest.TestCase):
|
||||
with self.assertRaises((KeyError, TypeError)):
|
||||
self.WordToSkillConverter({"docx_path": "test.docx"})
|
||||
|
||||
def test_non_docx_file_raises_value_error(self):
|
||||
"""extract_docx raises ValueError for non-.docx files."""
|
||||
# Create a real file with wrong extension
|
||||
txt_path = os.path.join(self.temp_dir, "test.txt")
|
||||
with open(txt_path, "w") as f:
|
||||
f.write("not a docx")
|
||||
config = {"name": "test", "docx_path": txt_path}
|
||||
converter = self.WordToSkillConverter(config)
|
||||
with self.assertRaises(ValueError):
|
||||
converter.extract_docx()
|
||||
|
||||
def test_doc_file_raises_value_error(self):
|
||||
"""extract_docx raises ValueError for .doc (old Word format)."""
|
||||
doc_path = os.path.join(self.temp_dir, "test.doc")
|
||||
with open(doc_path, "w") as f:
|
||||
f.write("not a docx")
|
||||
config = {"name": "test", "docx_path": doc_path}
|
||||
converter = self.WordToSkillConverter(config)
|
||||
with self.assertRaises(ValueError):
|
||||
converter.extract_docx()
|
||||
|
||||
def test_no_extension_file_raises_value_error(self):
|
||||
"""extract_docx raises ValueError for file with no extension."""
|
||||
no_ext_path = os.path.join(self.temp_dir, "document")
|
||||
with open(no_ext_path, "w") as f:
|
||||
f.write("not a docx")
|
||||
config = {"name": "test", "docx_path": no_ext_path}
|
||||
converter = self.WordToSkillConverter(config)
|
||||
with self.assertRaises(ValueError):
|
||||
converter.extract_docx()
|
||||
|
||||
|
||||
class TestWordJSONWorkflow(unittest.TestCase):
|
||||
"""Test building skills from extracted JSON."""
|
||||
@@ -464,6 +523,7 @@ class TestWordJSONWorkflow(unittest.TestCase):
|
||||
if not WORD_AVAILABLE:
|
||||
self.skipTest("mammoth and python-docx not installed")
|
||||
from skill_seekers.cli.word_scraper import WordToSkillConverter
|
||||
|
||||
self.WordToSkillConverter = WordToSkillConverter
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user