feat: add 10 new skill source types (17 total) with full pipeline integration

Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint,
RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new
skill source types. Each type is fully integrated across:

- Standalone CLI commands (skill-seekers <type>)
- Auto-detection via 'skill-seekers create' (file extension + content sniffing)
- Unified multi-source configs (scraped_data, dispatch, config validation)
- Unified skill builder (generic merge + source-attributed synthesis)
- MCP server (scrape_generic tool with per-type flag mapping)
- pyproject.toml (entry points, optional deps, [all] group)

Also fixes: EPUB unified pipeline gap, missing word/video config validators,
OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale
docstrings, and adds 77 integration tests + complex-merge workflow.

50 files changed, +20,201 lines
This commit is contained in:
yusyus
2026-03-15 15:30:15 +03:00
parent 64403a3686
commit 53b911b697
50 changed files with 20193 additions and 856 deletions

View File

@@ -24,12 +24,12 @@ class TestParserRegistry:
def test_all_parsers_registered(self):
"""Test that all parsers are registered."""
assert len(PARSERS) == 25, f"Expected 25 parsers, got {len(PARSERS)}"
assert len(PARSERS) == 35, f"Expected 35 parsers, got {len(PARSERS)}"
def test_get_parser_names(self):
"""Test getting list of parser names."""
names = get_parser_names()
assert len(names) == 25
assert len(names) == 35
assert "scrape" in names
assert "github" in names
assert "package" in names
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
assert cmd in names, f"Command '{cmd}' not found in parser registry!"
def test_command_count_matches(self):
"""Test that we have exactly 25 commands (includes create, workflows, word, epub, video, and sync-config)."""
assert len(PARSERS) == 25
assert len(get_parser_names()) == 25
"""Test that we have exactly 35 commands (25 original + 10 new source types)."""
assert len(PARSERS) == 35
assert len(get_parser_names()) == 35
if __name__ == "__main__":

View File

@@ -0,0 +1,824 @@
#!/usr/bin/env python3
"""
Tests for v3.2.0 new source type integration points.
Covers source detection, config validation, generic merge, CLI wiring,
and source validation for the 10 new source types: jupyter, html, openapi,
asciidoc, pptx, rss, manpage, confluence, notion, chat.
"""
import os
import textwrap
import pytest
from skill_seekers.cli.config_validator import ConfigValidator
from skill_seekers.cli.main import COMMAND_MODULES
from skill_seekers.cli.parsers import PARSERS, get_parser_names
from skill_seekers.cli.source_detector import SourceDetector, SourceInfo
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
# ---------------------------------------------------------------------------
# 1. SourceDetector — new type detection
# ---------------------------------------------------------------------------
class TestSourceDetectorNewTypes:
"""Test that SourceDetector.detect() maps new extensions to correct types."""
# -- Jupyter --
def test_detect_ipynb(self):
"""Test .ipynb → jupyter detection."""
info = SourceDetector.detect("analysis.ipynb")
assert info.type == "jupyter"
assert info.parsed["file_path"] == "analysis.ipynb"
assert info.suggested_name == "analysis"
# -- HTML --
def test_detect_html_extension(self):
"""Test .html → html detection."""
info = SourceDetector.detect("page.html")
assert info.type == "html"
assert info.parsed["file_path"] == "page.html"
def test_detect_htm_extension(self):
"""Test .htm → html detection."""
info = SourceDetector.detect("index.HTM")
assert info.type == "html"
assert info.parsed["file_path"] == "index.HTM"
# -- PowerPoint --
def test_detect_pptx(self):
"""Test .pptx → pptx detection."""
info = SourceDetector.detect("slides.pptx")
assert info.type == "pptx"
assert info.parsed["file_path"] == "slides.pptx"
assert info.suggested_name == "slides"
# -- AsciiDoc --
def test_detect_adoc(self):
"""Test .adoc → asciidoc detection."""
info = SourceDetector.detect("manual.adoc")
assert info.type == "asciidoc"
assert info.parsed["file_path"] == "manual.adoc"
def test_detect_asciidoc_extension(self):
"""Test .asciidoc → asciidoc detection."""
info = SourceDetector.detect("guide.ASCIIDOC")
assert info.type == "asciidoc"
assert info.parsed["file_path"] == "guide.ASCIIDOC"
# -- Man pages --
def test_detect_man_extension(self):
"""Test .man → manpage detection."""
info = SourceDetector.detect("curl.man")
assert info.type == "manpage"
assert info.parsed["file_path"] == "curl.man"
@pytest.mark.parametrize("section", range(1, 9))
def test_detect_man_sections(self, section):
"""Test .1 through .8 → manpage for simple basenames."""
filename = f"git.{section}"
info = SourceDetector.detect(filename)
assert info.type == "manpage", f"{filename} should detect as manpage"
assert info.suggested_name == "git"
def test_man_section_with_dotted_basename_not_detected(self):
"""Test that 'access.log.1' is NOT detected as a man page.
The heuristic checks that the basename (without extension) has no dots.
"""
# This should fall through to web/domain detection (has a dot, not a path)
info = SourceDetector.detect("access.log.1")
# access.log.1 has a dot in the basename-without-ext ("access.log"),
# so it should NOT be detected as manpage. It falls through to the
# domain inference branch because it contains a dot and doesn't start
# with '/'.
assert info.type != "manpage"
# -- RSS/Atom --
def test_detect_rss_extension(self):
"""Test .rss → rss detection."""
info = SourceDetector.detect("feed.rss")
assert info.type == "rss"
assert info.parsed["file_path"] == "feed.rss"
def test_detect_atom_extension(self):
"""Test .atom → rss detection."""
info = SourceDetector.detect("updates.atom")
assert info.type == "rss"
assert info.parsed["file_path"] == "updates.atom"
def test_xml_not_detected_as_rss(self):
"""Test .xml is NOT detected as rss (too generic).
The fix ensures .xml files do not get incorrectly classified as RSS feeds.
"""
# .xml has no special handling — it will fall through to domain inference
# or raise ValueError depending on contents. Either way, it must not
# be classified as "rss".
info = SourceDetector.detect("data.xml")
assert info.type != "rss"
# -- OpenAPI --
def test_yaml_with_openapi_content_detected(self, tmp_path):
"""Test .yaml with 'openapi:' key → openapi detection."""
spec = tmp_path / "petstore.yaml"
spec.write_text(
textwrap.dedent("""\
openapi: "3.0.0"
info:
title: Petstore
version: "1.0.0"
paths: {}
""")
)
info = SourceDetector.detect(str(spec))
assert info.type == "openapi"
assert info.parsed["file_path"] == str(spec)
assert info.suggested_name == "petstore"
def test_yaml_with_swagger_content_detected(self, tmp_path):
"""Test .yaml with 'swagger:' key → openapi detection."""
spec = tmp_path / "legacy.yml"
spec.write_text(
textwrap.dedent("""\
swagger: "2.0"
info:
title: Legacy API
basePath: /v1
""")
)
info = SourceDetector.detect(str(spec))
assert info.type == "openapi"
def test_yaml_without_openapi_not_detected(self, tmp_path):
"""Test .yaml without OpenAPI content is NOT detected as openapi.
When the YAML file doesn't contain openapi/swagger keys the detector
skips OpenAPI and falls through. For an absolute path it will raise
ValueError (cannot determine type), which still confirms it was NOT
classified as openapi.
"""
plain = tmp_path / "config.yaml"
plain.write_text("name: my-project\nversion: 1.0\n")
# Absolute path falls through to ValueError (no matching type).
# Either way, it must NOT be "openapi".
try:
info = SourceDetector.detect(str(plain))
assert info.type != "openapi"
except ValueError:
# Raised because source type cannot be determined — this is fine,
# the important thing is it was not classified as openapi.
pass
def test_looks_like_openapi_returns_false_for_missing_file(self):
"""Test _looks_like_openapi returns False for non-existent file."""
assert SourceDetector._looks_like_openapi("/nonexistent/spec.yaml") is False
def test_looks_like_openapi_json_key_format(self, tmp_path):
"""Test _looks_like_openapi detects JSON-style keys (quoted)."""
spec = tmp_path / "api.yaml"
spec.write_text('"openapi": "3.0.0"\n')
assert SourceDetector._looks_like_openapi(str(spec)) is True
# ---------------------------------------------------------------------------
# 2. ConfigValidator — new source type validation
# ---------------------------------------------------------------------------
class TestConfigValidatorNewTypes:
"""Test ConfigValidator VALID_SOURCE_TYPES and per-type validation."""
# All 17 expected types
EXPECTED_TYPES = {
"documentation",
"github",
"pdf",
"local",
"word",
"video",
"epub",
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
}
def test_all_17_types_present(self):
"""Test that VALID_SOURCE_TYPES contains all 17 types."""
assert ConfigValidator.VALID_SOURCE_TYPES == self.EXPECTED_TYPES
def test_unknown_type_rejected(self):
"""Test that an unknown source type is rejected during validation."""
config = {
"name": "test",
"description": "test",
"sources": [{"type": "foobar"}],
}
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Invalid type 'foobar'"):
validator.validate()
# --- Per-type required-field validation ---
def _make_config(self, source: dict) -> dict:
"""Helper: wrap a source dict in a valid config structure."""
return {
"name": "test",
"description": "test",
"sources": [source],
}
def test_epub_requires_path(self):
"""Test epub source validation requires 'path'."""
config = self._make_config({"type": "epub"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path'"):
validator.validate()
def test_jupyter_requires_path(self):
"""Test jupyter source validation requires 'path'."""
config = self._make_config({"type": "jupyter"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path'"):
validator.validate()
def test_html_requires_path(self):
"""Test html source validation requires 'path'."""
config = self._make_config({"type": "html"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path'"):
validator.validate()
def test_openapi_requires_path_or_url(self):
"""Test openapi source validation requires 'path' or 'url'."""
config = self._make_config({"type": "openapi"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path' or 'url'"):
validator.validate()
def test_openapi_accepts_url(self):
"""Test openapi source passes validation with 'url'."""
config = self._make_config({"type": "openapi", "url": "https://example.com/spec.yaml"})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_pptx_requires_path(self):
"""Test pptx source validation requires 'path'."""
config = self._make_config({"type": "pptx"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path'"):
validator.validate()
def test_asciidoc_requires_path(self):
"""Test asciidoc source validation requires 'path'."""
config = self._make_config({"type": "asciidoc"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path'"):
validator.validate()
def test_confluence_requires_url_or_path(self):
"""Test confluence requires 'url'/'base_url' or 'path'."""
config = self._make_config({"type": "confluence"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field"):
validator.validate()
def test_confluence_accepts_base_url(self):
"""Test confluence passes with base_url + space_key."""
config = self._make_config(
{
"type": "confluence",
"base_url": "https://wiki.example.com",
"space_key": "DEV",
}
)
validator = ConfigValidator(config)
assert validator.validate() is True
def test_confluence_accepts_path(self):
"""Test confluence passes with export path."""
config = self._make_config({"type": "confluence", "path": "/exports/wiki"})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_notion_requires_url_or_path(self):
"""Test notion requires 'url'/'database_id'/'page_id' or 'path'."""
config = self._make_config({"type": "notion"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field"):
validator.validate()
def test_notion_accepts_page_id(self):
"""Test notion passes with page_id."""
config = self._make_config({"type": "notion", "page_id": "abc123"})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_notion_accepts_database_id(self):
"""Test notion passes with database_id."""
config = self._make_config({"type": "notion", "database_id": "db-456"})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_rss_requires_url_or_path(self):
"""Test rss source validation requires 'url' or 'path'."""
config = self._make_config({"type": "rss"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'url' or 'path'"):
validator.validate()
def test_rss_accepts_url(self):
"""Test rss passes with url."""
config = self._make_config({"type": "rss", "url": "https://blog.example.com/feed.xml"})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_manpage_requires_path_or_names(self):
"""Test manpage source validation requires 'path' or 'names'."""
config = self._make_config({"type": "manpage"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path' or 'names'"):
validator.validate()
def test_manpage_accepts_names(self):
"""Test manpage passes with 'names' list."""
config = self._make_config({"type": "manpage", "names": ["git", "curl"]})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_chat_requires_path_or_token(self):
"""Test chat source validation requires 'path' or 'token'."""
config = self._make_config({"type": "chat"})
validator = ConfigValidator(config)
with pytest.raises(ValueError, match="Missing required field 'path'.*or 'token'"):
validator.validate()
def test_chat_accepts_path(self):
"""Test chat passes with export path."""
config = self._make_config({"type": "chat", "path": "/exports/slack"})
validator = ConfigValidator(config)
assert validator.validate() is True
def test_chat_accepts_token_with_channel(self):
"""Test chat passes with API token + channel."""
config = self._make_config(
{
"type": "chat",
"token": "xoxb-fake",
"channel": "#general",
}
)
validator = ConfigValidator(config)
assert validator.validate() is True
# ---------------------------------------------------------------------------
# 3. UnifiedSkillBuilder — generic merge system
# ---------------------------------------------------------------------------
class TestUnifiedSkillBuilderGenericMerge:
"""Test _generic_merge, _append_extra_sources, and _SOURCE_LABELS."""
def _make_builder(self, tmp_path) -> UnifiedSkillBuilder:
"""Create a minimal builder instance for testing."""
config = {
"name": "test_project",
"description": "A test project for merge testing",
"sources": [
{"type": "jupyter", "path": "nb.ipynb"},
{"type": "rss", "url": "https://example.com/feed.rss"},
],
}
scraped_data: dict = {}
builder = UnifiedSkillBuilder(
config=config,
scraped_data=scraped_data,
cache_dir=str(tmp_path / "cache"),
)
# Override skill_dir to use tmp_path
builder.skill_dir = str(tmp_path / "output" / "test_project")
os.makedirs(builder.skill_dir, exist_ok=True)
os.makedirs(os.path.join(builder.skill_dir, "references"), exist_ok=True)
return builder
def test_generic_merge_produces_valid_markdown(self, tmp_path):
"""Test _generic_merge with two source types produces markdown."""
builder = self._make_builder(tmp_path)
skill_mds = {
"jupyter": "## When to Use\n\nFor data analysis.\n\n## Quick Reference\n\nImport pandas.",
"rss": "## When to Use\n\nFor feed monitoring.\n\n## Feed Items\n\nLatest entries.",
}
result = builder._generic_merge(skill_mds)
# Must be non-empty markdown
assert len(result) > 100
# Must contain the project title
assert "Test Project" in result
def test_generic_merge_includes_yaml_frontmatter(self, tmp_path):
"""Test _generic_merge includes YAML frontmatter."""
builder = self._make_builder(tmp_path)
skill_mds = {
"html": "## Overview\n\nHTML content here.",
}
result = builder._generic_merge(skill_mds)
assert result.startswith("---\n")
assert "name: test-project" in result
assert "description: A test project" in result
def test_generic_merge_attributes_content_to_sources(self, tmp_path):
"""Test _generic_merge attributes content to correct source labels."""
builder = self._make_builder(tmp_path)
skill_mds = {
"jupyter": "## Overview\n\nNotebook content.",
"pptx": "## Overview\n\nSlide content.",
}
result = builder._generic_merge(skill_mds)
# Check source labels appear
assert "Jupyter Notebook" in result
assert "PowerPoint Presentation" in result
def test_generic_merge_single_source_section(self, tmp_path):
"""Test section unique to one source has 'From <Label>' attribution."""
builder = self._make_builder(tmp_path)
skill_mds = {
"manpage": "## Synopsis\n\ngit [options]",
}
result = builder._generic_merge(skill_mds)
assert "*From Man Page*" in result
assert "## Synopsis" in result
def test_generic_merge_multi_source_section(self, tmp_path):
"""Test section shared by multiple sources gets sub-headings per source."""
builder = self._make_builder(tmp_path)
skill_mds = {
"asciidoc": "## Quick Reference\n\nAsciiDoc quick ref.",
"html": "## Quick Reference\n\nHTML quick ref.",
}
result = builder._generic_merge(skill_mds)
# Both sources should be attributed under the shared section
assert "### From AsciiDoc Document" in result
assert "### From HTML Document" in result
def test_generic_merge_footer(self, tmp_path):
"""Test _generic_merge ends with the standard footer."""
builder = self._make_builder(tmp_path)
skill_mds = {
"rss": "## Feeds\n\nSome feeds.",
}
result = builder._generic_merge(skill_mds)
assert "Generated by Skill Seeker" in result
def test_generic_merge_merged_from_line(self, tmp_path):
"""Test _generic_merge includes 'Merged from:' with correct labels."""
builder = self._make_builder(tmp_path)
skill_mds = {
"confluence": "## Pages\n\nWiki pages.",
"notion": "## Databases\n\nNotion DBs.",
}
result = builder._generic_merge(skill_mds)
assert "*Merged from: Confluence Wiki, Notion Page*" in result
def test_append_extra_sources_adds_sections(self, tmp_path):
"""Test _append_extra_sources adds new sections to base content."""
builder = self._make_builder(tmp_path)
base_content = "# Test\n\nIntro.\n\n## Main Section\n\nContent.\n\n---\n\n*Footer*\n"
skill_mds = {
"epub": "## Chapters\n\nChapter list.\n\n## Key Concepts\n\nConcept A.",
}
result = builder._append_extra_sources(base_content, skill_mds, {"epub"})
# The extra source content should be inserted before the footer separator
assert "EPUB E-book Content" in result
assert "Chapters" in result
assert "Key Concepts" in result
# Original content should still be present
assert "# Test" in result
assert "## Main Section" in result
def test_append_extra_sources_preserves_footer(self, tmp_path):
"""Test _append_extra_sources keeps the footer intact."""
builder = self._make_builder(tmp_path)
base_content = "# Test\n\n---\n\n*Footer*\n"
skill_mds = {
"chat": "## Messages\n\nChat history.",
}
result = builder._append_extra_sources(base_content, skill_mds, {"chat"})
assert "*Footer*" in result
def test_source_labels_has_all_17_types(self):
"""Test _SOURCE_LABELS has entries for all 17 source types."""
expected = {
"documentation",
"github",
"pdf",
"word",
"epub",
"video",
"local",
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
}
assert set(UnifiedSkillBuilder._SOURCE_LABELS.keys()) == expected
def test_source_labels_values_are_nonempty_strings(self):
"""Test all _SOURCE_LABELS values are non-empty strings."""
for key, label in UnifiedSkillBuilder._SOURCE_LABELS.items():
assert isinstance(label, str), f"Label for '{key}' is not a string"
assert len(label) > 0, f"Label for '{key}' is empty"
# ---------------------------------------------------------------------------
# 4. COMMAND_MODULES and parser wiring
# ---------------------------------------------------------------------------
class TestCommandModules:
"""Test that all 10 new source types are wired into CLI."""
NEW_COMMAND_NAMES = [
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"rss",
"manpage",
"confluence",
"notion",
"chat",
]
def test_new_types_in_command_modules(self):
"""Test all 10 new source types are in COMMAND_MODULES."""
for cmd in self.NEW_COMMAND_NAMES:
assert cmd in COMMAND_MODULES, f"'{cmd}' not in COMMAND_MODULES"
def test_command_modules_values_are_module_paths(self):
"""Test COMMAND_MODULES values look like importable module paths."""
for cmd in self.NEW_COMMAND_NAMES:
module_path = COMMAND_MODULES[cmd]
assert module_path.startswith("skill_seekers.cli."), (
f"Module path for '{cmd}' doesn't start with 'skill_seekers.cli.'"
)
def test_new_parser_names_include_all_10(self):
"""Test that get_parser_names() includes all 10 new source types."""
names = get_parser_names()
for cmd in self.NEW_COMMAND_NAMES:
assert cmd in names, f"Parser '{cmd}' not registered"
def test_total_parser_count(self):
"""Test total PARSERS count is 35 (25 original + 10 new)."""
assert len(PARSERS) == 35
def test_no_duplicate_parser_names(self):
"""Test no duplicate parser names exist."""
names = get_parser_names()
assert len(names) == len(set(names)), "Duplicate parser names found!"
def test_command_module_count(self):
"""Test COMMAND_MODULES has expected number of entries."""
# 25 original + 10 new = 35
assert len(COMMAND_MODULES) == 35
# ---------------------------------------------------------------------------
# 5. SourceDetector.validate_source — new types
# ---------------------------------------------------------------------------
class TestSourceDetectorValidation:
"""Test validate_source for new file-based source types."""
def test_validation_passes_for_existing_jupyter(self, tmp_path):
"""Test validation passes for an existing .ipynb file."""
nb = tmp_path / "test.ipynb"
nb.write_text('{"cells": []}')
info = SourceInfo(
type="jupyter",
parsed={"file_path": str(nb)},
suggested_name="test",
raw_input=str(nb),
)
# Should not raise
SourceDetector.validate_source(info)
def test_validation_raises_for_nonexistent_jupyter(self):
"""Test validation raises ValueError for non-existent file."""
info = SourceInfo(
type="jupyter",
parsed={"file_path": "/nonexistent/notebook.ipynb"},
suggested_name="notebook",
raw_input="/nonexistent/notebook.ipynb",
)
with pytest.raises(ValueError, match="does not exist"):
SourceDetector.validate_source(info)
def test_validation_passes_for_existing_html(self, tmp_path):
"""Test validation passes for an existing .html file."""
html = tmp_path / "page.html"
html.write_text("<html></html>")
info = SourceInfo(
type="html",
parsed={"file_path": str(html)},
suggested_name="page",
raw_input=str(html),
)
SourceDetector.validate_source(info)
def test_validation_raises_for_nonexistent_pptx(self):
"""Test validation raises ValueError for non-existent pptx."""
info = SourceInfo(
type="pptx",
parsed={"file_path": "/nonexistent/slides.pptx"},
suggested_name="slides",
raw_input="/nonexistent/slides.pptx",
)
with pytest.raises(ValueError, match="does not exist"):
SourceDetector.validate_source(info)
def test_validation_passes_for_existing_openapi(self, tmp_path):
"""Test validation passes for an existing OpenAPI spec file."""
spec = tmp_path / "api.yaml"
spec.write_text("openapi: '3.0.0'\n")
info = SourceInfo(
type="openapi",
parsed={"file_path": str(spec)},
suggested_name="api",
raw_input=str(spec),
)
SourceDetector.validate_source(info)
def test_validation_raises_for_nonexistent_asciidoc(self):
"""Test validation raises ValueError for non-existent asciidoc."""
info = SourceInfo(
type="asciidoc",
parsed={"file_path": "/nonexistent/doc.adoc"},
suggested_name="doc",
raw_input="/nonexistent/doc.adoc",
)
with pytest.raises(ValueError, match="does not exist"):
SourceDetector.validate_source(info)
def test_validation_raises_for_nonexistent_manpage(self):
"""Test validation raises ValueError for non-existent manpage."""
info = SourceInfo(
type="manpage",
parsed={"file_path": "/nonexistent/git.1"},
suggested_name="git",
raw_input="/nonexistent/git.1",
)
with pytest.raises(ValueError, match="does not exist"):
SourceDetector.validate_source(info)
def test_validation_passes_for_existing_manpage(self, tmp_path):
"""Test validation passes for an existing man page file."""
man = tmp_path / "curl.1"
man.write_text(".TH CURL 1\n")
info = SourceInfo(
type="manpage",
parsed={"file_path": str(man)},
suggested_name="curl",
raw_input=str(man),
)
SourceDetector.validate_source(info)
def test_rss_url_validation_no_file_check(self):
"""Test rss validation passes for URL-based source (no file check)."""
info = SourceInfo(
type="rss",
parsed={"url": "https://example.com/feed.rss"},
suggested_name="feed",
raw_input="https://example.com/feed.rss",
)
# rss validation only checks file if file_path is present; URL should pass
SourceDetector.validate_source(info)
def test_rss_validation_raises_for_nonexistent_file(self):
"""Test rss validation raises for non-existent local file."""
info = SourceInfo(
type="rss",
parsed={"file_path": "/nonexistent/feed.rss"},
suggested_name="feed",
raw_input="/nonexistent/feed.rss",
)
with pytest.raises(ValueError, match="does not exist"):
SourceDetector.validate_source(info)
def test_rss_validation_passes_for_existing_file(self, tmp_path):
"""Test rss validation passes for an existing .rss file."""
rss = tmp_path / "feed.rss"
rss.write_text("<rss></rss>")
info = SourceInfo(
type="rss",
parsed={"file_path": str(rss)},
suggested_name="feed",
raw_input=str(rss),
)
SourceDetector.validate_source(info)
def test_validation_passes_for_directory_types(self, tmp_path):
"""Test validation passes when source is a directory (e.g., html dir)."""
html_dir = tmp_path / "pages"
html_dir.mkdir()
info = SourceInfo(
type="html",
parsed={"file_path": str(html_dir)},
suggested_name="pages",
raw_input=str(html_dir),
)
# The validator allows directories for these types (isfile or isdir)
SourceDetector.validate_source(info)
# ---------------------------------------------------------------------------
# 6. CreateCommand._route_generic coverage
# ---------------------------------------------------------------------------
class TestCreateCommandRouting:
"""Test that CreateCommand._route_to_scraper maps new types to _route_generic."""
# We can't easily call _route_to_scraper (it imports real scrapers),
# but we verify the routing table is correct by checking the method source.
GENERIC_ROUTES = {
"jupyter": ("jupyter_scraper", "--notebook"),
"html": ("html_scraper", "--html-path"),
"openapi": ("openapi_scraper", "--spec"),
"asciidoc": ("asciidoc_scraper", "--asciidoc-path"),
"pptx": ("pptx_scraper", "--pptx"),
"rss": ("rss_scraper", "--feed-path"),
"manpage": ("man_scraper", "--man-path"),
"confluence": ("confluence_scraper", "--export-path"),
"notion": ("notion_scraper", "--export-path"),
"chat": ("chat_scraper", "--export-path"),
}
def test_route_to_scraper_source_coverage(self):
"""Test _route_to_scraper method handles all 10 new types.
We inspect the method source to verify each type has a branch.
"""
import inspect
source = inspect.getsource(
__import__(
"skill_seekers.cli.create_command",
fromlist=["CreateCommand"],
).CreateCommand._route_to_scraper
)
for source_type in self.GENERIC_ROUTES:
assert f'"{source_type}"' in source, (
f"_route_to_scraper missing branch for '{source_type}'"
)
def test_generic_route_module_names(self):
"""Test _route_generic is called with correct module names."""
import inspect
source = inspect.getsource(
__import__(
"skill_seekers.cli.create_command",
fromlist=["CreateCommand"],
).CreateCommand._route_to_scraper
)
for source_type, (module, flag) in self.GENERIC_ROUTES.items():
assert f'"{module}"' in source, f"Module name '{module}' not found for '{source_type}'"
assert f'"{flag}"' in source, f"Flag '{flag}' not found for '{source_type}'"
if __name__ == "__main__":
pytest.main([__file__, "-v"])