feat: add 10 new skill source types (17 total) with full pipeline integration

Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
2026-03-15 15:30:15 +03:00
parent 64403a3686
commit 53b911b697
50 changed files with 20193 additions and 856 deletions
--- a/tests/test_cli_parsers.py
+++ b/tests/test_cli_parsers.py
@@ -24,12 +24,12 @@ class TestParserRegistry:

    def test_all_parsers_registered(self):
        """Test that all parsers are registered."""
-        assert len(PARSERS) == 25, f"Expected 25 parsers, got {len(PARSERS)}"
+        assert len(PARSERS) == 35, f"Expected 35 parsers, got {len(PARSERS)}"

    def test_get_parser_names(self):
        """Test getting list of parser names."""
        names = get_parser_names()
-        assert len(names) == 25
+        assert len(names) == 35
        assert "scrape" in names
        assert "github" in names
        assert "package" in names
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
            assert cmd in names, f"Command '{cmd}' not found in parser registry!"

    def test_command_count_matches(self):
-        """Test that we have exactly 25 commands (includes create, workflows, word, epub, video, and sync-config)."""
-        assert len(PARSERS) == 25
-        assert len(get_parser_names()) == 25
+        """Test that we have exactly 35 commands (25 original + 10 new source types)."""
+        assert len(PARSERS) == 35
+        assert len(get_parser_names()) == 35


 if __name__ == "__main__":
--- a/tests/test_new_source_types.py
+++ b/tests/test_new_source_types.py
@@ -0,0 +1,824 @@
+#!/usr/bin/env python3
+"""
+Tests for v3.2.0 new source type integration points.
+
+Covers source detection, config validation, generic merge, CLI wiring,
+and source validation for the 10 new source types: jupyter, html, openapi,
+asciidoc, pptx, rss, manpage, confluence, notion, chat.
+"""
+
+import os
+import textwrap
+
+import pytest
+
+from skill_seekers.cli.config_validator import ConfigValidator
+from skill_seekers.cli.main import COMMAND_MODULES
+from skill_seekers.cli.parsers import PARSERS, get_parser_names
+from skill_seekers.cli.source_detector import SourceDetector, SourceInfo
+from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+
+# ---------------------------------------------------------------------------
+# 1. SourceDetector — new type detection
+# ---------------------------------------------------------------------------
+
+
+class TestSourceDetectorNewTypes:
+    """Test that SourceDetector.detect() maps new extensions to correct types."""
+
+    # -- Jupyter --
+    def test_detect_ipynb(self):
+        """Test .ipynb → jupyter detection."""
+        info = SourceDetector.detect("analysis.ipynb")
+        assert info.type == "jupyter"
+        assert info.parsed["file_path"] == "analysis.ipynb"
+        assert info.suggested_name == "analysis"
+
+    # -- HTML --
+    def test_detect_html_extension(self):
+        """Test .html → html detection."""
+        info = SourceDetector.detect("page.html")
+        assert info.type == "html"
+        assert info.parsed["file_path"] == "page.html"
+
+    def test_detect_htm_extension(self):
+        """Test .htm → html detection."""
+        info = SourceDetector.detect("index.HTM")
+        assert info.type == "html"
+        assert info.parsed["file_path"] == "index.HTM"
+
+    # -- PowerPoint --
+    def test_detect_pptx(self):
+        """Test .pptx → pptx detection."""
+        info = SourceDetector.detect("slides.pptx")
+        assert info.type == "pptx"
+        assert info.parsed["file_path"] == "slides.pptx"
+        assert info.suggested_name == "slides"
+
+    # -- AsciiDoc --
+    def test_detect_adoc(self):
+        """Test .adoc → asciidoc detection."""
+        info = SourceDetector.detect("manual.adoc")
+        assert info.type == "asciidoc"
+        assert info.parsed["file_path"] == "manual.adoc"
+
+    def test_detect_asciidoc_extension(self):
+        """Test .asciidoc → asciidoc detection."""
+        info = SourceDetector.detect("guide.ASCIIDOC")
+        assert info.type == "asciidoc"
+        assert info.parsed["file_path"] == "guide.ASCIIDOC"
+
+    # -- Man pages --
+    def test_detect_man_extension(self):
+        """Test .man → manpage detection."""
+        info = SourceDetector.detect("curl.man")
+        assert info.type == "manpage"
+        assert info.parsed["file_path"] == "curl.man"
+
+    @pytest.mark.parametrize("section", range(1, 9))
+    def test_detect_man_sections(self, section):
+        """Test .1 through .8 → manpage for simple basenames."""
+        filename = f"git.{section}"
+        info = SourceDetector.detect(filename)
+        assert info.type == "manpage", f"{filename} should detect as manpage"
+        assert info.suggested_name == "git"
+
+    def test_man_section_with_dotted_basename_not_detected(self):
+        """Test that 'access.log.1' is NOT detected as a man page.
+
+        The heuristic checks that the basename (without extension) has no dots.
+        """
+        # This should fall through to web/domain detection (has a dot, not a path)
+        info = SourceDetector.detect("access.log.1")
+        # access.log.1 has a dot in the basename-without-ext ("access.log"),
+        # so it should NOT be detected as manpage.  It falls through to the
+        # domain inference branch because it contains a dot and doesn't start
+        # with '/'.
+        assert info.type != "manpage"
+
+    # -- RSS/Atom --
+    def test_detect_rss_extension(self):
+        """Test .rss → rss detection."""
+        info = SourceDetector.detect("feed.rss")
+        assert info.type == "rss"
+        assert info.parsed["file_path"] == "feed.rss"
+
+    def test_detect_atom_extension(self):
+        """Test .atom → rss detection."""
+        info = SourceDetector.detect("updates.atom")
+        assert info.type == "rss"
+        assert info.parsed["file_path"] == "updates.atom"
+
+    def test_xml_not_detected_as_rss(self):
+        """Test .xml is NOT detected as rss (too generic).
+
+        The fix ensures .xml files do not get incorrectly classified as RSS feeds.
+        """
+        # .xml has no special handling — it will fall through to domain inference
+        # or raise ValueError depending on contents.  Either way, it must not
+        # be classified as "rss".
+        info = SourceDetector.detect("data.xml")
+        assert info.type != "rss"
+
+    # -- OpenAPI --
+    def test_yaml_with_openapi_content_detected(self, tmp_path):
+        """Test .yaml with 'openapi:' key → openapi detection."""
+        spec = tmp_path / "petstore.yaml"
+        spec.write_text(
+            textwrap.dedent("""\
+                openapi: "3.0.0"
+                info:
+                  title: Petstore
+                  version: "1.0.0"
+                paths: {}
+            """)
+        )
+        info = SourceDetector.detect(str(spec))
+        assert info.type == "openapi"
+        assert info.parsed["file_path"] == str(spec)
+        assert info.suggested_name == "petstore"
+
+    def test_yaml_with_swagger_content_detected(self, tmp_path):
+        """Test .yaml with 'swagger:' key → openapi detection."""
+        spec = tmp_path / "legacy.yml"
+        spec.write_text(
+            textwrap.dedent("""\
+                swagger: "2.0"
+                info:
+                  title: Legacy API
+                basePath: /v1
+            """)
+        )
+        info = SourceDetector.detect(str(spec))
+        assert info.type == "openapi"
+
+    def test_yaml_without_openapi_not_detected(self, tmp_path):
+        """Test .yaml without OpenAPI content is NOT detected as openapi.
+
+        When the YAML file doesn't contain openapi/swagger keys the detector
+        skips OpenAPI and falls through.  For an absolute path it will raise
+        ValueError (cannot determine type), which still confirms it was NOT
+        classified as openapi.
+        """
+        plain = tmp_path / "config.yaml"
+        plain.write_text("name: my-project\nversion: 1.0\n")
+        # Absolute path falls through to ValueError (no matching type).
+        # Either way, it must NOT be "openapi".
+        try:
+            info = SourceDetector.detect(str(plain))
+            assert info.type != "openapi"
+        except ValueError:
+            # Raised because source type cannot be determined — this is fine,
+            # the important thing is it was not classified as openapi.
+            pass
+
+    def test_looks_like_openapi_returns_false_for_missing_file(self):
+        """Test _looks_like_openapi returns False for non-existent file."""
+        assert SourceDetector._looks_like_openapi("/nonexistent/spec.yaml") is False
+
+    def test_looks_like_openapi_json_key_format(self, tmp_path):
+        """Test _looks_like_openapi detects JSON-style keys (quoted)."""
+        spec = tmp_path / "api.yaml"
+        spec.write_text('"openapi": "3.0.0"\n')
+        assert SourceDetector._looks_like_openapi(str(spec)) is True
+
+
+# ---------------------------------------------------------------------------
+# 2. ConfigValidator — new source type validation
+# ---------------------------------------------------------------------------
+
+
+class TestConfigValidatorNewTypes:
+    """Test ConfigValidator VALID_SOURCE_TYPES and per-type validation."""
+
+    # All 17 expected types
+    EXPECTED_TYPES = {
+        "documentation",
+        "github",
+        "pdf",
+        "local",
+        "word",
+        "video",
+        "epub",
+        "jupyter",
+        "html",
+        "openapi",
+        "asciidoc",
+        "pptx",
+        "confluence",
+        "notion",
+        "rss",
+        "manpage",
+        "chat",
+    }
+
+    def test_all_17_types_present(self):
+        """Test that VALID_SOURCE_TYPES contains all 17 types."""
+        assert ConfigValidator.VALID_SOURCE_TYPES == self.EXPECTED_TYPES
+
+    def test_unknown_type_rejected(self):
+        """Test that an unknown source type is rejected during validation."""
+        config = {
+            "name": "test",
+            "description": "test",
+            "sources": [{"type": "foobar"}],
+        }
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Invalid type 'foobar'"):
+            validator.validate()
+
+    # --- Per-type required-field validation ---
+
+    def _make_config(self, source: dict) -> dict:
+        """Helper: wrap a source dict in a valid config structure."""
+        return {
+            "name": "test",
+            "description": "test",
+            "sources": [source],
+        }
+
+    def test_epub_requires_path(self):
+        """Test epub source validation requires 'path'."""
+        config = self._make_config({"type": "epub"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_jupyter_requires_path(self):
+        """Test jupyter source validation requires 'path'."""
+        config = self._make_config({"type": "jupyter"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_html_requires_path(self):
+        """Test html source validation requires 'path'."""
+        config = self._make_config({"type": "html"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_openapi_requires_path_or_url(self):
+        """Test openapi source validation requires 'path' or 'url'."""
+        config = self._make_config({"type": "openapi"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path' or 'url'"):
+            validator.validate()
+
+    def test_openapi_accepts_url(self):
+        """Test openapi source passes validation with 'url'."""
+        config = self._make_config({"type": "openapi", "url": "https://example.com/spec.yaml"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_pptx_requires_path(self):
+        """Test pptx source validation requires 'path'."""
+        config = self._make_config({"type": "pptx"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_asciidoc_requires_path(self):
+        """Test asciidoc source validation requires 'path'."""
+        config = self._make_config({"type": "asciidoc"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_confluence_requires_url_or_path(self):
+        """Test confluence requires 'url'/'base_url' or 'path'."""
+        config = self._make_config({"type": "confluence"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field"):
+            validator.validate()
+
+    def test_confluence_accepts_base_url(self):
+        """Test confluence passes with base_url + space_key."""
+        config = self._make_config(
+            {
+                "type": "confluence",
+                "base_url": "https://wiki.example.com",
+                "space_key": "DEV",
+            }
+        )
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_confluence_accepts_path(self):
+        """Test confluence passes with export path."""
+        config = self._make_config({"type": "confluence", "path": "/exports/wiki"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_notion_requires_url_or_path(self):
+        """Test notion requires 'url'/'database_id'/'page_id' or 'path'."""
+        config = self._make_config({"type": "notion"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field"):
+            validator.validate()
+
+    def test_notion_accepts_page_id(self):
+        """Test notion passes with page_id."""
+        config = self._make_config({"type": "notion", "page_id": "abc123"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_notion_accepts_database_id(self):
+        """Test notion passes with database_id."""
+        config = self._make_config({"type": "notion", "database_id": "db-456"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_rss_requires_url_or_path(self):
+        """Test rss source validation requires 'url' or 'path'."""
+        config = self._make_config({"type": "rss"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'url' or 'path'"):
+            validator.validate()
+
+    def test_rss_accepts_url(self):
+        """Test rss passes with url."""
+        config = self._make_config({"type": "rss", "url": "https://blog.example.com/feed.xml"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_manpage_requires_path_or_names(self):
+        """Test manpage source validation requires 'path' or 'names'."""
+        config = self._make_config({"type": "manpage"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path' or 'names'"):
+            validator.validate()
+
+    def test_manpage_accepts_names(self):
+        """Test manpage passes with 'names' list."""
+        config = self._make_config({"type": "manpage", "names": ["git", "curl"]})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_chat_requires_path_or_token(self):
+        """Test chat source validation requires 'path' or 'token'."""
+        config = self._make_config({"type": "chat"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'.*or 'token'"):
+            validator.validate()
+
+    def test_chat_accepts_path(self):
+        """Test chat passes with export path."""
+        config = self._make_config({"type": "chat", "path": "/exports/slack"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_chat_accepts_token_with_channel(self):
+        """Test chat passes with API token + channel."""
+        config = self._make_config(
+            {
+                "type": "chat",
+                "token": "xoxb-fake",
+                "channel": "#general",
+            }
+        )
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+
+# ---------------------------------------------------------------------------
+# 3. UnifiedSkillBuilder — generic merge system
+# ---------------------------------------------------------------------------
+
+
+class TestUnifiedSkillBuilderGenericMerge:
+    """Test _generic_merge, _append_extra_sources, and _SOURCE_LABELS."""
+
+    def _make_builder(self, tmp_path) -> UnifiedSkillBuilder:
+        """Create a minimal builder instance for testing."""
+        config = {
+            "name": "test_project",
+            "description": "A test project for merge testing",
+            "sources": [
+                {"type": "jupyter", "path": "nb.ipynb"},
+                {"type": "rss", "url": "https://example.com/feed.rss"},
+            ],
+        }
+        scraped_data: dict = {}
+        builder = UnifiedSkillBuilder(
+            config=config,
+            scraped_data=scraped_data,
+            cache_dir=str(tmp_path / "cache"),
+        )
+        # Override skill_dir to use tmp_path
+        builder.skill_dir = str(tmp_path / "output" / "test_project")
+        os.makedirs(builder.skill_dir, exist_ok=True)
+        os.makedirs(os.path.join(builder.skill_dir, "references"), exist_ok=True)
+        return builder
+
+    def test_generic_merge_produces_valid_markdown(self, tmp_path):
+        """Test _generic_merge with two source types produces markdown."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "jupyter": "## When to Use\n\nFor data analysis.\n\n## Quick Reference\n\nImport pandas.",
+            "rss": "## When to Use\n\nFor feed monitoring.\n\n## Feed Items\n\nLatest entries.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        # Must be non-empty markdown
+        assert len(result) > 100
+        # Must contain the project title
+        assert "Test Project" in result
+
+    def test_generic_merge_includes_yaml_frontmatter(self, tmp_path):
+        """Test _generic_merge includes YAML frontmatter."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "html": "## Overview\n\nHTML content here.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        assert result.startswith("---\n")
+        assert "name: test-project" in result
+        assert "description: A test project" in result
+
+    def test_generic_merge_attributes_content_to_sources(self, tmp_path):
+        """Test _generic_merge attributes content to correct source labels."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "jupyter": "## Overview\n\nNotebook content.",
+            "pptx": "## Overview\n\nSlide content.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        # Check source labels appear
+        assert "Jupyter Notebook" in result
+        assert "PowerPoint Presentation" in result
+
+    def test_generic_merge_single_source_section(self, tmp_path):
+        """Test section unique to one source has 'From <Label>' attribution."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "manpage": "## Synopsis\n\ngit [options]",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        assert "*From Man Page*" in result
+        assert "## Synopsis" in result
+
+    def test_generic_merge_multi_source_section(self, tmp_path):
+        """Test section shared by multiple sources gets sub-headings per source."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "asciidoc": "## Quick Reference\n\nAsciiDoc quick ref.",
+            "html": "## Quick Reference\n\nHTML quick ref.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        # Both sources should be attributed under the shared section
+        assert "### From AsciiDoc Document" in result
+        assert "### From HTML Document" in result
+
+    def test_generic_merge_footer(self, tmp_path):
+        """Test _generic_merge ends with the standard footer."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "rss": "## Feeds\n\nSome feeds.",
+        }
+        result = builder._generic_merge(skill_mds)
+        assert "Generated by Skill Seeker" in result
+
+    def test_generic_merge_merged_from_line(self, tmp_path):
+        """Test _generic_merge includes 'Merged from:' with correct labels."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "confluence": "## Pages\n\nWiki pages.",
+            "notion": "## Databases\n\nNotion DBs.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        assert "*Merged from: Confluence Wiki, Notion Page*" in result
+
+    def test_append_extra_sources_adds_sections(self, tmp_path):
+        """Test _append_extra_sources adds new sections to base content."""
+        builder = self._make_builder(tmp_path)
+        base_content = "# Test\n\nIntro.\n\n## Main Section\n\nContent.\n\n---\n\n*Footer*\n"
+        skill_mds = {
+            "epub": "## Chapters\n\nChapter list.\n\n## Key Concepts\n\nConcept A.",
+        }
+        result = builder._append_extra_sources(base_content, skill_mds, {"epub"})
+
+        # The extra source content should be inserted before the footer separator
+        assert "EPUB E-book Content" in result
+        assert "Chapters" in result
+        assert "Key Concepts" in result
+        # Original content should still be present
+        assert "# Test" in result
+        assert "## Main Section" in result
+
+    def test_append_extra_sources_preserves_footer(self, tmp_path):
+        """Test _append_extra_sources keeps the footer intact."""
+        builder = self._make_builder(tmp_path)
+        base_content = "# Test\n\n---\n\n*Footer*\n"
+        skill_mds = {
+            "chat": "## Messages\n\nChat history.",
+        }
+        result = builder._append_extra_sources(base_content, skill_mds, {"chat"})
+
+        assert "*Footer*" in result
+
+    def test_source_labels_has_all_17_types(self):
+        """Test _SOURCE_LABELS has entries for all 17 source types."""
+        expected = {
+            "documentation",
+            "github",
+            "pdf",
+            "word",
+            "epub",
+            "video",
+            "local",
+            "jupyter",
+            "html",
+            "openapi",
+            "asciidoc",
+            "pptx",
+            "confluence",
+            "notion",
+            "rss",
+            "manpage",
+            "chat",
+        }
+        assert set(UnifiedSkillBuilder._SOURCE_LABELS.keys()) == expected
+
+    def test_source_labels_values_are_nonempty_strings(self):
+        """Test all _SOURCE_LABELS values are non-empty strings."""
+        for key, label in UnifiedSkillBuilder._SOURCE_LABELS.items():
+            assert isinstance(label, str), f"Label for '{key}' is not a string"
+            assert len(label) > 0, f"Label for '{key}' is empty"
+
+
+# ---------------------------------------------------------------------------
+# 4. COMMAND_MODULES and parser wiring
+# ---------------------------------------------------------------------------
+
+
+class TestCommandModules:
+    """Test that all 10 new source types are wired into CLI."""
+
+    NEW_COMMAND_NAMES = [
+        "jupyter",
+        "html",
+        "openapi",
+        "asciidoc",
+        "pptx",
+        "rss",
+        "manpage",
+        "confluence",
+        "notion",
+        "chat",
+    ]
+
+    def test_new_types_in_command_modules(self):
+        """Test all 10 new source types are in COMMAND_MODULES."""
+        for cmd in self.NEW_COMMAND_NAMES:
+            assert cmd in COMMAND_MODULES, f"'{cmd}' not in COMMAND_MODULES"
+
+    def test_command_modules_values_are_module_paths(self):
+        """Test COMMAND_MODULES values look like importable module paths."""
+        for cmd in self.NEW_COMMAND_NAMES:
+            module_path = COMMAND_MODULES[cmd]
+            assert module_path.startswith("skill_seekers.cli."), (
+                f"Module path for '{cmd}' doesn't start with 'skill_seekers.cli.'"
+            )
+
+    def test_new_parser_names_include_all_10(self):
+        """Test that get_parser_names() includes all 10 new source types."""
+        names = get_parser_names()
+        for cmd in self.NEW_COMMAND_NAMES:
+            assert cmd in names, f"Parser '{cmd}' not registered"
+
+    def test_total_parser_count(self):
+        """Test total PARSERS count is 35 (25 original + 10 new)."""
+        assert len(PARSERS) == 35
+
+    def test_no_duplicate_parser_names(self):
+        """Test no duplicate parser names exist."""
+        names = get_parser_names()
+        assert len(names) == len(set(names)), "Duplicate parser names found!"
+
+    def test_command_module_count(self):
+        """Test COMMAND_MODULES has expected number of entries."""
+        # 25 original + 10 new = 35
+        assert len(COMMAND_MODULES) == 35
+
+
+# ---------------------------------------------------------------------------
+# 5. SourceDetector.validate_source — new types
+# ---------------------------------------------------------------------------
+
+
+class TestSourceDetectorValidation:
+    """Test validate_source for new file-based source types."""
+
+    def test_validation_passes_for_existing_jupyter(self, tmp_path):
+        """Test validation passes for an existing .ipynb file."""
+        nb = tmp_path / "test.ipynb"
+        nb.write_text('{"cells": []}')
+
+        info = SourceInfo(
+            type="jupyter",
+            parsed={"file_path": str(nb)},
+            suggested_name="test",
+            raw_input=str(nb),
+        )
+        # Should not raise
+        SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_jupyter(self):
+        """Test validation raises ValueError for non-existent file."""
+        info = SourceInfo(
+            type="jupyter",
+            parsed={"file_path": "/nonexistent/notebook.ipynb"},
+            suggested_name="notebook",
+            raw_input="/nonexistent/notebook.ipynb",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_existing_html(self, tmp_path):
+        """Test validation passes for an existing .html file."""
+        html = tmp_path / "page.html"
+        html.write_text("<html></html>")
+
+        info = SourceInfo(
+            type="html",
+            parsed={"file_path": str(html)},
+            suggested_name="page",
+            raw_input=str(html),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_pptx(self):
+        """Test validation raises ValueError for non-existent pptx."""
+        info = SourceInfo(
+            type="pptx",
+            parsed={"file_path": "/nonexistent/slides.pptx"},
+            suggested_name="slides",
+            raw_input="/nonexistent/slides.pptx",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_existing_openapi(self, tmp_path):
+        """Test validation passes for an existing OpenAPI spec file."""
+        spec = tmp_path / "api.yaml"
+        spec.write_text("openapi: '3.0.0'\n")
+
+        info = SourceInfo(
+            type="openapi",
+            parsed={"file_path": str(spec)},
+            suggested_name="api",
+            raw_input=str(spec),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_asciidoc(self):
+        """Test validation raises ValueError for non-existent asciidoc."""
+        info = SourceInfo(
+            type="asciidoc",
+            parsed={"file_path": "/nonexistent/doc.adoc"},
+            suggested_name="doc",
+            raw_input="/nonexistent/doc.adoc",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_manpage(self):
+        """Test validation raises ValueError for non-existent manpage."""
+        info = SourceInfo(
+            type="manpage",
+            parsed={"file_path": "/nonexistent/git.1"},
+            suggested_name="git",
+            raw_input="/nonexistent/git.1",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_existing_manpage(self, tmp_path):
+        """Test validation passes for an existing man page file."""
+        man = tmp_path / "curl.1"
+        man.write_text(".TH CURL 1\n")
+
+        info = SourceInfo(
+            type="manpage",
+            parsed={"file_path": str(man)},
+            suggested_name="curl",
+            raw_input=str(man),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_rss_url_validation_no_file_check(self):
+        """Test rss validation passes for URL-based source (no file check)."""
+        info = SourceInfo(
+            type="rss",
+            parsed={"url": "https://example.com/feed.rss"},
+            suggested_name="feed",
+            raw_input="https://example.com/feed.rss",
+        )
+        # rss validation only checks file if file_path is present; URL should pass
+        SourceDetector.validate_source(info)
+
+    def test_rss_validation_raises_for_nonexistent_file(self):
+        """Test rss validation raises for non-existent local file."""
+        info = SourceInfo(
+            type="rss",
+            parsed={"file_path": "/nonexistent/feed.rss"},
+            suggested_name="feed",
+            raw_input="/nonexistent/feed.rss",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_rss_validation_passes_for_existing_file(self, tmp_path):
+        """Test rss validation passes for an existing .rss file."""
+        rss = tmp_path / "feed.rss"
+        rss.write_text("<rss></rss>")
+
+        info = SourceInfo(
+            type="rss",
+            parsed={"file_path": str(rss)},
+            suggested_name="feed",
+            raw_input=str(rss),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_directory_types(self, tmp_path):
+        """Test validation passes when source is a directory (e.g., html dir)."""
+        html_dir = tmp_path / "pages"
+        html_dir.mkdir()
+
+        info = SourceInfo(
+            type="html",
+            parsed={"file_path": str(html_dir)},
+            suggested_name="pages",
+            raw_input=str(html_dir),
+        )
+        # The validator allows directories for these types (isfile or isdir)
+        SourceDetector.validate_source(info)
+
+
+# ---------------------------------------------------------------------------
+# 6. CreateCommand._route_generic coverage
+# ---------------------------------------------------------------------------
+
+
+class TestCreateCommandRouting:
+    """Test that CreateCommand._route_to_scraper maps new types to _route_generic."""
+
+    # We can't easily call _route_to_scraper (it imports real scrapers),
+    # but we verify the routing table is correct by checking the method source.
+
+    GENERIC_ROUTES = {
+        "jupyter": ("jupyter_scraper", "--notebook"),
+        "html": ("html_scraper", "--html-path"),
+        "openapi": ("openapi_scraper", "--spec"),
+        "asciidoc": ("asciidoc_scraper", "--asciidoc-path"),
+        "pptx": ("pptx_scraper", "--pptx"),
+        "rss": ("rss_scraper", "--feed-path"),
+        "manpage": ("man_scraper", "--man-path"),
+        "confluence": ("confluence_scraper", "--export-path"),
+        "notion": ("notion_scraper", "--export-path"),
+        "chat": ("chat_scraper", "--export-path"),
+    }
+
+    def test_route_to_scraper_source_coverage(self):
+        """Test _route_to_scraper method handles all 10 new types.
+
+        We inspect the method source to verify each type has a branch.
+        """
+        import inspect
+
+        source = inspect.getsource(
+            __import__(
+                "skill_seekers.cli.create_command",
+                fromlist=["CreateCommand"],
+            ).CreateCommand._route_to_scraper
+        )
+        for source_type in self.GENERIC_ROUTES:
+            assert f'"{source_type}"' in source, (
+                f"_route_to_scraper missing branch for '{source_type}'"
+            )
+
+    def test_generic_route_module_names(self):
+        """Test _route_generic is called with correct module names."""
+        import inspect
+
+        source = inspect.getsource(
+            __import__(
+                "skill_seekers.cli.create_command",
+                fromlist=["CreateCommand"],
+            ).CreateCommand._route_to_scraper
+        )
+        for source_type, (module, flag) in self.GENERIC_ROUTES.items():
+            assert f'"{module}"' in source, f"Module name '{module}' not found for '{source_type}'"
+            assert f'"{flag}"' in source, f"Flag '{flag}' not found for '{source_type}'"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])