feat: add 10 new skill source types (17 total) with full pipeline integration

Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
2026-03-15 15:30:15 +03:00
parent 64403a3686
commit 53b911b697
50 changed files with 20193 additions and 856 deletions
--- a/src/skill_seekers/mcp/tools/init.py
+++ b/src/skill_seekers/mcp/tools/init.py
@@ -63,6 +63,9 @@ from .scraping_tools import (
 from .scraping_tools import (
    scrape_pdf_tool as scrape_pdf_impl,
 )
+from .scraping_tools import (
+    scrape_generic_tool as scrape_generic_impl,
+)
 from .scraping_tools import (
    scrape_video_tool as scrape_video_impl,
 )
@@ -135,6 +138,7 @@ __all__ = [
    "extract_test_examples_impl",
    "build_how_to_guides_impl",
    "extract_config_patterns_impl",
+    "scrape_generic_impl",
    # Packaging tools
    "package_skill_impl",
    "upload_skill_impl",
--- a/src/skill_seekers/mcp/tools/config_tools.py
+++ b/src/skill_seekers/mcp/tools/config_tools.py
@@ -205,6 +205,18 @@ async def validate_config(args: dict) -> list[TextContent]:
                        )
                    elif source["type"] == "pdf":
                        result += f"    Path: {source.get('path', 'N/A')}\n"
+                    elif source["type"] in (
+                        "jupyter",
+                        "html",
+                        "openapi",
+                        "asciidoc",
+                        "pptx",
+                        "manpage",
+                        "chat",
+                    ):
+                        result += f"    Path: {source.get('path', 'N/A')}\n"
+                    elif source["type"] in ("confluence", "notion", "rss"):
+                        result += f"    URL: {source.get('url', 'N/A')}\n"

                # Show merge settings if applicable
                if validator.needs_api_merge():
--- a/src/skill_seekers/mcp/tools/scraping_tools.py
+++ b/src/skill_seekers/mcp/tools/scraping_tools.py
@@ -7,6 +7,8 @@ This module contains all scraping-related MCP tool implementations:
 - scrape_github_tool: Scrape GitHub repositories
 - scrape_pdf_tool: Scrape PDF documentation
 - scrape_codebase_tool: Analyze local codebase and extract code knowledge
+- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
+  openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)

 Extracted from server.py for better modularity and organization.
 """
@@ -1005,3 +1007,155 @@ async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
        return [TextContent(type="text", text=output_text)]
    else:
        return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
+
+
+# Valid source types for the generic scraper
+GENERIC_SOURCE_TYPES = (
+    "jupyter",
+    "html",
+    "openapi",
+    "asciidoc",
+    "pptx",
+    "confluence",
+    "notion",
+    "rss",
+    "manpage",
+    "chat",
+)
+
+# Mapping from source type to the CLI flag used for the primary input argument.
+# URL-based types use --url; file/path-based types use --path.
+_URL_BASED_TYPES = {"confluence", "notion", "rss"}
+
+# Friendly emoji labels per source type
+_SOURCE_EMOJIS = {
+    "jupyter": "📓",
+    "html": "🌐",
+    "openapi": "📡",
+    "asciidoc": "📄",
+    "pptx": "📊",
+    "confluence": "🏢",
+    "notion": "📝",
+    "rss": "📰",
+    "manpage": "📖",
+    "chat": "💬",
+}
+
+
+async def scrape_generic_tool(args: dict) -> list[TextContent]:
+    """
+    Generic scraper for new source types.
+
+    Handles all 10 new source types by building the appropriate subprocess
+    command and delegating to the corresponding CLI scraper module.
+
+    Supported source types: jupyter, html, openapi, asciidoc, pptx,
+    confluence, notion, rss, manpage, chat.
+
+    Args:
+        args: Dictionary containing:
+            - source_type (str): One of the supported source types
+            - path (str, optional): File or directory path (for file-based sources)
+            - url (str, optional): URL (for URL-based sources like confluence, notion, rss)
+            - name (str): Skill name for the output
+
+    Returns:
+        List[TextContent]: Tool execution results
+    """
+    source_type = args.get("source_type", "")
+    path = args.get("path")
+    url = args.get("url")
+    name = args.get("name")
+
+    # Validate source_type
+    if source_type not in GENERIC_SOURCE_TYPES:
+        return [
+            TextContent(
+                type="text",
+                text=(
+                    f"❌ Error: Unknown source_type '{source_type}'. "
+                    f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
+                ),
+            )
+        ]
+
+    # Validate that we have either path or url
+    if not path and not url:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
+            )
+        ]
+
+    if not name:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: 'name' parameter is required",
+            )
+        ]
+
+    # Build the subprocess command
+    # Map source type to module name (most are <type>_scraper, but some differ)
+    _MODULE_NAMES = {
+        "manpage": "man_scraper",
+    }
+    module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
+    cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
+
+    # Map source type to the correct CLI flag for file/path input and URL input.
+    # Each scraper has its own flag name — using a generic --path or --url would fail.
+    _PATH_FLAGS: dict[str, str] = {
+        "jupyter": "--notebook",
+        "html": "--html-path",
+        "openapi": "--spec",
+        "asciidoc": "--asciidoc-path",
+        "pptx": "--pptx",
+        "manpage": "--man-path",
+        "confluence": "--export-path",
+        "notion": "--export-path",
+        "rss": "--feed-path",
+        "chat": "--export-path",
+    }
+    _URL_FLAGS: dict[str, str] = {
+        "confluence": "--base-url",
+        "notion": "--page-id",
+        "rss": "--feed-url",
+        "openapi": "--spec-url",
+    }
+
+    # Determine the input flag based on source type
+    if source_type in _URL_BASED_TYPES and url:
+        url_flag = _URL_FLAGS.get(source_type, "--url")
+        cmd.extend([url_flag, url])
+    elif path:
+        path_flag = _PATH_FLAGS.get(source_type, "--path")
+        cmd.extend([path_flag, path])
+    elif url:
+        # Allow url fallback for file-based types (some may accept URLs too)
+        url_flag = _URL_FLAGS.get(source_type, "--url")
+        cmd.extend([url_flag, url])
+
+    cmd.extend(["--name", name])
+
+    # Set a reasonable timeout
+    timeout = 600  # 10 minutes
+
+    emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
+    progress_msg = f"{emoji} Scraping {source_type} source...\n"
+    if path:
+        progress_msg += f"📁 Path: {path}\n"
+    if url:
+        progress_msg += f"🔗 URL: {url}\n"
+    progress_msg += f"📛 Name: {name}\n"
+    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
+
+    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
+
+    output = progress_msg + stdout
+
+    if returncode == 0:
+        return [TextContent(type="text", text=output)]
+    else:
+        return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
--- a/src/skill_seekers/mcp/tools/splitting_tools.py
+++ b/src/skill_seekers/mcp/tools/splitting_tools.py
@@ -106,7 +106,9 @@ async def split_config(args: dict) -> list[TextContent]:

    Supports both documentation and unified (multi-source) configs:
    - Documentation configs: Split by categories, size, or create router skills
-    - Unified configs: Split by source type (documentation, github, pdf)
+    - Unified configs: Split by source type (documentation, github, pdf,
+      jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss,
+      manpage, chat)

    For large documentation sites (10K+ pages), this tool splits the config into
    multiple smaller configs. For unified configs with multiple sources, splits