feat: add 10 new skill source types (17 total) with full pipeline integration

Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint,
RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new
skill source types. Each type is fully integrated across:

- Standalone CLI commands (skill-seekers <type>)
- Auto-detection via 'skill-seekers create' (file extension + content sniffing)
- Unified multi-source configs (scraped_data, dispatch, config validation)
- Unified skill builder (generic merge + source-attributed synthesis)
- MCP server (scrape_generic tool with per-type flag mapping)
- pyproject.toml (entry points, optional deps, [all] group)

Also fixes: EPUB unified pipeline gap, missing word/video config validators,
OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale
docstrings, and adds 77 integration tests + complex-merge workflow.

50 files changed, +20,201 lines
This commit is contained in:
yusyus
2026-03-15 15:30:15 +03:00
parent 64403a3686
commit 53b911b697
50 changed files with 20193 additions and 856 deletions

View File

@@ -63,6 +63,9 @@ from .scraping_tools import (
from .scraping_tools import (
scrape_pdf_tool as scrape_pdf_impl,
)
from .scraping_tools import (
scrape_generic_tool as scrape_generic_impl,
)
from .scraping_tools import (
scrape_video_tool as scrape_video_impl,
)
@@ -135,6 +138,7 @@ __all__ = [
"extract_test_examples_impl",
"build_how_to_guides_impl",
"extract_config_patterns_impl",
"scrape_generic_impl",
# Packaging tools
"package_skill_impl",
"upload_skill_impl",

View File

@@ -205,6 +205,18 @@ async def validate_config(args: dict) -> list[TextContent]:
)
elif source["type"] == "pdf":
result += f" Path: {source.get('path', 'N/A')}\n"
elif source["type"] in (
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"manpage",
"chat",
):
result += f" Path: {source.get('path', 'N/A')}\n"
elif source["type"] in ("confluence", "notion", "rss"):
result += f" URL: {source.get('url', 'N/A')}\n"
# Show merge settings if applicable
if validator.needs_api_merge():

View File

@@ -7,6 +7,8 @@ This module contains all scraping-related MCP tool implementations:
- scrape_github_tool: Scrape GitHub repositories
- scrape_pdf_tool: Scrape PDF documentation
- scrape_codebase_tool: Analyze local codebase and extract code knowledge
- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)
Extracted from server.py for better modularity and organization.
"""
@@ -1005,3 +1007,155 @@ async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
return [TextContent(type="text", text=output_text)]
else:
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
# Valid source types for the generic scraper
GENERIC_SOURCE_TYPES = (
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
)
# Mapping from source type to the CLI flag used for the primary input argument.
# URL-based types use --url; file/path-based types use --path.
_URL_BASED_TYPES = {"confluence", "notion", "rss"}
# Friendly emoji labels per source type
_SOURCE_EMOJIS = {
"jupyter": "📓",
"html": "🌐",
"openapi": "📡",
"asciidoc": "📄",
"pptx": "📊",
"confluence": "🏢",
"notion": "📝",
"rss": "📰",
"manpage": "📖",
"chat": "💬",
}
async def scrape_generic_tool(args: dict) -> list[TextContent]:
"""
Generic scraper for new source types.
Handles all 10 new source types by building the appropriate subprocess
command and delegating to the corresponding CLI scraper module.
Supported source types: jupyter, html, openapi, asciidoc, pptx,
confluence, notion, rss, manpage, chat.
Args:
args: Dictionary containing:
- source_type (str): One of the supported source types
- path (str, optional): File or directory path (for file-based sources)
- url (str, optional): URL (for URL-based sources like confluence, notion, rss)
- name (str): Skill name for the output
Returns:
List[TextContent]: Tool execution results
"""
source_type = args.get("source_type", "")
path = args.get("path")
url = args.get("url")
name = args.get("name")
# Validate source_type
if source_type not in GENERIC_SOURCE_TYPES:
return [
TextContent(
type="text",
text=(
f"❌ Error: Unknown source_type '{source_type}'. "
f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
),
)
]
# Validate that we have either path or url
if not path and not url:
return [
TextContent(
type="text",
text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
)
]
if not name:
return [
TextContent(
type="text",
text="❌ Error: 'name' parameter is required",
)
]
# Build the subprocess command
# Map source type to module name (most are <type>_scraper, but some differ)
_MODULE_NAMES = {
"manpage": "man_scraper",
}
module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
# Map source type to the correct CLI flag for file/path input and URL input.
# Each scraper has its own flag name — using a generic --path or --url would fail.
_PATH_FLAGS: dict[str, str] = {
"jupyter": "--notebook",
"html": "--html-path",
"openapi": "--spec",
"asciidoc": "--asciidoc-path",
"pptx": "--pptx",
"manpage": "--man-path",
"confluence": "--export-path",
"notion": "--export-path",
"rss": "--feed-path",
"chat": "--export-path",
}
_URL_FLAGS: dict[str, str] = {
"confluence": "--base-url",
"notion": "--page-id",
"rss": "--feed-url",
"openapi": "--spec-url",
}
# Determine the input flag based on source type
if source_type in _URL_BASED_TYPES and url:
url_flag = _URL_FLAGS.get(source_type, "--url")
cmd.extend([url_flag, url])
elif path:
path_flag = _PATH_FLAGS.get(source_type, "--path")
cmd.extend([path_flag, path])
elif url:
# Allow url fallback for file-based types (some may accept URLs too)
url_flag = _URL_FLAGS.get(source_type, "--url")
cmd.extend([url_flag, url])
cmd.extend(["--name", name])
# Set a reasonable timeout
timeout = 600 # 10 minutes
emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
progress_msg = f"{emoji} Scraping {source_type} source...\n"
if path:
progress_msg += f"📁 Path: {path}\n"
if url:
progress_msg += f"🔗 URL: {url}\n"
progress_msg += f"📛 Name: {name}\n"
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
output = progress_msg + stdout
if returncode == 0:
return [TextContent(type="text", text=output)]
else:
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]

View File

@@ -106,7 +106,9 @@ async def split_config(args: dict) -> list[TextContent]:
Supports both documentation and unified (multi-source) configs:
- Documentation configs: Split by categories, size, or create router skills
- Unified configs: Split by source type (documentation, github, pdf)
- Unified configs: Split by source type (documentation, github, pdf,
jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss,
manpage, chat)
For large documentation sites (10K+ pages), this tool splits the config into
multiple smaller configs. For unified configs with multiple sources, splits