feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
This commit is contained in:
@@ -3,16 +3,16 @@
|
||||
Skill Seeker MCP Server (FastMCP Implementation)
|
||||
|
||||
Modern, decorator-based MCP server using FastMCP for simplified tool registration.
|
||||
Provides 33 tools for generating Claude AI skills from documentation.
|
||||
Provides 34 tools for generating Claude AI skills from documentation.
|
||||
|
||||
This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
|
||||
All tool implementations are delegated to modular tool files in tools/ directory.
|
||||
|
||||
**Architecture:**
|
||||
- FastMCP server with decorator-based tool registration
|
||||
- 33 tools organized into 7 categories:
|
||||
- 34 tools organized into 7 categories:
|
||||
* Config tools (3): generate_config, list_configs, validate_config
|
||||
* Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
|
||||
* Scraping tools (11): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns, scrape_generic
|
||||
* Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
|
||||
* Splitting tools (2): split_config, generate_router
|
||||
* Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
|
||||
@@ -97,6 +97,7 @@ try:
|
||||
remove_config_source_impl,
|
||||
scrape_codebase_impl,
|
||||
scrape_docs_impl,
|
||||
scrape_generic_impl,
|
||||
scrape_github_impl,
|
||||
scrape_pdf_impl,
|
||||
scrape_video_impl,
|
||||
@@ -141,6 +142,7 @@ except ImportError:
|
||||
remove_config_source_impl,
|
||||
scrape_codebase_impl,
|
||||
scrape_docs_impl,
|
||||
scrape_generic_impl,
|
||||
scrape_github_impl,
|
||||
scrape_pdf_impl,
|
||||
scrape_video_impl,
|
||||
@@ -301,7 +303,7 @@ async def sync_config(
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SCRAPING TOOLS (10 tools)
|
||||
# SCRAPING TOOLS (11 tools)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
@@ -823,6 +825,50 @@ async def extract_config_patterns(
|
||||
return str(result)
|
||||
|
||||
|
||||
@safe_tool_decorator(
|
||||
description="Scrape content from new source types: jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat. A generic entry point that delegates to the appropriate CLI scraper module."
|
||||
)
|
||||
async def scrape_generic(
|
||||
source_type: str,
|
||||
name: str,
|
||||
path: str | None = None,
|
||||
url: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Scrape content from various source types and build a skill.
|
||||
|
||||
A generic scraper that supports 10 new source types. It delegates to the
|
||||
corresponding CLI scraper module (e.g., skill_seekers.cli.jupyter_scraper).
|
||||
|
||||
File-based types (jupyter, html, openapi, asciidoc, pptx, manpage, chat)
|
||||
typically use the 'path' parameter. URL-based types (confluence, notion, rss)
|
||||
typically use the 'url' parameter.
|
||||
|
||||
Args:
|
||||
source_type: Source type to scrape. One of: jupyter, html, openapi,
|
||||
asciidoc, pptx, confluence, notion, rss, manpage, chat.
|
||||
name: Skill name for the output
|
||||
path: File or directory path (for file-based sources like jupyter, html, pptx)
|
||||
url: URL (for URL-based sources like confluence, notion, rss)
|
||||
|
||||
Returns:
|
||||
Scraping results with file paths and statistics.
|
||||
"""
|
||||
args = {
|
||||
"source_type": source_type,
|
||||
"name": name,
|
||||
}
|
||||
if path:
|
||||
args["path"] = path
|
||||
if url:
|
||||
args["url"] = url
|
||||
|
||||
result = await scrape_generic_impl(args)
|
||||
if isinstance(result, list) and result:
|
||||
return result[0].text if hasattr(result[0], "text") else str(result[0])
|
||||
return str(result)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# PACKAGING TOOLS (4 tools)
|
||||
# ============================================================================
|
||||
|
||||
@@ -63,6 +63,9 @@ from .scraping_tools import (
|
||||
from .scraping_tools import (
|
||||
scrape_pdf_tool as scrape_pdf_impl,
|
||||
)
|
||||
from .scraping_tools import (
|
||||
scrape_generic_tool as scrape_generic_impl,
|
||||
)
|
||||
from .scraping_tools import (
|
||||
scrape_video_tool as scrape_video_impl,
|
||||
)
|
||||
@@ -135,6 +138,7 @@ __all__ = [
|
||||
"extract_test_examples_impl",
|
||||
"build_how_to_guides_impl",
|
||||
"extract_config_patterns_impl",
|
||||
"scrape_generic_impl",
|
||||
# Packaging tools
|
||||
"package_skill_impl",
|
||||
"upload_skill_impl",
|
||||
|
||||
@@ -205,6 +205,18 @@ async def validate_config(args: dict) -> list[TextContent]:
|
||||
)
|
||||
elif source["type"] == "pdf":
|
||||
result += f" Path: {source.get('path', 'N/A')}\n"
|
||||
elif source["type"] in (
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"manpage",
|
||||
"chat",
|
||||
):
|
||||
result += f" Path: {source.get('path', 'N/A')}\n"
|
||||
elif source["type"] in ("confluence", "notion", "rss"):
|
||||
result += f" URL: {source.get('url', 'N/A')}\n"
|
||||
|
||||
# Show merge settings if applicable
|
||||
if validator.needs_api_merge():
|
||||
|
||||
@@ -7,6 +7,8 @@ This module contains all scraping-related MCP tool implementations:
|
||||
- scrape_github_tool: Scrape GitHub repositories
|
||||
- scrape_pdf_tool: Scrape PDF documentation
|
||||
- scrape_codebase_tool: Analyze local codebase and extract code knowledge
|
||||
- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
|
||||
openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)
|
||||
|
||||
Extracted from server.py for better modularity and organization.
|
||||
"""
|
||||
@@ -1005,3 +1007,155 @@ async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
|
||||
return [TextContent(type="text", text=output_text)]
|
||||
else:
|
||||
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
||||
|
||||
|
||||
# Valid source types for the generic scraper
|
||||
GENERIC_SOURCE_TYPES = (
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"confluence",
|
||||
"notion",
|
||||
"rss",
|
||||
"manpage",
|
||||
"chat",
|
||||
)
|
||||
|
||||
# Mapping from source type to the CLI flag used for the primary input argument.
|
||||
# URL-based types use --url; file/path-based types use --path.
|
||||
_URL_BASED_TYPES = {"confluence", "notion", "rss"}
|
||||
|
||||
# Friendly emoji labels per source type
|
||||
_SOURCE_EMOJIS = {
|
||||
"jupyter": "📓",
|
||||
"html": "🌐",
|
||||
"openapi": "📡",
|
||||
"asciidoc": "📄",
|
||||
"pptx": "📊",
|
||||
"confluence": "🏢",
|
||||
"notion": "📝",
|
||||
"rss": "📰",
|
||||
"manpage": "📖",
|
||||
"chat": "💬",
|
||||
}
|
||||
|
||||
|
||||
async def scrape_generic_tool(args: dict) -> list[TextContent]:
|
||||
"""
|
||||
Generic scraper for new source types.
|
||||
|
||||
Handles all 10 new source types by building the appropriate subprocess
|
||||
command and delegating to the corresponding CLI scraper module.
|
||||
|
||||
Supported source types: jupyter, html, openapi, asciidoc, pptx,
|
||||
confluence, notion, rss, manpage, chat.
|
||||
|
||||
Args:
|
||||
args: Dictionary containing:
|
||||
- source_type (str): One of the supported source types
|
||||
- path (str, optional): File or directory path (for file-based sources)
|
||||
- url (str, optional): URL (for URL-based sources like confluence, notion, rss)
|
||||
- name (str): Skill name for the output
|
||||
|
||||
Returns:
|
||||
List[TextContent]: Tool execution results
|
||||
"""
|
||||
source_type = args.get("source_type", "")
|
||||
path = args.get("path")
|
||||
url = args.get("url")
|
||||
name = args.get("name")
|
||||
|
||||
# Validate source_type
|
||||
if source_type not in GENERIC_SOURCE_TYPES:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text=(
|
||||
f"❌ Error: Unknown source_type '{source_type}'. "
|
||||
f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
# Validate that we have either path or url
|
||||
if not path and not url:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
|
||||
)
|
||||
]
|
||||
|
||||
if not name:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="❌ Error: 'name' parameter is required",
|
||||
)
|
||||
]
|
||||
|
||||
# Build the subprocess command
|
||||
# Map source type to module name (most are <type>_scraper, but some differ)
|
||||
_MODULE_NAMES = {
|
||||
"manpage": "man_scraper",
|
||||
}
|
||||
module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
|
||||
cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
|
||||
|
||||
# Map source type to the correct CLI flag for file/path input and URL input.
|
||||
# Each scraper has its own flag name — using a generic --path or --url would fail.
|
||||
_PATH_FLAGS: dict[str, str] = {
|
||||
"jupyter": "--notebook",
|
||||
"html": "--html-path",
|
||||
"openapi": "--spec",
|
||||
"asciidoc": "--asciidoc-path",
|
||||
"pptx": "--pptx",
|
||||
"manpage": "--man-path",
|
||||
"confluence": "--export-path",
|
||||
"notion": "--export-path",
|
||||
"rss": "--feed-path",
|
||||
"chat": "--export-path",
|
||||
}
|
||||
_URL_FLAGS: dict[str, str] = {
|
||||
"confluence": "--base-url",
|
||||
"notion": "--page-id",
|
||||
"rss": "--feed-url",
|
||||
"openapi": "--spec-url",
|
||||
}
|
||||
|
||||
# Determine the input flag based on source type
|
||||
if source_type in _URL_BASED_TYPES and url:
|
||||
url_flag = _URL_FLAGS.get(source_type, "--url")
|
||||
cmd.extend([url_flag, url])
|
||||
elif path:
|
||||
path_flag = _PATH_FLAGS.get(source_type, "--path")
|
||||
cmd.extend([path_flag, path])
|
||||
elif url:
|
||||
# Allow url fallback for file-based types (some may accept URLs too)
|
||||
url_flag = _URL_FLAGS.get(source_type, "--url")
|
||||
cmd.extend([url_flag, url])
|
||||
|
||||
cmd.extend(["--name", name])
|
||||
|
||||
# Set a reasonable timeout
|
||||
timeout = 600 # 10 minutes
|
||||
|
||||
emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
|
||||
progress_msg = f"{emoji} Scraping {source_type} source...\n"
|
||||
if path:
|
||||
progress_msg += f"📁 Path: {path}\n"
|
||||
if url:
|
||||
progress_msg += f"🔗 URL: {url}\n"
|
||||
progress_msg += f"📛 Name: {name}\n"
|
||||
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
||||
|
||||
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
||||
|
||||
output = progress_msg + stdout
|
||||
|
||||
if returncode == 0:
|
||||
return [TextContent(type="text", text=output)]
|
||||
else:
|
||||
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
||||
|
||||
@@ -106,7 +106,9 @@ async def split_config(args: dict) -> list[TextContent]:
|
||||
|
||||
Supports both documentation and unified (multi-source) configs:
|
||||
- Documentation configs: Split by categories, size, or create router skills
|
||||
- Unified configs: Split by source type (documentation, github, pdf)
|
||||
- Unified configs: Split by source type (documentation, github, pdf,
|
||||
jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss,
|
||||
manpage, chat)
|
||||
|
||||
For large documentation sites (10K+ pages), this tool splits the config into
|
||||
multiple smaller configs. For unified configs with multiple sources, splits
|
||||
|
||||
Reference in New Issue
Block a user