feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
This commit is contained in:
68
src/skill_seekers/cli/arguments/asciidoc.py
Normal file
68
src/skill_seekers/cli/arguments/asciidoc.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""AsciiDoc command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the asciidoc command in ONE place.
|
||||
Both asciidoc_scraper.py (standalone) and parsers/asciidoc_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# AsciiDoc-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"asciidoc_path": {
|
||||
"flags": ("--asciidoc-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to AsciiDoc file or directory containing .adoc files",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_asciidoc_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all asciidoc command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds AsciiDoc-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for AsciiDoc.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for AsciiDoc
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for AsciiDoc), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# AsciiDoc-specific args
|
||||
for arg_name, arg_def in ASCIIDOC_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
102
src/skill_seekers/cli/arguments/chat.py
Normal file
102
src/skill_seekers/cli/arguments/chat.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Chat command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the chat command in ONE place.
|
||||
Both chat_scraper.py (standalone) and parsers/chat_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# Chat-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
CHAT_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"export_path": {
|
||||
"flags": ("--export-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to chat export directory or file",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"platform": {
|
||||
"flags": ("--platform",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"choices": ["slack", "discord"],
|
||||
"default": "slack",
|
||||
"help": "Chat platform type (default: slack)",
|
||||
},
|
||||
},
|
||||
"token": {
|
||||
"flags": ("--token",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "API token for chat platform authentication",
|
||||
"metavar": "TOKEN",
|
||||
},
|
||||
},
|
||||
"channel": {
|
||||
"flags": ("--channel",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Channel name or ID to extract from",
|
||||
"metavar": "CHANNEL",
|
||||
},
|
||||
},
|
||||
"max_messages": {
|
||||
"flags": ("--max-messages",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 10000,
|
||||
"help": "Maximum number of messages to extract (default: 10000)",
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_chat_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all chat command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds Chat-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for Chat.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for Chat
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for Chat), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# Chat-specific args
|
||||
for arg_name, arg_def in CHAT_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
109
src/skill_seekers/cli/arguments/confluence.py
Normal file
109
src/skill_seekers/cli/arguments/confluence.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Confluence command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the confluence command in ONE place.
|
||||
Both confluence_scraper.py (standalone) and parsers/confluence_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# Confluence-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"base_url": {
|
||||
"flags": ("--base-url",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Confluence instance base URL",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"space_key": {
|
||||
"flags": ("--space-key",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Confluence space key to extract from",
|
||||
"metavar": "KEY",
|
||||
},
|
||||
},
|
||||
"export_path": {
|
||||
"flags": ("--export-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to Confluence HTML/XML export directory",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"username": {
|
||||
"flags": ("--username",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Confluence username for API authentication",
|
||||
"metavar": "USER",
|
||||
},
|
||||
},
|
||||
"token": {
|
||||
"flags": ("--token",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Confluence API token for authentication",
|
||||
"metavar": "TOKEN",
|
||||
},
|
||||
},
|
||||
"max_pages": {
|
||||
"flags": ("--max-pages",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 500,
|
||||
"help": "Maximum number of pages to extract (default: 500)",
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_confluence_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all confluence command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds Confluence-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for Confluence.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for Confluence
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for Confluence), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# Confluence-specific args
|
||||
for arg_name, arg_def in CONFLUENCE_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
@@ -549,6 +549,121 @@ CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
# For unified config files, use `skill-seekers unified --fresh` directly.
|
||||
}
|
||||
|
||||
# New source type arguments (v3.2.0+)
|
||||
# These are minimal dicts since most flags are handled by each scraper's own argument module.
|
||||
# The create command only needs the primary input flag for routing.
|
||||
|
||||
JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"notebook": {
|
||||
"flags": ("--notebook",),
|
||||
"kwargs": {"type": str, "help": "Jupyter Notebook file path (.ipynb)", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
HTML_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"html_path": {
|
||||
"flags": ("--html-path",),
|
||||
"kwargs": {"type": str, "help": "Local HTML file or directory path", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"spec": {
|
||||
"flags": ("--spec",),
|
||||
"kwargs": {"type": str, "help": "OpenAPI/Swagger spec file path", "metavar": "PATH"},
|
||||
},
|
||||
"spec_url": {
|
||||
"flags": ("--spec-url",),
|
||||
"kwargs": {"type": str, "help": "OpenAPI/Swagger spec URL", "metavar": "URL"},
|
||||
},
|
||||
}
|
||||
|
||||
ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"asciidoc_path": {
|
||||
"flags": ("--asciidoc-path",),
|
||||
"kwargs": {"type": str, "help": "AsciiDoc file or directory path", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
PPTX_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"pptx": {
|
||||
"flags": ("--pptx",),
|
||||
"kwargs": {"type": str, "help": "PowerPoint file path (.pptx)", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
RSS_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"feed_url": {
|
||||
"flags": ("--feed-url",),
|
||||
"kwargs": {"type": str, "help": "RSS/Atom feed URL", "metavar": "URL"},
|
||||
},
|
||||
"feed_path": {
|
||||
"flags": ("--feed-path",),
|
||||
"kwargs": {"type": str, "help": "RSS/Atom feed file path", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"man_names": {
|
||||
"flags": ("--man-names",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Comma-separated man page names (e.g., 'git,curl')",
|
||||
"metavar": "NAMES",
|
||||
},
|
||||
},
|
||||
"man_path": {
|
||||
"flags": ("--man-path",),
|
||||
"kwargs": {"type": str, "help": "Directory of man page files", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"conf_base_url": {
|
||||
"flags": ("--conf-base-url",),
|
||||
"kwargs": {"type": str, "help": "Confluence base URL", "metavar": "URL"},
|
||||
},
|
||||
"space_key": {
|
||||
"flags": ("--space-key",),
|
||||
"kwargs": {"type": str, "help": "Confluence space key", "metavar": "KEY"},
|
||||
},
|
||||
"conf_export_path": {
|
||||
"flags": ("--conf-export-path",),
|
||||
"kwargs": {"type": str, "help": "Confluence export directory", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
NOTION_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"database_id": {
|
||||
"flags": ("--database-id",),
|
||||
"kwargs": {"type": str, "help": "Notion database ID", "metavar": "ID"},
|
||||
},
|
||||
"page_id": {
|
||||
"flags": ("--page-id",),
|
||||
"kwargs": {"type": str, "help": "Notion page ID", "metavar": "ID"},
|
||||
},
|
||||
"notion_export_path": {
|
||||
"flags": ("--notion-export-path",),
|
||||
"kwargs": {"type": str, "help": "Notion export directory", "metavar": "PATH"},
|
||||
},
|
||||
}
|
||||
|
||||
CHAT_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"chat_export_path": {
|
||||
"flags": ("--chat-export-path",),
|
||||
"kwargs": {"type": str, "help": "Slack/Discord export directory", "metavar": "PATH"},
|
||||
},
|
||||
"platform": {
|
||||
"flags": ("--platform",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"choices": ["slack", "discord"],
|
||||
"default": "slack",
|
||||
"help": "Chat platform (default: slack)",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
# TIER 3: ADVANCED/RARE ARGUMENTS
|
||||
# =============================================================================
|
||||
@@ -613,6 +728,17 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
|
||||
"epub": EPUB_ARGUMENTS,
|
||||
"video": VIDEO_ARGUMENTS,
|
||||
"config": CONFIG_ARGUMENTS,
|
||||
# New source types (v3.2.0+)
|
||||
"jupyter": JUPYTER_ARGUMENTS,
|
||||
"html": HTML_ARGUMENTS,
|
||||
"openapi": OPENAPI_ARGUMENTS,
|
||||
"asciidoc": ASCIIDOC_ARGUMENTS,
|
||||
"pptx": PPTX_ARGUMENTS,
|
||||
"rss": RSS_ARGUMENTS,
|
||||
"manpage": MANPAGE_ARGUMENTS,
|
||||
"confluence": CONFLUENCE_ARGUMENTS,
|
||||
"notion": NOTION_ARGUMENTS,
|
||||
"chat": CHAT_ARGUMENTS,
|
||||
}
|
||||
return source_args.get(source_type, {})
|
||||
|
||||
@@ -703,6 +829,24 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
|
||||
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
# New source types (v3.2.0+)
|
||||
_NEW_SOURCE_ARGS = {
|
||||
"jupyter": JUPYTER_ARGUMENTS,
|
||||
"html": HTML_ARGUMENTS,
|
||||
"openapi": OPENAPI_ARGUMENTS,
|
||||
"asciidoc": ASCIIDOC_ARGUMENTS,
|
||||
"pptx": PPTX_ARGUMENTS,
|
||||
"rss": RSS_ARGUMENTS,
|
||||
"manpage": MANPAGE_ARGUMENTS,
|
||||
"confluence": CONFLUENCE_ARGUMENTS,
|
||||
"notion": NOTION_ARGUMENTS,
|
||||
"chat": CHAT_ARGUMENTS,
|
||||
}
|
||||
for stype, sargs in _NEW_SOURCE_ARGS.items():
|
||||
if mode in [stype, "all"]:
|
||||
for arg_name, arg_def in sargs.items():
|
||||
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
||||
|
||||
# Add advanced arguments if requested
|
||||
if mode in ["advanced", "all"]:
|
||||
for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
|
||||
|
||||
68
src/skill_seekers/cli/arguments/html.py
Normal file
68
src/skill_seekers/cli/arguments/html.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""HTML command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the html command in ONE place.
|
||||
Both html_scraper.py (standalone) and parsers/html_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# HTML-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
HTML_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"html_path": {
|
||||
"flags": ("--html-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to HTML file or directory containing HTML files",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_html_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all html command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds HTML-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for HTML.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for HTML
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for HTML), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# HTML-specific args
|
||||
for arg_name, arg_def in HTML_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
68
src/skill_seekers/cli/arguments/jupyter.py
Normal file
68
src/skill_seekers/cli/arguments/jupyter.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""Jupyter Notebook command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the jupyter command in ONE place.
|
||||
Both jupyter_scraper.py (standalone) and parsers/jupyter_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# Jupyter-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"notebook": {
|
||||
"flags": ("--notebook",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to .ipynb file or directory containing notebooks",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_jupyter_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all jupyter command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds Jupyter-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for Jupyter.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for Jupyter
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for Jupyter), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# Jupyter-specific args
|
||||
for arg_name, arg_def in JUPYTER_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
84
src/skill_seekers/cli/arguments/manpage.py
Normal file
84
src/skill_seekers/cli/arguments/manpage.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Man page command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the manpage command in ONE place.
|
||||
Both manpage_scraper.py (standalone) and parsers/manpage_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# ManPage-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"man_names": {
|
||||
"flags": ("--man-names",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Comma-separated list of man page names (e.g., 'ls,grep,find')",
|
||||
"metavar": "NAMES",
|
||||
},
|
||||
},
|
||||
"man_path": {
|
||||
"flags": ("--man-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to directory containing man page files",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"sections": {
|
||||
"flags": ("--sections",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Comma-separated section numbers to include (e.g., '1,3,8')",
|
||||
"metavar": "SECTIONS",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_manpage_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all manpage command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds ManPage-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for ManPage.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for ManPage
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for ManPage), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# ManPage-specific args
|
||||
for arg_name, arg_def in MANPAGE_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
101
src/skill_seekers/cli/arguments/notion.py
Normal file
101
src/skill_seekers/cli/arguments/notion.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Notion command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the notion command in ONE place.
|
||||
Both notion_scraper.py (standalone) and parsers/notion_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# Notion-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
NOTION_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"database_id": {
|
||||
"flags": ("--database-id",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Notion database ID to extract from",
|
||||
"metavar": "ID",
|
||||
},
|
||||
},
|
||||
"page_id": {
|
||||
"flags": ("--page-id",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Notion page ID to extract from",
|
||||
"metavar": "ID",
|
||||
},
|
||||
},
|
||||
"export_path": {
|
||||
"flags": ("--export-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to Notion export directory",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"token": {
|
||||
"flags": ("--token",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Notion integration token for API authentication",
|
||||
"metavar": "TOKEN",
|
||||
},
|
||||
},
|
||||
"max_pages": {
|
||||
"flags": ("--max-pages",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 500,
|
||||
"help": "Maximum number of pages to extract (default: 500)",
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_notion_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all notion command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds Notion-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for Notion.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for Notion
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for Notion), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# Notion-specific args
|
||||
for arg_name, arg_def in NOTION_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
76
src/skill_seekers/cli/arguments/openapi.py
Normal file
76
src/skill_seekers/cli/arguments/openapi.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""OpenAPI command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the openapi command in ONE place.
|
||||
Both openapi_scraper.py (standalone) and parsers/openapi_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# OpenAPI-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"spec": {
|
||||
"flags": ("--spec",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to OpenAPI/Swagger spec file",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"spec_url": {
|
||||
"flags": ("--spec-url",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "URL to OpenAPI/Swagger spec",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_openapi_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all openapi command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds OpenAPI-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for OpenAPI.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for OpenAPI
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for OpenAPI), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# OpenAPI-specific args
|
||||
for arg_name, arg_def in OPENAPI_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
68
src/skill_seekers/cli/arguments/pptx.py
Normal file
68
src/skill_seekers/cli/arguments/pptx.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""PPTX command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the pptx command in ONE place.
|
||||
Both pptx_scraper.py (standalone) and parsers/pptx_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# PPTX-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
PPTX_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"pptx": {
|
||||
"flags": ("--pptx",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to PowerPoint file (.pptx)",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_pptx_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all pptx command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds PPTX-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for PPTX.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for PPTX
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for PPTX), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# PPTX-specific args
|
||||
for arg_name, arg_def in PPTX_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
101
src/skill_seekers/cli/arguments/rss.py
Normal file
101
src/skill_seekers/cli/arguments/rss.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""RSS command argument definitions.
|
||||
|
||||
This module defines ALL arguments for the rss command in ONE place.
|
||||
Both rss_scraper.py (standalone) and parsers/rss_parser.py (unified CLI)
|
||||
import and use these definitions.
|
||||
|
||||
Shared arguments (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
|
||||
via ``add_all_standard_arguments()``.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from typing import Any
|
||||
|
||||
from .common import add_all_standard_arguments
|
||||
|
||||
# RSS-specific argument definitions as data structure
|
||||
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
|
||||
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
|
||||
RSS_ARGUMENTS: dict[str, dict[str, Any]] = {
|
||||
"feed_url": {
|
||||
"flags": ("--feed-url",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "URL of the RSS/Atom feed",
|
||||
"metavar": "URL",
|
||||
},
|
||||
},
|
||||
"feed_path": {
|
||||
"flags": ("--feed-path",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Path to local RSS/Atom feed file",
|
||||
"metavar": "PATH",
|
||||
},
|
||||
},
|
||||
"follow_links": {
|
||||
"flags": ("--follow-links",),
|
||||
"kwargs": {
|
||||
"action": "store_true",
|
||||
"default": True,
|
||||
"help": "Follow article links and extract full content (default: True)",
|
||||
},
|
||||
},
|
||||
"no_follow_links": {
|
||||
"flags": ("--no-follow-links",),
|
||||
"kwargs": {
|
||||
"action": "store_false",
|
||||
"dest": "follow_links",
|
||||
"help": "Do not follow article links; use feed summary only",
|
||||
},
|
||||
},
|
||||
"max_articles": {
|
||||
"flags": ("--max-articles",),
|
||||
"kwargs": {
|
||||
"type": int,
|
||||
"default": 50,
|
||||
"help": "Maximum number of articles to extract (default: 50)",
|
||||
"metavar": "N",
|
||||
},
|
||||
},
|
||||
"from_json": {
|
||||
"flags": ("--from-json",),
|
||||
"kwargs": {
|
||||
"type": str,
|
||||
"help": "Build skill from extracted JSON",
|
||||
"metavar": "FILE",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def add_rss_arguments(parser: argparse.ArgumentParser) -> None:
|
||||
"""Add all rss command arguments to a parser.
|
||||
|
||||
Registers shared args (name, description, output, enhance-level, api-key,
|
||||
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
|
||||
then adds RSS-specific args on top.
|
||||
|
||||
The default for --enhance-level is overridden to 0 (disabled) for RSS.
|
||||
"""
|
||||
# Shared universal args first
|
||||
add_all_standard_arguments(parser)
|
||||
|
||||
# Override enhance-level default to 0 for RSS
|
||||
for action in parser._actions:
|
||||
if hasattr(action, "dest") and action.dest == "enhance_level":
|
||||
action.default = 0
|
||||
action.help = (
|
||||
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
||||
"0=disabled (default for RSS), 1=SKILL.md only, "
|
||||
"2=+architecture/config, 3=full enhancement. "
|
||||
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
|
||||
"otherwise LOCAL (Claude Code)"
|
||||
)
|
||||
|
||||
# RSS-specific args
|
||||
for arg_name, arg_def in RSS_ARGUMENTS.items():
|
||||
flags = arg_def["flags"]
|
||||
kwargs = arg_def["kwargs"]
|
||||
parser.add_argument(*flags, **kwargs)
|
||||
1085
src/skill_seekers/cli/asciidoc_scraper.py
Normal file
1085
src/skill_seekers/cli/asciidoc_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
1920
src/skill_seekers/cli/chat_scraper.py
Normal file
1920
src/skill_seekers/cli/chat_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,19 @@ Validates unified config format that supports multiple sources:
|
||||
- github (repository scraping)
|
||||
- pdf (PDF document scraping)
|
||||
- local (local codebase analysis)
|
||||
- word (Word .docx document scraping)
|
||||
- video (video transcript/visual extraction)
|
||||
- epub (EPUB e-book extraction)
|
||||
- jupyter (Jupyter Notebook extraction)
|
||||
- html (local HTML file extraction)
|
||||
- openapi (OpenAPI/Swagger spec extraction)
|
||||
- asciidoc (AsciiDoc document extraction)
|
||||
- pptx (PowerPoint presentation extraction)
|
||||
- confluence (Confluence wiki extraction)
|
||||
- notion (Notion page extraction)
|
||||
- rss (RSS/Atom feed extraction)
|
||||
- manpage (man page extraction)
|
||||
- chat (Slack/Discord chat export extraction)
|
||||
|
||||
Legacy config format support removed in v2.11.0.
|
||||
All configs must use unified format with 'sources' array.
|
||||
@@ -27,7 +40,25 @@ class ConfigValidator:
|
||||
"""
|
||||
|
||||
# Valid source types
|
||||
VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}
|
||||
VALID_SOURCE_TYPES = {
|
||||
"documentation",
|
||||
"github",
|
||||
"pdf",
|
||||
"local",
|
||||
"word",
|
||||
"video",
|
||||
"epub",
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"confluence",
|
||||
"notion",
|
||||
"rss",
|
||||
"manpage",
|
||||
"chat",
|
||||
}
|
||||
|
||||
# Valid merge modes
|
||||
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
|
||||
@@ -159,6 +190,32 @@ class ConfigValidator:
|
||||
self._validate_pdf_source(source, index)
|
||||
elif source_type == "local":
|
||||
self._validate_local_source(source, index)
|
||||
elif source_type == "word":
|
||||
self._validate_word_source(source, index)
|
||||
elif source_type == "video":
|
||||
self._validate_video_source(source, index)
|
||||
elif source_type == "epub":
|
||||
self._validate_epub_source(source, index)
|
||||
elif source_type == "jupyter":
|
||||
self._validate_jupyter_source(source, index)
|
||||
elif source_type == "html":
|
||||
self._validate_html_source(source, index)
|
||||
elif source_type == "openapi":
|
||||
self._validate_openapi_source(source, index)
|
||||
elif source_type == "asciidoc":
|
||||
self._validate_asciidoc_source(source, index)
|
||||
elif source_type == "pptx":
|
||||
self._validate_pptx_source(source, index)
|
||||
elif source_type == "confluence":
|
||||
self._validate_confluence_source(source, index)
|
||||
elif source_type == "notion":
|
||||
self._validate_notion_source(source, index)
|
||||
elif source_type == "rss":
|
||||
self._validate_rss_source(source, index)
|
||||
elif source_type == "manpage":
|
||||
self._validate_manpage_source(source, index)
|
||||
elif source_type == "chat":
|
||||
self._validate_chat_source(source, index)
|
||||
|
||||
def _validate_documentation_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate documentation source configuration."""
|
||||
@@ -253,12 +310,126 @@ class ConfigValidator:
|
||||
f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
|
||||
)
|
||||
|
||||
def _validate_word_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate Word document (.docx) source configuration."""
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (word): Missing required field 'path'")
|
||||
word_path = source["path"]
|
||||
if not Path(word_path).exists():
|
||||
logger.warning(f"Source {index} (word): File not found: {word_path}")
|
||||
|
||||
def _validate_video_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate video source configuration."""
|
||||
has_url = "url" in source
|
||||
has_path = "path" in source
|
||||
has_playlist = "playlist" in source
|
||||
if not has_url and not has_path and not has_playlist:
|
||||
raise ValueError(
|
||||
f"Source {index} (video): Missing required field 'url', 'path', or 'playlist'"
|
||||
)
|
||||
|
||||
def _validate_epub_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate EPUB source configuration."""
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (epub): Missing required field 'path'")
|
||||
epub_path = source["path"]
|
||||
if not Path(epub_path).exists():
|
||||
logger.warning(f"Source {index} (epub): File not found: {epub_path}")
|
||||
|
||||
def _validate_jupyter_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate Jupyter Notebook source configuration."""
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (jupyter): Missing required field 'path'")
|
||||
nb_path = source["path"]
|
||||
if not Path(nb_path).exists():
|
||||
logger.warning(f"Source {index} (jupyter): Path not found: {nb_path}")
|
||||
|
||||
def _validate_html_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate local HTML source configuration."""
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (html): Missing required field 'path'")
|
||||
html_path = source["path"]
|
||||
if not Path(html_path).exists():
|
||||
logger.warning(f"Source {index} (html): Path not found: {html_path}")
|
||||
|
||||
def _validate_openapi_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate OpenAPI/Swagger source configuration."""
|
||||
if "path" not in source and "url" not in source:
|
||||
raise ValueError(f"Source {index} (openapi): Missing required field 'path' or 'url'")
|
||||
if "path" in source and not Path(source["path"]).exists():
|
||||
logger.warning(f"Source {index} (openapi): File not found: {source['path']}")
|
||||
|
||||
def _validate_asciidoc_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate AsciiDoc source configuration."""
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (asciidoc): Missing required field 'path'")
|
||||
adoc_path = source["path"]
|
||||
if not Path(adoc_path).exists():
|
||||
logger.warning(f"Source {index} (asciidoc): Path not found: {adoc_path}")
|
||||
|
||||
def _validate_pptx_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate PowerPoint source configuration."""
|
||||
if "path" not in source:
|
||||
raise ValueError(f"Source {index} (pptx): Missing required field 'path'")
|
||||
pptx_path = source["path"]
|
||||
if not Path(pptx_path).exists():
|
||||
logger.warning(f"Source {index} (pptx): File not found: {pptx_path}")
|
||||
|
||||
def _validate_confluence_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate Confluence source configuration."""
|
||||
has_url = "url" in source or "base_url" in source
|
||||
has_path = "path" in source
|
||||
if not has_url and not has_path:
|
||||
raise ValueError(
|
||||
f"Source {index} (confluence): Missing required field 'url'/'base_url' "
|
||||
f"(for API) or 'path' (for export)"
|
||||
)
|
||||
if has_url and "space_key" not in source and "path" not in source:
|
||||
logger.warning(f"Source {index} (confluence): No 'space_key' specified for API mode")
|
||||
|
||||
def _validate_notion_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate Notion source configuration."""
|
||||
has_url = "url" in source or "database_id" in source or "page_id" in source
|
||||
has_path = "path" in source
|
||||
if not has_url and not has_path:
|
||||
raise ValueError(
|
||||
f"Source {index} (notion): Missing required field 'url'/'database_id'/'page_id' "
|
||||
f"(for API) or 'path' (for export)"
|
||||
)
|
||||
|
||||
def _validate_rss_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate RSS/Atom feed source configuration."""
|
||||
if "url" not in source and "path" not in source:
|
||||
raise ValueError(f"Source {index} (rss): Missing required field 'url' or 'path'")
|
||||
|
||||
def _validate_manpage_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate man page source configuration."""
|
||||
if "path" not in source and "names" not in source:
|
||||
raise ValueError(f"Source {index} (manpage): Missing required field 'path' or 'names'")
|
||||
if "path" in source and not Path(source["path"]).exists():
|
||||
logger.warning(f"Source {index} (manpage): Path not found: {source['path']}")
|
||||
|
||||
def _validate_chat_source(self, source: dict[str, Any], index: int):
|
||||
"""Validate Slack/Discord chat source configuration."""
|
||||
has_path = "path" in source
|
||||
has_api = "token" in source or "webhook_url" in source
|
||||
has_channel = "channel" in source or "channel_id" in source
|
||||
if not has_path and not has_api:
|
||||
raise ValueError(
|
||||
f"Source {index} (chat): Missing required field 'path' (for export) "
|
||||
f"or 'token' (for API)"
|
||||
)
|
||||
if has_api and not has_channel:
|
||||
logger.warning(
|
||||
f"Source {index} (chat): No 'channel' or 'channel_id' specified for API mode"
|
||||
)
|
||||
|
||||
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Get all sources of a specific type.
|
||||
|
||||
Args:
|
||||
source_type: 'documentation', 'github', 'pdf', or 'local'
|
||||
source_type: Any valid source type string
|
||||
|
||||
Returns:
|
||||
List of sources matching the type
|
||||
|
||||
2166
src/skill_seekers/cli/confluence_scraper.py
Normal file
2166
src/skill_seekers/cli/confluence_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -140,6 +140,26 @@ class CreateCommand:
|
||||
return self._route_video()
|
||||
elif self.source_info.type == "config":
|
||||
return self._route_config()
|
||||
elif self.source_info.type == "jupyter":
|
||||
return self._route_generic("jupyter_scraper", "--notebook")
|
||||
elif self.source_info.type == "html":
|
||||
return self._route_generic("html_scraper", "--html-path")
|
||||
elif self.source_info.type == "openapi":
|
||||
return self._route_generic("openapi_scraper", "--spec")
|
||||
elif self.source_info.type == "asciidoc":
|
||||
return self._route_generic("asciidoc_scraper", "--asciidoc-path")
|
||||
elif self.source_info.type == "pptx":
|
||||
return self._route_generic("pptx_scraper", "--pptx")
|
||||
elif self.source_info.type == "rss":
|
||||
return self._route_generic("rss_scraper", "--feed-path")
|
||||
elif self.source_info.type == "manpage":
|
||||
return self._route_generic("man_scraper", "--man-path")
|
||||
elif self.source_info.type == "confluence":
|
||||
return self._route_generic("confluence_scraper", "--export-path")
|
||||
elif self.source_info.type == "notion":
|
||||
return self._route_generic("notion_scraper", "--export-path")
|
||||
elif self.source_info.type == "chat":
|
||||
return self._route_generic("chat_scraper", "--export-path")
|
||||
else:
|
||||
logger.error(f"Unknown source type: {self.source_info.type}")
|
||||
return 1
|
||||
@@ -485,6 +505,40 @@ class CreateCommand:
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_generic(self, module_name: str, file_flag: str) -> int:
|
||||
"""Generic routing for new source types.
|
||||
|
||||
Most new source types (jupyter, html, openapi, asciidoc, pptx, rss,
|
||||
manpage, confluence, notion, chat) follow the same pattern:
|
||||
import module, build argv with --flag <file_path>, add common args, call main().
|
||||
|
||||
Args:
|
||||
module_name: Python module name under skill_seekers.cli (e.g., "jupyter_scraper")
|
||||
file_flag: CLI flag for the source file (e.g., "--notebook")
|
||||
|
||||
Returns:
|
||||
Exit code from scraper
|
||||
"""
|
||||
import importlib
|
||||
|
||||
module = importlib.import_module(f"skill_seekers.cli.{module_name}")
|
||||
|
||||
argv = [module_name]
|
||||
|
||||
file_path = self.source_info.parsed.get("file_path", "")
|
||||
if file_path:
|
||||
argv.extend([file_flag, file_path])
|
||||
|
||||
self._add_common_args(argv)
|
||||
|
||||
logger.debug(f"Calling {module_name} with argv: {argv}")
|
||||
original_argv = sys.argv
|
||||
try:
|
||||
sys.argv = argv
|
||||
return module.main()
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _add_common_args(self, argv: list[str]) -> None:
|
||||
"""Add truly universal arguments to argv list.
|
||||
|
||||
|
||||
1942
src/skill_seekers/cli/html_scraper.py
Normal file
1942
src/skill_seekers/cli/html_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
1209
src/skill_seekers/cli/jupyter_scraper.py
Normal file
1209
src/skill_seekers/cli/jupyter_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -15,7 +15,17 @@ Commands:
|
||||
word Extract from Word (.docx) file
|
||||
epub Extract from EPUB e-book (.epub)
|
||||
video Extract from video (YouTube or local)
|
||||
unified Multi-source scraping (docs + GitHub + PDF)
|
||||
jupyter Extract from Jupyter Notebook (.ipynb)
|
||||
html Extract from local HTML files
|
||||
openapi Extract from OpenAPI/Swagger spec
|
||||
asciidoc Extract from AsciiDoc documents (.adoc)
|
||||
pptx Extract from PowerPoint (.pptx)
|
||||
rss Extract from RSS/Atom feeds
|
||||
manpage Extract from man pages
|
||||
confluence Extract from Confluence wiki
|
||||
notion Extract from Notion pages
|
||||
chat Extract from Slack/Discord chat exports
|
||||
unified Multi-source scraping (docs + GitHub + PDF + more)
|
||||
analyze Analyze local codebase and extract code knowledge
|
||||
enhance AI-powered enhancement (auto: API or LOCAL mode)
|
||||
enhance-status Check enhancement status (for background/daemon modes)
|
||||
@@ -70,6 +80,17 @@ COMMAND_MODULES = {
|
||||
"quality": "skill_seekers.cli.quality_metrics",
|
||||
"workflows": "skill_seekers.cli.workflows_command",
|
||||
"sync-config": "skill_seekers.cli.sync_config",
|
||||
# New source types (v3.2.0+)
|
||||
"jupyter": "skill_seekers.cli.jupyter_scraper",
|
||||
"html": "skill_seekers.cli.html_scraper",
|
||||
"openapi": "skill_seekers.cli.openapi_scraper",
|
||||
"asciidoc": "skill_seekers.cli.asciidoc_scraper",
|
||||
"pptx": "skill_seekers.cli.pptx_scraper",
|
||||
"rss": "skill_seekers.cli.rss_scraper",
|
||||
"manpage": "skill_seekers.cli.man_scraper",
|
||||
"confluence": "skill_seekers.cli.confluence_scraper",
|
||||
"notion": "skill_seekers.cli.notion_scraper",
|
||||
"chat": "skill_seekers.cli.chat_scraper",
|
||||
}
|
||||
|
||||
|
||||
|
||||
1513
src/skill_seekers/cli/man_scraper.py
Normal file
1513
src/skill_seekers/cli/man_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
1023
src/skill_seekers/cli/notion_scraper.py
Normal file
1023
src/skill_seekers/cli/notion_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
1959
src/skill_seekers/cli/openapi_scraper.py
Normal file
1959
src/skill_seekers/cli/openapi_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -33,6 +33,18 @@ from .quality_parser import QualityParser
|
||||
from .workflows_parser import WorkflowsParser
|
||||
from .sync_config_parser import SyncConfigParser
|
||||
|
||||
# New source type parsers (v3.2.0+)
|
||||
from .jupyter_parser import JupyterParser
|
||||
from .html_parser import HtmlParser
|
||||
from .openapi_parser import OpenAPIParser
|
||||
from .asciidoc_parser import AsciiDocParser
|
||||
from .pptx_parser import PptxParser
|
||||
from .rss_parser import RssParser
|
||||
from .manpage_parser import ManPageParser
|
||||
from .confluence_parser import ConfluenceParser
|
||||
from .notion_parser import NotionParser
|
||||
from .chat_parser import ChatParser
|
||||
|
||||
# Registry of all parsers (in order of usage frequency)
|
||||
PARSERS = [
|
||||
CreateParser(), # NEW: Unified create command (placed first for prominence)
|
||||
@@ -60,6 +72,17 @@ PARSERS = [
|
||||
QualityParser(),
|
||||
WorkflowsParser(),
|
||||
SyncConfigParser(),
|
||||
# New source types (v3.2.0+)
|
||||
JupyterParser(),
|
||||
HtmlParser(),
|
||||
OpenAPIParser(),
|
||||
AsciiDocParser(),
|
||||
PptxParser(),
|
||||
RssParser(),
|
||||
ManPageParser(),
|
||||
ConfluenceParser(),
|
||||
NotionParser(),
|
||||
ChatParser(),
|
||||
]
|
||||
|
||||
|
||||
|
||||
32
src/skill_seekers/cli/parsers/asciidoc_parser.py
Normal file
32
src/skill_seekers/cli/parsers/asciidoc_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""AsciiDoc subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.asciidoc to ensure
|
||||
consistency with the standalone asciidoc_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.asciidoc import add_asciidoc_arguments
|
||||
|
||||
|
||||
class AsciiDocParser(SubcommandParser):
|
||||
"""Parser for asciidoc subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "asciidoc"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from AsciiDoc documents (.adoc)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from AsciiDoc documents (.adoc) and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add asciidoc-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with asciidoc_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_asciidoc_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/chat_parser.py
Normal file
32
src/skill_seekers/cli/parsers/chat_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Chat subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.chat to ensure
|
||||
consistency with the standalone chat_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.chat import add_chat_arguments
|
||||
|
||||
|
||||
class ChatParser(SubcommandParser):
|
||||
"""Parser for chat subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "chat"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from Slack/Discord chat exports"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from Slack/Discord chat exports and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add chat-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with chat_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_chat_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/confluence_parser.py
Normal file
32
src/skill_seekers/cli/parsers/confluence_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Confluence subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.confluence to ensure
|
||||
consistency with the standalone confluence_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.confluence import add_confluence_arguments
|
||||
|
||||
|
||||
class ConfluenceParser(SubcommandParser):
|
||||
"""Parser for confluence subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "confluence"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from Confluence wiki"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from Confluence wiki and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add confluence-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with confluence_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_confluence_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/html_parser.py
Normal file
32
src/skill_seekers/cli/parsers/html_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""HTML subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.html to ensure
|
||||
consistency with the standalone html_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.html import add_html_arguments
|
||||
|
||||
|
||||
class HtmlParser(SubcommandParser):
|
||||
"""Parser for html subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "html"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from local HTML files (.html/.htm)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from local HTML files (.html/.htm) and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add html-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with html_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_html_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/jupyter_parser.py
Normal file
32
src/skill_seekers/cli/parsers/jupyter_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Jupyter Notebook subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.jupyter to ensure
|
||||
consistency with the standalone jupyter_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.jupyter import add_jupyter_arguments
|
||||
|
||||
|
||||
class JupyterParser(SubcommandParser):
|
||||
"""Parser for jupyter subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "jupyter"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from Jupyter Notebook (.ipynb)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from Jupyter Notebook (.ipynb) and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add jupyter-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with jupyter_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_jupyter_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/manpage_parser.py
Normal file
32
src/skill_seekers/cli/parsers/manpage_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Man page subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.manpage to ensure
|
||||
consistency with the standalone man_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.manpage import add_manpage_arguments
|
||||
|
||||
|
||||
class ManPageParser(SubcommandParser):
|
||||
"""Parser for manpage subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "manpage"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from man pages"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from man pages and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add manpage-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with man_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_manpage_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/notion_parser.py
Normal file
32
src/skill_seekers/cli/parsers/notion_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""Notion subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.notion to ensure
|
||||
consistency with the standalone notion_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.notion import add_notion_arguments
|
||||
|
||||
|
||||
class NotionParser(SubcommandParser):
|
||||
"""Parser for notion subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "notion"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from Notion pages"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from Notion pages and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add notion-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with notion_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_notion_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/openapi_parser.py
Normal file
32
src/skill_seekers/cli/parsers/openapi_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""OpenAPI subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.openapi to ensure
|
||||
consistency with the standalone openapi_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.openapi import add_openapi_arguments
|
||||
|
||||
|
||||
class OpenAPIParser(SubcommandParser):
|
||||
"""Parser for openapi subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "openapi"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from OpenAPI/Swagger spec"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from OpenAPI/Swagger spec and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add openapi-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with openapi_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_openapi_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/pptx_parser.py
Normal file
32
src/skill_seekers/cli/parsers/pptx_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""PPTX subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.pptx to ensure
|
||||
consistency with the standalone pptx_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.pptx import add_pptx_arguments
|
||||
|
||||
|
||||
class PptxParser(SubcommandParser):
|
||||
"""Parser for pptx subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "pptx"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from PowerPoint presentations (.pptx)"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from PowerPoint presentations (.pptx) and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add pptx-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with pptx_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_pptx_arguments(parser)
|
||||
32
src/skill_seekers/cli/parsers/rss_parser.py
Normal file
32
src/skill_seekers/cli/parsers/rss_parser.py
Normal file
@@ -0,0 +1,32 @@
|
||||
"""RSS subcommand parser.
|
||||
|
||||
Uses shared argument definitions from arguments.rss to ensure
|
||||
consistency with the standalone rss_scraper module.
|
||||
"""
|
||||
|
||||
from .base import SubcommandParser
|
||||
from skill_seekers.cli.arguments.rss import add_rss_arguments
|
||||
|
||||
|
||||
class RssParser(SubcommandParser):
|
||||
"""Parser for rss subcommand."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "rss"
|
||||
|
||||
@property
|
||||
def help(self) -> str:
|
||||
return "Extract from RSS/Atom feeds"
|
||||
|
||||
@property
|
||||
def description(self) -> str:
|
||||
return "Extract content from RSS/Atom feeds and generate skill"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
"""Add rss-specific arguments.
|
||||
|
||||
Uses shared argument definitions to ensure consistency
|
||||
with rss_scraper.py (standalone scraper).
|
||||
"""
|
||||
add_rss_arguments(parser)
|
||||
1821
src/skill_seekers/cli/pptx_scraper.py
Normal file
1821
src/skill_seekers/cli/pptx_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
1087
src/skill_seekers/cli/rss_scraper.py
Normal file
1087
src/skill_seekers/cli/rss_scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,12 @@
|
||||
"""Source type detection for unified create command.
|
||||
|
||||
Auto-detects whether a source is a web URL, GitHub repository,
|
||||
local directory, PDF file, or config file based on patterns.
|
||||
Auto-detects source type from user input — supports web URLs, GitHub repos,
|
||||
local directories, and 14+ file types (PDF, DOCX, EPUB, IPYNB, HTML, YAML/OpenAPI,
|
||||
AsciiDoc, PPTX, RSS/Atom, man pages, video files, and config JSON).
|
||||
|
||||
Note: Confluence, Notion, and Slack/Discord chat sources are API/export-based
|
||||
and cannot be auto-detected from a single argument. Use their dedicated
|
||||
subcommands (``skill-seekers confluence``, ``notion``, ``chat``) instead.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -66,11 +71,49 @@ class SourceDetector:
|
||||
if source.endswith(".epub"):
|
||||
return cls._detect_epub(source)
|
||||
|
||||
if source.endswith(".ipynb"):
|
||||
return cls._detect_jupyter(source)
|
||||
|
||||
if source.lower().endswith((".html", ".htm")):
|
||||
return cls._detect_html(source)
|
||||
|
||||
if source.endswith(".pptx"):
|
||||
return cls._detect_pptx(source)
|
||||
|
||||
if source.lower().endswith((".adoc", ".asciidoc")):
|
||||
return cls._detect_asciidoc(source)
|
||||
|
||||
# Man page file extensions (.1 through .8, .man)
|
||||
# Only match if the basename looks like a man page (e.g., "git.1", not "log.1")
|
||||
# Require basename without the extension to be a plausible command name
|
||||
if source.lower().endswith(".man"):
|
||||
return cls._detect_manpage(source)
|
||||
MAN_SECTION_EXTENSIONS = (".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8")
|
||||
if source.lower().endswith(MAN_SECTION_EXTENSIONS):
|
||||
# Heuristic: man pages have a simple basename (no dots before extension)
|
||||
# e.g., "git.1" is a man page, "access.log.1" is not
|
||||
basename_no_ext = os.path.splitext(os.path.basename(source))[0]
|
||||
if "." not in basename_no_ext:
|
||||
return cls._detect_manpage(source)
|
||||
|
||||
# Video file extensions
|
||||
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
|
||||
if source.lower().endswith(VIDEO_EXTENSIONS):
|
||||
return cls._detect_video_file(source)
|
||||
|
||||
# RSS/Atom feed file extensions (only .rss and .atom — .xml is too generic)
|
||||
if source.lower().endswith((".rss", ".atom")):
|
||||
return cls._detect_rss(source)
|
||||
|
||||
# OpenAPI/Swagger spec detection (YAML files with OpenAPI content)
|
||||
# Sniff file content for 'openapi:' or 'swagger:' keys before committing
|
||||
if (
|
||||
source.lower().endswith((".yaml", ".yml"))
|
||||
and os.path.isfile(source)
|
||||
and cls._looks_like_openapi(source)
|
||||
):
|
||||
return cls._detect_openapi(source)
|
||||
|
||||
# 2. Video URL detection (before directory check)
|
||||
video_url_info = cls._detect_video_url(source)
|
||||
if video_url_info:
|
||||
@@ -97,15 +140,22 @@ class SourceDetector:
|
||||
raise ValueError(
|
||||
f"Cannot determine source type for: {source}\n\n"
|
||||
"Examples:\n"
|
||||
" Web: skill-seekers create https://docs.react.dev/\n"
|
||||
" GitHub: skill-seekers create facebook/react\n"
|
||||
" Local: skill-seekers create ./my-project\n"
|
||||
" PDF: skill-seekers create tutorial.pdf\n"
|
||||
" DOCX: skill-seekers create document.docx\n"
|
||||
" EPUB: skill-seekers create ebook.epub\n"
|
||||
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
|
||||
" Video: skill-seekers create recording.mp4\n"
|
||||
" Config: skill-seekers create configs/react.json"
|
||||
" Web: skill-seekers create https://docs.react.dev/\n"
|
||||
" GitHub: skill-seekers create facebook/react\n"
|
||||
" Local: skill-seekers create ./my-project\n"
|
||||
" PDF: skill-seekers create tutorial.pdf\n"
|
||||
" DOCX: skill-seekers create document.docx\n"
|
||||
" EPUB: skill-seekers create ebook.epub\n"
|
||||
" Jupyter: skill-seekers create notebook.ipynb\n"
|
||||
" HTML: skill-seekers create page.html\n"
|
||||
" OpenAPI: skill-seekers create openapi.yaml\n"
|
||||
" AsciiDoc: skill-seekers create document.adoc\n"
|
||||
" PowerPoint: skill-seekers create presentation.pptx\n"
|
||||
" RSS: skill-seekers create feed.rss\n"
|
||||
" Man page: skill-seekers create command.1\n"
|
||||
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
|
||||
" Video: skill-seekers create recording.mp4\n"
|
||||
" Config: skill-seekers create configs/react.json"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@@ -140,6 +190,90 @@ class SourceDetector:
|
||||
type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_jupyter(cls, source: str) -> SourceInfo:
|
||||
"""Detect Jupyter Notebook file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="jupyter", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_html(cls, source: str) -> SourceInfo:
|
||||
"""Detect local HTML file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="html", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_pptx(cls, source: str) -> SourceInfo:
|
||||
"""Detect PowerPoint file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="pptx", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_asciidoc(cls, source: str) -> SourceInfo:
|
||||
"""Detect AsciiDoc file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="asciidoc", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_manpage(cls, source: str) -> SourceInfo:
|
||||
"""Detect man page file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="manpage", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_rss(cls, source: str) -> SourceInfo:
|
||||
"""Detect RSS/Atom feed file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="rss", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _looks_like_openapi(cls, source: str) -> bool:
|
||||
"""Check if a YAML/JSON file looks like an OpenAPI or Swagger spec.
|
||||
|
||||
Reads the first few lines to look for 'openapi:' or 'swagger:' keys.
|
||||
|
||||
Args:
|
||||
source: Path to the file
|
||||
|
||||
Returns:
|
||||
True if the file appears to be an OpenAPI/Swagger spec
|
||||
"""
|
||||
try:
|
||||
with open(source, encoding="utf-8", errors="replace") as f:
|
||||
# Read first 20 lines — the openapi/swagger key is always near the top
|
||||
for _ in range(20):
|
||||
line = f.readline()
|
||||
if not line:
|
||||
break
|
||||
stripped = line.strip().lower()
|
||||
if stripped.startswith("openapi:") or stripped.startswith("swagger:"):
|
||||
return True
|
||||
if stripped.startswith('"openapi"') or stripped.startswith('"swagger"'):
|
||||
return True
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _detect_openapi(cls, source: str) -> SourceInfo:
|
||||
"""Detect OpenAPI/Swagger spec file source."""
|
||||
name = os.path.splitext(os.path.basename(source))[0]
|
||||
return SourceInfo(
|
||||
type="openapi", parsed={"file_path": source}, suggested_name=name, raw_input=source
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _detect_video_file(cls, source: str) -> SourceInfo:
|
||||
"""Detect local video file source."""
|
||||
@@ -312,5 +446,19 @@ class SourceDetector:
|
||||
if not os.path.isfile(config_path):
|
||||
raise ValueError(f"Path is not a file: {config_path}")
|
||||
|
||||
# For web and github, validation happens during scraping
|
||||
# (URL accessibility, repo existence)
|
||||
elif source_info.type in ("jupyter", "html", "pptx", "asciidoc", "manpage", "openapi"):
|
||||
file_path = source_info.parsed.get("file_path", "")
|
||||
if file_path:
|
||||
type_label = source_info.type.upper()
|
||||
if not os.path.exists(file_path):
|
||||
raise ValueError(f"{type_label} file does not exist: {file_path}")
|
||||
if not os.path.isfile(file_path) and not os.path.isdir(file_path):
|
||||
raise ValueError(f"Path is not a file or directory: {file_path}")
|
||||
|
||||
elif source_info.type == "rss":
|
||||
file_path = source_info.parsed.get("file_path", "")
|
||||
if file_path and not os.path.exists(file_path):
|
||||
raise ValueError(f"RSS/Atom file does not exist: {file_path}")
|
||||
|
||||
# For web, github, confluence, notion, chat, rss (URL), validation happens
|
||||
# during scraping (URL accessibility, API auth, etc.)
|
||||
|
||||
@@ -76,6 +76,17 @@ class UnifiedScraper:
|
||||
"word": [], # List of word sources
|
||||
"video": [], # List of video sources
|
||||
"local": [], # List of local sources (docs or code)
|
||||
"epub": [], # List of epub sources
|
||||
"jupyter": [], # List of Jupyter notebook sources
|
||||
"html": [], # List of local HTML sources
|
||||
"openapi": [], # List of OpenAPI/Swagger spec sources
|
||||
"asciidoc": [], # List of AsciiDoc sources
|
||||
"pptx": [], # List of PowerPoint sources
|
||||
"confluence": [], # List of Confluence wiki sources
|
||||
"notion": [], # List of Notion page sources
|
||||
"rss": [], # List of RSS/Atom feed sources
|
||||
"manpage": [], # List of man page sources
|
||||
"chat": [], # List of Slack/Discord chat sources
|
||||
}
|
||||
|
||||
# Track source index for unique naming (multi-source support)
|
||||
@@ -86,6 +97,17 @@ class UnifiedScraper:
|
||||
"word": 0,
|
||||
"video": 0,
|
||||
"local": 0,
|
||||
"epub": 0,
|
||||
"jupyter": 0,
|
||||
"html": 0,
|
||||
"openapi": 0,
|
||||
"asciidoc": 0,
|
||||
"pptx": 0,
|
||||
"confluence": 0,
|
||||
"notion": 0,
|
||||
"rss": 0,
|
||||
"manpage": 0,
|
||||
"chat": 0,
|
||||
}
|
||||
|
||||
# Output paths - cleaner organization
|
||||
@@ -166,6 +188,28 @@ class UnifiedScraper:
|
||||
self._scrape_video(source)
|
||||
elif source_type == "local":
|
||||
self._scrape_local(source)
|
||||
elif source_type == "epub":
|
||||
self._scrape_epub(source)
|
||||
elif source_type == "jupyter":
|
||||
self._scrape_jupyter(source)
|
||||
elif source_type == "html":
|
||||
self._scrape_html(source)
|
||||
elif source_type == "openapi":
|
||||
self._scrape_openapi(source)
|
||||
elif source_type == "asciidoc":
|
||||
self._scrape_asciidoc(source)
|
||||
elif source_type == "pptx":
|
||||
self._scrape_pptx(source)
|
||||
elif source_type == "confluence":
|
||||
self._scrape_confluence(source)
|
||||
elif source_type == "notion":
|
||||
self._scrape_notion(source)
|
||||
elif source_type == "rss":
|
||||
self._scrape_rss(source)
|
||||
elif source_type == "manpage":
|
||||
self._scrape_manpage(source)
|
||||
elif source_type == "chat":
|
||||
self._scrape_chat(source)
|
||||
else:
|
||||
logger.warning(f"Unknown source type: {source_type}")
|
||||
except Exception as e:
|
||||
@@ -571,6 +615,7 @@ class UnifiedScraper:
|
||||
{
|
||||
"docx_path": docx_path,
|
||||
"docx_id": docx_id,
|
||||
"word_id": docx_id, # Alias for generic reference generation
|
||||
"idx": idx,
|
||||
"data": word_data,
|
||||
"data_file": cache_word_data,
|
||||
@@ -788,6 +833,595 @@ class UnifiedScraper:
|
||||
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||
raise
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# New source type handlers (v3.2.0+)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _scrape_epub(self, source: dict[str, Any]):
|
||||
"""Scrape EPUB e-book (.epub)."""
|
||||
try:
|
||||
from skill_seekers.cli.epub_scraper import EpubToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"EPUB scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[epub]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["epub"]
|
||||
self._source_counters["epub"] += 1
|
||||
|
||||
epub_path = source["path"]
|
||||
epub_id = os.path.splitext(os.path.basename(epub_path))[0]
|
||||
|
||||
epub_config = {
|
||||
"name": f"{self.name}_epub_{idx}_{epub_id}",
|
||||
"epub_path": source["path"],
|
||||
"description": source.get("description", f"{epub_id} e-book"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping EPUB: {source['path']}")
|
||||
converter = EpubToSkillConverter(epub_config)
|
||||
converter.extract_epub()
|
||||
|
||||
epub_data_file = converter.data_file
|
||||
with open(epub_data_file, encoding="utf-8") as f:
|
||||
epub_data = json.load(f)
|
||||
|
||||
cache_epub_data = os.path.join(self.data_dir, f"epub_data_{idx}_{epub_id}.json")
|
||||
shutil.copy(epub_data_file, cache_epub_data)
|
||||
|
||||
self.scraped_data["epub"].append(
|
||||
{
|
||||
"epub_path": epub_path,
|
||||
"epub_id": epub_id,
|
||||
"idx": idx,
|
||||
"data": epub_data,
|
||||
"data_file": cache_epub_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ EPUB: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone EPUB SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ EPUB: {len(epub_data.get('chapters', []))} chapters extracted")
|
||||
|
||||
def _scrape_jupyter(self, source: dict[str, Any]):
|
||||
"""Scrape Jupyter Notebook (.ipynb)."""
|
||||
try:
|
||||
from skill_seekers.cli.jupyter_scraper import JupyterToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Jupyter scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[jupyter]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["jupyter"]
|
||||
self._source_counters["jupyter"] += 1
|
||||
|
||||
nb_path = source["path"]
|
||||
nb_id = os.path.splitext(os.path.basename(nb_path))[0]
|
||||
|
||||
nb_config = {
|
||||
"name": f"{self.name}_jupyter_{idx}_{nb_id}",
|
||||
"notebook_path": source["path"],
|
||||
"description": source.get("description", f"{nb_id} notebook"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping Jupyter Notebook: {source['path']}")
|
||||
converter = JupyterToSkillConverter(nb_config)
|
||||
converter.extract_notebook()
|
||||
|
||||
nb_data_file = converter.data_file
|
||||
with open(nb_data_file, encoding="utf-8") as f:
|
||||
nb_data = json.load(f)
|
||||
|
||||
cache_nb_data = os.path.join(self.data_dir, f"jupyter_data_{idx}_{nb_id}.json")
|
||||
shutil.copy(nb_data_file, cache_nb_data)
|
||||
|
||||
self.scraped_data["jupyter"].append(
|
||||
{
|
||||
"notebook_path": nb_path,
|
||||
"notebook_id": nb_id,
|
||||
"idx": idx,
|
||||
"data": nb_data,
|
||||
"data_file": cache_nb_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Jupyter: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Jupyter SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Jupyter: {len(nb_data.get('cells', []))} cells extracted")
|
||||
|
||||
def _scrape_html(self, source: dict[str, Any]):
|
||||
"""Scrape local HTML file(s)."""
|
||||
try:
|
||||
from skill_seekers.cli.html_scraper import HtmlToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("html_scraper.py not found")
|
||||
return
|
||||
|
||||
idx = self._source_counters["html"]
|
||||
self._source_counters["html"] += 1
|
||||
|
||||
html_path = source["path"]
|
||||
html_id = os.path.splitext(os.path.basename(html_path.rstrip("/")))[0]
|
||||
|
||||
html_config = {
|
||||
"name": f"{self.name}_html_{idx}_{html_id}",
|
||||
"html_path": source["path"],
|
||||
"description": source.get("description", f"{html_id} HTML content"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping local HTML: {source['path']}")
|
||||
converter = HtmlToSkillConverter(html_config)
|
||||
converter.extract_html()
|
||||
|
||||
html_data_file = converter.data_file
|
||||
with open(html_data_file, encoding="utf-8") as f:
|
||||
html_data = json.load(f)
|
||||
|
||||
cache_html_data = os.path.join(self.data_dir, f"html_data_{idx}_{html_id}.json")
|
||||
shutil.copy(html_data_file, cache_html_data)
|
||||
|
||||
self.scraped_data["html"].append(
|
||||
{
|
||||
"html_path": html_path,
|
||||
"html_id": html_id,
|
||||
"idx": idx,
|
||||
"data": html_data,
|
||||
"data_file": cache_html_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ HTML: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone HTML SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ HTML: {len(html_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_openapi(self, source: dict[str, Any]):
|
||||
"""Scrape OpenAPI/Swagger specification."""
|
||||
try:
|
||||
from skill_seekers.cli.openapi_scraper import OpenAPIToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("openapi_scraper.py not found")
|
||||
return
|
||||
|
||||
idx = self._source_counters["openapi"]
|
||||
self._source_counters["openapi"] += 1
|
||||
|
||||
spec_path = source.get("path", source.get("url", ""))
|
||||
spec_id = os.path.splitext(os.path.basename(spec_path))[0] if spec_path else f"spec_{idx}"
|
||||
|
||||
openapi_config = {
|
||||
"name": f"{self.name}_openapi_{idx}_{spec_id}",
|
||||
"spec_path": source.get("path"),
|
||||
"spec_url": source.get("url"),
|
||||
"description": source.get("description", f"{spec_id} API spec"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping OpenAPI spec: {spec_path}")
|
||||
converter = OpenAPIToSkillConverter(openapi_config)
|
||||
converter.extract_spec()
|
||||
|
||||
api_data_file = converter.data_file
|
||||
with open(api_data_file, encoding="utf-8") as f:
|
||||
api_data = json.load(f)
|
||||
|
||||
cache_api_data = os.path.join(self.data_dir, f"openapi_data_{idx}_{spec_id}.json")
|
||||
shutil.copy(api_data_file, cache_api_data)
|
||||
|
||||
self.scraped_data["openapi"].append(
|
||||
{
|
||||
"spec_path": spec_path,
|
||||
"spec_id": spec_id,
|
||||
"idx": idx,
|
||||
"data": api_data,
|
||||
"data_file": cache_api_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ OpenAPI: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone OpenAPI SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ OpenAPI: {len(api_data.get('endpoints', []))} endpoints extracted")
|
||||
|
||||
def _scrape_asciidoc(self, source: dict[str, Any]):
|
||||
"""Scrape AsciiDoc document(s)."""
|
||||
try:
|
||||
from skill_seekers.cli.asciidoc_scraper import AsciiDocToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"AsciiDoc scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[asciidoc]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["asciidoc"]
|
||||
self._source_counters["asciidoc"] += 1
|
||||
|
||||
adoc_path = source["path"]
|
||||
adoc_id = os.path.splitext(os.path.basename(adoc_path.rstrip("/")))[0]
|
||||
|
||||
adoc_config = {
|
||||
"name": f"{self.name}_asciidoc_{idx}_{adoc_id}",
|
||||
"asciidoc_path": source["path"],
|
||||
"description": source.get("description", f"{adoc_id} AsciiDoc content"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping AsciiDoc: {source['path']}")
|
||||
converter = AsciiDocToSkillConverter(adoc_config)
|
||||
converter.extract_asciidoc()
|
||||
|
||||
adoc_data_file = converter.data_file
|
||||
with open(adoc_data_file, encoding="utf-8") as f:
|
||||
adoc_data = json.load(f)
|
||||
|
||||
cache_adoc_data = os.path.join(self.data_dir, f"asciidoc_data_{idx}_{adoc_id}.json")
|
||||
shutil.copy(adoc_data_file, cache_adoc_data)
|
||||
|
||||
self.scraped_data["asciidoc"].append(
|
||||
{
|
||||
"asciidoc_path": adoc_path,
|
||||
"asciidoc_id": adoc_id,
|
||||
"idx": idx,
|
||||
"data": adoc_data,
|
||||
"data_file": cache_adoc_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ AsciiDoc: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone AsciiDoc SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ AsciiDoc: {len(adoc_data.get('sections', []))} sections extracted")
|
||||
|
||||
def _scrape_pptx(self, source: dict[str, Any]):
|
||||
"""Scrape PowerPoint presentation (.pptx)."""
|
||||
try:
|
||||
from skill_seekers.cli.pptx_scraper import PptxToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"PowerPoint scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[pptx]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["pptx"]
|
||||
self._source_counters["pptx"] += 1
|
||||
|
||||
pptx_path = source["path"]
|
||||
pptx_id = os.path.splitext(os.path.basename(pptx_path))[0]
|
||||
|
||||
pptx_config = {
|
||||
"name": f"{self.name}_pptx_{idx}_{pptx_id}",
|
||||
"pptx_path": source["path"],
|
||||
"description": source.get("description", f"{pptx_id} presentation"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping PowerPoint: {source['path']}")
|
||||
converter = PptxToSkillConverter(pptx_config)
|
||||
converter.extract_pptx()
|
||||
|
||||
pptx_data_file = converter.data_file
|
||||
with open(pptx_data_file, encoding="utf-8") as f:
|
||||
pptx_data = json.load(f)
|
||||
|
||||
cache_pptx_data = os.path.join(self.data_dir, f"pptx_data_{idx}_{pptx_id}.json")
|
||||
shutil.copy(pptx_data_file, cache_pptx_data)
|
||||
|
||||
self.scraped_data["pptx"].append(
|
||||
{
|
||||
"pptx_path": pptx_path,
|
||||
"pptx_id": pptx_id,
|
||||
"idx": idx,
|
||||
"data": pptx_data,
|
||||
"data_file": cache_pptx_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ PowerPoint: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone PowerPoint SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ PowerPoint: {len(pptx_data.get('slides', []))} slides extracted")
|
||||
|
||||
def _scrape_confluence(self, source: dict[str, Any]):
|
||||
"""Scrape Confluence wiki (API or exported HTML/XML)."""
|
||||
try:
|
||||
from skill_seekers.cli.confluence_scraper import ConfluenceToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Confluence scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[confluence]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["confluence"]
|
||||
self._source_counters["confluence"] += 1
|
||||
|
||||
source_id = source.get("space_key", source.get("path", f"confluence_{idx}"))
|
||||
if isinstance(source_id, str) and "/" in source_id:
|
||||
source_id = os.path.basename(source_id.rstrip("/"))
|
||||
|
||||
conf_config = {
|
||||
"name": f"{self.name}_confluence_{idx}_{source_id}",
|
||||
"base_url": source.get("base_url", source.get("url")),
|
||||
"space_key": source.get("space_key"),
|
||||
"export_path": source.get("path"),
|
||||
"username": source.get("username"),
|
||||
"token": source.get("token"),
|
||||
"description": source.get("description", f"{source_id} Confluence content"),
|
||||
"max_pages": source.get("max_pages", 500),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping Confluence: {source_id}")
|
||||
converter = ConfluenceToSkillConverter(conf_config)
|
||||
converter.extract_confluence()
|
||||
|
||||
conf_data_file = converter.data_file
|
||||
with open(conf_data_file, encoding="utf-8") as f:
|
||||
conf_data = json.load(f)
|
||||
|
||||
cache_conf_data = os.path.join(self.data_dir, f"confluence_data_{idx}_{source_id}.json")
|
||||
shutil.copy(conf_data_file, cache_conf_data)
|
||||
|
||||
self.scraped_data["confluence"].append(
|
||||
{
|
||||
"source_id": source_id,
|
||||
"idx": idx,
|
||||
"data": conf_data,
|
||||
"data_file": cache_conf_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Confluence: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Confluence SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Confluence: {len(conf_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_notion(self, source: dict[str, Any]):
|
||||
"""Scrape Notion pages (API or exported Markdown)."""
|
||||
try:
|
||||
from skill_seekers.cli.notion_scraper import NotionToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Notion scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[notion]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["notion"]
|
||||
self._source_counters["notion"] += 1
|
||||
|
||||
source_id = source.get(
|
||||
"database_id", source.get("page_id", source.get("path", f"notion_{idx}"))
|
||||
)
|
||||
if isinstance(source_id, str) and "/" in source_id:
|
||||
source_id = os.path.basename(source_id.rstrip("/"))
|
||||
|
||||
notion_config = {
|
||||
"name": f"{self.name}_notion_{idx}_{source_id}",
|
||||
"database_id": source.get("database_id"),
|
||||
"page_id": source.get("page_id"),
|
||||
"export_path": source.get("path"),
|
||||
"token": source.get("token"),
|
||||
"description": source.get("description", f"{source_id} Notion content"),
|
||||
"max_pages": source.get("max_pages", 500),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping Notion: {source_id}")
|
||||
converter = NotionToSkillConverter(notion_config)
|
||||
converter.extract_notion()
|
||||
|
||||
notion_data_file = converter.data_file
|
||||
with open(notion_data_file, encoding="utf-8") as f:
|
||||
notion_data = json.load(f)
|
||||
|
||||
cache_notion_data = os.path.join(self.data_dir, f"notion_data_{idx}_{source_id}.json")
|
||||
shutil.copy(notion_data_file, cache_notion_data)
|
||||
|
||||
self.scraped_data["notion"].append(
|
||||
{
|
||||
"source_id": source_id,
|
||||
"idx": idx,
|
||||
"data": notion_data,
|
||||
"data_file": cache_notion_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Notion: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Notion SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Notion: {len(notion_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_rss(self, source: dict[str, Any]):
|
||||
"""Scrape RSS/Atom feed (with optional full article scraping)."""
|
||||
try:
|
||||
from skill_seekers.cli.rss_scraper import RssToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"RSS scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[rss]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["rss"]
|
||||
self._source_counters["rss"] += 1
|
||||
|
||||
feed_url = source.get("url", source.get("path", ""))
|
||||
feed_id = feed_url.split("/")[-1].split(".")[0] if feed_url else f"feed_{idx}"
|
||||
|
||||
rss_config = {
|
||||
"name": f"{self.name}_rss_{idx}_{feed_id}",
|
||||
"feed_url": source.get("url"),
|
||||
"feed_path": source.get("path"),
|
||||
"follow_links": source.get("follow_links", True),
|
||||
"max_articles": source.get("max_articles", 50),
|
||||
"description": source.get("description", f"{feed_id} RSS/Atom feed"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping RSS/Atom feed: {feed_url}")
|
||||
converter = RssToSkillConverter(rss_config)
|
||||
converter.extract_feed()
|
||||
|
||||
rss_data_file = converter.data_file
|
||||
with open(rss_data_file, encoding="utf-8") as f:
|
||||
rss_data = json.load(f)
|
||||
|
||||
cache_rss_data = os.path.join(self.data_dir, f"rss_data_{idx}_{feed_id}.json")
|
||||
shutil.copy(rss_data_file, cache_rss_data)
|
||||
|
||||
self.scraped_data["rss"].append(
|
||||
{
|
||||
"feed_url": feed_url,
|
||||
"feed_id": feed_id,
|
||||
"idx": idx,
|
||||
"data": rss_data,
|
||||
"data_file": cache_rss_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ RSS: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone RSS SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ RSS: {len(rss_data.get('articles', []))} articles extracted")
|
||||
|
||||
def _scrape_manpage(self, source: dict[str, Any]):
|
||||
"""Scrape man page(s)."""
|
||||
try:
|
||||
from skill_seekers.cli.man_scraper import ManPageToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("man_scraper.py not found")
|
||||
return
|
||||
|
||||
idx = self._source_counters["manpage"]
|
||||
self._source_counters["manpage"] += 1
|
||||
|
||||
man_names = source.get("names", [])
|
||||
man_path = source.get("path", "")
|
||||
man_id = man_names[0] if man_names else os.path.basename(man_path.rstrip("/"))
|
||||
|
||||
man_config = {
|
||||
"name": f"{self.name}_manpage_{idx}_{man_id}",
|
||||
"man_names": man_names,
|
||||
"man_path": man_path,
|
||||
"sections": source.get("sections", []),
|
||||
"description": source.get("description", f"{man_id} man pages"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping man pages: {man_id}")
|
||||
converter = ManPageToSkillConverter(man_config)
|
||||
converter.extract_manpages()
|
||||
|
||||
man_data_file = converter.data_file
|
||||
with open(man_data_file, encoding="utf-8") as f:
|
||||
man_data = json.load(f)
|
||||
|
||||
cache_man_data = os.path.join(self.data_dir, f"manpage_data_{idx}_{man_id}.json")
|
||||
shutil.copy(man_data_file, cache_man_data)
|
||||
|
||||
self.scraped_data["manpage"].append(
|
||||
{
|
||||
"man_id": man_id,
|
||||
"idx": idx,
|
||||
"data": man_data,
|
||||
"data_file": cache_man_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Man pages: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone man page SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Man pages: {len(man_data.get('pages', []))} man pages extracted")
|
||||
|
||||
def _scrape_chat(self, source: dict[str, Any]):
|
||||
"""Scrape Slack/Discord chat export or API."""
|
||||
try:
|
||||
from skill_seekers.cli.chat_scraper import ChatToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Chat scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[chat]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["chat"]
|
||||
self._source_counters["chat"] += 1
|
||||
|
||||
export_path = source.get("path", "")
|
||||
channel = source.get("channel", source.get("channel_id", ""))
|
||||
chat_id = channel or os.path.basename(export_path.rstrip("/")) or f"chat_{idx}"
|
||||
|
||||
chat_config = {
|
||||
"name": f"{self.name}_chat_{idx}_{chat_id}",
|
||||
"export_path": source.get("path"),
|
||||
"platform": source.get("platform", "slack"),
|
||||
"token": source.get("token"),
|
||||
"channel": channel,
|
||||
"max_messages": source.get("max_messages", 10000),
|
||||
"description": source.get("description", f"{chat_id} chat export"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping chat: {chat_id}")
|
||||
converter = ChatToSkillConverter(chat_config)
|
||||
converter.extract_chat()
|
||||
|
||||
chat_data_file = converter.data_file
|
||||
with open(chat_data_file, encoding="utf-8") as f:
|
||||
chat_data = json.load(f)
|
||||
|
||||
cache_chat_data = os.path.join(self.data_dir, f"chat_data_{idx}_{chat_id}.json")
|
||||
shutil.copy(chat_data_file, cache_chat_data)
|
||||
|
||||
self.scraped_data["chat"].append(
|
||||
{
|
||||
"chat_id": chat_id,
|
||||
"platform": source.get("platform", "slack"),
|
||||
"idx": idx,
|
||||
"data": chat_data,
|
||||
"data_file": cache_chat_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Chat: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone chat SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Chat: {len(chat_data.get('messages', []))} messages extracted")
|
||||
|
||||
def _load_json(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Load JSON file safely.
|
||||
@@ -1297,14 +1931,33 @@ Examples:
|
||||
if args.dry_run:
|
||||
logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
|
||||
logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
|
||||
# Source type display config: type -> (label, key for detail)
|
||||
_SOURCE_DISPLAY = {
|
||||
"documentation": ("Documentation", "base_url"),
|
||||
"github": ("GitHub", "repo"),
|
||||
"pdf": ("PDF", "path"),
|
||||
"word": ("Word", "path"),
|
||||
"epub": ("EPUB", "path"),
|
||||
"video": ("Video", "url"),
|
||||
"local": ("Local Codebase", "path"),
|
||||
"jupyter": ("Jupyter Notebook", "path"),
|
||||
"html": ("HTML", "path"),
|
||||
"openapi": ("OpenAPI Spec", "path"),
|
||||
"asciidoc": ("AsciiDoc", "path"),
|
||||
"pptx": ("PowerPoint", "path"),
|
||||
"confluence": ("Confluence", "base_url"),
|
||||
"notion": ("Notion", "page_id"),
|
||||
"rss": ("RSS/Atom Feed", "url"),
|
||||
"manpage": ("Man Page", "names"),
|
||||
"chat": ("Chat Export", "path"),
|
||||
}
|
||||
for idx, source in enumerate(scraper.config.get("sources", []), 1):
|
||||
source_type = source.get("type", "unknown")
|
||||
if source_type == "documentation":
|
||||
logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
|
||||
elif source_type == "github":
|
||||
logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
|
||||
elif source_type == "pdf":
|
||||
logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
|
||||
label, key = _SOURCE_DISPLAY.get(source_type, (source_type.title(), "path"))
|
||||
detail = source.get(key, "N/A")
|
||||
if isinstance(detail, list):
|
||||
detail = ", ".join(str(d) for d in detail)
|
||||
logger.info(f" {idx}. {label}: {detail}")
|
||||
logger.info(f"\nOutput directory: {scraper.output_dir}")
|
||||
logger.info(f"Merge mode: {scraper.merge_mode}")
|
||||
return
|
||||
|
||||
@@ -136,6 +136,44 @@ class UnifiedSkillBuilder:
|
||||
skill_mds["pdf"] = "\n\n---\n\n".join(pdf_sources)
|
||||
logger.debug(f"Combined {len(pdf_sources)} PDF SKILL.md files")
|
||||
|
||||
# Load additional source types using generic glob pattern
|
||||
# Each source type uses: {name}_{type}_{idx}_*/ or {name}_{type}_*/
|
||||
_extra_types = [
|
||||
"word",
|
||||
"epub",
|
||||
"video",
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"confluence",
|
||||
"notion",
|
||||
"rss",
|
||||
"manpage",
|
||||
"chat",
|
||||
]
|
||||
for source_type in _extra_types:
|
||||
type_sources = []
|
||||
for type_dir in sources_dir.glob(f"{self.name}_{source_type}_*"):
|
||||
type_skill_path = type_dir / "SKILL.md"
|
||||
if type_skill_path.exists():
|
||||
try:
|
||||
content = type_skill_path.read_text(encoding="utf-8")
|
||||
type_sources.append(content)
|
||||
logger.debug(
|
||||
f"Loaded {source_type} SKILL.md from {type_dir.name} "
|
||||
f"({len(content)} chars)"
|
||||
)
|
||||
except OSError as e:
|
||||
logger.warning(
|
||||
f"Failed to read {source_type} SKILL.md from {type_dir.name}: {e}"
|
||||
)
|
||||
|
||||
if type_sources:
|
||||
skill_mds[source_type] = "\n\n---\n\n".join(type_sources)
|
||||
logger.debug(f"Combined {len(type_sources)} {source_type} SKILL.md files")
|
||||
|
||||
logger.info(f"Loaded {len(skill_mds)} source SKILL.md files")
|
||||
return skill_mds
|
||||
|
||||
@@ -477,6 +515,18 @@ This skill synthesizes knowledge from multiple sources:
|
||||
logger.info("Using PDF SKILL.md as-is")
|
||||
content = skill_mds["pdf"]
|
||||
|
||||
# Generic merge for additional source types not covered by pairwise methods
|
||||
if not content and skill_mds:
|
||||
# At least one source SKILL.md exists but not docs/github/pdf
|
||||
logger.info(f"Generic merge for source types: {list(skill_mds.keys())}")
|
||||
content = self._generic_merge(skill_mds)
|
||||
elif content and len(skill_mds) > (int(has_docs) + int(has_github) + int(has_pdf)):
|
||||
# Pairwise synthesis handled the core types; append additional sources
|
||||
extra_types = set(skill_mds.keys()) - {"documentation", "github", "pdf"}
|
||||
if extra_types:
|
||||
logger.info(f"Appending additional sources: {extra_types}")
|
||||
content = self._append_extra_sources(content, skill_mds, extra_types)
|
||||
|
||||
# Fallback: generate minimal SKILL.md (legacy behavior)
|
||||
if not content:
|
||||
logger.warning("No source SKILL.md files found, generating minimal SKILL.md (legacy)")
|
||||
@@ -574,6 +624,165 @@ This skill synthesizes knowledge from multiple sources:
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Generic merge system for any combination of source types (v3.2.0+)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
# Human-readable labels for source types
|
||||
_SOURCE_LABELS: dict[str, str] = {
|
||||
"documentation": "Documentation",
|
||||
"github": "GitHub Repository",
|
||||
"pdf": "PDF Document",
|
||||
"word": "Word Document",
|
||||
"epub": "EPUB E-book",
|
||||
"video": "Video",
|
||||
"local": "Local Codebase",
|
||||
"jupyter": "Jupyter Notebook",
|
||||
"html": "HTML Document",
|
||||
"openapi": "OpenAPI/Swagger Spec",
|
||||
"asciidoc": "AsciiDoc Document",
|
||||
"pptx": "PowerPoint Presentation",
|
||||
"confluence": "Confluence Wiki",
|
||||
"notion": "Notion Page",
|
||||
"rss": "RSS/Atom Feed",
|
||||
"manpage": "Man Page",
|
||||
"chat": "Chat Export",
|
||||
}
|
||||
|
||||
def _generic_merge(self, skill_mds: dict[str, str]) -> str:
|
||||
"""Generic merge for any combination of source types.
|
||||
|
||||
Uses a priority-based section ordering approach:
|
||||
1. Parse all source SKILL.md files into sections
|
||||
2. Collect unique sections across all sources
|
||||
3. Merge matching sections with source attribution
|
||||
4. Produce a unified SKILL.md
|
||||
|
||||
This preserves the existing pairwise synthesis for docs+github, docs+pdf, etc.
|
||||
and handles any other combination generically.
|
||||
|
||||
Args:
|
||||
skill_mds: Dict mapping source type to SKILL.md content
|
||||
|
||||
Returns:
|
||||
Merged SKILL.md content string
|
||||
"""
|
||||
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
|
||||
desc = self.description[:1024] if len(self.description) > 1024 else self.description
|
||||
|
||||
# Parse all source SKILL.md files into sections
|
||||
all_sections: dict[str, dict[str, str]] = {}
|
||||
for source_type, content in skill_mds.items():
|
||||
all_sections[source_type] = self._parse_skill_md_sections(content)
|
||||
|
||||
# Determine all unique section names in priority order
|
||||
# Sections that appear earlier in sources have higher priority
|
||||
seen_sections: list[str] = []
|
||||
for _source_type, sections in all_sections.items():
|
||||
for section_name in sections:
|
||||
if section_name not in seen_sections:
|
||||
seen_sections.append(section_name)
|
||||
|
||||
# Build merged content
|
||||
source_labels = ", ".join(self._SOURCE_LABELS.get(t, t.title()) for t in skill_mds)
|
||||
lines = [
|
||||
"---",
|
||||
f"name: {skill_name}",
|
||||
f"description: {desc}",
|
||||
"---",
|
||||
"",
|
||||
f"# {self.name.replace('_', ' ').title()}",
|
||||
"",
|
||||
f"{self.description}",
|
||||
"",
|
||||
f"*Merged from: {source_labels}*",
|
||||
"",
|
||||
]
|
||||
|
||||
# Emit each section, merging content from all sources that have it
|
||||
for section_name in seen_sections:
|
||||
contributing_sources = [
|
||||
(stype, sections[section_name])
|
||||
for stype, sections in all_sections.items()
|
||||
if section_name in sections
|
||||
]
|
||||
|
||||
if len(contributing_sources) == 1:
|
||||
# Single source for this section — emit as-is
|
||||
stype, content = contributing_sources[0]
|
||||
label = self._SOURCE_LABELS.get(stype, stype.title())
|
||||
lines.append(f"## {section_name}")
|
||||
lines.append("")
|
||||
lines.append(f"*From {label}*")
|
||||
lines.append("")
|
||||
lines.append(content)
|
||||
lines.append("")
|
||||
else:
|
||||
# Multiple sources — merge with attribution
|
||||
lines.append(f"## {section_name}")
|
||||
lines.append("")
|
||||
for stype, content in contributing_sources:
|
||||
label = self._SOURCE_LABELS.get(stype, stype.title())
|
||||
lines.append(f"### From {label}")
|
||||
lines.append("")
|
||||
lines.append(content)
|
||||
lines.append("")
|
||||
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
lines.append("*Generated by Skill Seeker's unified multi-source scraper*")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _append_extra_sources(
|
||||
self,
|
||||
base_content: str,
|
||||
skill_mds: dict[str, str],
|
||||
extra_types: set[str],
|
||||
) -> str:
|
||||
"""Append additional source content to existing pairwise-synthesized SKILL.md.
|
||||
|
||||
Used when the core docs+github+pdf synthesis has run, but there are
|
||||
additional source types (epub, jupyter, etc.) that need to be included.
|
||||
|
||||
Args:
|
||||
base_content: Already-synthesized SKILL.md content
|
||||
skill_mds: All source SKILL.md files
|
||||
extra_types: Set of extra source type keys to append
|
||||
|
||||
Returns:
|
||||
Extended SKILL.md content
|
||||
"""
|
||||
lines = base_content.split("\n")
|
||||
|
||||
# Find the final separator (---) or end of file
|
||||
insertion_index = len(lines)
|
||||
for i in range(len(lines) - 1, -1, -1):
|
||||
if lines[i].strip() == "---":
|
||||
insertion_index = i
|
||||
break
|
||||
|
||||
# Build extra content
|
||||
extra_lines = [""]
|
||||
for source_type in sorted(extra_types):
|
||||
if source_type not in skill_mds:
|
||||
continue
|
||||
label = self._SOURCE_LABELS.get(source_type, source_type.title())
|
||||
sections = self._parse_skill_md_sections(skill_mds[source_type])
|
||||
|
||||
extra_lines.append(f"## {label} Content")
|
||||
extra_lines.append("")
|
||||
|
||||
for section_name, content in sections.items():
|
||||
extra_lines.append(f"### {section_name}")
|
||||
extra_lines.append("")
|
||||
extra_lines.append(content)
|
||||
extra_lines.append("")
|
||||
|
||||
lines[insertion_index:insertion_index] = extra_lines
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _generate_minimal_skill_md(self) -> str:
|
||||
"""Generate minimal SKILL.md (legacy fallback behavior).
|
||||
|
||||
@@ -597,18 +806,42 @@ This skill combines knowledge from multiple sources:
|
||||
|
||||
"""
|
||||
|
||||
# Source type display keys: type -> (label, primary_key, extra_keys)
|
||||
_source_detail_map = {
|
||||
"documentation": ("Documentation", "base_url", [("Pages", "max_pages", "unlimited")]),
|
||||
"github": (
|
||||
"GitHub Repository",
|
||||
"repo",
|
||||
[("Code Analysis", "code_analysis_depth", "surface"), ("Issues", "max_issues", 0)],
|
||||
),
|
||||
"pdf": ("PDF Document", "path", []),
|
||||
"word": ("Word Document", "path", []),
|
||||
"epub": ("EPUB E-book", "path", []),
|
||||
"video": ("Video", "url", []),
|
||||
"local": ("Local Codebase", "path", [("Analysis Depth", "analysis_depth", "surface")]),
|
||||
"jupyter": ("Jupyter Notebook", "path", []),
|
||||
"html": ("HTML Document", "path", []),
|
||||
"openapi": ("OpenAPI Spec", "path", []),
|
||||
"asciidoc": ("AsciiDoc Document", "path", []),
|
||||
"pptx": ("PowerPoint", "path", []),
|
||||
"confluence": ("Confluence Wiki", "base_url", []),
|
||||
"notion": ("Notion Page", "page_id", []),
|
||||
"rss": ("RSS/Atom Feed", "url", []),
|
||||
"manpage": ("Man Page", "names", []),
|
||||
"chat": ("Chat Export", "path", []),
|
||||
}
|
||||
|
||||
# List sources
|
||||
for source in self.config.get("sources", []):
|
||||
source_type = source["type"]
|
||||
if source_type == "documentation":
|
||||
content += f"- ✅ **Documentation**: {source.get('base_url', 'N/A')}\n"
|
||||
content += f" - Pages: {source.get('max_pages', 'unlimited')}\n"
|
||||
elif source_type == "github":
|
||||
content += f"- ✅ **GitHub Repository**: {source.get('repo', 'N/A')}\n"
|
||||
content += f" - Code Analysis: {source.get('code_analysis_depth', 'surface')}\n"
|
||||
content += f" - Issues: {source.get('max_issues', 0)}\n"
|
||||
elif source_type == "pdf":
|
||||
content += f"- ✅ **PDF Document**: {source.get('path', 'N/A')}\n"
|
||||
display = _source_detail_map.get(source_type, (source_type.title(), "path", []))
|
||||
label, primary_key, extras = display
|
||||
primary_val = source.get(primary_key, "N/A")
|
||||
if isinstance(primary_val, list):
|
||||
primary_val = ", ".join(str(v) for v in primary_val)
|
||||
content += f"- ✅ **{label}**: {primary_val}\n"
|
||||
for extra_label, extra_key, extra_default in extras:
|
||||
content += f" - {extra_label}: {source.get(extra_key, extra_default)}\n"
|
||||
|
||||
# C3.x Architecture & Code Analysis section (if available)
|
||||
github_data = self.scraped_data.get("github", {})
|
||||
@@ -796,6 +1029,27 @@ This skill combines knowledge from multiple sources:
|
||||
if pdf_list:
|
||||
self._generate_pdf_references(pdf_list)
|
||||
|
||||
# Generate references for all additional source types
|
||||
_extra_source_types = [
|
||||
"word",
|
||||
"epub",
|
||||
"video",
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"confluence",
|
||||
"notion",
|
||||
"rss",
|
||||
"manpage",
|
||||
"chat",
|
||||
]
|
||||
for source_type in _extra_source_types:
|
||||
source_list = self.scraped_data.get(source_type, [])
|
||||
if source_list:
|
||||
self._generate_generic_references(source_type, source_list)
|
||||
|
||||
# Generate merged API reference if available
|
||||
if self.merged_data:
|
||||
self._generate_merged_api_reference()
|
||||
@@ -977,6 +1231,63 @@ This skill combines knowledge from multiple sources:
|
||||
|
||||
logger.info(f"Created PDF references ({len(pdf_list)} sources)")
|
||||
|
||||
def _generate_generic_references(self, source_type: str, source_list: list[dict]):
|
||||
"""Generate references for any source type using a generic approach.
|
||||
|
||||
Creates a references/<source_type>/ directory with an index and
|
||||
copies any data files from the source list.
|
||||
|
||||
Args:
|
||||
source_type: The source type key (e.g., 'epub', 'jupyter')
|
||||
source_list: List of scraped source dicts for this type
|
||||
"""
|
||||
if not source_list:
|
||||
return
|
||||
|
||||
label = self._SOURCE_LABELS.get(source_type, source_type.title())
|
||||
type_dir = os.path.join(self.skill_dir, "references", source_type)
|
||||
os.makedirs(type_dir, exist_ok=True)
|
||||
|
||||
# Create index
|
||||
index_path = os.path.join(type_dir, "index.md")
|
||||
with open(index_path, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {label} References\n\n")
|
||||
f.write(f"Reference from {len(source_list)} {label} source(s).\n\n")
|
||||
|
||||
for i, source_data in enumerate(source_list):
|
||||
# Try common ID fields
|
||||
source_id = (
|
||||
source_data.get("source_id")
|
||||
or source_data.get(f"{source_type}_id")
|
||||
or source_data.get("notebook_id")
|
||||
or source_data.get("spec_id")
|
||||
or source_data.get("feed_id")
|
||||
or source_data.get("man_id")
|
||||
or source_data.get("chat_id")
|
||||
or f"source_{i}"
|
||||
)
|
||||
f.write(f"## {source_id}\n\n")
|
||||
|
||||
# Write summary of extracted data
|
||||
data = source_data.get("data", {})
|
||||
if isinstance(data, dict):
|
||||
for key in ["title", "description", "metadata"]:
|
||||
if key in data:
|
||||
val = data[key]
|
||||
if isinstance(val, str) and val:
|
||||
f.write(f"**{key.title()}:** {val}\n\n")
|
||||
|
||||
# Copy data file if available
|
||||
data_file = source_data.get("data_file")
|
||||
if data_file and os.path.isfile(data_file):
|
||||
dest = os.path.join(type_dir, f"{source_id}_data.json")
|
||||
import contextlib
|
||||
|
||||
with contextlib.suppress(OSError):
|
||||
shutil.copy(data_file, dest)
|
||||
|
||||
logger.info(f"Created {label} references ({len(source_list)} sources)")
|
||||
|
||||
def _generate_merged_api_reference(self):
|
||||
"""Generate merged API reference file."""
|
||||
api_dir = os.path.join(self.skill_dir, "references", "api")
|
||||
|
||||
@@ -3,16 +3,16 @@
|
||||
Skill Seeker MCP Server (FastMCP Implementation)
|
||||
|
||||
Modern, decorator-based MCP server using FastMCP for simplified tool registration.
|
||||
Provides 33 tools for generating Claude AI skills from documentation.
|
||||
Provides 34 tools for generating Claude AI skills from documentation.
|
||||
|
||||
This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
|
||||
All tool implementations are delegated to modular tool files in tools/ directory.
|
||||
|
||||
**Architecture:**
|
||||
- FastMCP server with decorator-based tool registration
|
||||
- 33 tools organized into 7 categories:
|
||||
- 34 tools organized into 7 categories:
|
||||
* Config tools (3): generate_config, list_configs, validate_config
|
||||
* Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
|
||||
* Scraping tools (11): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns, scrape_generic
|
||||
* Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
|
||||
* Splitting tools (2): split_config, generate_router
|
||||
* Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
|
||||
@@ -97,6 +97,7 @@ try:
|
||||
remove_config_source_impl,
|
||||
scrape_codebase_impl,
|
||||
scrape_docs_impl,
|
||||
scrape_generic_impl,
|
||||
scrape_github_impl,
|
||||
scrape_pdf_impl,
|
||||
scrape_video_impl,
|
||||
@@ -141,6 +142,7 @@ except ImportError:
|
||||
remove_config_source_impl,
|
||||
scrape_codebase_impl,
|
||||
scrape_docs_impl,
|
||||
scrape_generic_impl,
|
||||
scrape_github_impl,
|
||||
scrape_pdf_impl,
|
||||
scrape_video_impl,
|
||||
@@ -301,7 +303,7 @@ async def sync_config(
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# SCRAPING TOOLS (10 tools)
|
||||
# SCRAPING TOOLS (11 tools)
|
||||
# ============================================================================
|
||||
|
||||
|
||||
@@ -823,6 +825,50 @@ async def extract_config_patterns(
|
||||
return str(result)
|
||||
|
||||
|
||||
@safe_tool_decorator(
|
||||
description="Scrape content from new source types: jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat. A generic entry point that delegates to the appropriate CLI scraper module."
|
||||
)
|
||||
async def scrape_generic(
|
||||
source_type: str,
|
||||
name: str,
|
||||
path: str | None = None,
|
||||
url: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Scrape content from various source types and build a skill.
|
||||
|
||||
A generic scraper that supports 10 new source types. It delegates to the
|
||||
corresponding CLI scraper module (e.g., skill_seekers.cli.jupyter_scraper).
|
||||
|
||||
File-based types (jupyter, html, openapi, asciidoc, pptx, manpage, chat)
|
||||
typically use the 'path' parameter. URL-based types (confluence, notion, rss)
|
||||
typically use the 'url' parameter.
|
||||
|
||||
Args:
|
||||
source_type: Source type to scrape. One of: jupyter, html, openapi,
|
||||
asciidoc, pptx, confluence, notion, rss, manpage, chat.
|
||||
name: Skill name for the output
|
||||
path: File or directory path (for file-based sources like jupyter, html, pptx)
|
||||
url: URL (for URL-based sources like confluence, notion, rss)
|
||||
|
||||
Returns:
|
||||
Scraping results with file paths and statistics.
|
||||
"""
|
||||
args = {
|
||||
"source_type": source_type,
|
||||
"name": name,
|
||||
}
|
||||
if path:
|
||||
args["path"] = path
|
||||
if url:
|
||||
args["url"] = url
|
||||
|
||||
result = await scrape_generic_impl(args)
|
||||
if isinstance(result, list) and result:
|
||||
return result[0].text if hasattr(result[0], "text") else str(result[0])
|
||||
return str(result)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# PACKAGING TOOLS (4 tools)
|
||||
# ============================================================================
|
||||
|
||||
@@ -63,6 +63,9 @@ from .scraping_tools import (
|
||||
from .scraping_tools import (
|
||||
scrape_pdf_tool as scrape_pdf_impl,
|
||||
)
|
||||
from .scraping_tools import (
|
||||
scrape_generic_tool as scrape_generic_impl,
|
||||
)
|
||||
from .scraping_tools import (
|
||||
scrape_video_tool as scrape_video_impl,
|
||||
)
|
||||
@@ -135,6 +138,7 @@ __all__ = [
|
||||
"extract_test_examples_impl",
|
||||
"build_how_to_guides_impl",
|
||||
"extract_config_patterns_impl",
|
||||
"scrape_generic_impl",
|
||||
# Packaging tools
|
||||
"package_skill_impl",
|
||||
"upload_skill_impl",
|
||||
|
||||
@@ -205,6 +205,18 @@ async def validate_config(args: dict) -> list[TextContent]:
|
||||
)
|
||||
elif source["type"] == "pdf":
|
||||
result += f" Path: {source.get('path', 'N/A')}\n"
|
||||
elif source["type"] in (
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"manpage",
|
||||
"chat",
|
||||
):
|
||||
result += f" Path: {source.get('path', 'N/A')}\n"
|
||||
elif source["type"] in ("confluence", "notion", "rss"):
|
||||
result += f" URL: {source.get('url', 'N/A')}\n"
|
||||
|
||||
# Show merge settings if applicable
|
||||
if validator.needs_api_merge():
|
||||
|
||||
@@ -7,6 +7,8 @@ This module contains all scraping-related MCP tool implementations:
|
||||
- scrape_github_tool: Scrape GitHub repositories
|
||||
- scrape_pdf_tool: Scrape PDF documentation
|
||||
- scrape_codebase_tool: Analyze local codebase and extract code knowledge
|
||||
- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
|
||||
openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)
|
||||
|
||||
Extracted from server.py for better modularity and organization.
|
||||
"""
|
||||
@@ -1005,3 +1007,155 @@ async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
|
||||
return [TextContent(type="text", text=output_text)]
|
||||
else:
|
||||
return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
|
||||
|
||||
|
||||
# Valid source types for the generic scraper
|
||||
GENERIC_SOURCE_TYPES = (
|
||||
"jupyter",
|
||||
"html",
|
||||
"openapi",
|
||||
"asciidoc",
|
||||
"pptx",
|
||||
"confluence",
|
||||
"notion",
|
||||
"rss",
|
||||
"manpage",
|
||||
"chat",
|
||||
)
|
||||
|
||||
# Mapping from source type to the CLI flag used for the primary input argument.
|
||||
# URL-based types use --url; file/path-based types use --path.
|
||||
_URL_BASED_TYPES = {"confluence", "notion", "rss"}
|
||||
|
||||
# Friendly emoji labels per source type
|
||||
_SOURCE_EMOJIS = {
|
||||
"jupyter": "📓",
|
||||
"html": "🌐",
|
||||
"openapi": "📡",
|
||||
"asciidoc": "📄",
|
||||
"pptx": "📊",
|
||||
"confluence": "🏢",
|
||||
"notion": "📝",
|
||||
"rss": "📰",
|
||||
"manpage": "📖",
|
||||
"chat": "💬",
|
||||
}
|
||||
|
||||
|
||||
async def scrape_generic_tool(args: dict) -> list[TextContent]:
|
||||
"""
|
||||
Generic scraper for new source types.
|
||||
|
||||
Handles all 10 new source types by building the appropriate subprocess
|
||||
command and delegating to the corresponding CLI scraper module.
|
||||
|
||||
Supported source types: jupyter, html, openapi, asciidoc, pptx,
|
||||
confluence, notion, rss, manpage, chat.
|
||||
|
||||
Args:
|
||||
args: Dictionary containing:
|
||||
- source_type (str): One of the supported source types
|
||||
- path (str, optional): File or directory path (for file-based sources)
|
||||
- url (str, optional): URL (for URL-based sources like confluence, notion, rss)
|
||||
- name (str): Skill name for the output
|
||||
|
||||
Returns:
|
||||
List[TextContent]: Tool execution results
|
||||
"""
|
||||
source_type = args.get("source_type", "")
|
||||
path = args.get("path")
|
||||
url = args.get("url")
|
||||
name = args.get("name")
|
||||
|
||||
# Validate source_type
|
||||
if source_type not in GENERIC_SOURCE_TYPES:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text=(
|
||||
f"❌ Error: Unknown source_type '{source_type}'. "
|
||||
f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
# Validate that we have either path or url
|
||||
if not path and not url:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
|
||||
)
|
||||
]
|
||||
|
||||
if not name:
|
||||
return [
|
||||
TextContent(
|
||||
type="text",
|
||||
text="❌ Error: 'name' parameter is required",
|
||||
)
|
||||
]
|
||||
|
||||
# Build the subprocess command
|
||||
# Map source type to module name (most are <type>_scraper, but some differ)
|
||||
_MODULE_NAMES = {
|
||||
"manpage": "man_scraper",
|
||||
}
|
||||
module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
|
||||
cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
|
||||
|
||||
# Map source type to the correct CLI flag for file/path input and URL input.
|
||||
# Each scraper has its own flag name — using a generic --path or --url would fail.
|
||||
_PATH_FLAGS: dict[str, str] = {
|
||||
"jupyter": "--notebook",
|
||||
"html": "--html-path",
|
||||
"openapi": "--spec",
|
||||
"asciidoc": "--asciidoc-path",
|
||||
"pptx": "--pptx",
|
||||
"manpage": "--man-path",
|
||||
"confluence": "--export-path",
|
||||
"notion": "--export-path",
|
||||
"rss": "--feed-path",
|
||||
"chat": "--export-path",
|
||||
}
|
||||
_URL_FLAGS: dict[str, str] = {
|
||||
"confluence": "--base-url",
|
||||
"notion": "--page-id",
|
||||
"rss": "--feed-url",
|
||||
"openapi": "--spec-url",
|
||||
}
|
||||
|
||||
# Determine the input flag based on source type
|
||||
if source_type in _URL_BASED_TYPES and url:
|
||||
url_flag = _URL_FLAGS.get(source_type, "--url")
|
||||
cmd.extend([url_flag, url])
|
||||
elif path:
|
||||
path_flag = _PATH_FLAGS.get(source_type, "--path")
|
||||
cmd.extend([path_flag, path])
|
||||
elif url:
|
||||
# Allow url fallback for file-based types (some may accept URLs too)
|
||||
url_flag = _URL_FLAGS.get(source_type, "--url")
|
||||
cmd.extend([url_flag, url])
|
||||
|
||||
cmd.extend(["--name", name])
|
||||
|
||||
# Set a reasonable timeout
|
||||
timeout = 600 # 10 minutes
|
||||
|
||||
emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
|
||||
progress_msg = f"{emoji} Scraping {source_type} source...\n"
|
||||
if path:
|
||||
progress_msg += f"📁 Path: {path}\n"
|
||||
if url:
|
||||
progress_msg += f"🔗 URL: {url}\n"
|
||||
progress_msg += f"📛 Name: {name}\n"
|
||||
progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
|
||||
|
||||
stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
|
||||
|
||||
output = progress_msg + stdout
|
||||
|
||||
if returncode == 0:
|
||||
return [TextContent(type="text", text=output)]
|
||||
else:
|
||||
return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
|
||||
|
||||
@@ -106,7 +106,9 @@ async def split_config(args: dict) -> list[TextContent]:
|
||||
|
||||
Supports both documentation and unified (multi-source) configs:
|
||||
- Documentation configs: Split by categories, size, or create router skills
|
||||
- Unified configs: Split by source type (documentation, github, pdf)
|
||||
- Unified configs: Split by source type (documentation, github, pdf,
|
||||
jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss,
|
||||
manpage, chat)
|
||||
|
||||
For large documentation sites (10K+ pages), this tool splits the config into
|
||||
multiple smaller configs. For unified configs with multiple sources, splits
|
||||
|
||||
222
src/skill_seekers/workflows/complex-merge.yaml
Normal file
222
src/skill_seekers/workflows/complex-merge.yaml
Normal file
@@ -0,0 +1,222 @@
|
||||
name: complex-merge
|
||||
description: Intelligent multi-source merging with conflict resolution, priority rules, and gap analysis
|
||||
version: "1.0"
|
||||
author: Skill Seekers
|
||||
tags:
|
||||
- merge
|
||||
- multi-source
|
||||
- conflict-resolution
|
||||
- synthesis
|
||||
applies_to:
|
||||
- doc_scraping
|
||||
- codebase_analysis
|
||||
- github_analysis
|
||||
variables:
|
||||
merge_strategy: priority
|
||||
source_priority_order: "official_docs,code,community"
|
||||
conflict_resolution: highest_priority
|
||||
min_sources_for_consensus: 2
|
||||
stages:
|
||||
- name: source_inventory
|
||||
type: custom
|
||||
target: inventory
|
||||
uses_history: false
|
||||
enabled: true
|
||||
prompt: >
|
||||
Catalog every source that contributed content to this skill extraction.
|
||||
For each source, classify its type and assess its characteristics.
|
||||
|
||||
For each source, determine:
|
||||
1. Source type (official_docs, codebase, github_repo, pdf, video, community, blog)
|
||||
2. Content scope — what topics or areas does this source cover?
|
||||
3. Freshness — how recent is the content? Look for version numbers, dates, deprecation notices
|
||||
4. Authority level — is this an official maintainer, core contributor, or third party?
|
||||
5. Content density — roughly how much substantive information does this source provide?
|
||||
6. Format characteristics — prose, code samples, API reference, tutorial, etc.
|
||||
|
||||
Output JSON with:
|
||||
- "sources": array of {id, type, scope_summary, topics_covered, freshness_estimate, authority, density, format}
|
||||
- "source_type_distribution": count of sources by type
|
||||
- "total_topics_identified": number of unique topics across all sources
|
||||
- "coverage_summary": brief overview of what the combined sources cover
|
||||
|
||||
- name: cross_reference
|
||||
type: custom
|
||||
target: cross_references
|
||||
uses_history: true
|
||||
enabled: true
|
||||
prompt: >
|
||||
Using the source inventory, identify overlapping topics across sources.
|
||||
Find where multiple sources discuss the same concept, API, feature, or pattern.
|
||||
|
||||
For each overlapping topic:
|
||||
1. List which sources cover it and how deeply
|
||||
2. Note whether sources agree, complement each other, or diverge
|
||||
3. Identify the richest source for that topic (most detail, best examples)
|
||||
4. Flag any terminology differences across sources for the same concept
|
||||
|
||||
Output JSON with:
|
||||
- "overlapping_topics": array of {topic, sources_covering, agreement_level, richest_source, terminology_variants}
|
||||
- "high_overlap_topics": topics covered by 3+ sources
|
||||
- "complementary_pairs": pairs of sources that cover different aspects of the same topic well
|
||||
- "terminology_map": dictionary mapping variant terms to a canonical term
|
||||
|
||||
- name: conflict_detection
|
||||
type: custom
|
||||
target: conflicts
|
||||
uses_history: true
|
||||
enabled: true
|
||||
prompt: >
|
||||
Examine the cross-referenced topics and identify genuine contradictions
|
||||
between sources. Distinguish between true conflicts and superficial differences.
|
||||
|
||||
Categories of conflict to detect:
|
||||
1. Factual contradictions — sources state opposite things about the same feature
|
||||
2. Version mismatches — sources describe different versions of an API or behavior
|
||||
3. Best practice disagreements — sources recommend conflicting approaches
|
||||
4. Deprecated vs current — one source shows deprecated usage another shows current
|
||||
5. Scope conflicts — sources disagree on what a feature can or cannot do
|
||||
|
||||
For each conflict:
|
||||
- Identify the specific claim from each source
|
||||
- Assess which source is more likely correct and why
|
||||
- Recommend a resolution strategy
|
||||
|
||||
Output JSON with:
|
||||
- "conflicts": array of {topic, type, source_a_claim, source_b_claim, likely_correct, resolution_rationale}
|
||||
- "conflict_count_by_type": breakdown of conflicts by category
|
||||
- "high_severity_conflicts": conflicts that would mislead users if unresolved
|
||||
- "auto_resolvable": conflicts that can be resolved by version/date alone
|
||||
|
||||
- name: priority_merge
|
||||
type: custom
|
||||
target: merged_content
|
||||
uses_history: true
|
||||
enabled: true
|
||||
prompt: >
|
||||
Merge content from all sources using the following priority hierarchy:
|
||||
1. Official documentation (highest authority)
|
||||
2. Source code and inline comments (ground truth for behavior)
|
||||
3. Community content — tutorials, blog posts, Stack Overflow (practical usage)
|
||||
|
||||
Merging rules:
|
||||
- When sources agree, combine the best explanation with the best examples
|
||||
- When sources conflict, prefer the higher-priority source but note the alternative
|
||||
- When only a lower-priority source covers a topic, include it but flag the authority level
|
||||
- Preserve code examples from any source, annotating their origin
|
||||
- Deduplicate content — do not repeat the same information from multiple sources
|
||||
- Normalize terminology using the canonical terms from cross-referencing
|
||||
|
||||
For each merged topic, produce:
|
||||
1. Authoritative explanation (from highest-priority source)
|
||||
2. Practical examples (best available from any source)
|
||||
3. Source attribution (which sources contributed)
|
||||
4. Confidence level (high if official docs confirm, medium if code-only, low if community-only)
|
||||
|
||||
Output JSON with:
|
||||
- "merged_topics": array of {topic, explanation, examples, sources_used, confidence, notes}
|
||||
- "merge_decisions": array of {topic, decision, rationale} for non-trivial merges
|
||||
- "source_contribution_stats": how much each source contributed to the final output
|
||||
|
||||
- name: gap_analysis
|
||||
type: custom
|
||||
target: gaps
|
||||
uses_history: true
|
||||
enabled: true
|
||||
prompt: >
|
||||
Analyse the merged content to identify gaps — topics or areas that are
|
||||
underrepresented or missing entirely.
|
||||
|
||||
Identify:
|
||||
1. Single-source topics — covered by only one source, making them fragile
|
||||
2. Missing fundamentals — core concepts that should be documented but are not
|
||||
3. Missing examples — topics explained in prose but lacking code samples
|
||||
4. Missing edge cases — common error scenarios or limitations not documented
|
||||
5. Broken references — topics that reference other topics not present in any source
|
||||
6. Audience gaps — content assumes knowledge that is never introduced
|
||||
|
||||
For each gap, assess:
|
||||
- Severity (critical, important, nice-to-have)
|
||||
- Whether the gap can be inferred from existing content
|
||||
- Suggested source type that would best fill this gap
|
||||
|
||||
Output JSON with:
|
||||
- "single_source_topics": array of {topic, sole_source, risk_level}
|
||||
- "missing_fundamentals": topics that should exist but do not
|
||||
- "example_gaps": topics needing code examples
|
||||
- "edge_case_gaps": undocumented error scenarios
|
||||
- "broken_references": internal references with no target
|
||||
- "gap_severity_summary": counts by severity level
|
||||
|
||||
- name: synthesis
|
||||
type: custom
|
||||
target: skill_md
|
||||
uses_history: true
|
||||
enabled: true
|
||||
prompt: >
|
||||
Create a unified, coherent narrative from the merged content. The output
|
||||
should read as if written by a single knowledgeable author, not as a
|
||||
patchwork of multiple sources.
|
||||
|
||||
Synthesis guidelines:
|
||||
1. Structure content logically — concepts build on each other
|
||||
2. Lead with the most important information for each topic
|
||||
3. Integrate code examples naturally within explanations
|
||||
4. Use consistent voice, terminology, and formatting throughout
|
||||
5. Add transition text between topics for narrative flow
|
||||
6. Include a "Sources and Confidence" appendix noting where information came from
|
||||
7. Mark any low-confidence or single-source claims with a caveat
|
||||
8. Fill minor gaps by inference where safe to do so, clearly marking inferred content
|
||||
|
||||
Output JSON with:
|
||||
- "synthesized_sections": array of {title, content, sources_used, confidence}
|
||||
- "section_order": recommended reading order
|
||||
- "inferred_content": content that was inferred rather than directly sourced
|
||||
- "caveats": any warnings about content reliability
|
||||
|
||||
- name: quality_check
|
||||
type: custom
|
||||
target: quality
|
||||
uses_history: true
|
||||
enabled: true
|
||||
prompt: >
|
||||
Perform a final quality review of the synthesized output. Evaluate the
|
||||
merge result against multiple quality dimensions.
|
||||
|
||||
Check for:
|
||||
1. Completeness — does the output cover all topics from all sources?
|
||||
2. Accuracy — are merged claims consistent and non-contradictory?
|
||||
3. Coherence — does the document flow logically as a unified piece?
|
||||
4. Attribution — are source contributions properly tracked?
|
||||
5. Confidence calibration — are confidence levels appropriate?
|
||||
6. Example quality — are code examples correct, runnable, and well-annotated?
|
||||
7. Terminology consistency — is the canonical terminology used throughout?
|
||||
8. Gap acknowledgment — are known gaps clearly communicated?
|
||||
|
||||
Scoring:
|
||||
- Rate each dimension 1-10
|
||||
- Provide specific issues found for any dimension scoring below 7
|
||||
- Suggest concrete fixes for each issue
|
||||
|
||||
Output JSON with:
|
||||
- "quality_scores": {completeness, accuracy, coherence, attribution, confidence_calibration, example_quality, terminology_consistency, gap_acknowledgment}
|
||||
- "overall_score": weighted average (accuracy and completeness weighted 2x)
|
||||
- "issues_found": array of {dimension, description, severity, suggested_fix}
|
||||
- "merge_health": "excellent" | "good" | "needs_review" | "poor" based on overall score
|
||||
- "recommendations": top 3 actions to improve merge quality
|
||||
|
||||
post_process:
|
||||
reorder_sections:
|
||||
- overview
|
||||
- core_concepts
|
||||
- api_reference
|
||||
- examples
|
||||
- advanced_topics
|
||||
- troubleshooting
|
||||
- sources_and_confidence
|
||||
add_metadata:
|
||||
enhanced: true
|
||||
workflow: complex-merge
|
||||
multi_source: true
|
||||
conflict_resolution: priority
|
||||
quality_checked: true
|
||||
Reference in New Issue
Block a user