feat: add 10 new skill source types (17 total) with full pipeline integration

Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint,
RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new
skill source types. Each type is fully integrated across:

- Standalone CLI commands (skill-seekers <type>)
- Auto-detection via 'skill-seekers create' (file extension + content sniffing)
- Unified multi-source configs (scraped_data, dispatch, config validation)
- Unified skill builder (generic merge + source-attributed synthesis)
- MCP server (scrape_generic tool with per-type flag mapping)
- pyproject.toml (entry points, optional deps, [all] group)

Also fixes: EPUB unified pipeline gap, missing word/video config validators,
OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale
docstrings, and adds 77 integration tests + complex-merge workflow.

50 files changed, +20,201 lines
This commit is contained in:
yusyus
2026-03-15 15:30:15 +03:00
parent 64403a3686
commit 53b911b697
50 changed files with 20193 additions and 856 deletions

View File

@@ -0,0 +1,68 @@
"""AsciiDoc command argument definitions.
This module defines ALL arguments for the asciidoc command in ONE place.
Both asciidoc_scraper.py (standalone) and parsers/asciidoc_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# AsciiDoc-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = {
"asciidoc_path": {
"flags": ("--asciidoc-path",),
"kwargs": {
"type": str,
"help": "Path to AsciiDoc file or directory containing .adoc files",
"metavar": "PATH",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_asciidoc_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all asciidoc command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds AsciiDoc-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for AsciiDoc.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for AsciiDoc
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for AsciiDoc), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# AsciiDoc-specific args
for arg_name, arg_def in ASCIIDOC_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,102 @@
"""Chat command argument definitions.
This module defines ALL arguments for the chat command in ONE place.
Both chat_scraper.py (standalone) and parsers/chat_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# Chat-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
CHAT_ARGUMENTS: dict[str, dict[str, Any]] = {
"export_path": {
"flags": ("--export-path",),
"kwargs": {
"type": str,
"help": "Path to chat export directory or file",
"metavar": "PATH",
},
},
"platform": {
"flags": ("--platform",),
"kwargs": {
"type": str,
"choices": ["slack", "discord"],
"default": "slack",
"help": "Chat platform type (default: slack)",
},
},
"token": {
"flags": ("--token",),
"kwargs": {
"type": str,
"help": "API token for chat platform authentication",
"metavar": "TOKEN",
},
},
"channel": {
"flags": ("--channel",),
"kwargs": {
"type": str,
"help": "Channel name or ID to extract from",
"metavar": "CHANNEL",
},
},
"max_messages": {
"flags": ("--max-messages",),
"kwargs": {
"type": int,
"default": 10000,
"help": "Maximum number of messages to extract (default: 10000)",
"metavar": "N",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_chat_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all chat command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds Chat-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for Chat.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for Chat
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for Chat), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# Chat-specific args
for arg_name, arg_def in CHAT_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,109 @@
"""Confluence command argument definitions.
This module defines ALL arguments for the confluence command in ONE place.
Both confluence_scraper.py (standalone) and parsers/confluence_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# Confluence-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = {
"base_url": {
"flags": ("--base-url",),
"kwargs": {
"type": str,
"help": "Confluence instance base URL",
"metavar": "URL",
},
},
"space_key": {
"flags": ("--space-key",),
"kwargs": {
"type": str,
"help": "Confluence space key to extract from",
"metavar": "KEY",
},
},
"export_path": {
"flags": ("--export-path",),
"kwargs": {
"type": str,
"help": "Path to Confluence HTML/XML export directory",
"metavar": "PATH",
},
},
"username": {
"flags": ("--username",),
"kwargs": {
"type": str,
"help": "Confluence username for API authentication",
"metavar": "USER",
},
},
"token": {
"flags": ("--token",),
"kwargs": {
"type": str,
"help": "Confluence API token for authentication",
"metavar": "TOKEN",
},
},
"max_pages": {
"flags": ("--max-pages",),
"kwargs": {
"type": int,
"default": 500,
"help": "Maximum number of pages to extract (default: 500)",
"metavar": "N",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_confluence_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all confluence command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds Confluence-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for Confluence.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for Confluence
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for Confluence), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# Confluence-specific args
for arg_name, arg_def in CONFLUENCE_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -549,6 +549,121 @@ CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
# For unified config files, use `skill-seekers unified --fresh` directly.
}
# New source type arguments (v3.2.0+)
# These are minimal dicts since most flags are handled by each scraper's own argument module.
# The create command only needs the primary input flag for routing.
JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = {
"notebook": {
"flags": ("--notebook",),
"kwargs": {"type": str, "help": "Jupyter Notebook file path (.ipynb)", "metavar": "PATH"},
},
}
HTML_ARGUMENTS: dict[str, dict[str, Any]] = {
"html_path": {
"flags": ("--html-path",),
"kwargs": {"type": str, "help": "Local HTML file or directory path", "metavar": "PATH"},
},
}
OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = {
"spec": {
"flags": ("--spec",),
"kwargs": {"type": str, "help": "OpenAPI/Swagger spec file path", "metavar": "PATH"},
},
"spec_url": {
"flags": ("--spec-url",),
"kwargs": {"type": str, "help": "OpenAPI/Swagger spec URL", "metavar": "URL"},
},
}
ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = {
"asciidoc_path": {
"flags": ("--asciidoc-path",),
"kwargs": {"type": str, "help": "AsciiDoc file or directory path", "metavar": "PATH"},
},
}
PPTX_ARGUMENTS: dict[str, dict[str, Any]] = {
"pptx": {
"flags": ("--pptx",),
"kwargs": {"type": str, "help": "PowerPoint file path (.pptx)", "metavar": "PATH"},
},
}
RSS_ARGUMENTS: dict[str, dict[str, Any]] = {
"feed_url": {
"flags": ("--feed-url",),
"kwargs": {"type": str, "help": "RSS/Atom feed URL", "metavar": "URL"},
},
"feed_path": {
"flags": ("--feed-path",),
"kwargs": {"type": str, "help": "RSS/Atom feed file path", "metavar": "PATH"},
},
}
MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
"man_names": {
"flags": ("--man-names",),
"kwargs": {
"type": str,
"help": "Comma-separated man page names (e.g., 'git,curl')",
"metavar": "NAMES",
},
},
"man_path": {
"flags": ("--man-path",),
"kwargs": {"type": str, "help": "Directory of man page files", "metavar": "PATH"},
},
}
CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = {
"conf_base_url": {
"flags": ("--conf-base-url",),
"kwargs": {"type": str, "help": "Confluence base URL", "metavar": "URL"},
},
"space_key": {
"flags": ("--space-key",),
"kwargs": {"type": str, "help": "Confluence space key", "metavar": "KEY"},
},
"conf_export_path": {
"flags": ("--conf-export-path",),
"kwargs": {"type": str, "help": "Confluence export directory", "metavar": "PATH"},
},
}
NOTION_ARGUMENTS: dict[str, dict[str, Any]] = {
"database_id": {
"flags": ("--database-id",),
"kwargs": {"type": str, "help": "Notion database ID", "metavar": "ID"},
},
"page_id": {
"flags": ("--page-id",),
"kwargs": {"type": str, "help": "Notion page ID", "metavar": "ID"},
},
"notion_export_path": {
"flags": ("--notion-export-path",),
"kwargs": {"type": str, "help": "Notion export directory", "metavar": "PATH"},
},
}
CHAT_ARGUMENTS: dict[str, dict[str, Any]] = {
"chat_export_path": {
"flags": ("--chat-export-path",),
"kwargs": {"type": str, "help": "Slack/Discord export directory", "metavar": "PATH"},
},
"platform": {
"flags": ("--platform",),
"kwargs": {
"type": str,
"choices": ["slack", "discord"],
"default": "slack",
"help": "Chat platform (default: slack)",
},
},
}
# =============================================================================
# TIER 3: ADVANCED/RARE ARGUMENTS
# =============================================================================
@@ -613,6 +728,17 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
"epub": EPUB_ARGUMENTS,
"video": VIDEO_ARGUMENTS,
"config": CONFIG_ARGUMENTS,
# New source types (v3.2.0+)
"jupyter": JUPYTER_ARGUMENTS,
"html": HTML_ARGUMENTS,
"openapi": OPENAPI_ARGUMENTS,
"asciidoc": ASCIIDOC_ARGUMENTS,
"pptx": PPTX_ARGUMENTS,
"rss": RSS_ARGUMENTS,
"manpage": MANPAGE_ARGUMENTS,
"confluence": CONFLUENCE_ARGUMENTS,
"notion": NOTION_ARGUMENTS,
"chat": CHAT_ARGUMENTS,
}
return source_args.get(source_type, {})
@@ -703,6 +829,24 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
# New source types (v3.2.0+)
_NEW_SOURCE_ARGS = {
"jupyter": JUPYTER_ARGUMENTS,
"html": HTML_ARGUMENTS,
"openapi": OPENAPI_ARGUMENTS,
"asciidoc": ASCIIDOC_ARGUMENTS,
"pptx": PPTX_ARGUMENTS,
"rss": RSS_ARGUMENTS,
"manpage": MANPAGE_ARGUMENTS,
"confluence": CONFLUENCE_ARGUMENTS,
"notion": NOTION_ARGUMENTS,
"chat": CHAT_ARGUMENTS,
}
for stype, sargs in _NEW_SOURCE_ARGS.items():
if mode in [stype, "all"]:
for arg_name, arg_def in sargs.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
# Add advanced arguments if requested
if mode in ["advanced", "all"]:
for arg_name, arg_def in ADVANCED_ARGUMENTS.items():

View File

@@ -0,0 +1,68 @@
"""HTML command argument definitions.
This module defines ALL arguments for the html command in ONE place.
Both html_scraper.py (standalone) and parsers/html_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# HTML-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
HTML_ARGUMENTS: dict[str, dict[str, Any]] = {
"html_path": {
"flags": ("--html-path",),
"kwargs": {
"type": str,
"help": "Path to HTML file or directory containing HTML files",
"metavar": "PATH",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_html_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all html command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds HTML-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for HTML.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for HTML
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for HTML), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# HTML-specific args
for arg_name, arg_def in HTML_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,68 @@
"""Jupyter Notebook command argument definitions.
This module defines ALL arguments for the jupyter command in ONE place.
Both jupyter_scraper.py (standalone) and parsers/jupyter_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# Jupyter-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = {
"notebook": {
"flags": ("--notebook",),
"kwargs": {
"type": str,
"help": "Path to .ipynb file or directory containing notebooks",
"metavar": "PATH",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_jupyter_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all jupyter command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds Jupyter-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for Jupyter.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for Jupyter
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for Jupyter), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# Jupyter-specific args
for arg_name, arg_def in JUPYTER_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,84 @@
"""Man page command argument definitions.
This module defines ALL arguments for the manpage command in ONE place.
Both manpage_scraper.py (standalone) and parsers/manpage_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# ManPage-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
"man_names": {
"flags": ("--man-names",),
"kwargs": {
"type": str,
"help": "Comma-separated list of man page names (e.g., 'ls,grep,find')",
"metavar": "NAMES",
},
},
"man_path": {
"flags": ("--man-path",),
"kwargs": {
"type": str,
"help": "Path to directory containing man page files",
"metavar": "PATH",
},
},
"sections": {
"flags": ("--sections",),
"kwargs": {
"type": str,
"help": "Comma-separated section numbers to include (e.g., '1,3,8')",
"metavar": "SECTIONS",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_manpage_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all manpage command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds ManPage-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for ManPage.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for ManPage
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for ManPage), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# ManPage-specific args
for arg_name, arg_def in MANPAGE_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,101 @@
"""Notion command argument definitions.
This module defines ALL arguments for the notion command in ONE place.
Both notion_scraper.py (standalone) and parsers/notion_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# Notion-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
NOTION_ARGUMENTS: dict[str, dict[str, Any]] = {
"database_id": {
"flags": ("--database-id",),
"kwargs": {
"type": str,
"help": "Notion database ID to extract from",
"metavar": "ID",
},
},
"page_id": {
"flags": ("--page-id",),
"kwargs": {
"type": str,
"help": "Notion page ID to extract from",
"metavar": "ID",
},
},
"export_path": {
"flags": ("--export-path",),
"kwargs": {
"type": str,
"help": "Path to Notion export directory",
"metavar": "PATH",
},
},
"token": {
"flags": ("--token",),
"kwargs": {
"type": str,
"help": "Notion integration token for API authentication",
"metavar": "TOKEN",
},
},
"max_pages": {
"flags": ("--max-pages",),
"kwargs": {
"type": int,
"default": 500,
"help": "Maximum number of pages to extract (default: 500)",
"metavar": "N",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_notion_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all notion command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds Notion-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for Notion.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for Notion
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for Notion), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# Notion-specific args
for arg_name, arg_def in NOTION_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,76 @@
"""OpenAPI command argument definitions.
This module defines ALL arguments for the openapi command in ONE place.
Both openapi_scraper.py (standalone) and parsers/openapi_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# OpenAPI-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = {
"spec": {
"flags": ("--spec",),
"kwargs": {
"type": str,
"help": "Path to OpenAPI/Swagger spec file",
"metavar": "PATH",
},
},
"spec_url": {
"flags": ("--spec-url",),
"kwargs": {
"type": str,
"help": "URL to OpenAPI/Swagger spec",
"metavar": "URL",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_openapi_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all openapi command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds OpenAPI-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for OpenAPI.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for OpenAPI
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for OpenAPI), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# OpenAPI-specific args
for arg_name, arg_def in OPENAPI_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,68 @@
"""PPTX command argument definitions.
This module defines ALL arguments for the pptx command in ONE place.
Both pptx_scraper.py (standalone) and parsers/pptx_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# PPTX-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
PPTX_ARGUMENTS: dict[str, dict[str, Any]] = {
"pptx": {
"flags": ("--pptx",),
"kwargs": {
"type": str,
"help": "Path to PowerPoint file (.pptx)",
"metavar": "PATH",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_pptx_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all pptx command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds PPTX-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for PPTX.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for PPTX
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for PPTX), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# PPTX-specific args
for arg_name, arg_def in PPTX_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

View File

@@ -0,0 +1,101 @@
"""RSS command argument definitions.
This module defines ALL arguments for the rss command in ONE place.
Both rss_scraper.py (standalone) and parsers/rss_parser.py (unified CLI)
import and use these definitions.
Shared arguments (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
via ``add_all_standard_arguments()``.
"""
import argparse
from typing import Any
from .common import add_all_standard_arguments
# RSS-specific argument definitions as data structure
# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
# verbose, quiet, workflow args) are registered by add_all_standard_arguments().
RSS_ARGUMENTS: dict[str, dict[str, Any]] = {
"feed_url": {
"flags": ("--feed-url",),
"kwargs": {
"type": str,
"help": "URL of the RSS/Atom feed",
"metavar": "URL",
},
},
"feed_path": {
"flags": ("--feed-path",),
"kwargs": {
"type": str,
"help": "Path to local RSS/Atom feed file",
"metavar": "PATH",
},
},
"follow_links": {
"flags": ("--follow-links",),
"kwargs": {
"action": "store_true",
"default": True,
"help": "Follow article links and extract full content (default: True)",
},
},
"no_follow_links": {
"flags": ("--no-follow-links",),
"kwargs": {
"action": "store_false",
"dest": "follow_links",
"help": "Do not follow article links; use feed summary only",
},
},
"max_articles": {
"flags": ("--max-articles",),
"kwargs": {
"type": int,
"default": 50,
"help": "Maximum number of articles to extract (default: 50)",
"metavar": "N",
},
},
"from_json": {
"flags": ("--from-json",),
"kwargs": {
"type": str,
"help": "Build skill from extracted JSON",
"metavar": "FILE",
},
},
}
def add_rss_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all rss command arguments to a parser.
Registers shared args (name, description, output, enhance-level, api-key,
dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
then adds RSS-specific args on top.
The default for --enhance-level is overridden to 0 (disabled) for RSS.
"""
# Shared universal args first
add_all_standard_arguments(parser)
# Override enhance-level default to 0 for RSS
for action in parser._actions:
if hasattr(action, "dest") and action.dest == "enhance_level":
action.default = 0
action.help = (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled (default for RSS), 1=SKILL.md only, "
"2=+architecture/config, 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, "
"otherwise LOCAL (Claude Code)"
)
# RSS-specific args
for arg_name, arg_def in RSS_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,19 @@ Validates unified config format that supports multiple sources:
- github (repository scraping)
- pdf (PDF document scraping)
- local (local codebase analysis)
- word (Word .docx document scraping)
- video (video transcript/visual extraction)
- epub (EPUB e-book extraction)
- jupyter (Jupyter Notebook extraction)
- html (local HTML file extraction)
- openapi (OpenAPI/Swagger spec extraction)
- asciidoc (AsciiDoc document extraction)
- pptx (PowerPoint presentation extraction)
- confluence (Confluence wiki extraction)
- notion (Notion page extraction)
- rss (RSS/Atom feed extraction)
- manpage (man page extraction)
- chat (Slack/Discord chat export extraction)
Legacy config format support removed in v2.11.0.
All configs must use unified format with 'sources' array.
@@ -27,7 +40,25 @@ class ConfigValidator:
"""
# Valid source types
VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}
VALID_SOURCE_TYPES = {
"documentation",
"github",
"pdf",
"local",
"word",
"video",
"epub",
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
}
# Valid merge modes
VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
@@ -159,6 +190,32 @@ class ConfigValidator:
self._validate_pdf_source(source, index)
elif source_type == "local":
self._validate_local_source(source, index)
elif source_type == "word":
self._validate_word_source(source, index)
elif source_type == "video":
self._validate_video_source(source, index)
elif source_type == "epub":
self._validate_epub_source(source, index)
elif source_type == "jupyter":
self._validate_jupyter_source(source, index)
elif source_type == "html":
self._validate_html_source(source, index)
elif source_type == "openapi":
self._validate_openapi_source(source, index)
elif source_type == "asciidoc":
self._validate_asciidoc_source(source, index)
elif source_type == "pptx":
self._validate_pptx_source(source, index)
elif source_type == "confluence":
self._validate_confluence_source(source, index)
elif source_type == "notion":
self._validate_notion_source(source, index)
elif source_type == "rss":
self._validate_rss_source(source, index)
elif source_type == "manpage":
self._validate_manpage_source(source, index)
elif source_type == "chat":
self._validate_chat_source(source, index)
def _validate_documentation_source(self, source: dict[str, Any], index: int):
"""Validate documentation source configuration."""
@@ -253,12 +310,126 @@ class ConfigValidator:
f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
)
def _validate_word_source(self, source: dict[str, Any], index: int):
"""Validate Word document (.docx) source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (word): Missing required field 'path'")
word_path = source["path"]
if not Path(word_path).exists():
logger.warning(f"Source {index} (word): File not found: {word_path}")
def _validate_video_source(self, source: dict[str, Any], index: int):
"""Validate video source configuration."""
has_url = "url" in source
has_path = "path" in source
has_playlist = "playlist" in source
if not has_url and not has_path and not has_playlist:
raise ValueError(
f"Source {index} (video): Missing required field 'url', 'path', or 'playlist'"
)
def _validate_epub_source(self, source: dict[str, Any], index: int):
"""Validate EPUB source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (epub): Missing required field 'path'")
epub_path = source["path"]
if not Path(epub_path).exists():
logger.warning(f"Source {index} (epub): File not found: {epub_path}")
def _validate_jupyter_source(self, source: dict[str, Any], index: int):
"""Validate Jupyter Notebook source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (jupyter): Missing required field 'path'")
nb_path = source["path"]
if not Path(nb_path).exists():
logger.warning(f"Source {index} (jupyter): Path not found: {nb_path}")
def _validate_html_source(self, source: dict[str, Any], index: int):
"""Validate local HTML source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (html): Missing required field 'path'")
html_path = source["path"]
if not Path(html_path).exists():
logger.warning(f"Source {index} (html): Path not found: {html_path}")
def _validate_openapi_source(self, source: dict[str, Any], index: int):
"""Validate OpenAPI/Swagger source configuration."""
if "path" not in source and "url" not in source:
raise ValueError(f"Source {index} (openapi): Missing required field 'path' or 'url'")
if "path" in source and not Path(source["path"]).exists():
logger.warning(f"Source {index} (openapi): File not found: {source['path']}")
def _validate_asciidoc_source(self, source: dict[str, Any], index: int):
"""Validate AsciiDoc source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (asciidoc): Missing required field 'path'")
adoc_path = source["path"]
if not Path(adoc_path).exists():
logger.warning(f"Source {index} (asciidoc): Path not found: {adoc_path}")
def _validate_pptx_source(self, source: dict[str, Any], index: int):
"""Validate PowerPoint source configuration."""
if "path" not in source:
raise ValueError(f"Source {index} (pptx): Missing required field 'path'")
pptx_path = source["path"]
if not Path(pptx_path).exists():
logger.warning(f"Source {index} (pptx): File not found: {pptx_path}")
def _validate_confluence_source(self, source: dict[str, Any], index: int):
"""Validate Confluence source configuration."""
has_url = "url" in source or "base_url" in source
has_path = "path" in source
if not has_url and not has_path:
raise ValueError(
f"Source {index} (confluence): Missing required field 'url'/'base_url' "
f"(for API) or 'path' (for export)"
)
if has_url and "space_key" not in source and "path" not in source:
logger.warning(f"Source {index} (confluence): No 'space_key' specified for API mode")
def _validate_notion_source(self, source: dict[str, Any], index: int):
"""Validate Notion source configuration."""
has_url = "url" in source or "database_id" in source or "page_id" in source
has_path = "path" in source
if not has_url and not has_path:
raise ValueError(
f"Source {index} (notion): Missing required field 'url'/'database_id'/'page_id' "
f"(for API) or 'path' (for export)"
)
def _validate_rss_source(self, source: dict[str, Any], index: int):
"""Validate RSS/Atom feed source configuration."""
if "url" not in source and "path" not in source:
raise ValueError(f"Source {index} (rss): Missing required field 'url' or 'path'")
def _validate_manpage_source(self, source: dict[str, Any], index: int):
"""Validate man page source configuration."""
if "path" not in source and "names" not in source:
raise ValueError(f"Source {index} (manpage): Missing required field 'path' or 'names'")
if "path" in source and not Path(source["path"]).exists():
logger.warning(f"Source {index} (manpage): Path not found: {source['path']}")
def _validate_chat_source(self, source: dict[str, Any], index: int):
"""Validate Slack/Discord chat source configuration."""
has_path = "path" in source
has_api = "token" in source or "webhook_url" in source
has_channel = "channel" in source or "channel_id" in source
if not has_path and not has_api:
raise ValueError(
f"Source {index} (chat): Missing required field 'path' (for export) "
f"or 'token' (for API)"
)
if has_api and not has_channel:
logger.warning(
f"Source {index} (chat): No 'channel' or 'channel_id' specified for API mode"
)
def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
"""
Get all sources of a specific type.
Args:
source_type: 'documentation', 'github', 'pdf', or 'local'
source_type: Any valid source type string
Returns:
List of sources matching the type

File diff suppressed because it is too large Load Diff

View File

@@ -140,6 +140,26 @@ class CreateCommand:
return self._route_video()
elif self.source_info.type == "config":
return self._route_config()
elif self.source_info.type == "jupyter":
return self._route_generic("jupyter_scraper", "--notebook")
elif self.source_info.type == "html":
return self._route_generic("html_scraper", "--html-path")
elif self.source_info.type == "openapi":
return self._route_generic("openapi_scraper", "--spec")
elif self.source_info.type == "asciidoc":
return self._route_generic("asciidoc_scraper", "--asciidoc-path")
elif self.source_info.type == "pptx":
return self._route_generic("pptx_scraper", "--pptx")
elif self.source_info.type == "rss":
return self._route_generic("rss_scraper", "--feed-path")
elif self.source_info.type == "manpage":
return self._route_generic("man_scraper", "--man-path")
elif self.source_info.type == "confluence":
return self._route_generic("confluence_scraper", "--export-path")
elif self.source_info.type == "notion":
return self._route_generic("notion_scraper", "--export-path")
elif self.source_info.type == "chat":
return self._route_generic("chat_scraper", "--export-path")
else:
logger.error(f"Unknown source type: {self.source_info.type}")
return 1
@@ -485,6 +505,40 @@ class CreateCommand:
finally:
sys.argv = original_argv
def _route_generic(self, module_name: str, file_flag: str) -> int:
"""Generic routing for new source types.
Most new source types (jupyter, html, openapi, asciidoc, pptx, rss,
manpage, confluence, notion, chat) follow the same pattern:
import module, build argv with --flag <file_path>, add common args, call main().
Args:
module_name: Python module name under skill_seekers.cli (e.g., "jupyter_scraper")
file_flag: CLI flag for the source file (e.g., "--notebook")
Returns:
Exit code from scraper
"""
import importlib
module = importlib.import_module(f"skill_seekers.cli.{module_name}")
argv = [module_name]
file_path = self.source_info.parsed.get("file_path", "")
if file_path:
argv.extend([file_flag, file_path])
self._add_common_args(argv)
logger.debug(f"Calling {module_name} with argv: {argv}")
original_argv = sys.argv
try:
sys.argv = argv
return module.main()
finally:
sys.argv = original_argv
def _add_common_args(self, argv: list[str]) -> None:
"""Add truly universal arguments to argv list.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -15,7 +15,17 @@ Commands:
word Extract from Word (.docx) file
epub Extract from EPUB e-book (.epub)
video Extract from video (YouTube or local)
unified Multi-source scraping (docs + GitHub + PDF)
jupyter Extract from Jupyter Notebook (.ipynb)
html Extract from local HTML files
openapi Extract from OpenAPI/Swagger spec
asciidoc Extract from AsciiDoc documents (.adoc)
pptx Extract from PowerPoint (.pptx)
rss Extract from RSS/Atom feeds
manpage Extract from man pages
confluence Extract from Confluence wiki
notion Extract from Notion pages
chat Extract from Slack/Discord chat exports
unified Multi-source scraping (docs + GitHub + PDF + more)
analyze Analyze local codebase and extract code knowledge
enhance AI-powered enhancement (auto: API or LOCAL mode)
enhance-status Check enhancement status (for background/daemon modes)
@@ -70,6 +80,17 @@ COMMAND_MODULES = {
"quality": "skill_seekers.cli.quality_metrics",
"workflows": "skill_seekers.cli.workflows_command",
"sync-config": "skill_seekers.cli.sync_config",
# New source types (v3.2.0+)
"jupyter": "skill_seekers.cli.jupyter_scraper",
"html": "skill_seekers.cli.html_scraper",
"openapi": "skill_seekers.cli.openapi_scraper",
"asciidoc": "skill_seekers.cli.asciidoc_scraper",
"pptx": "skill_seekers.cli.pptx_scraper",
"rss": "skill_seekers.cli.rss_scraper",
"manpage": "skill_seekers.cli.man_scraper",
"confluence": "skill_seekers.cli.confluence_scraper",
"notion": "skill_seekers.cli.notion_scraper",
"chat": "skill_seekers.cli.chat_scraper",
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -33,6 +33,18 @@ from .quality_parser import QualityParser
from .workflows_parser import WorkflowsParser
from .sync_config_parser import SyncConfigParser
# New source type parsers (v3.2.0+)
from .jupyter_parser import JupyterParser
from .html_parser import HtmlParser
from .openapi_parser import OpenAPIParser
from .asciidoc_parser import AsciiDocParser
from .pptx_parser import PptxParser
from .rss_parser import RssParser
from .manpage_parser import ManPageParser
from .confluence_parser import ConfluenceParser
from .notion_parser import NotionParser
from .chat_parser import ChatParser
# Registry of all parsers (in order of usage frequency)
PARSERS = [
CreateParser(), # NEW: Unified create command (placed first for prominence)
@@ -60,6 +72,17 @@ PARSERS = [
QualityParser(),
WorkflowsParser(),
SyncConfigParser(),
# New source types (v3.2.0+)
JupyterParser(),
HtmlParser(),
OpenAPIParser(),
AsciiDocParser(),
PptxParser(),
RssParser(),
ManPageParser(),
ConfluenceParser(),
NotionParser(),
ChatParser(),
]

View File

@@ -0,0 +1,32 @@
"""AsciiDoc subcommand parser.
Uses shared argument definitions from arguments.asciidoc to ensure
consistency with the standalone asciidoc_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.asciidoc import add_asciidoc_arguments
class AsciiDocParser(SubcommandParser):
"""Parser for asciidoc subcommand."""
@property
def name(self) -> str:
return "asciidoc"
@property
def help(self) -> str:
return "Extract from AsciiDoc documents (.adoc)"
@property
def description(self) -> str:
return "Extract content from AsciiDoc documents (.adoc) and generate skill"
def add_arguments(self, parser):
"""Add asciidoc-specific arguments.
Uses shared argument definitions to ensure consistency
with asciidoc_scraper.py (standalone scraper).
"""
add_asciidoc_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""Chat subcommand parser.
Uses shared argument definitions from arguments.chat to ensure
consistency with the standalone chat_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.chat import add_chat_arguments
class ChatParser(SubcommandParser):
"""Parser for chat subcommand."""
@property
def name(self) -> str:
return "chat"
@property
def help(self) -> str:
return "Extract from Slack/Discord chat exports"
@property
def description(self) -> str:
return "Extract content from Slack/Discord chat exports and generate skill"
def add_arguments(self, parser):
"""Add chat-specific arguments.
Uses shared argument definitions to ensure consistency
with chat_scraper.py (standalone scraper).
"""
add_chat_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""Confluence subcommand parser.
Uses shared argument definitions from arguments.confluence to ensure
consistency with the standalone confluence_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.confluence import add_confluence_arguments
class ConfluenceParser(SubcommandParser):
"""Parser for confluence subcommand."""
@property
def name(self) -> str:
return "confluence"
@property
def help(self) -> str:
return "Extract from Confluence wiki"
@property
def description(self) -> str:
return "Extract content from Confluence wiki and generate skill"
def add_arguments(self, parser):
"""Add confluence-specific arguments.
Uses shared argument definitions to ensure consistency
with confluence_scraper.py (standalone scraper).
"""
add_confluence_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""HTML subcommand parser.
Uses shared argument definitions from arguments.html to ensure
consistency with the standalone html_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.html import add_html_arguments
class HtmlParser(SubcommandParser):
"""Parser for html subcommand."""
@property
def name(self) -> str:
return "html"
@property
def help(self) -> str:
return "Extract from local HTML files (.html/.htm)"
@property
def description(self) -> str:
return "Extract content from local HTML files (.html/.htm) and generate skill"
def add_arguments(self, parser):
"""Add html-specific arguments.
Uses shared argument definitions to ensure consistency
with html_scraper.py (standalone scraper).
"""
add_html_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""Jupyter Notebook subcommand parser.
Uses shared argument definitions from arguments.jupyter to ensure
consistency with the standalone jupyter_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.jupyter import add_jupyter_arguments
class JupyterParser(SubcommandParser):
"""Parser for jupyter subcommand."""
@property
def name(self) -> str:
return "jupyter"
@property
def help(self) -> str:
return "Extract from Jupyter Notebook (.ipynb)"
@property
def description(self) -> str:
return "Extract content from Jupyter Notebook (.ipynb) and generate skill"
def add_arguments(self, parser):
"""Add jupyter-specific arguments.
Uses shared argument definitions to ensure consistency
with jupyter_scraper.py (standalone scraper).
"""
add_jupyter_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""Man page subcommand parser.
Uses shared argument definitions from arguments.manpage to ensure
consistency with the standalone man_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.manpage import add_manpage_arguments
class ManPageParser(SubcommandParser):
"""Parser for manpage subcommand."""
@property
def name(self) -> str:
return "manpage"
@property
def help(self) -> str:
return "Extract from man pages"
@property
def description(self) -> str:
return "Extract content from man pages and generate skill"
def add_arguments(self, parser):
"""Add manpage-specific arguments.
Uses shared argument definitions to ensure consistency
with man_scraper.py (standalone scraper).
"""
add_manpage_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""Notion subcommand parser.
Uses shared argument definitions from arguments.notion to ensure
consistency with the standalone notion_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.notion import add_notion_arguments
class NotionParser(SubcommandParser):
"""Parser for notion subcommand."""
@property
def name(self) -> str:
return "notion"
@property
def help(self) -> str:
return "Extract from Notion pages"
@property
def description(self) -> str:
return "Extract content from Notion pages and generate skill"
def add_arguments(self, parser):
"""Add notion-specific arguments.
Uses shared argument definitions to ensure consistency
with notion_scraper.py (standalone scraper).
"""
add_notion_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""OpenAPI subcommand parser.
Uses shared argument definitions from arguments.openapi to ensure
consistency with the standalone openapi_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.openapi import add_openapi_arguments
class OpenAPIParser(SubcommandParser):
"""Parser for openapi subcommand."""
@property
def name(self) -> str:
return "openapi"
@property
def help(self) -> str:
return "Extract from OpenAPI/Swagger spec"
@property
def description(self) -> str:
return "Extract content from OpenAPI/Swagger spec and generate skill"
def add_arguments(self, parser):
"""Add openapi-specific arguments.
Uses shared argument definitions to ensure consistency
with openapi_scraper.py (standalone scraper).
"""
add_openapi_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""PPTX subcommand parser.
Uses shared argument definitions from arguments.pptx to ensure
consistency with the standalone pptx_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.pptx import add_pptx_arguments
class PptxParser(SubcommandParser):
"""Parser for pptx subcommand."""
@property
def name(self) -> str:
return "pptx"
@property
def help(self) -> str:
return "Extract from PowerPoint presentations (.pptx)"
@property
def description(self) -> str:
return "Extract content from PowerPoint presentations (.pptx) and generate skill"
def add_arguments(self, parser):
"""Add pptx-specific arguments.
Uses shared argument definitions to ensure consistency
with pptx_scraper.py (standalone scraper).
"""
add_pptx_arguments(parser)

View File

@@ -0,0 +1,32 @@
"""RSS subcommand parser.
Uses shared argument definitions from arguments.rss to ensure
consistency with the standalone rss_scraper module.
"""
from .base import SubcommandParser
from skill_seekers.cli.arguments.rss import add_rss_arguments
class RssParser(SubcommandParser):
"""Parser for rss subcommand."""
@property
def name(self) -> str:
return "rss"
@property
def help(self) -> str:
return "Extract from RSS/Atom feeds"
@property
def description(self) -> str:
return "Extract content from RSS/Atom feeds and generate skill"
def add_arguments(self, parser):
"""Add rss-specific arguments.
Uses shared argument definitions to ensure consistency
with rss_scraper.py (standalone scraper).
"""
add_rss_arguments(parser)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,12 @@
"""Source type detection for unified create command.
Auto-detects whether a source is a web URL, GitHub repository,
local directory, PDF file, or config file based on patterns.
Auto-detects source type from user input — supports web URLs, GitHub repos,
local directories, and 14+ file types (PDF, DOCX, EPUB, IPYNB, HTML, YAML/OpenAPI,
AsciiDoc, PPTX, RSS/Atom, man pages, video files, and config JSON).
Note: Confluence, Notion, and Slack/Discord chat sources are API/export-based
and cannot be auto-detected from a single argument. Use their dedicated
subcommands (``skill-seekers confluence``, ``notion``, ``chat``) instead.
"""
import os
@@ -66,11 +71,49 @@ class SourceDetector:
if source.endswith(".epub"):
return cls._detect_epub(source)
if source.endswith(".ipynb"):
return cls._detect_jupyter(source)
if source.lower().endswith((".html", ".htm")):
return cls._detect_html(source)
if source.endswith(".pptx"):
return cls._detect_pptx(source)
if source.lower().endswith((".adoc", ".asciidoc")):
return cls._detect_asciidoc(source)
# Man page file extensions (.1 through .8, .man)
# Only match if the basename looks like a man page (e.g., "git.1", not "log.1")
# Require basename without the extension to be a plausible command name
if source.lower().endswith(".man"):
return cls._detect_manpage(source)
MAN_SECTION_EXTENSIONS = (".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8")
if source.lower().endswith(MAN_SECTION_EXTENSIONS):
# Heuristic: man pages have a simple basename (no dots before extension)
# e.g., "git.1" is a man page, "access.log.1" is not
basename_no_ext = os.path.splitext(os.path.basename(source))[0]
if "." not in basename_no_ext:
return cls._detect_manpage(source)
# Video file extensions
VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
if source.lower().endswith(VIDEO_EXTENSIONS):
return cls._detect_video_file(source)
# RSS/Atom feed file extensions (only .rss and .atom — .xml is too generic)
if source.lower().endswith((".rss", ".atom")):
return cls._detect_rss(source)
# OpenAPI/Swagger spec detection (YAML files with OpenAPI content)
# Sniff file content for 'openapi:' or 'swagger:' keys before committing
if (
source.lower().endswith((".yaml", ".yml"))
and os.path.isfile(source)
and cls._looks_like_openapi(source)
):
return cls._detect_openapi(source)
# 2. Video URL detection (before directory check)
video_url_info = cls._detect_video_url(source)
if video_url_info:
@@ -97,15 +140,22 @@ class SourceDetector:
raise ValueError(
f"Cannot determine source type for: {source}\n\n"
"Examples:\n"
" Web: skill-seekers create https://docs.react.dev/\n"
" GitHub: skill-seekers create facebook/react\n"
" Local: skill-seekers create ./my-project\n"
" PDF: skill-seekers create tutorial.pdf\n"
" DOCX: skill-seekers create document.docx\n"
" EPUB: skill-seekers create ebook.epub\n"
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
" Video: skill-seekers create recording.mp4\n"
" Config: skill-seekers create configs/react.json"
" Web: skill-seekers create https://docs.react.dev/\n"
" GitHub: skill-seekers create facebook/react\n"
" Local: skill-seekers create ./my-project\n"
" PDF: skill-seekers create tutorial.pdf\n"
" DOCX: skill-seekers create document.docx\n"
" EPUB: skill-seekers create ebook.epub\n"
" Jupyter: skill-seekers create notebook.ipynb\n"
" HTML: skill-seekers create page.html\n"
" OpenAPI: skill-seekers create openapi.yaml\n"
" AsciiDoc: skill-seekers create document.adoc\n"
" PowerPoint: skill-seekers create presentation.pptx\n"
" RSS: skill-seekers create feed.rss\n"
" Man page: skill-seekers create command.1\n"
" Video: skill-seekers create https://youtube.com/watch?v=...\n"
" Video: skill-seekers create recording.mp4\n"
" Config: skill-seekers create configs/react.json"
)
@classmethod
@@ -140,6 +190,90 @@ class SourceDetector:
type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_jupyter(cls, source: str) -> SourceInfo:
"""Detect Jupyter Notebook file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="jupyter", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_html(cls, source: str) -> SourceInfo:
"""Detect local HTML file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="html", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_pptx(cls, source: str) -> SourceInfo:
"""Detect PowerPoint file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="pptx", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_asciidoc(cls, source: str) -> SourceInfo:
"""Detect AsciiDoc file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="asciidoc", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_manpage(cls, source: str) -> SourceInfo:
"""Detect man page file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="manpage", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_rss(cls, source: str) -> SourceInfo:
"""Detect RSS/Atom feed file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="rss", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _looks_like_openapi(cls, source: str) -> bool:
"""Check if a YAML/JSON file looks like an OpenAPI or Swagger spec.
Reads the first few lines to look for 'openapi:' or 'swagger:' keys.
Args:
source: Path to the file
Returns:
True if the file appears to be an OpenAPI/Swagger spec
"""
try:
with open(source, encoding="utf-8", errors="replace") as f:
# Read first 20 lines — the openapi/swagger key is always near the top
for _ in range(20):
line = f.readline()
if not line:
break
stripped = line.strip().lower()
if stripped.startswith("openapi:") or stripped.startswith("swagger:"):
return True
if stripped.startswith('"openapi"') or stripped.startswith('"swagger"'):
return True
except OSError:
pass
return False
@classmethod
def _detect_openapi(cls, source: str) -> SourceInfo:
"""Detect OpenAPI/Swagger spec file source."""
name = os.path.splitext(os.path.basename(source))[0]
return SourceInfo(
type="openapi", parsed={"file_path": source}, suggested_name=name, raw_input=source
)
@classmethod
def _detect_video_file(cls, source: str) -> SourceInfo:
"""Detect local video file source."""
@@ -312,5 +446,19 @@ class SourceDetector:
if not os.path.isfile(config_path):
raise ValueError(f"Path is not a file: {config_path}")
# For web and github, validation happens during scraping
# (URL accessibility, repo existence)
elif source_info.type in ("jupyter", "html", "pptx", "asciidoc", "manpage", "openapi"):
file_path = source_info.parsed.get("file_path", "")
if file_path:
type_label = source_info.type.upper()
if not os.path.exists(file_path):
raise ValueError(f"{type_label} file does not exist: {file_path}")
if not os.path.isfile(file_path) and not os.path.isdir(file_path):
raise ValueError(f"Path is not a file or directory: {file_path}")
elif source_info.type == "rss":
file_path = source_info.parsed.get("file_path", "")
if file_path and not os.path.exists(file_path):
raise ValueError(f"RSS/Atom file does not exist: {file_path}")
# For web, github, confluence, notion, chat, rss (URL), validation happens
# during scraping (URL accessibility, API auth, etc.)

View File

@@ -76,6 +76,17 @@ class UnifiedScraper:
"word": [], # List of word sources
"video": [], # List of video sources
"local": [], # List of local sources (docs or code)
"epub": [], # List of epub sources
"jupyter": [], # List of Jupyter notebook sources
"html": [], # List of local HTML sources
"openapi": [], # List of OpenAPI/Swagger spec sources
"asciidoc": [], # List of AsciiDoc sources
"pptx": [], # List of PowerPoint sources
"confluence": [], # List of Confluence wiki sources
"notion": [], # List of Notion page sources
"rss": [], # List of RSS/Atom feed sources
"manpage": [], # List of man page sources
"chat": [], # List of Slack/Discord chat sources
}
# Track source index for unique naming (multi-source support)
@@ -86,6 +97,17 @@ class UnifiedScraper:
"word": 0,
"video": 0,
"local": 0,
"epub": 0,
"jupyter": 0,
"html": 0,
"openapi": 0,
"asciidoc": 0,
"pptx": 0,
"confluence": 0,
"notion": 0,
"rss": 0,
"manpage": 0,
"chat": 0,
}
# Output paths - cleaner organization
@@ -166,6 +188,28 @@ class UnifiedScraper:
self._scrape_video(source)
elif source_type == "local":
self._scrape_local(source)
elif source_type == "epub":
self._scrape_epub(source)
elif source_type == "jupyter":
self._scrape_jupyter(source)
elif source_type == "html":
self._scrape_html(source)
elif source_type == "openapi":
self._scrape_openapi(source)
elif source_type == "asciidoc":
self._scrape_asciidoc(source)
elif source_type == "pptx":
self._scrape_pptx(source)
elif source_type == "confluence":
self._scrape_confluence(source)
elif source_type == "notion":
self._scrape_notion(source)
elif source_type == "rss":
self._scrape_rss(source)
elif source_type == "manpage":
self._scrape_manpage(source)
elif source_type == "chat":
self._scrape_chat(source)
else:
logger.warning(f"Unknown source type: {source_type}")
except Exception as e:
@@ -571,6 +615,7 @@ class UnifiedScraper:
{
"docx_path": docx_path,
"docx_id": docx_id,
"word_id": docx_id, # Alias for generic reference generation
"idx": idx,
"data": word_data,
"data_file": cache_word_data,
@@ -788,6 +833,595 @@ class UnifiedScraper:
logger.debug(f"Traceback: {traceback.format_exc()}")
raise
# ------------------------------------------------------------------
# New source type handlers (v3.2.0+)
# ------------------------------------------------------------------
def _scrape_epub(self, source: dict[str, Any]):
"""Scrape EPUB e-book (.epub)."""
try:
from skill_seekers.cli.epub_scraper import EpubToSkillConverter
except ImportError:
logger.error(
"EPUB scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[epub]"
)
return
idx = self._source_counters["epub"]
self._source_counters["epub"] += 1
epub_path = source["path"]
epub_id = os.path.splitext(os.path.basename(epub_path))[0]
epub_config = {
"name": f"{self.name}_epub_{idx}_{epub_id}",
"epub_path": source["path"],
"description": source.get("description", f"{epub_id} e-book"),
}
logger.info(f"Scraping EPUB: {source['path']}")
converter = EpubToSkillConverter(epub_config)
converter.extract_epub()
epub_data_file = converter.data_file
with open(epub_data_file, encoding="utf-8") as f:
epub_data = json.load(f)
cache_epub_data = os.path.join(self.data_dir, f"epub_data_{idx}_{epub_id}.json")
shutil.copy(epub_data_file, cache_epub_data)
self.scraped_data["epub"].append(
{
"epub_path": epub_path,
"epub_id": epub_id,
"idx": idx,
"data": epub_data,
"data_file": cache_epub_data,
}
)
try:
converter.build_skill()
logger.info("✅ EPUB: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone EPUB SKILL.md: {e}")
logger.info(f"✅ EPUB: {len(epub_data.get('chapters', []))} chapters extracted")
def _scrape_jupyter(self, source: dict[str, Any]):
"""Scrape Jupyter Notebook (.ipynb)."""
try:
from skill_seekers.cli.jupyter_scraper import JupyterToSkillConverter
except ImportError:
logger.error(
"Jupyter scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[jupyter]"
)
return
idx = self._source_counters["jupyter"]
self._source_counters["jupyter"] += 1
nb_path = source["path"]
nb_id = os.path.splitext(os.path.basename(nb_path))[0]
nb_config = {
"name": f"{self.name}_jupyter_{idx}_{nb_id}",
"notebook_path": source["path"],
"description": source.get("description", f"{nb_id} notebook"),
}
logger.info(f"Scraping Jupyter Notebook: {source['path']}")
converter = JupyterToSkillConverter(nb_config)
converter.extract_notebook()
nb_data_file = converter.data_file
with open(nb_data_file, encoding="utf-8") as f:
nb_data = json.load(f)
cache_nb_data = os.path.join(self.data_dir, f"jupyter_data_{idx}_{nb_id}.json")
shutil.copy(nb_data_file, cache_nb_data)
self.scraped_data["jupyter"].append(
{
"notebook_path": nb_path,
"notebook_id": nb_id,
"idx": idx,
"data": nb_data,
"data_file": cache_nb_data,
}
)
try:
converter.build_skill()
logger.info("✅ Jupyter: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone Jupyter SKILL.md: {e}")
logger.info(f"✅ Jupyter: {len(nb_data.get('cells', []))} cells extracted")
def _scrape_html(self, source: dict[str, Any]):
"""Scrape local HTML file(s)."""
try:
from skill_seekers.cli.html_scraper import HtmlToSkillConverter
except ImportError:
logger.error("html_scraper.py not found")
return
idx = self._source_counters["html"]
self._source_counters["html"] += 1
html_path = source["path"]
html_id = os.path.splitext(os.path.basename(html_path.rstrip("/")))[0]
html_config = {
"name": f"{self.name}_html_{idx}_{html_id}",
"html_path": source["path"],
"description": source.get("description", f"{html_id} HTML content"),
}
logger.info(f"Scraping local HTML: {source['path']}")
converter = HtmlToSkillConverter(html_config)
converter.extract_html()
html_data_file = converter.data_file
with open(html_data_file, encoding="utf-8") as f:
html_data = json.load(f)
cache_html_data = os.path.join(self.data_dir, f"html_data_{idx}_{html_id}.json")
shutil.copy(html_data_file, cache_html_data)
self.scraped_data["html"].append(
{
"html_path": html_path,
"html_id": html_id,
"idx": idx,
"data": html_data,
"data_file": cache_html_data,
}
)
try:
converter.build_skill()
logger.info("✅ HTML: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone HTML SKILL.md: {e}")
logger.info(f"✅ HTML: {len(html_data.get('pages', []))} pages extracted")
def _scrape_openapi(self, source: dict[str, Any]):
"""Scrape OpenAPI/Swagger specification."""
try:
from skill_seekers.cli.openapi_scraper import OpenAPIToSkillConverter
except ImportError:
logger.error("openapi_scraper.py not found")
return
idx = self._source_counters["openapi"]
self._source_counters["openapi"] += 1
spec_path = source.get("path", source.get("url", ""))
spec_id = os.path.splitext(os.path.basename(spec_path))[0] if spec_path else f"spec_{idx}"
openapi_config = {
"name": f"{self.name}_openapi_{idx}_{spec_id}",
"spec_path": source.get("path"),
"spec_url": source.get("url"),
"description": source.get("description", f"{spec_id} API spec"),
}
logger.info(f"Scraping OpenAPI spec: {spec_path}")
converter = OpenAPIToSkillConverter(openapi_config)
converter.extract_spec()
api_data_file = converter.data_file
with open(api_data_file, encoding="utf-8") as f:
api_data = json.load(f)
cache_api_data = os.path.join(self.data_dir, f"openapi_data_{idx}_{spec_id}.json")
shutil.copy(api_data_file, cache_api_data)
self.scraped_data["openapi"].append(
{
"spec_path": spec_path,
"spec_id": spec_id,
"idx": idx,
"data": api_data,
"data_file": cache_api_data,
}
)
try:
converter.build_skill()
logger.info("✅ OpenAPI: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone OpenAPI SKILL.md: {e}")
logger.info(f"✅ OpenAPI: {len(api_data.get('endpoints', []))} endpoints extracted")
def _scrape_asciidoc(self, source: dict[str, Any]):
"""Scrape AsciiDoc document(s)."""
try:
from skill_seekers.cli.asciidoc_scraper import AsciiDocToSkillConverter
except ImportError:
logger.error(
"AsciiDoc scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[asciidoc]"
)
return
idx = self._source_counters["asciidoc"]
self._source_counters["asciidoc"] += 1
adoc_path = source["path"]
adoc_id = os.path.splitext(os.path.basename(adoc_path.rstrip("/")))[0]
adoc_config = {
"name": f"{self.name}_asciidoc_{idx}_{adoc_id}",
"asciidoc_path": source["path"],
"description": source.get("description", f"{adoc_id} AsciiDoc content"),
}
logger.info(f"Scraping AsciiDoc: {source['path']}")
converter = AsciiDocToSkillConverter(adoc_config)
converter.extract_asciidoc()
adoc_data_file = converter.data_file
with open(adoc_data_file, encoding="utf-8") as f:
adoc_data = json.load(f)
cache_adoc_data = os.path.join(self.data_dir, f"asciidoc_data_{idx}_{adoc_id}.json")
shutil.copy(adoc_data_file, cache_adoc_data)
self.scraped_data["asciidoc"].append(
{
"asciidoc_path": adoc_path,
"asciidoc_id": adoc_id,
"idx": idx,
"data": adoc_data,
"data_file": cache_adoc_data,
}
)
try:
converter.build_skill()
logger.info("✅ AsciiDoc: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone AsciiDoc SKILL.md: {e}")
logger.info(f"✅ AsciiDoc: {len(adoc_data.get('sections', []))} sections extracted")
def _scrape_pptx(self, source: dict[str, Any]):
"""Scrape PowerPoint presentation (.pptx)."""
try:
from skill_seekers.cli.pptx_scraper import PptxToSkillConverter
except ImportError:
logger.error(
"PowerPoint scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[pptx]"
)
return
idx = self._source_counters["pptx"]
self._source_counters["pptx"] += 1
pptx_path = source["path"]
pptx_id = os.path.splitext(os.path.basename(pptx_path))[0]
pptx_config = {
"name": f"{self.name}_pptx_{idx}_{pptx_id}",
"pptx_path": source["path"],
"description": source.get("description", f"{pptx_id} presentation"),
}
logger.info(f"Scraping PowerPoint: {source['path']}")
converter = PptxToSkillConverter(pptx_config)
converter.extract_pptx()
pptx_data_file = converter.data_file
with open(pptx_data_file, encoding="utf-8") as f:
pptx_data = json.load(f)
cache_pptx_data = os.path.join(self.data_dir, f"pptx_data_{idx}_{pptx_id}.json")
shutil.copy(pptx_data_file, cache_pptx_data)
self.scraped_data["pptx"].append(
{
"pptx_path": pptx_path,
"pptx_id": pptx_id,
"idx": idx,
"data": pptx_data,
"data_file": cache_pptx_data,
}
)
try:
converter.build_skill()
logger.info("✅ PowerPoint: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone PowerPoint SKILL.md: {e}")
logger.info(f"✅ PowerPoint: {len(pptx_data.get('slides', []))} slides extracted")
def _scrape_confluence(self, source: dict[str, Any]):
"""Scrape Confluence wiki (API or exported HTML/XML)."""
try:
from skill_seekers.cli.confluence_scraper import ConfluenceToSkillConverter
except ImportError:
logger.error(
"Confluence scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[confluence]"
)
return
idx = self._source_counters["confluence"]
self._source_counters["confluence"] += 1
source_id = source.get("space_key", source.get("path", f"confluence_{idx}"))
if isinstance(source_id, str) and "/" in source_id:
source_id = os.path.basename(source_id.rstrip("/"))
conf_config = {
"name": f"{self.name}_confluence_{idx}_{source_id}",
"base_url": source.get("base_url", source.get("url")),
"space_key": source.get("space_key"),
"export_path": source.get("path"),
"username": source.get("username"),
"token": source.get("token"),
"description": source.get("description", f"{source_id} Confluence content"),
"max_pages": source.get("max_pages", 500),
}
logger.info(f"Scraping Confluence: {source_id}")
converter = ConfluenceToSkillConverter(conf_config)
converter.extract_confluence()
conf_data_file = converter.data_file
with open(conf_data_file, encoding="utf-8") as f:
conf_data = json.load(f)
cache_conf_data = os.path.join(self.data_dir, f"confluence_data_{idx}_{source_id}.json")
shutil.copy(conf_data_file, cache_conf_data)
self.scraped_data["confluence"].append(
{
"source_id": source_id,
"idx": idx,
"data": conf_data,
"data_file": cache_conf_data,
}
)
try:
converter.build_skill()
logger.info("✅ Confluence: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone Confluence SKILL.md: {e}")
logger.info(f"✅ Confluence: {len(conf_data.get('pages', []))} pages extracted")
def _scrape_notion(self, source: dict[str, Any]):
"""Scrape Notion pages (API or exported Markdown)."""
try:
from skill_seekers.cli.notion_scraper import NotionToSkillConverter
except ImportError:
logger.error(
"Notion scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[notion]"
)
return
idx = self._source_counters["notion"]
self._source_counters["notion"] += 1
source_id = source.get(
"database_id", source.get("page_id", source.get("path", f"notion_{idx}"))
)
if isinstance(source_id, str) and "/" in source_id:
source_id = os.path.basename(source_id.rstrip("/"))
notion_config = {
"name": f"{self.name}_notion_{idx}_{source_id}",
"database_id": source.get("database_id"),
"page_id": source.get("page_id"),
"export_path": source.get("path"),
"token": source.get("token"),
"description": source.get("description", f"{source_id} Notion content"),
"max_pages": source.get("max_pages", 500),
}
logger.info(f"Scraping Notion: {source_id}")
converter = NotionToSkillConverter(notion_config)
converter.extract_notion()
notion_data_file = converter.data_file
with open(notion_data_file, encoding="utf-8") as f:
notion_data = json.load(f)
cache_notion_data = os.path.join(self.data_dir, f"notion_data_{idx}_{source_id}.json")
shutil.copy(notion_data_file, cache_notion_data)
self.scraped_data["notion"].append(
{
"source_id": source_id,
"idx": idx,
"data": notion_data,
"data_file": cache_notion_data,
}
)
try:
converter.build_skill()
logger.info("✅ Notion: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone Notion SKILL.md: {e}")
logger.info(f"✅ Notion: {len(notion_data.get('pages', []))} pages extracted")
def _scrape_rss(self, source: dict[str, Any]):
"""Scrape RSS/Atom feed (with optional full article scraping)."""
try:
from skill_seekers.cli.rss_scraper import RssToSkillConverter
except ImportError:
logger.error(
"RSS scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[rss]"
)
return
idx = self._source_counters["rss"]
self._source_counters["rss"] += 1
feed_url = source.get("url", source.get("path", ""))
feed_id = feed_url.split("/")[-1].split(".")[0] if feed_url else f"feed_{idx}"
rss_config = {
"name": f"{self.name}_rss_{idx}_{feed_id}",
"feed_url": source.get("url"),
"feed_path": source.get("path"),
"follow_links": source.get("follow_links", True),
"max_articles": source.get("max_articles", 50),
"description": source.get("description", f"{feed_id} RSS/Atom feed"),
}
logger.info(f"Scraping RSS/Atom feed: {feed_url}")
converter = RssToSkillConverter(rss_config)
converter.extract_feed()
rss_data_file = converter.data_file
with open(rss_data_file, encoding="utf-8") as f:
rss_data = json.load(f)
cache_rss_data = os.path.join(self.data_dir, f"rss_data_{idx}_{feed_id}.json")
shutil.copy(rss_data_file, cache_rss_data)
self.scraped_data["rss"].append(
{
"feed_url": feed_url,
"feed_id": feed_id,
"idx": idx,
"data": rss_data,
"data_file": cache_rss_data,
}
)
try:
converter.build_skill()
logger.info("✅ RSS: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone RSS SKILL.md: {e}")
logger.info(f"✅ RSS: {len(rss_data.get('articles', []))} articles extracted")
def _scrape_manpage(self, source: dict[str, Any]):
"""Scrape man page(s)."""
try:
from skill_seekers.cli.man_scraper import ManPageToSkillConverter
except ImportError:
logger.error("man_scraper.py not found")
return
idx = self._source_counters["manpage"]
self._source_counters["manpage"] += 1
man_names = source.get("names", [])
man_path = source.get("path", "")
man_id = man_names[0] if man_names else os.path.basename(man_path.rstrip("/"))
man_config = {
"name": f"{self.name}_manpage_{idx}_{man_id}",
"man_names": man_names,
"man_path": man_path,
"sections": source.get("sections", []),
"description": source.get("description", f"{man_id} man pages"),
}
logger.info(f"Scraping man pages: {man_id}")
converter = ManPageToSkillConverter(man_config)
converter.extract_manpages()
man_data_file = converter.data_file
with open(man_data_file, encoding="utf-8") as f:
man_data = json.load(f)
cache_man_data = os.path.join(self.data_dir, f"manpage_data_{idx}_{man_id}.json")
shutil.copy(man_data_file, cache_man_data)
self.scraped_data["manpage"].append(
{
"man_id": man_id,
"idx": idx,
"data": man_data,
"data_file": cache_man_data,
}
)
try:
converter.build_skill()
logger.info("✅ Man pages: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone man page SKILL.md: {e}")
logger.info(f"✅ Man pages: {len(man_data.get('pages', []))} man pages extracted")
def _scrape_chat(self, source: dict[str, Any]):
"""Scrape Slack/Discord chat export or API."""
try:
from skill_seekers.cli.chat_scraper import ChatToSkillConverter
except ImportError:
logger.error(
"Chat scraper dependencies not installed.\n"
" Install with: pip install skill-seekers[chat]"
)
return
idx = self._source_counters["chat"]
self._source_counters["chat"] += 1
export_path = source.get("path", "")
channel = source.get("channel", source.get("channel_id", ""))
chat_id = channel or os.path.basename(export_path.rstrip("/")) or f"chat_{idx}"
chat_config = {
"name": f"{self.name}_chat_{idx}_{chat_id}",
"export_path": source.get("path"),
"platform": source.get("platform", "slack"),
"token": source.get("token"),
"channel": channel,
"max_messages": source.get("max_messages", 10000),
"description": source.get("description", f"{chat_id} chat export"),
}
logger.info(f"Scraping chat: {chat_id}")
converter = ChatToSkillConverter(chat_config)
converter.extract_chat()
chat_data_file = converter.data_file
with open(chat_data_file, encoding="utf-8") as f:
chat_data = json.load(f)
cache_chat_data = os.path.join(self.data_dir, f"chat_data_{idx}_{chat_id}.json")
shutil.copy(chat_data_file, cache_chat_data)
self.scraped_data["chat"].append(
{
"chat_id": chat_id,
"platform": source.get("platform", "slack"),
"idx": idx,
"data": chat_data,
"data_file": cache_chat_data,
}
)
try:
converter.build_skill()
logger.info("✅ Chat: Standalone SKILL.md created")
except Exception as e:
logger.warning(f"⚠️ Failed to build standalone chat SKILL.md: {e}")
logger.info(f"✅ Chat: {len(chat_data.get('messages', []))} messages extracted")
def _load_json(self, file_path: Path) -> dict:
"""
Load JSON file safely.
@@ -1297,14 +1931,33 @@ Examples:
if args.dry_run:
logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
# Source type display config: type -> (label, key for detail)
_SOURCE_DISPLAY = {
"documentation": ("Documentation", "base_url"),
"github": ("GitHub", "repo"),
"pdf": ("PDF", "path"),
"word": ("Word", "path"),
"epub": ("EPUB", "path"),
"video": ("Video", "url"),
"local": ("Local Codebase", "path"),
"jupyter": ("Jupyter Notebook", "path"),
"html": ("HTML", "path"),
"openapi": ("OpenAPI Spec", "path"),
"asciidoc": ("AsciiDoc", "path"),
"pptx": ("PowerPoint", "path"),
"confluence": ("Confluence", "base_url"),
"notion": ("Notion", "page_id"),
"rss": ("RSS/Atom Feed", "url"),
"manpage": ("Man Page", "names"),
"chat": ("Chat Export", "path"),
}
for idx, source in enumerate(scraper.config.get("sources", []), 1):
source_type = source.get("type", "unknown")
if source_type == "documentation":
logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
elif source_type == "github":
logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
elif source_type == "pdf":
logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
label, key = _SOURCE_DISPLAY.get(source_type, (source_type.title(), "path"))
detail = source.get(key, "N/A")
if isinstance(detail, list):
detail = ", ".join(str(d) for d in detail)
logger.info(f" {idx}. {label}: {detail}")
logger.info(f"\nOutput directory: {scraper.output_dir}")
logger.info(f"Merge mode: {scraper.merge_mode}")
return

View File

@@ -136,6 +136,44 @@ class UnifiedSkillBuilder:
skill_mds["pdf"] = "\n\n---\n\n".join(pdf_sources)
logger.debug(f"Combined {len(pdf_sources)} PDF SKILL.md files")
# Load additional source types using generic glob pattern
# Each source type uses: {name}_{type}_{idx}_*/ or {name}_{type}_*/
_extra_types = [
"word",
"epub",
"video",
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
]
for source_type in _extra_types:
type_sources = []
for type_dir in sources_dir.glob(f"{self.name}_{source_type}_*"):
type_skill_path = type_dir / "SKILL.md"
if type_skill_path.exists():
try:
content = type_skill_path.read_text(encoding="utf-8")
type_sources.append(content)
logger.debug(
f"Loaded {source_type} SKILL.md from {type_dir.name} "
f"({len(content)} chars)"
)
except OSError as e:
logger.warning(
f"Failed to read {source_type} SKILL.md from {type_dir.name}: {e}"
)
if type_sources:
skill_mds[source_type] = "\n\n---\n\n".join(type_sources)
logger.debug(f"Combined {len(type_sources)} {source_type} SKILL.md files")
logger.info(f"Loaded {len(skill_mds)} source SKILL.md files")
return skill_mds
@@ -477,6 +515,18 @@ This skill synthesizes knowledge from multiple sources:
logger.info("Using PDF SKILL.md as-is")
content = skill_mds["pdf"]
# Generic merge for additional source types not covered by pairwise methods
if not content and skill_mds:
# At least one source SKILL.md exists but not docs/github/pdf
logger.info(f"Generic merge for source types: {list(skill_mds.keys())}")
content = self._generic_merge(skill_mds)
elif content and len(skill_mds) > (int(has_docs) + int(has_github) + int(has_pdf)):
# Pairwise synthesis handled the core types; append additional sources
extra_types = set(skill_mds.keys()) - {"documentation", "github", "pdf"}
if extra_types:
logger.info(f"Appending additional sources: {extra_types}")
content = self._append_extra_sources(content, skill_mds, extra_types)
# Fallback: generate minimal SKILL.md (legacy behavior)
if not content:
logger.warning("No source SKILL.md files found, generating minimal SKILL.md (legacy)")
@@ -574,6 +624,165 @@ This skill synthesizes knowledge from multiple sources:
return "\n".join(lines)
# ------------------------------------------------------------------
# Generic merge system for any combination of source types (v3.2.0+)
# ------------------------------------------------------------------
# Human-readable labels for source types
_SOURCE_LABELS: dict[str, str] = {
"documentation": "Documentation",
"github": "GitHub Repository",
"pdf": "PDF Document",
"word": "Word Document",
"epub": "EPUB E-book",
"video": "Video",
"local": "Local Codebase",
"jupyter": "Jupyter Notebook",
"html": "HTML Document",
"openapi": "OpenAPI/Swagger Spec",
"asciidoc": "AsciiDoc Document",
"pptx": "PowerPoint Presentation",
"confluence": "Confluence Wiki",
"notion": "Notion Page",
"rss": "RSS/Atom Feed",
"manpage": "Man Page",
"chat": "Chat Export",
}
def _generic_merge(self, skill_mds: dict[str, str]) -> str:
"""Generic merge for any combination of source types.
Uses a priority-based section ordering approach:
1. Parse all source SKILL.md files into sections
2. Collect unique sections across all sources
3. Merge matching sections with source attribution
4. Produce a unified SKILL.md
This preserves the existing pairwise synthesis for docs+github, docs+pdf, etc.
and handles any other combination generically.
Args:
skill_mds: Dict mapping source type to SKILL.md content
Returns:
Merged SKILL.md content string
"""
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
desc = self.description[:1024] if len(self.description) > 1024 else self.description
# Parse all source SKILL.md files into sections
all_sections: dict[str, dict[str, str]] = {}
for source_type, content in skill_mds.items():
all_sections[source_type] = self._parse_skill_md_sections(content)
# Determine all unique section names in priority order
# Sections that appear earlier in sources have higher priority
seen_sections: list[str] = []
for _source_type, sections in all_sections.items():
for section_name in sections:
if section_name not in seen_sections:
seen_sections.append(section_name)
# Build merged content
source_labels = ", ".join(self._SOURCE_LABELS.get(t, t.title()) for t in skill_mds)
lines = [
"---",
f"name: {skill_name}",
f"description: {desc}",
"---",
"",
f"# {self.name.replace('_', ' ').title()}",
"",
f"{self.description}",
"",
f"*Merged from: {source_labels}*",
"",
]
# Emit each section, merging content from all sources that have it
for section_name in seen_sections:
contributing_sources = [
(stype, sections[section_name])
for stype, sections in all_sections.items()
if section_name in sections
]
if len(contributing_sources) == 1:
# Single source for this section — emit as-is
stype, content = contributing_sources[0]
label = self._SOURCE_LABELS.get(stype, stype.title())
lines.append(f"## {section_name}")
lines.append("")
lines.append(f"*From {label}*")
lines.append("")
lines.append(content)
lines.append("")
else:
# Multiple sources — merge with attribution
lines.append(f"## {section_name}")
lines.append("")
for stype, content in contributing_sources:
label = self._SOURCE_LABELS.get(stype, stype.title())
lines.append(f"### From {label}")
lines.append("")
lines.append(content)
lines.append("")
lines.append("---")
lines.append("")
lines.append("*Generated by Skill Seeker's unified multi-source scraper*")
return "\n".join(lines)
def _append_extra_sources(
self,
base_content: str,
skill_mds: dict[str, str],
extra_types: set[str],
) -> str:
"""Append additional source content to existing pairwise-synthesized SKILL.md.
Used when the core docs+github+pdf synthesis has run, but there are
additional source types (epub, jupyter, etc.) that need to be included.
Args:
base_content: Already-synthesized SKILL.md content
skill_mds: All source SKILL.md files
extra_types: Set of extra source type keys to append
Returns:
Extended SKILL.md content
"""
lines = base_content.split("\n")
# Find the final separator (---) or end of file
insertion_index = len(lines)
for i in range(len(lines) - 1, -1, -1):
if lines[i].strip() == "---":
insertion_index = i
break
# Build extra content
extra_lines = [""]
for source_type in sorted(extra_types):
if source_type not in skill_mds:
continue
label = self._SOURCE_LABELS.get(source_type, source_type.title())
sections = self._parse_skill_md_sections(skill_mds[source_type])
extra_lines.append(f"## {label} Content")
extra_lines.append("")
for section_name, content in sections.items():
extra_lines.append(f"### {section_name}")
extra_lines.append("")
extra_lines.append(content)
extra_lines.append("")
lines[insertion_index:insertion_index] = extra_lines
return "\n".join(lines)
def _generate_minimal_skill_md(self) -> str:
"""Generate minimal SKILL.md (legacy fallback behavior).
@@ -597,18 +806,42 @@ This skill combines knowledge from multiple sources:
"""
# Source type display keys: type -> (label, primary_key, extra_keys)
_source_detail_map = {
"documentation": ("Documentation", "base_url", [("Pages", "max_pages", "unlimited")]),
"github": (
"GitHub Repository",
"repo",
[("Code Analysis", "code_analysis_depth", "surface"), ("Issues", "max_issues", 0)],
),
"pdf": ("PDF Document", "path", []),
"word": ("Word Document", "path", []),
"epub": ("EPUB E-book", "path", []),
"video": ("Video", "url", []),
"local": ("Local Codebase", "path", [("Analysis Depth", "analysis_depth", "surface")]),
"jupyter": ("Jupyter Notebook", "path", []),
"html": ("HTML Document", "path", []),
"openapi": ("OpenAPI Spec", "path", []),
"asciidoc": ("AsciiDoc Document", "path", []),
"pptx": ("PowerPoint", "path", []),
"confluence": ("Confluence Wiki", "base_url", []),
"notion": ("Notion Page", "page_id", []),
"rss": ("RSS/Atom Feed", "url", []),
"manpage": ("Man Page", "names", []),
"chat": ("Chat Export", "path", []),
}
# List sources
for source in self.config.get("sources", []):
source_type = source["type"]
if source_type == "documentation":
content += f"- ✅ **Documentation**: {source.get('base_url', 'N/A')}\n"
content += f" - Pages: {source.get('max_pages', 'unlimited')}\n"
elif source_type == "github":
content += f"- ✅ **GitHub Repository**: {source.get('repo', 'N/A')}\n"
content += f" - Code Analysis: {source.get('code_analysis_depth', 'surface')}\n"
content += f" - Issues: {source.get('max_issues', 0)}\n"
elif source_type == "pdf":
content += f"- ✅ **PDF Document**: {source.get('path', 'N/A')}\n"
display = _source_detail_map.get(source_type, (source_type.title(), "path", []))
label, primary_key, extras = display
primary_val = source.get(primary_key, "N/A")
if isinstance(primary_val, list):
primary_val = ", ".join(str(v) for v in primary_val)
content += f"- ✅ **{label}**: {primary_val}\n"
for extra_label, extra_key, extra_default in extras:
content += f" - {extra_label}: {source.get(extra_key, extra_default)}\n"
# C3.x Architecture & Code Analysis section (if available)
github_data = self.scraped_data.get("github", {})
@@ -796,6 +1029,27 @@ This skill combines knowledge from multiple sources:
if pdf_list:
self._generate_pdf_references(pdf_list)
# Generate references for all additional source types
_extra_source_types = [
"word",
"epub",
"video",
"jupyter",
"html",
"openapi",
"asciidoc",
"pptx",
"confluence",
"notion",
"rss",
"manpage",
"chat",
]
for source_type in _extra_source_types:
source_list = self.scraped_data.get(source_type, [])
if source_list:
self._generate_generic_references(source_type, source_list)
# Generate merged API reference if available
if self.merged_data:
self._generate_merged_api_reference()
@@ -977,6 +1231,63 @@ This skill combines knowledge from multiple sources:
logger.info(f"Created PDF references ({len(pdf_list)} sources)")
def _generate_generic_references(self, source_type: str, source_list: list[dict]):
"""Generate references for any source type using a generic approach.
Creates a references/<source_type>/ directory with an index and
copies any data files from the source list.
Args:
source_type: The source type key (e.g., 'epub', 'jupyter')
source_list: List of scraped source dicts for this type
"""
if not source_list:
return
label = self._SOURCE_LABELS.get(source_type, source_type.title())
type_dir = os.path.join(self.skill_dir, "references", source_type)
os.makedirs(type_dir, exist_ok=True)
# Create index
index_path = os.path.join(type_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write(f"# {label} References\n\n")
f.write(f"Reference from {len(source_list)} {label} source(s).\n\n")
for i, source_data in enumerate(source_list):
# Try common ID fields
source_id = (
source_data.get("source_id")
or source_data.get(f"{source_type}_id")
or source_data.get("notebook_id")
or source_data.get("spec_id")
or source_data.get("feed_id")
or source_data.get("man_id")
or source_data.get("chat_id")
or f"source_{i}"
)
f.write(f"## {source_id}\n\n")
# Write summary of extracted data
data = source_data.get("data", {})
if isinstance(data, dict):
for key in ["title", "description", "metadata"]:
if key in data:
val = data[key]
if isinstance(val, str) and val:
f.write(f"**{key.title()}:** {val}\n\n")
# Copy data file if available
data_file = source_data.get("data_file")
if data_file and os.path.isfile(data_file):
dest = os.path.join(type_dir, f"{source_id}_data.json")
import contextlib
with contextlib.suppress(OSError):
shutil.copy(data_file, dest)
logger.info(f"Created {label} references ({len(source_list)} sources)")
def _generate_merged_api_reference(self):
"""Generate merged API reference file."""
api_dir = os.path.join(self.skill_dir, "references", "api")