feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
This commit is contained in:
@@ -140,6 +140,26 @@ class CreateCommand:
|
||||
return self._route_video()
|
||||
elif self.source_info.type == "config":
|
||||
return self._route_config()
|
||||
elif self.source_info.type == "jupyter":
|
||||
return self._route_generic("jupyter_scraper", "--notebook")
|
||||
elif self.source_info.type == "html":
|
||||
return self._route_generic("html_scraper", "--html-path")
|
||||
elif self.source_info.type == "openapi":
|
||||
return self._route_generic("openapi_scraper", "--spec")
|
||||
elif self.source_info.type == "asciidoc":
|
||||
return self._route_generic("asciidoc_scraper", "--asciidoc-path")
|
||||
elif self.source_info.type == "pptx":
|
||||
return self._route_generic("pptx_scraper", "--pptx")
|
||||
elif self.source_info.type == "rss":
|
||||
return self._route_generic("rss_scraper", "--feed-path")
|
||||
elif self.source_info.type == "manpage":
|
||||
return self._route_generic("man_scraper", "--man-path")
|
||||
elif self.source_info.type == "confluence":
|
||||
return self._route_generic("confluence_scraper", "--export-path")
|
||||
elif self.source_info.type == "notion":
|
||||
return self._route_generic("notion_scraper", "--export-path")
|
||||
elif self.source_info.type == "chat":
|
||||
return self._route_generic("chat_scraper", "--export-path")
|
||||
else:
|
||||
logger.error(f"Unknown source type: {self.source_info.type}")
|
||||
return 1
|
||||
@@ -485,6 +505,40 @@ class CreateCommand:
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _route_generic(self, module_name: str, file_flag: str) -> int:
|
||||
"""Generic routing for new source types.
|
||||
|
||||
Most new source types (jupyter, html, openapi, asciidoc, pptx, rss,
|
||||
manpage, confluence, notion, chat) follow the same pattern:
|
||||
import module, build argv with --flag <file_path>, add common args, call main().
|
||||
|
||||
Args:
|
||||
module_name: Python module name under skill_seekers.cli (e.g., "jupyter_scraper")
|
||||
file_flag: CLI flag for the source file (e.g., "--notebook")
|
||||
|
||||
Returns:
|
||||
Exit code from scraper
|
||||
"""
|
||||
import importlib
|
||||
|
||||
module = importlib.import_module(f"skill_seekers.cli.{module_name}")
|
||||
|
||||
argv = [module_name]
|
||||
|
||||
file_path = self.source_info.parsed.get("file_path", "")
|
||||
if file_path:
|
||||
argv.extend([file_flag, file_path])
|
||||
|
||||
self._add_common_args(argv)
|
||||
|
||||
logger.debug(f"Calling {module_name} with argv: {argv}")
|
||||
original_argv = sys.argv
|
||||
try:
|
||||
sys.argv = argv
|
||||
return module.main()
|
||||
finally:
|
||||
sys.argv = original_argv
|
||||
|
||||
def _add_common_args(self, argv: list[str]) -> None:
|
||||
"""Add truly universal arguments to argv list.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user