feat: add 10 new skill source types (17 total) with full pipeline integration
Add Jupyter Notebook, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint, RSS/Atom, Man Pages, Confluence, Notion, and Slack/Discord Chat as new skill source types. Each type is fully integrated across: - Standalone CLI commands (skill-seekers <type>) - Auto-detection via 'skill-seekers create' (file extension + content sniffing) - Unified multi-source configs (scraped_data, dispatch, config validation) - Unified skill builder (generic merge + source-attributed synthesis) - MCP server (scrape_generic tool with per-type flag mapping) - pyproject.toml (entry points, optional deps, [all] group) Also fixes: EPUB unified pipeline gap, missing word/video config validators, OpenAPI yaml import guard, MCP flag mismatch for all 10 types, stale docstrings, and adds 77 integration tests + complex-merge workflow. 50 files changed, +20,201 lines
This commit is contained in:
@@ -76,6 +76,17 @@ class UnifiedScraper:
|
||||
"word": [], # List of word sources
|
||||
"video": [], # List of video sources
|
||||
"local": [], # List of local sources (docs or code)
|
||||
"epub": [], # List of epub sources
|
||||
"jupyter": [], # List of Jupyter notebook sources
|
||||
"html": [], # List of local HTML sources
|
||||
"openapi": [], # List of OpenAPI/Swagger spec sources
|
||||
"asciidoc": [], # List of AsciiDoc sources
|
||||
"pptx": [], # List of PowerPoint sources
|
||||
"confluence": [], # List of Confluence wiki sources
|
||||
"notion": [], # List of Notion page sources
|
||||
"rss": [], # List of RSS/Atom feed sources
|
||||
"manpage": [], # List of man page sources
|
||||
"chat": [], # List of Slack/Discord chat sources
|
||||
}
|
||||
|
||||
# Track source index for unique naming (multi-source support)
|
||||
@@ -86,6 +97,17 @@ class UnifiedScraper:
|
||||
"word": 0,
|
||||
"video": 0,
|
||||
"local": 0,
|
||||
"epub": 0,
|
||||
"jupyter": 0,
|
||||
"html": 0,
|
||||
"openapi": 0,
|
||||
"asciidoc": 0,
|
||||
"pptx": 0,
|
||||
"confluence": 0,
|
||||
"notion": 0,
|
||||
"rss": 0,
|
||||
"manpage": 0,
|
||||
"chat": 0,
|
||||
}
|
||||
|
||||
# Output paths - cleaner organization
|
||||
@@ -166,6 +188,28 @@ class UnifiedScraper:
|
||||
self._scrape_video(source)
|
||||
elif source_type == "local":
|
||||
self._scrape_local(source)
|
||||
elif source_type == "epub":
|
||||
self._scrape_epub(source)
|
||||
elif source_type == "jupyter":
|
||||
self._scrape_jupyter(source)
|
||||
elif source_type == "html":
|
||||
self._scrape_html(source)
|
||||
elif source_type == "openapi":
|
||||
self._scrape_openapi(source)
|
||||
elif source_type == "asciidoc":
|
||||
self._scrape_asciidoc(source)
|
||||
elif source_type == "pptx":
|
||||
self._scrape_pptx(source)
|
||||
elif source_type == "confluence":
|
||||
self._scrape_confluence(source)
|
||||
elif source_type == "notion":
|
||||
self._scrape_notion(source)
|
||||
elif source_type == "rss":
|
||||
self._scrape_rss(source)
|
||||
elif source_type == "manpage":
|
||||
self._scrape_manpage(source)
|
||||
elif source_type == "chat":
|
||||
self._scrape_chat(source)
|
||||
else:
|
||||
logger.warning(f"Unknown source type: {source_type}")
|
||||
except Exception as e:
|
||||
@@ -571,6 +615,7 @@ class UnifiedScraper:
|
||||
{
|
||||
"docx_path": docx_path,
|
||||
"docx_id": docx_id,
|
||||
"word_id": docx_id, # Alias for generic reference generation
|
||||
"idx": idx,
|
||||
"data": word_data,
|
||||
"data_file": cache_word_data,
|
||||
@@ -788,6 +833,595 @@ class UnifiedScraper:
|
||||
logger.debug(f"Traceback: {traceback.format_exc()}")
|
||||
raise
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# New source type handlers (v3.2.0+)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _scrape_epub(self, source: dict[str, Any]):
|
||||
"""Scrape EPUB e-book (.epub)."""
|
||||
try:
|
||||
from skill_seekers.cli.epub_scraper import EpubToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"EPUB scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[epub]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["epub"]
|
||||
self._source_counters["epub"] += 1
|
||||
|
||||
epub_path = source["path"]
|
||||
epub_id = os.path.splitext(os.path.basename(epub_path))[0]
|
||||
|
||||
epub_config = {
|
||||
"name": f"{self.name}_epub_{idx}_{epub_id}",
|
||||
"epub_path": source["path"],
|
||||
"description": source.get("description", f"{epub_id} e-book"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping EPUB: {source['path']}")
|
||||
converter = EpubToSkillConverter(epub_config)
|
||||
converter.extract_epub()
|
||||
|
||||
epub_data_file = converter.data_file
|
||||
with open(epub_data_file, encoding="utf-8") as f:
|
||||
epub_data = json.load(f)
|
||||
|
||||
cache_epub_data = os.path.join(self.data_dir, f"epub_data_{idx}_{epub_id}.json")
|
||||
shutil.copy(epub_data_file, cache_epub_data)
|
||||
|
||||
self.scraped_data["epub"].append(
|
||||
{
|
||||
"epub_path": epub_path,
|
||||
"epub_id": epub_id,
|
||||
"idx": idx,
|
||||
"data": epub_data,
|
||||
"data_file": cache_epub_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ EPUB: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone EPUB SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ EPUB: {len(epub_data.get('chapters', []))} chapters extracted")
|
||||
|
||||
def _scrape_jupyter(self, source: dict[str, Any]):
|
||||
"""Scrape Jupyter Notebook (.ipynb)."""
|
||||
try:
|
||||
from skill_seekers.cli.jupyter_scraper import JupyterToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Jupyter scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[jupyter]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["jupyter"]
|
||||
self._source_counters["jupyter"] += 1
|
||||
|
||||
nb_path = source["path"]
|
||||
nb_id = os.path.splitext(os.path.basename(nb_path))[0]
|
||||
|
||||
nb_config = {
|
||||
"name": f"{self.name}_jupyter_{idx}_{nb_id}",
|
||||
"notebook_path": source["path"],
|
||||
"description": source.get("description", f"{nb_id} notebook"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping Jupyter Notebook: {source['path']}")
|
||||
converter = JupyterToSkillConverter(nb_config)
|
||||
converter.extract_notebook()
|
||||
|
||||
nb_data_file = converter.data_file
|
||||
with open(nb_data_file, encoding="utf-8") as f:
|
||||
nb_data = json.load(f)
|
||||
|
||||
cache_nb_data = os.path.join(self.data_dir, f"jupyter_data_{idx}_{nb_id}.json")
|
||||
shutil.copy(nb_data_file, cache_nb_data)
|
||||
|
||||
self.scraped_data["jupyter"].append(
|
||||
{
|
||||
"notebook_path": nb_path,
|
||||
"notebook_id": nb_id,
|
||||
"idx": idx,
|
||||
"data": nb_data,
|
||||
"data_file": cache_nb_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Jupyter: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Jupyter SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Jupyter: {len(nb_data.get('cells', []))} cells extracted")
|
||||
|
||||
def _scrape_html(self, source: dict[str, Any]):
|
||||
"""Scrape local HTML file(s)."""
|
||||
try:
|
||||
from skill_seekers.cli.html_scraper import HtmlToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("html_scraper.py not found")
|
||||
return
|
||||
|
||||
idx = self._source_counters["html"]
|
||||
self._source_counters["html"] += 1
|
||||
|
||||
html_path = source["path"]
|
||||
html_id = os.path.splitext(os.path.basename(html_path.rstrip("/")))[0]
|
||||
|
||||
html_config = {
|
||||
"name": f"{self.name}_html_{idx}_{html_id}",
|
||||
"html_path": source["path"],
|
||||
"description": source.get("description", f"{html_id} HTML content"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping local HTML: {source['path']}")
|
||||
converter = HtmlToSkillConverter(html_config)
|
||||
converter.extract_html()
|
||||
|
||||
html_data_file = converter.data_file
|
||||
with open(html_data_file, encoding="utf-8") as f:
|
||||
html_data = json.load(f)
|
||||
|
||||
cache_html_data = os.path.join(self.data_dir, f"html_data_{idx}_{html_id}.json")
|
||||
shutil.copy(html_data_file, cache_html_data)
|
||||
|
||||
self.scraped_data["html"].append(
|
||||
{
|
||||
"html_path": html_path,
|
||||
"html_id": html_id,
|
||||
"idx": idx,
|
||||
"data": html_data,
|
||||
"data_file": cache_html_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ HTML: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone HTML SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ HTML: {len(html_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_openapi(self, source: dict[str, Any]):
|
||||
"""Scrape OpenAPI/Swagger specification."""
|
||||
try:
|
||||
from skill_seekers.cli.openapi_scraper import OpenAPIToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("openapi_scraper.py not found")
|
||||
return
|
||||
|
||||
idx = self._source_counters["openapi"]
|
||||
self._source_counters["openapi"] += 1
|
||||
|
||||
spec_path = source.get("path", source.get("url", ""))
|
||||
spec_id = os.path.splitext(os.path.basename(spec_path))[0] if spec_path else f"spec_{idx}"
|
||||
|
||||
openapi_config = {
|
||||
"name": f"{self.name}_openapi_{idx}_{spec_id}",
|
||||
"spec_path": source.get("path"),
|
||||
"spec_url": source.get("url"),
|
||||
"description": source.get("description", f"{spec_id} API spec"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping OpenAPI spec: {spec_path}")
|
||||
converter = OpenAPIToSkillConverter(openapi_config)
|
||||
converter.extract_spec()
|
||||
|
||||
api_data_file = converter.data_file
|
||||
with open(api_data_file, encoding="utf-8") as f:
|
||||
api_data = json.load(f)
|
||||
|
||||
cache_api_data = os.path.join(self.data_dir, f"openapi_data_{idx}_{spec_id}.json")
|
||||
shutil.copy(api_data_file, cache_api_data)
|
||||
|
||||
self.scraped_data["openapi"].append(
|
||||
{
|
||||
"spec_path": spec_path,
|
||||
"spec_id": spec_id,
|
||||
"idx": idx,
|
||||
"data": api_data,
|
||||
"data_file": cache_api_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ OpenAPI: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone OpenAPI SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ OpenAPI: {len(api_data.get('endpoints', []))} endpoints extracted")
|
||||
|
||||
def _scrape_asciidoc(self, source: dict[str, Any]):
|
||||
"""Scrape AsciiDoc document(s)."""
|
||||
try:
|
||||
from skill_seekers.cli.asciidoc_scraper import AsciiDocToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"AsciiDoc scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[asciidoc]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["asciidoc"]
|
||||
self._source_counters["asciidoc"] += 1
|
||||
|
||||
adoc_path = source["path"]
|
||||
adoc_id = os.path.splitext(os.path.basename(adoc_path.rstrip("/")))[0]
|
||||
|
||||
adoc_config = {
|
||||
"name": f"{self.name}_asciidoc_{idx}_{adoc_id}",
|
||||
"asciidoc_path": source["path"],
|
||||
"description": source.get("description", f"{adoc_id} AsciiDoc content"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping AsciiDoc: {source['path']}")
|
||||
converter = AsciiDocToSkillConverter(adoc_config)
|
||||
converter.extract_asciidoc()
|
||||
|
||||
adoc_data_file = converter.data_file
|
||||
with open(adoc_data_file, encoding="utf-8") as f:
|
||||
adoc_data = json.load(f)
|
||||
|
||||
cache_adoc_data = os.path.join(self.data_dir, f"asciidoc_data_{idx}_{adoc_id}.json")
|
||||
shutil.copy(adoc_data_file, cache_adoc_data)
|
||||
|
||||
self.scraped_data["asciidoc"].append(
|
||||
{
|
||||
"asciidoc_path": adoc_path,
|
||||
"asciidoc_id": adoc_id,
|
||||
"idx": idx,
|
||||
"data": adoc_data,
|
||||
"data_file": cache_adoc_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ AsciiDoc: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone AsciiDoc SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ AsciiDoc: {len(adoc_data.get('sections', []))} sections extracted")
|
||||
|
||||
def _scrape_pptx(self, source: dict[str, Any]):
|
||||
"""Scrape PowerPoint presentation (.pptx)."""
|
||||
try:
|
||||
from skill_seekers.cli.pptx_scraper import PptxToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"PowerPoint scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[pptx]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["pptx"]
|
||||
self._source_counters["pptx"] += 1
|
||||
|
||||
pptx_path = source["path"]
|
||||
pptx_id = os.path.splitext(os.path.basename(pptx_path))[0]
|
||||
|
||||
pptx_config = {
|
||||
"name": f"{self.name}_pptx_{idx}_{pptx_id}",
|
||||
"pptx_path": source["path"],
|
||||
"description": source.get("description", f"{pptx_id} presentation"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping PowerPoint: {source['path']}")
|
||||
converter = PptxToSkillConverter(pptx_config)
|
||||
converter.extract_pptx()
|
||||
|
||||
pptx_data_file = converter.data_file
|
||||
with open(pptx_data_file, encoding="utf-8") as f:
|
||||
pptx_data = json.load(f)
|
||||
|
||||
cache_pptx_data = os.path.join(self.data_dir, f"pptx_data_{idx}_{pptx_id}.json")
|
||||
shutil.copy(pptx_data_file, cache_pptx_data)
|
||||
|
||||
self.scraped_data["pptx"].append(
|
||||
{
|
||||
"pptx_path": pptx_path,
|
||||
"pptx_id": pptx_id,
|
||||
"idx": idx,
|
||||
"data": pptx_data,
|
||||
"data_file": cache_pptx_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ PowerPoint: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone PowerPoint SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ PowerPoint: {len(pptx_data.get('slides', []))} slides extracted")
|
||||
|
||||
def _scrape_confluence(self, source: dict[str, Any]):
|
||||
"""Scrape Confluence wiki (API or exported HTML/XML)."""
|
||||
try:
|
||||
from skill_seekers.cli.confluence_scraper import ConfluenceToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Confluence scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[confluence]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["confluence"]
|
||||
self._source_counters["confluence"] += 1
|
||||
|
||||
source_id = source.get("space_key", source.get("path", f"confluence_{idx}"))
|
||||
if isinstance(source_id, str) and "/" in source_id:
|
||||
source_id = os.path.basename(source_id.rstrip("/"))
|
||||
|
||||
conf_config = {
|
||||
"name": f"{self.name}_confluence_{idx}_{source_id}",
|
||||
"base_url": source.get("base_url", source.get("url")),
|
||||
"space_key": source.get("space_key"),
|
||||
"export_path": source.get("path"),
|
||||
"username": source.get("username"),
|
||||
"token": source.get("token"),
|
||||
"description": source.get("description", f"{source_id} Confluence content"),
|
||||
"max_pages": source.get("max_pages", 500),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping Confluence: {source_id}")
|
||||
converter = ConfluenceToSkillConverter(conf_config)
|
||||
converter.extract_confluence()
|
||||
|
||||
conf_data_file = converter.data_file
|
||||
with open(conf_data_file, encoding="utf-8") as f:
|
||||
conf_data = json.load(f)
|
||||
|
||||
cache_conf_data = os.path.join(self.data_dir, f"confluence_data_{idx}_{source_id}.json")
|
||||
shutil.copy(conf_data_file, cache_conf_data)
|
||||
|
||||
self.scraped_data["confluence"].append(
|
||||
{
|
||||
"source_id": source_id,
|
||||
"idx": idx,
|
||||
"data": conf_data,
|
||||
"data_file": cache_conf_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Confluence: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Confluence SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Confluence: {len(conf_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_notion(self, source: dict[str, Any]):
|
||||
"""Scrape Notion pages (API or exported Markdown)."""
|
||||
try:
|
||||
from skill_seekers.cli.notion_scraper import NotionToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Notion scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[notion]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["notion"]
|
||||
self._source_counters["notion"] += 1
|
||||
|
||||
source_id = source.get(
|
||||
"database_id", source.get("page_id", source.get("path", f"notion_{idx}"))
|
||||
)
|
||||
if isinstance(source_id, str) and "/" in source_id:
|
||||
source_id = os.path.basename(source_id.rstrip("/"))
|
||||
|
||||
notion_config = {
|
||||
"name": f"{self.name}_notion_{idx}_{source_id}",
|
||||
"database_id": source.get("database_id"),
|
||||
"page_id": source.get("page_id"),
|
||||
"export_path": source.get("path"),
|
||||
"token": source.get("token"),
|
||||
"description": source.get("description", f"{source_id} Notion content"),
|
||||
"max_pages": source.get("max_pages", 500),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping Notion: {source_id}")
|
||||
converter = NotionToSkillConverter(notion_config)
|
||||
converter.extract_notion()
|
||||
|
||||
notion_data_file = converter.data_file
|
||||
with open(notion_data_file, encoding="utf-8") as f:
|
||||
notion_data = json.load(f)
|
||||
|
||||
cache_notion_data = os.path.join(self.data_dir, f"notion_data_{idx}_{source_id}.json")
|
||||
shutil.copy(notion_data_file, cache_notion_data)
|
||||
|
||||
self.scraped_data["notion"].append(
|
||||
{
|
||||
"source_id": source_id,
|
||||
"idx": idx,
|
||||
"data": notion_data,
|
||||
"data_file": cache_notion_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Notion: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone Notion SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Notion: {len(notion_data.get('pages', []))} pages extracted")
|
||||
|
||||
def _scrape_rss(self, source: dict[str, Any]):
|
||||
"""Scrape RSS/Atom feed (with optional full article scraping)."""
|
||||
try:
|
||||
from skill_seekers.cli.rss_scraper import RssToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"RSS scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[rss]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["rss"]
|
||||
self._source_counters["rss"] += 1
|
||||
|
||||
feed_url = source.get("url", source.get("path", ""))
|
||||
feed_id = feed_url.split("/")[-1].split(".")[0] if feed_url else f"feed_{idx}"
|
||||
|
||||
rss_config = {
|
||||
"name": f"{self.name}_rss_{idx}_{feed_id}",
|
||||
"feed_url": source.get("url"),
|
||||
"feed_path": source.get("path"),
|
||||
"follow_links": source.get("follow_links", True),
|
||||
"max_articles": source.get("max_articles", 50),
|
||||
"description": source.get("description", f"{feed_id} RSS/Atom feed"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping RSS/Atom feed: {feed_url}")
|
||||
converter = RssToSkillConverter(rss_config)
|
||||
converter.extract_feed()
|
||||
|
||||
rss_data_file = converter.data_file
|
||||
with open(rss_data_file, encoding="utf-8") as f:
|
||||
rss_data = json.load(f)
|
||||
|
||||
cache_rss_data = os.path.join(self.data_dir, f"rss_data_{idx}_{feed_id}.json")
|
||||
shutil.copy(rss_data_file, cache_rss_data)
|
||||
|
||||
self.scraped_data["rss"].append(
|
||||
{
|
||||
"feed_url": feed_url,
|
||||
"feed_id": feed_id,
|
||||
"idx": idx,
|
||||
"data": rss_data,
|
||||
"data_file": cache_rss_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ RSS: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone RSS SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ RSS: {len(rss_data.get('articles', []))} articles extracted")
|
||||
|
||||
def _scrape_manpage(self, source: dict[str, Any]):
|
||||
"""Scrape man page(s)."""
|
||||
try:
|
||||
from skill_seekers.cli.man_scraper import ManPageToSkillConverter
|
||||
except ImportError:
|
||||
logger.error("man_scraper.py not found")
|
||||
return
|
||||
|
||||
idx = self._source_counters["manpage"]
|
||||
self._source_counters["manpage"] += 1
|
||||
|
||||
man_names = source.get("names", [])
|
||||
man_path = source.get("path", "")
|
||||
man_id = man_names[0] if man_names else os.path.basename(man_path.rstrip("/"))
|
||||
|
||||
man_config = {
|
||||
"name": f"{self.name}_manpage_{idx}_{man_id}",
|
||||
"man_names": man_names,
|
||||
"man_path": man_path,
|
||||
"sections": source.get("sections", []),
|
||||
"description": source.get("description", f"{man_id} man pages"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping man pages: {man_id}")
|
||||
converter = ManPageToSkillConverter(man_config)
|
||||
converter.extract_manpages()
|
||||
|
||||
man_data_file = converter.data_file
|
||||
with open(man_data_file, encoding="utf-8") as f:
|
||||
man_data = json.load(f)
|
||||
|
||||
cache_man_data = os.path.join(self.data_dir, f"manpage_data_{idx}_{man_id}.json")
|
||||
shutil.copy(man_data_file, cache_man_data)
|
||||
|
||||
self.scraped_data["manpage"].append(
|
||||
{
|
||||
"man_id": man_id,
|
||||
"idx": idx,
|
||||
"data": man_data,
|
||||
"data_file": cache_man_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Man pages: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone man page SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Man pages: {len(man_data.get('pages', []))} man pages extracted")
|
||||
|
||||
def _scrape_chat(self, source: dict[str, Any]):
|
||||
"""Scrape Slack/Discord chat export or API."""
|
||||
try:
|
||||
from skill_seekers.cli.chat_scraper import ChatToSkillConverter
|
||||
except ImportError:
|
||||
logger.error(
|
||||
"Chat scraper dependencies not installed.\n"
|
||||
" Install with: pip install skill-seekers[chat]"
|
||||
)
|
||||
return
|
||||
|
||||
idx = self._source_counters["chat"]
|
||||
self._source_counters["chat"] += 1
|
||||
|
||||
export_path = source.get("path", "")
|
||||
channel = source.get("channel", source.get("channel_id", ""))
|
||||
chat_id = channel or os.path.basename(export_path.rstrip("/")) or f"chat_{idx}"
|
||||
|
||||
chat_config = {
|
||||
"name": f"{self.name}_chat_{idx}_{chat_id}",
|
||||
"export_path": source.get("path"),
|
||||
"platform": source.get("platform", "slack"),
|
||||
"token": source.get("token"),
|
||||
"channel": channel,
|
||||
"max_messages": source.get("max_messages", 10000),
|
||||
"description": source.get("description", f"{chat_id} chat export"),
|
||||
}
|
||||
|
||||
logger.info(f"Scraping chat: {chat_id}")
|
||||
converter = ChatToSkillConverter(chat_config)
|
||||
converter.extract_chat()
|
||||
|
||||
chat_data_file = converter.data_file
|
||||
with open(chat_data_file, encoding="utf-8") as f:
|
||||
chat_data = json.load(f)
|
||||
|
||||
cache_chat_data = os.path.join(self.data_dir, f"chat_data_{idx}_{chat_id}.json")
|
||||
shutil.copy(chat_data_file, cache_chat_data)
|
||||
|
||||
self.scraped_data["chat"].append(
|
||||
{
|
||||
"chat_id": chat_id,
|
||||
"platform": source.get("platform", "slack"),
|
||||
"idx": idx,
|
||||
"data": chat_data,
|
||||
"data_file": cache_chat_data,
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
converter.build_skill()
|
||||
logger.info("✅ Chat: Standalone SKILL.md created")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Failed to build standalone chat SKILL.md: {e}")
|
||||
|
||||
logger.info(f"✅ Chat: {len(chat_data.get('messages', []))} messages extracted")
|
||||
|
||||
def _load_json(self, file_path: Path) -> dict:
|
||||
"""
|
||||
Load JSON file safely.
|
||||
@@ -1297,14 +1931,33 @@ Examples:
|
||||
if args.dry_run:
|
||||
logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
|
||||
logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
|
||||
# Source type display config: type -> (label, key for detail)
|
||||
_SOURCE_DISPLAY = {
|
||||
"documentation": ("Documentation", "base_url"),
|
||||
"github": ("GitHub", "repo"),
|
||||
"pdf": ("PDF", "path"),
|
||||
"word": ("Word", "path"),
|
||||
"epub": ("EPUB", "path"),
|
||||
"video": ("Video", "url"),
|
||||
"local": ("Local Codebase", "path"),
|
||||
"jupyter": ("Jupyter Notebook", "path"),
|
||||
"html": ("HTML", "path"),
|
||||
"openapi": ("OpenAPI Spec", "path"),
|
||||
"asciidoc": ("AsciiDoc", "path"),
|
||||
"pptx": ("PowerPoint", "path"),
|
||||
"confluence": ("Confluence", "base_url"),
|
||||
"notion": ("Notion", "page_id"),
|
||||
"rss": ("RSS/Atom Feed", "url"),
|
||||
"manpage": ("Man Page", "names"),
|
||||
"chat": ("Chat Export", "path"),
|
||||
}
|
||||
for idx, source in enumerate(scraper.config.get("sources", []), 1):
|
||||
source_type = source.get("type", "unknown")
|
||||
if source_type == "documentation":
|
||||
logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
|
||||
elif source_type == "github":
|
||||
logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
|
||||
elif source_type == "pdf":
|
||||
logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
|
||||
label, key = _SOURCE_DISPLAY.get(source_type, (source_type.title(), "path"))
|
||||
detail = source.get(key, "N/A")
|
||||
if isinstance(detail, list):
|
||||
detail = ", ".join(str(d) for d in detail)
|
||||
logger.info(f" {idx}. {label}: {detail}")
|
||||
logger.info(f"\nOutput directory: {scraper.output_dir}")
|
||||
logger.info(f"Merge mode: {scraper.merge_mode}")
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user