From 37cb30745597217f65786189629c4294b286fb70 Mon Sep 17 00:00:00 2001 From: yusyus Date: Sun, 15 Mar 2026 15:56:04 +0300 Subject: [PATCH] docs: update all documentation for 17 source types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update 32 documentation files across English and Chinese (zh-CN) docs to reflect the 10 new source types added in the previous commit. Updated files: - README.md, README.zh-CN.md — taglines, feature lists, examples, install extras - docs/reference/ — CLI_REFERENCE, FEATURE_MATRIX, MCP_REFERENCE, CONFIG_FORMAT, API_REFERENCE - docs/features/ — UNIFIED_SCRAPING with generic merge docs - docs/advanced/ — multi-source guide, MCP server guide - docs/getting-started/ — installation extras, quick-start examples - docs/user-guide/ — core-concepts, scraping, packaging, workflows (complex-merge) - docs/ — FAQ, TROUBLESHOOTING, BEST_PRACTICES, ARCHITECTURE, UNIFIED_PARSERS, README - Root — BULLETPROOF_QUICKSTART, CONTRIBUTING, ROADMAP - docs/zh-CN/ — Chinese translations for all of the above 32 files changed, +3,016 lines, -245 lines --- BULLETPROOF_QUICKSTART.md | 22 ++ CONTRIBUTING.md | 51 ++- README.md | 57 ++- README.zh-CN.md | 30 +- ROADMAP.md | 74 ++-- docs/ARCHITECTURE.md | 12 +- docs/BEST_PRACTICES.md | 47 +++ docs/FAQ.md | 133 ++++++- docs/README.md | 28 +- docs/TROUBLESHOOTING.md | 181 +++++++++ docs/advanced/mcp-server.md | 38 +- docs/advanced/multi-source.md | 256 +++++++++++- docs/architecture/UNIFIED_PARSERS.md | 57 ++- docs/features/UNIFIED_SCRAPING.md | 244 +++++++++++- docs/getting-started/01-installation.md | 9 +- docs/getting-started/02-quick-start.md | 57 ++- docs/reference/API_REFERENCE.md | 66 +++- docs/reference/CLI_REFERENCE.md | 368 +++++++++++++++++- docs/reference/CONFIG_FORMAT.md | 214 +++++++++- docs/reference/FEATURE_MATRIX.md | 83 +++- docs/reference/MCP_REFERENCE.md | 82 +++- docs/user-guide/01-core-concepts.md | 162 +++++++- docs/user-guide/02-scraping.md | 285 +++++++++++++- docs/user-guide/04-packaging.md | 2 +- docs/user-guide/05-workflows.md | 51 ++- docs/zh-CN/README.md | 31 +- docs/zh-CN/advanced/mcp-server.md | 102 +++-- docs/zh-CN/getting-started/01-installation.md | 8 +- docs/zh-CN/getting-started/02-quick-start.md | 56 ++- docs/zh-CN/reference/CLI_REFERENCE.md | 326 +++++++++++++++- docs/zh-CN/reference/FEATURE_MATRIX.md | 45 ++- docs/zh-CN/reference/MCP_REFERENCE.md | 74 +++- 32 files changed, 3011 insertions(+), 240 deletions(-) diff --git a/BULLETPROOF_QUICKSTART.md b/BULLETPROOF_QUICKSTART.md index 126c1c1..dc3d52f 100644 --- a/BULLETPROOF_QUICKSTART.md +++ b/BULLETPROOF_QUICKSTART.md @@ -405,6 +405,28 @@ skill-seekers scrape --config configs/vue.json --max-pages 50 skill-seekers scrape --config configs/django.json --max-pages 50 ``` +### Try Other Source Types (17 Supported!) + +```bash +# Auto-detect source type with the `create` command +skill-seekers create https://docs.example.com # Documentation +skill-seekers create facebook/react # GitHub repo +skill-seekers create manual.pdf # PDF +skill-seekers create report.docx # Word document +skill-seekers create book.epub # EPUB book +skill-seekers create analysis.ipynb # Jupyter Notebook +skill-seekers create spec.yaml # OpenAPI/Swagger spec +skill-seekers create slides.pptx # PowerPoint + +# Or use specific subcommands +skill-seekers video https://youtube.com/watch?v=abc # Video +skill-seekers confluence --space DOCS # Confluence wiki +skill-seekers notion --database DB_ID # Notion +skill-seekers rss https://blog.example.com/feed.xml # RSS feed +skill-seekers manpage grep.1 # Man page +skill-seekers chat --platform slack --export-dir ./export # Slack/Discord +``` + ### Create Custom Skills ```bash diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 306f47f..8902d36 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -441,21 +441,46 @@ def test_config_validation_with_missing_fields(): ``` Skill_Seekers/ -├── cli/ # CLI tools -│ ├── doc_scraper.py # Main scraper -│ ├── package_skill.py # Packager -│ ├── upload_skill.py # Uploader -│ └── utils.py # Shared utilities -├── mcp/ # MCP server -│ ├── server.py # MCP implementation -│ └── requirements.txt # MCP dependencies -├── configs/ # Framework configs -├── docs/ # Documentation -├── tests/ # Test suite -└── .github/ # GitHub config - └── workflows/ # CI/CD workflows +├── src/skill_seekers/ # Main package (src/ layout) +│ ├── cli/ # CLI commands and entry points +│ │ ├── main.py # Unified CLI entry (COMMAND_MODULES dict) +│ │ ├── source_detector.py # Auto-detects source type +│ │ ├── create_command.py # Unified `create` command routing +│ │ ├── config_validator.py # VALID_SOURCE_TYPES set +│ │ ├── unified_scraper.py # Multi-source orchestrator +│ │ ├── unified_skill_builder.py # Pairwise synthesis + generic merge +│ │ ├── doc_scraper.py # Documentation (web) +│ │ ├── github_scraper.py # GitHub repos +│ │ ├── pdf_scraper.py # PDF files +│ │ ├── word_scraper.py # Word (.docx) +│ │ ├── epub_scraper.py # EPUB books +│ │ ├── video_scraper.py # Video (YouTube, Vimeo, local) +│ │ ├── codebase_scraper.py # Local codebases +│ │ ├── jupyter_scraper.py # Jupyter Notebooks +│ │ ├── html_scraper.py # Local HTML files +│ │ ├── openapi_scraper.py # OpenAPI/Swagger specs +│ │ ├── asciidoc_scraper.py # AsciiDoc files +│ │ ├── pptx_scraper.py # PowerPoint files +│ │ ├── rss_scraper.py # RSS/Atom feeds +│ │ ├── manpage_scraper.py # Man pages +│ │ ├── confluence_scraper.py # Confluence wikis +│ │ ├── notion_scraper.py # Notion pages +│ │ ├── chat_scraper.py # Slack/Discord exports +│ │ ├── adaptors/ # Platform adaptors (Strategy pattern) +│ │ ├── arguments/ # CLI argument definitions (one per source) +│ │ ├── parsers/ # Subcommand parsers (one per source) +│ │ └── storage/ # Cloud storage adaptors +│ ├── mcp/ # MCP server + tools +│ └── sync/ # Sync monitoring +├── configs/ # Preset JSON scraping configs +├── docs/ # Documentation +├── tests/ # 115+ test files (pytest) +└── .github/ # GitHub config + └── workflows/ # CI/CD workflows ``` +**Scraper pattern (17 source types):** Each source type has `cli/_scraper.py` (with `ToSkillConverter` class + `main()`), `arguments/.py`, and `parsers/_parser.py`. Register new types in: `parsers/__init__.py` PARSERS list, `main.py` COMMAND_MODULES dict, `config_validator.py` VALID_SOURCE_TYPES set. + --- ## Release Process diff --git a/README.md b/README.md index a7a4b43..b8f59d6 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ English | [简体中文](https://github.com/yusufkaraaslan/Skill_Seekers/blob/ma [![Twitter Follow](https://img.shields.io/twitter/follow/_yUSyUS_?style=social)](https://x.com/_yUSyUS_) [![GitHub Repo stars](https://img.shields.io/github/stars/yusufkaraaslan/Skill_Seekers?style=social)](https://github.com/yusufkaraaslan/Skill_Seekers) -**🧠 The data layer for AI systems.** Skill Seekers turns any documentation, GitHub repo, PDF, or video into structured knowledge assets—ready to power AI Skills (Claude, Gemini, OpenAI), RAG pipelines (LangChain, LlamaIndex, Pinecone), and AI coding assistants (Cursor, Windsurf, Cline) in minutes, not hours. +**🧠 The data layer for AI systems.** Skill Seekers turns documentation sites, GitHub repos, PDFs, videos, notebooks, wikis, and 10+ more source types into structured knowledge assets—ready to power AI Skills (Claude, Gemini, OpenAI), RAG pipelines (LangChain, LlamaIndex, Pinecone), and AI coding assistants (Cursor, Windsurf, Cline) in minutes, not hours. > 🌐 **[Visit SkillSeekersWeb.com](https://skillseekersweb.com/)** - Browse 24+ preset configs, share your configs, and access complete documentation! @@ -63,7 +63,7 @@ skill-seekers package output/react --target cursor # → .cursorrules - 🎯 **AI Skill quality** — 500+ line SKILL.md files with examples, patterns, and guides - 📊 **RAG-ready chunks** — Smart chunking preserves code blocks and maintains context - 🎬 **Videos** — Extract code, transcripts, and structured knowledge from YouTube and local videos -- 🔄 **Multi-source** — Combine docs + GitHub + PDFs + videos into one knowledge asset +- 🔄 **Multi-source** — Combine 17 source types (docs, GitHub, PDFs, videos, notebooks, wikis, and more) into one knowledge asset - 🌐 **One prep, every target** — Export the same asset to 16 platforms without re-scraping - ✅ **Battle-tested** — 2,540+ tests, 24+ framework presets, production-ready @@ -82,7 +82,7 @@ skill-seekers package output/django --target claude **That's it!** You now have `output/django-claude.zip` ready to use. -### Other Sources +### Other Sources (17 Supported) ```bash # GitHub repository @@ -94,10 +94,46 @@ skill-seekers create ./my-project # PDF document skill-seekers create manual.pdf +# Word document +skill-seekers create report.docx + +# EPUB e-book +skill-seekers create book.epub + +# Jupyter Notebook +skill-seekers create notebook.ipynb + +# OpenAPI spec +skill-seekers create openapi.yaml + +# PowerPoint presentation +skill-seekers create presentation.pptx + +# AsciiDoc document +skill-seekers create guide.adoc + +# Local HTML file +skill-seekers create page.html + +# RSS/Atom feed +skill-seekers create feed.rss + +# Man page +skill-seekers create curl.1 + # Video (YouTube, Vimeo, or local file — requires skill-seekers[video]) skill-seekers video --url https://www.youtube.com/watch?v=... --name mytutorial # First time? Auto-install GPU-aware visual deps: skill-seekers video --setup + +# Confluence wiki +skill-seekers confluence --space TEAM --name wiki + +# Notion pages +skill-seekers notion --database-id ... --name docs + +# Slack/Discord chat export +skill-seekers chat --export-dir ./slack-export --name team-chat ``` ### Export Everywhere @@ -111,7 +147,7 @@ done ## What is Skill Seekers? -Skill Seekers is the **data layer for AI systems**. It transforms documentation websites, GitHub repositories, PDF files, and videos into structured knowledge assets for every AI target: +Skill Seekers is the **data layer for AI systems**. It transforms 17 source types—documentation websites, GitHub repositories, PDFs, videos, Jupyter Notebooks, Word/EPUB/AsciiDoc documents, OpenAPI specs, PowerPoint presentations, RSS feeds, man pages, Confluence wikis, Notion pages, Slack/Discord exports, and more—into structured knowledge assets for every AI target: | Use Case | What you get | Examples | |----------|-------------|---------| @@ -137,7 +173,7 @@ Skill Seekers is the **data layer for AI systems**. It transforms documentation Instead of spending days on manual preprocessing, Skill Seekers: -1. **Ingests** — docs, GitHub repos, local codebases, PDFs, videos +1. **Ingests** — docs, GitHub repos, local codebases, PDFs, videos, notebooks, wikis, and 10+ more source types 2. **Analyzes** — deep AST parsing, pattern detection, API extraction 3. **Structures** — categorized reference files with metadata 4. **Enhances** — AI-powered SKILL.md generation (Claude, Gemini, or local) @@ -610,6 +646,13 @@ skill-seekers-setup | `pip install skill-seekers[mcp]` | + MCP server for Claude Code, Cursor, etc. | | `pip install skill-seekers[video]` | + YouTube/Vimeo transcript & metadata extraction | | `pip install skill-seekers[video-full]` | + Whisper transcription & visual frame extraction | +| `pip install skill-seekers[jupyter]` | + Jupyter Notebook support | +| `pip install skill-seekers[pptx]` | + PowerPoint support | +| `pip install skill-seekers[confluence]` | + Confluence wiki support | +| `pip install skill-seekers[notion]` | + Notion pages support | +| `pip install skill-seekers[rss]` | + RSS/Atom feed support | +| `pip install skill-seekers[chat]` | + Slack/Discord chat export support | +| `pip install skill-seekers[asciidoc]` | + AsciiDoc document support | | `pip install skill-seekers[all]` | Everything enabled | > **Video visual deps (GPU-aware):** After installing `skill-seekers[video-full]`, run @@ -655,10 +698,10 @@ skill-seekers install --config react --dry-run ## 📊 Feature Matrix -Skill Seekers supports **4 LLM platforms** and **6 skill modes** with full feature parity. +Skill Seekers supports **4 LLM platforms**, **17 source types**, and full feature parity across all targets. **Platforms:** Claude AI, Google Gemini, OpenAI ChatGPT, Generic Markdown -**Skill Modes:** Documentation, GitHub, PDF, Video, Unified Multi-Source, Local Repository +**Source Types:** Documentation websites, GitHub repos, PDFs, Word (.docx), EPUB, Video, Local codebases, Jupyter Notebooks, Local HTML, OpenAPI/Swagger, AsciiDoc, PowerPoint (.pptx), RSS/Atom feeds, Man pages, Confluence wikis, Notion pages, Slack/Discord chat exports See [Complete Feature Matrix](docs/FEATURE_MATRIX.md) for detailed platform and feature support. diff --git a/README.zh-CN.md b/README.zh-CN.md index bccbf43..b7629d6 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -23,7 +23,7 @@ [![关注 Twitter](https://img.shields.io/twitter/follow/_yUSyUS_?style=social)](https://x.com/_yUSyUS_) [![GitHub Stars](https://img.shields.io/github/stars/yusufkaraaslan/Skill_Seekers?style=social)](https://github.com/yusufkaraaslan/Skill_Seekers) -**🧠 AI 系统的数据层。** Skill Seekers 将任何文档、GitHub 仓库、PDF 或视频转换为结构化知识资产——可在几分钟内为 AI 技能(Claude、Gemini、OpenAI)、RAG 流水线(LangChain、LlamaIndex、Pinecone)和 AI 编程助手(Cursor、Windsurf、Cline)提供支持。 +**🧠 AI 系统的数据层。** Skill Seekers 将文档网站、GitHub 仓库、PDF、视频、Jupyter 笔记本、Wiki 等 17 种以上来源类型转换为结构化知识资产——可在几分钟内为 AI 技能(Claude、Gemini、OpenAI)、RAG 流水线(LangChain、LlamaIndex、Pinecone)和 AI 编程助手(Cursor、Windsurf、Cline)提供支持。 > 🌐 **[访问 SkillSeekersWeb.com](https://skillseekersweb.com/)** - 浏览 24+ 个预设配置,分享您的配置,访问完整文档! @@ -66,7 +66,7 @@ skill-seekers package output/react --target cursor # → .cursorrules - ⚡ **快 99%** — 数天的手动数据准备 → 15–45 分钟 - 🎯 **AI 技能质量** — 500+ 行的 SKILL.md 文件,包含示例、模式和指南 - 📊 **RAG 就绪的分块** — 智能分块保留代码块并维护上下文 -- 🔄 **多源支持** — 将文档 + GitHub + PDF 合并为一个知识资产 +- 🔄 **17 种来源类型** — 将文档 + GitHub + PDF + 视频 + 笔记本 + Wiki 等合并为一个知识资产 - 🌐 **一次准备,导出所有目标** — 无需重新抓取即可导出到 16 个平台 - 🎬 **视频** — 从 YouTube 和本地视频提取代码、字幕和结构化知识 - ✅ **久经考验** — 2,540+ 测试,24+ 框架预设,生产就绪 @@ -81,6 +81,13 @@ skill-seekers create https://docs.django.com/ # 文档网站 skill-seekers create django/django # GitHub 仓库 skill-seekers create ./my-codebase # 本地项目 skill-seekers create manual.pdf # PDF 文件 +skill-seekers create manual.docx # Word 文档 +skill-seekers create book.epub # EPUB 电子书 +skill-seekers create notebook.ipynb # Jupyter 笔记本 +skill-seekers create page.html # 本地 HTML +skill-seekers create api-spec.yaml # OpenAPI/Swagger 规范 +skill-seekers create guide.adoc # AsciiDoc 文档 +skill-seekers create slides.pptx # PowerPoint 演示文稿 # 视频(YouTube、Vimeo 或本地文件 — 需要 skill-seekers[video]) skill-seekers video --url https://www.youtube.com/watch?v=... --name mytutorial @@ -100,7 +107,7 @@ skill-seekers package output/django --target cursor # Cursor IDE 上下文 ## 什么是 Skill Seekers? -Skill Seekers 是 **AI 系统的数据层**,将文档网站、GitHub 仓库、PDF 文件和视频转换为适用于所有 AI 目标的结构化知识资产: +Skill Seekers 是 **AI 系统的数据层**,将 17 种来源类型——文档网站、GitHub 仓库、PDF、视频、Jupyter 笔记本、Word/EPUB/AsciiDoc 文档、OpenAPI/Swagger 规范、PowerPoint 演示文稿、RSS/Atom 订阅源、Man 手册页、Confluence 维基、Notion 页面、Slack/Discord 聊天记录等——转换为适用于所有 AI 目标的结构化知识资产: | 使用场景 | 获得的内容 | 示例 | |---------|-----------|------| @@ -111,7 +118,7 @@ Skill Seekers 是 **AI 系统的数据层**,将文档网站、GitHub 仓库、 Skill Seekers 通过以下步骤代替数天的手动预处理工作: -1. **采集** — 文档、GitHub 仓库、本地代码库、PDF、视频 +1. **采集** — 文档、GitHub 仓库、本地代码库、PDF、视频、Jupyter 笔记本、Wiki 等 17 种以上来源类型 2. **分析** — 深度 AST 解析、模式检测、API 提取 3. **结构化** — 带元数据的分类参考文件 4. **增强** — AI 驱动的 SKILL.md 生成(Claude、Gemini 或本地) @@ -526,6 +533,10 @@ skill-seekers-setup | `pip install skill-seekers[mcp]` | + MCP 服务器 | | `pip install skill-seekers[video]` | + YouTube/Vimeo 字幕和元数据提取 | | `pip install skill-seekers[video-full]` | + Whisper 转录和视觉帧提取 | +| `pip install skill-seekers[jupyter]` | + Jupyter 笔记本提取 | +| `pip install skill-seekers[ocr]` | + OCR 支持(PDF 扫描件、视觉帧) | +| `pip install skill-seekers[confluence]` | + Confluence 维基支持 | +| `pip install skill-seekers[notion]` | + Notion 页面支持 | | `pip install skill-seekers[all]` | 全部功能 | > **视频视觉依赖(GPU 感知):** 安装 `skill-seekers[video-full]` 后,运行 @@ -565,9 +576,10 @@ skill-seekers install --config react --dry-run ## 📊 功能矩阵 -Skill Seekers 支持 **4 个 LLM 平台**和 **5 种技能模式**,功能完全对等。 +Skill Seekers 支持 **4 个 LLM 平台**、**17 种来源类型**和 **5 种技能模式**,功能完全对等。 **平台:** Claude AI、Google Gemini、OpenAI ChatGPT、通用 Markdown +**来源类型:** 文档网站、GitHub 仓库、PDF、Word、EPUB、视频、本地代码库、Jupyter 笔记本、本地 HTML、OpenAPI/Swagger 规范、AsciiDoc 文档、PowerPoint 演示文稿、RSS/Atom 订阅源、Man 手册页、Confluence 维基、Notion 页面、Slack/Discord 聊天记录 **技能模式:** 文档、GitHub、PDF、统一多源、本地仓库 完整信息请查看 [完整功能矩阵](docs/FEATURE_MATRIX.md)。 @@ -830,7 +842,7 @@ skill-seekers install-agent output/react/ --agent cursor --dry-run --- -## 🔌 MCP 集成(26 个工具) +## 🔌 MCP 集成(27 个工具) Skill Seekers 提供 MCP 服务器,可在 Claude Code、Cursor、Windsurf、VS Code + Cline 或 IntelliJ IDEA 中使用。 @@ -845,12 +857,14 @@ python -m skill_seekers.mcp.server_fastmcp --transport http --port 8765 ./setup_mcp.sh ``` -**所有 26 个工具:** +**所有 27 个工具:** - **核心(9 个):** `list_configs`、`generate_config`、`validate_config`、`estimate_pages`、`scrape_docs`、`package_skill`、`upload_skill`、`enhance_skill`、`install_skill` -- **扩展(10 个):** `scrape_github`、`scrape_pdf`、`unified_scrape`、`merge_sources`、`detect_conflicts`、`add_config_source`、`fetch_config`、`list_config_sources`、`remove_config_source`、`split_config` +- **扩展(11 个):** `scrape_github`、`scrape_pdf`、`scrape_generic`、`unified_scrape`、`merge_sources`、`detect_conflicts`、`add_config_source`、`fetch_config`、`list_config_sources`、`remove_config_source`、`split_config` - **向量数据库(4 个):** `export_to_chroma`、`export_to_weaviate`、`export_to_faiss`、`export_to_qdrant` - **云存储(3 个):** `cloud_upload`、`cloud_download`、`cloud_list` +> `scrape_generic` 支持 10 种新来源类型:Jupyter 笔记本、本地 HTML、OpenAPI/Swagger 规范、AsciiDoc 文档、PowerPoint 演示文稿、RSS/Atom 订阅源、Man 手册页、Confluence 维基、Notion 页面、Slack/Discord 聊天记录。 + **完整指南:** [docs/MCP_SETUP.md](docs/MCP_SETUP.md) --- diff --git a/ROADMAP.md b/ROADMAP.md index bbe4511..2e6fd5d 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -4,16 +4,14 @@ Transform Skill Seekers into the easiest way to create Claude AI skills from **a --- -## 🎯 Current Status: v3.1.0-dev ✅ +## 🎯 Current Status: v3.2.0 ✅ -**Latest Release:** v3.0.0 (February 10, 2026) | In Development: v3.1.0-dev +**Latest Release:** v3.2.0 (March 2026) **What Works:** -- ✅ Documentation scraping (HTML websites with llms.txt support) -- ✅ GitHub repository scraping with C3.x codebase analysis -- ✅ PDF extraction with OCR and image support -- ✅ Unified multi-source scraping (docs + GitHub + PDF) -- ✅ 26 MCP tools fully functional +- ✅ **17 source types** — documentation, GitHub, PDF, video, Word, EPUB, Jupyter, local HTML, OpenAPI, AsciiDoc, PowerPoint, RSS/Atom, man pages, Confluence, Notion, Slack/Discord, local codebase +- ✅ Unified multi-source scraping with generic merge for any source combination +- ✅ 26+ MCP tools fully functional - ✅ Multi-platform support (16 platforms: Claude, Gemini, OpenAI, LangChain, LlamaIndex, Haystack, ChromaDB, FAISS, Weaviate, Qdrant, Cursor, Windsurf, Cline, Continue.dev, Pinecone, Markdown) - ✅ Auto-upload to all platforms - ✅ 24 preset configs (including 7 unified configs) @@ -21,12 +19,15 @@ Transform Skill Seekers into the easiest way to create Claude AI skills from **a - ✅ C3.x codebase analysis suite (C3.1-C3.10) - ✅ Bootstrap skill feature - self-hosting capability - ✅ 1,880+ tests passing -- ✅ Unified `create` command with auto-detection and progressive help +- ✅ Unified `create` command with auto-detection for all 17 source types - ✅ Enhancement workflow presets (5 bundled: default, minimal, security-focus, architecture-comprehensive, api-documentation) - ✅ Cloud storage integration (S3, GCS, Azure) +- ✅ Source auto-detection via `source_detector.py` -**Recent Improvements (v3.1.0-dev):** -- ✅ **Unified CLI**: `create` command auto-detects web/GitHub/local/PDF sources +**Recent Improvements (v3.2.0):** +- ✅ **10 new source types**: Word, EPUB, video, Jupyter, local HTML, OpenAPI, AsciiDoc, PowerPoint, RSS/Atom, man pages, Confluence, Notion, Slack/Discord +- ✅ **Generic merge system**: `_generic_merge()` in `unified_skill_builder.py` handles arbitrary source combinations +- ✅ **Unified CLI**: `create` command auto-detects all 17 source types - ✅ **Workflow Presets**: YAML-based enhancement presets with CLI management - ✅ **Progressive Disclosure**: Default help shows 13 universal flags, detailed help per source - ✅ **Bug Fixes**: Markdown parser h1 filtering, paragraph length filtering @@ -105,26 +106,38 @@ Small tasks that build community features incrementally ### 🛠️ **Category B: New Input Formats** Add support for non-HTML documentation sources -#### B1: PDF Documentation Support -- [ ] **Task B1.1:** Research PDF parsing libraries -- [ ] **Task B1.2:** Create simple PDF text extractor (POC) -- [ ] **Task B1.3:** Add PDF page detection and chunking -- [ ] **Task B1.4:** Extract code blocks from PDFs -- [ ] **Task B1.5:** Add PDF image extraction -- [ ] **Task B1.6:** Create `pdf_scraper.py` CLI tool -- [ ] **Task B1.7:** Add MCP tool `scrape_pdf` -- [ ] **Task B1.8:** Create PDF config format +#### B1: PDF Documentation Support ✅ **COMPLETE (v3.0.0)** +- [x] **Task B1.1:** Research PDF parsing libraries ✅ +- [x] **Task B1.2:** Create simple PDF text extractor (POC) ✅ +- [x] **Task B1.3:** Add PDF page detection and chunking ✅ +- [x] **Task B1.4:** Extract code blocks from PDFs ✅ +- [x] **Task B1.5:** Add PDF image extraction ✅ +- [x] **Task B1.6:** Create `pdf_scraper.py` CLI tool ✅ +- [x] **Task B1.7:** Add MCP tool `scrape_pdf` ✅ +- [x] **Task B1.8:** Create PDF config format ✅ -**Start Small:** Pick B1.1 first (research only) - -#### B2: Microsoft Word (.docx) Support -- [ ] **Task B2.1-B2.7:** Word document parsing and scraping +#### B2: Microsoft Word (.docx) Support ✅ **COMPLETE (v3.2.0)** +- [x] **Task B2.1-B2.7:** Word document parsing and scraping ✅ #### B3: Excel/Spreadsheet (.xlsx) Support - [ ] **Task B3.1-B3.6:** Spreadsheet parsing and API extraction -#### B4: Markdown Files Support -- [ ] **Task B4.1-B4.6:** Local markdown directory scraping +#### B4: Markdown Files Support ✅ **COMPLETE (v3.1.0)** +- [x] **Task B4.1-B4.6:** Local markdown directory scraping ✅ + +#### B5: Additional Source Types ✅ **COMPLETE (v3.2.0)** +- [x] **EPUB** - `epub_scraper.py` ✅ +- [x] **Video** - `video_scraper.py` (YouTube, Vimeo, local files) ✅ +- [x] **Jupyter Notebook** - `jupyter_scraper.py` ✅ +- [x] **Local HTML** - `html_scraper.py` ✅ +- [x] **OpenAPI/Swagger** - `openapi_scraper.py` ✅ +- [x] **AsciiDoc** - `asciidoc_scraper.py` ✅ +- [x] **PowerPoint** - `pptx_scraper.py` ✅ +- [x] **RSS/Atom** - `rss_scraper.py` ✅ +- [x] **Man pages** - `manpage_scraper.py` ✅ +- [x] **Confluence** - `confluence_scraper.py` ✅ +- [x] **Notion** - `notion_scraper.py` ✅ +- [x] **Slack/Discord** - `chat_scraper.py` ✅ --- @@ -347,13 +360,14 @@ Small standalone tools that add value ## 📈 Metrics & Goals -### Current State (v2.6.0) ✅ +### Current State (v3.2.0) ✅ +- ✅ 17 source types supported - ✅ 24 preset configs (14 official + 10 test/examples) -- ✅ 700+ tests (excellent coverage) -- ✅ 18 MCP tools +- ✅ 1,880+ tests (excellent coverage) +- ✅ 26+ MCP tools - ✅ 4 platform adaptors (Claude, Gemini, OpenAI, Markdown) - ✅ C3.x codebase analysis suite complete -- ✅ Multi-source synthesis with conflict detection +- ✅ Multi-source synthesis with generic merge for any combination ### Goals for v2.7-v2.9 - 🎯 Professional website live @@ -433,7 +447,7 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines. --- -**Last Updated:** January 14, 2026 +**Last Updated:** March 15, 2026 **Philosophy:** Small steps → Consistent progress → Compound results **Together, we're building the future of documentation-to-AI skill conversion!** 🚀 diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index dca3bd7..e6cd527 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -1,6 +1,6 @@ # Documentation Architecture -> **How Skill Seekers documentation is organized** +> **How Skill Seekers documentation is organized (v3.2.0 - 17 source types)** --- @@ -83,7 +83,7 @@ docs/ **Files:** - `01-core-concepts.md` - How it works -- `02-scraping.md` - All scraping options +- `02-scraping.md` - All 17 source types (docs, GitHub, PDF, video, Word, EPUB, Jupyter, HTML, OpenAPI, AsciiDoc, PPTX, RSS, man pages, Confluence, Notion, Slack/Discord, local codebase) - `03-enhancement.md` - AI enhancement - `04-packaging.md` - Platform export - `05-workflows.md` - Workflow presets @@ -102,10 +102,10 @@ docs/ - Always accurate **Files:** -- `CLI_REFERENCE.md` - All 20 CLI commands -- `MCP_REFERENCE.md` - 26 MCP tools -- `CONFIG_FORMAT.md` - JSON schema -- `ENVIRONMENT_VARIABLES.md` - All env vars +- `CLI_REFERENCE.md` - All CLI commands (including 17 source-type subcommands) +- `MCP_REFERENCE.md` - 26+ MCP tools +- `CONFIG_FORMAT.md` - JSON schema (covers all 17 source types) +- `ENVIRONMENT_VARIABLES.md` - All env vars (including Confluence, Notion, Slack tokens) --- diff --git a/docs/BEST_PRACTICES.md b/docs/BEST_PRACTICES.md index b38382b..7f4595e 100644 --- a/docs/BEST_PRACTICES.md +++ b/docs/BEST_PRACTICES.md @@ -434,6 +434,53 @@ That's it! Follow these practices and your skills will work better with Claude. --- +## 8. Tips for Specific Source Types + +Skill Seekers supports **17 source types**. Here are tips for getting the best results from each category: + +### Documentation (Web) +- Always test CSS selectors before large scrapes: `skill-seekers scrape --max-pages 3 --verbose` +- Use `--async` for large sites (2-3x faster) + +### GitHub Repos +- Use `--analysis-depth c3x` for deep analysis (patterns, tests, architecture) +- Set `GITHUB_TOKEN` to avoid rate limits + +### PDFs & Office Documents (PDF, Word, EPUB, PPTX) +- Use `--enable-ocr` for scanned PDFs +- For Word/PPTX, embedded images are extracted automatically; add `--extract-images` for PDFs +- EPUB works best with DRM-free files + +### Video +- Run `skill-seekers video --setup` first to install GPU-optimized dependencies +- YouTube and Vimeo URLs are auto-detected; local video files also work + +### Jupyter Notebooks +- Ensure notebooks are saved (unsaved cell outputs won't be captured) +- Both code cells and markdown cells are extracted + +### OpenAPI/Swagger Specs +- Both YAML and JSON specs are supported (OpenAPI 3.x and Swagger 2.0) +- Endpoints, schemas, and examples are parsed into structured API reference + +### AsciiDoc & Man Pages +- AsciiDoc requires `asciidoctor` (install via your package manager or gem) +- Man pages in sections `.1` through `.8` are supported + +### RSS/Atom Feeds +- Useful for converting blog posts and changelogs into skills +- Set `--max-items` to limit how many entries are extracted + +### Confluence & Notion +- API mode requires authentication tokens (see FAQ for setup) +- Export directory mode works offline with HTML/Markdown exports + +### Slack & Discord +- Use official export tools (Slack Workspace Export, DiscordChatExporter) +- Specify `--platform slack` or `--platform discord` explicitly + +--- + ## See Also - [Enhancement Guide](features/ENHANCEMENT.md) - AI-powered SKILL.md improvement diff --git a/docs/FAQ.md b/docs/FAQ.md index da25a1e..d5a47ed 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -1,7 +1,7 @@ # Frequently Asked Questions (FAQ) -**Version:** 3.1.0-dev -**Last Updated:** 2026-02-18 +**Version:** 3.2.0 +**Last Updated:** 2026-03-15 --- @@ -9,13 +9,17 @@ ### What is Skill Seekers? -Skill Seekers is a Python tool that converts documentation websites, GitHub repositories, and PDF files into AI-ready formats for 16+ platforms: LLM platforms (Claude, Gemini, OpenAI), RAG frameworks (LangChain, LlamaIndex, Haystack), vector databases (ChromaDB, FAISS, Weaviate, Qdrant, Pinecone), and AI coding assistants (Cursor, Windsurf, Cline, Continue.dev). +Skill Seekers is a Python tool that converts 17 source types — documentation websites, GitHub repos, PDFs, videos, Word docs, EPUB books, Jupyter notebooks, local HTML files, OpenAPI specs, AsciiDoc, PowerPoint, RSS/Atom feeds, man pages, Confluence wikis, Notion pages, Slack/Discord exports, and local codebases — into AI-ready formats for 16+ platforms: LLM platforms (Claude, Gemini, OpenAI), RAG frameworks (LangChain, LlamaIndex, Haystack), vector databases (ChromaDB, FAISS, Weaviate, Qdrant, Pinecone), and AI coding assistants (Cursor, Windsurf, Cline, Continue.dev). **Use Cases:** - Create custom documentation skills for your favorite frameworks - Analyze GitHub repositories and extract code patterns - Convert PDF manuals into searchable AI skills -- Combine multiple sources (docs + code + PDFs) into unified skills +- Import knowledge from Confluence, Notion, or Slack/Discord +- Extract content from videos (YouTube, Vimeo, local files) +- Convert Jupyter notebooks, EPUB books, or PowerPoint slides into skills +- Parse OpenAPI/Swagger specs into API reference skills +- Combine multiple sources (docs + code + PDFs + more) into unified skills ### Which platforms are supported? @@ -77,12 +81,43 @@ The `--setup` command auto-detects your GPU vendor (NVIDIA CUDA, AMD ROCm, or CP - **AMD:** Uses `rocminfo` to find ROCm version → installs matching ROCm PyTorch - **CPU-only:** Installs lightweight CPU-only PyTorch +### What source types are supported? + +Skill Seekers supports **17 source types**: + +| # | Source Type | CLI Command | Auto-Detection | +|---|------------|-------------|----------------| +| 1 | Documentation (web) | `scrape` / `create ` | HTTP/HTTPS URLs | +| 2 | GitHub repo | `github` / `create owner/repo` | `owner/repo` or github.com URLs | +| 3 | PDF | `pdf` / `create file.pdf` | `.pdf` extension | +| 4 | Word (.docx) | `word` / `create file.docx` | `.docx` extension | +| 5 | EPUB | `epub` / `create file.epub` | `.epub` extension | +| 6 | Video | `video` / `create ` | YouTube/Vimeo URLs, video extensions | +| 7 | Local codebase | `analyze` / `create ./path` | Directory paths | +| 8 | Jupyter Notebook | `jupyter` / `create file.ipynb` | `.ipynb` extension | +| 9 | Local HTML | `html` / `create file.html` | `.html`/`.htm` extensions | +| 10 | OpenAPI/Swagger | `openapi` / `create spec.yaml` | `.yaml`/`.yml` with OpenAPI content | +| 11 | AsciiDoc | `asciidoc` / `create file.adoc` | `.adoc`/`.asciidoc` extensions | +| 12 | PowerPoint | `pptx` / `create file.pptx` | `.pptx` extension | +| 13 | RSS/Atom | `rss` / `create feed.rss` | `.rss`/`.atom` extensions | +| 14 | Man pages | `manpage` / `create cmd.1` | `.1`-`.8`/`.man` extensions | +| 15 | Confluence | `confluence` | API or export directory | +| 16 | Notion | `notion` | API or export directory | +| 17 | Slack/Discord | `chat` | Export directory or API | + +The `create` command auto-detects the source type from your input, so you often don't need to specify a subcommand. + ### How long does it take to create a skill? **Typical Times:** - Documentation scraping: 5-45 minutes (depends on size) - GitHub analysis: 1-5 minutes (basic) or 20-60 minutes (C3.x deep analysis) - PDF extraction: 30 seconds - 5 minutes +- Video extraction: 2-10 minutes (depends on length and visual analysis) +- Word/EPUB/PPTX: 10-60 seconds +- Jupyter notebook: 10-30 seconds +- OpenAPI spec: 5-15 seconds +- Confluence/Notion import: 1-5 minutes (depends on space size) - AI enhancement: 30-60 seconds (LOCAL or API mode) - Total workflow: 10-60 minutes @@ -214,6 +249,92 @@ skill-seekers pdf scanned.pdf --enable-ocr skill-seekers pdf document.pdf --extract-images --extract-tables ``` +### How do I scrape a Jupyter Notebook? + +```bash +# Extract cells, outputs, and markdown from a notebook +skill-seekers jupyter analysis.ipynb --name data-analysis + +# Or use auto-detection +skill-seekers create analysis.ipynb +``` + +Jupyter extraction preserves code cells, markdown cells, and cell outputs. It works with `.ipynb` files from JupyterLab, Google Colab, and other notebook environments. + +### How do I import from Confluence or Notion? + +**Confluence:** +```bash +# From Confluence Cloud API +export CONFLUENCE_URL=https://yourorg.atlassian.net +export CONFLUENCE_TOKEN=your-api-token +export CONFLUENCE_EMAIL=your-email@example.com +skill-seekers confluence --space MYSPACE --name my-wiki + +# From a Confluence HTML/XML export directory +skill-seekers confluence --export-dir ./confluence-export --name my-wiki +``` + +**Notion:** +```bash +# From Notion API +export NOTION_TOKEN=secret_... +skill-seekers notion --database DATABASE_ID --name my-notes + +# From a Notion HTML/Markdown export directory +skill-seekers notion --export-dir ./notion-export --name my-notes +``` + +### How do I convert Word, EPUB, or PowerPoint files? + +```bash +# Word document +skill-seekers word report.docx --name quarterly-report + +# EPUB book +skill-seekers epub handbook.epub --name dev-handbook + +# PowerPoint presentation +skill-seekers pptx slides.pptx --name training-deck + +# Or use auto-detection for any of them +skill-seekers create report.docx +skill-seekers create handbook.epub +skill-seekers create slides.pptx +``` + +### How do I parse an OpenAPI/Swagger spec? + +```bash +# From a local YAML/JSON file +skill-seekers openapi api-spec.yaml --name my-api + +# Auto-detection works too +skill-seekers create api-spec.yaml +``` + +OpenAPI extraction parses endpoints, schemas, parameters, and examples into a structured API reference skill. + +### How do I extract content from RSS feeds or man pages? + +```bash +# RSS/Atom feed +skill-seekers rss https://blog.example.com/feed.xml --name blog-feed + +# Man page +skill-seekers manpage grep.1 --name grep-manual +``` + +### How do I import from Slack or Discord? + +```bash +# From a Slack export directory +skill-seekers chat --platform slack --export-dir ./slack-export --name team-knowledge + +# From a Discord export directory +skill-seekers chat --platform discord --export-dir ./discord-export --name server-archive +``` + ### Can I combine multiple sources? Yes! Unified multi-source scraping: @@ -704,6 +825,6 @@ Yes! --- -**Version:** 3.1.0-dev -**Last Updated:** 2026-02-18 +**Version:** 3.2.0 +**Last Updated:** 2026-03-15 **Questions? Ask on [GitHub Discussions](https://github.com/yusufkaraaslan/Skill_Seekers/discussions)** diff --git a/docs/README.md b/docs/README.md index bee11a0..a656cbc 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,12 +1,12 @@ # Skill Seekers Documentation -> **Complete documentation for Skill Seekers v3.1.0** +> **Complete documentation for Skill Seekers v3.2.0** --- ## Welcome! -This is the official documentation for **Skill Seekers** - the universal tool for converting documentation, code, and PDFs into AI-ready skills. +This is the official documentation for **Skill Seekers** - the universal tool for converting **17 source types** (documentation sites, GitHub repos, PDFs, videos, Word docs, EPUB books, Jupyter notebooks, local HTML, OpenAPI specs, AsciiDoc, PowerPoint, RSS/Atom feeds, man pages, Confluence, Notion, Slack/Discord, and local codebases) into AI-ready skills for 16+ platforms. --- @@ -70,6 +70,12 @@ skill-seekers package output/django --target claude ### Common Commands ```bash +# Auto-detect any source type +skill-seekers create https://docs.django.com/ +skill-seekers create facebook/react +skill-seekers create manual.pdf +skill-seekers create notebook.ipynb + # Scrape documentation skill-seekers scrape --config react @@ -79,6 +85,19 @@ skill-seekers github --repo facebook/react # Extract PDF skill-seekers pdf manual.pdf --name docs +# Convert other formats +skill-seekers word report.docx --name report +skill-seekers epub book.epub --name handbook +skill-seekers jupyter analysis.ipynb --name analysis +skill-seekers openapi spec.yaml --name my-api +skill-seekers pptx slides.pptx --name deck +skill-seekers video https://youtube.com/watch?v=... --name tutorial + +# Import from platforms +skill-seekers confluence --space DOCS --name wiki +skill-seekers notion --database DB_ID --name notes +skill-seekers chat --platform slack --export-dir ./export + # Analyze local code skill-seekers analyze --directory ./my-project @@ -163,8 +182,9 @@ For Cursor, Windsurf, Cline: ## Version Information -- **Current Version:** 3.1.0 -- **Last Updated:** 2026-02-16 +- **Current Version:** 3.2.0 +- **Last Updated:** 2026-03-15 +- **Source Types:** 17 - **Python Required:** 3.10+ --- diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md index 704ef81..8285769 100644 --- a/docs/TROUBLESHOOTING.md +++ b/docs/TROUBLESHOOTING.md @@ -14,6 +14,7 @@ Comprehensive guide for diagnosing and resolving common issues with Skill Seeker - [Storage Issues](#storage-issues) - [Network Issues](#network-issues) - [General Debug Techniques](#general-debug-techniques) +- [Source-Type-Specific Issues](#source-type-specific-issues) ## Installation Issues @@ -893,6 +894,181 @@ If you're still experiencing issues: - Steps to reproduce - Diagnostic information (see above) +## Source-Type-Specific Issues + +### Issue: Missing Optional Dependencies for New Source Types + +**Symptoms:** +``` +ModuleNotFoundError: No module named 'ebooklib' +ModuleNotFoundError: No module named 'python-docx' +ModuleNotFoundError: No module named 'python-pptx' +ImportError: Missing dependency for jupyter extraction +``` + +**Solutions:** + +```bash +# Install all optional dependencies at once +pip install skill-seekers[all] + +# Or install per source type +pip install python-docx # Word (.docx) support +pip install ebooklib # EPUB support +pip install python-pptx # PowerPoint (.pptx) support +pip install nbformat nbconvert # Jupyter Notebook support +pip install pyyaml jsonschema # OpenAPI/Swagger support +pip install asciidoctor # AsciiDoc support (or install system asciidoctor) +pip install feedparser # RSS/Atom feed support +pip install groff # Man page support (system package) + +# Video support (GPU-aware) +skill-seekers video --setup +``` + +### Issue: Confluence API Authentication Fails + +**Symptoms:** +``` +401 Unauthorized: Confluence API rejected credentials +Error: CONFLUENCE_TOKEN not found +``` + +**Solutions:** + +```bash +# Set Confluence Cloud credentials +export CONFLUENCE_URL=https://yourorg.atlassian.net +export CONFLUENCE_EMAIL=your-email@example.com +export CONFLUENCE_TOKEN=your-api-token + +# Generate API token at: +# https://id.atlassian.com/manage-profile/security/api-tokens + +# Test connection +skill-seekers confluence --space MYSPACE --dry-run + +# For Confluence Server/Data Center, use personal access token: +export CONFLUENCE_TOKEN=your-pat +``` + +### Issue: Notion API Authentication Fails + +**Symptoms:** +``` +401 Unauthorized: Notion API rejected credentials +Error: NOTION_TOKEN not found +``` + +**Solutions:** + +```bash +# Set Notion integration token +export NOTION_TOKEN=secret_... + +# Create an integration at: +# https://www.notion.so/my-integrations + +# IMPORTANT: Share the target database/page with your integration +# (click "..." menu on page → "Add connections" → select your integration) + +# Test connection +skill-seekers notion --database DATABASE_ID --dry-run +``` + +### Issue: Jupyter Notebook Extraction Fails + +**Symptoms:** +``` +Error: Cannot read notebook format +nbformat.reader.NotJSONError +``` + +**Solutions:** + +```bash +# Ensure notebook is valid JSON +python -c "import json; json.load(open('notebook.ipynb'))" + +# Install required deps +pip install nbformat nbconvert + +# Try with explicit format version +skill-seekers jupyter notebook.ipynb --nbformat 4 +``` + +### Issue: OpenAPI Spec Parsing Fails + +**Symptoms:** +``` +Error: Not a valid OpenAPI specification +Error: Missing 'openapi' or 'swagger' field +``` + +**Solutions:** + +```bash +# Validate your spec first +pip install openapi-spec-validator +python -c " +from openapi_spec_validator import validate +validate({'openapi': '3.0.0', ...}) +" + +# Ensure the file has the 'openapi' or 'swagger' top-level key +# Supported: OpenAPI 3.x and Swagger 2.0 + +# For remote specs +skill-seekers openapi https://api.example.com/openapi.json --name my-api +``` + +### Issue: EPUB Extraction Produces Empty Output + +**Symptoms:** +``` +Warning: No content found in EPUB +0 chapters extracted +``` + +**Solutions:** + +```bash +# Check EPUB is valid +pip install epubcheck +epubcheck book.epub + +# Try with different content extraction +skill-seekers epub book.epub --extract-images --verbose + +# Some DRM-protected EPUBs cannot be extracted +# Ensure your EPUB is DRM-free +``` + +### Issue: Slack/Discord Export Not Recognized + +**Symptoms:** +``` +Error: Cannot detect chat platform from export directory +Error: No messages found in export +``` + +**Solutions:** + +```bash +# Specify platform explicitly +skill-seekers chat --platform slack --export-dir ./slack-export +skill-seekers chat --platform discord --export-dir ./discord-export + +# For Slack: Export from Workspace Settings → Import/Export +# For Discord: Use DiscordChatExporter or similar tool + +# Check export directory structure +ls ./slack-export/ +# Should contain: channels/, users.json, etc. +``` + +--- + ## Common Error Messages Reference | Error | Cause | Solution | @@ -907,6 +1083,11 @@ If you're still experiencing issues: | `MemoryError` | Out of memory | Reduce batch size | | `PermissionError` | Access denied | Check file permissions | | `FileNotFoundError` | Missing file | Verify file path | +| `No module named 'ebooklib'` | EPUB dep missing | `pip install ebooklib` | +| `No module named 'python-docx'` | Word dep missing | `pip install python-docx` | +| `No module named 'python-pptx'` | PPTX dep missing | `pip install python-pptx` | +| `CONFLUENCE_TOKEN not found` | Confluence auth missing | Set env vars (see above) | +| `NOTION_TOKEN not found` | Notion auth missing | Set env vars (see above) | --- diff --git a/docs/advanced/mcp-server.md b/docs/advanced/mcp-server.md index c471fe7..5894bc0 100644 --- a/docs/advanced/mcp-server.md +++ b/docs/advanced/mcp-server.md @@ -1,6 +1,6 @@ # MCP Server Setup Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** > **Integrate with AI agents via Model Context Protocol** --- @@ -143,7 +143,7 @@ skill-seekers-mcp --transport http --port 8765 ## Available Tools -26 tools organized by category: +27 tools organized by category: ### Core Tools (9) - `list_configs` - List presets @@ -156,9 +156,10 @@ skill-seekers-mcp --transport http --port 8765 - `enhance_skill` - AI enhancement - `install_skill` - Complete workflow -### Extended Tools (9) +### Extended Tools (10) - `scrape_github` - GitHub repo - `scrape_pdf` - PDF extraction +- `scrape_generic` - Generic scraper for 10 new source types (see below) - `scrape_codebase` - Local code - `unified_scrape` - Multi-source - `detect_patterns` - Pattern detection @@ -180,6 +181,37 @@ skill-seekers-mcp --transport http --port 8765 - `export_to_faiss` - `export_to_qdrant` +### scrape_generic Tool + +The `scrape_generic` tool is the generic entry point for 10 new source types added in v3.2.0. It delegates to the appropriate CLI scraper module. + +**Supported source types:** `jupyter`, `html`, `openapi`, `asciidoc`, `pptx`, `rss`, `manpage`, `confluence`, `notion`, `chat` + +**Parameters:** + +| Name | Type | Required | Description | +|------|------|----------|-------------| +| `source_type` | string | Yes | One of the 10 supported source types | +| `name` | string | Yes | Skill name for the output | +| `path` | string | No | File or directory path (for file-based sources) | +| `url` | string | No | URL (for URL-based sources like confluence, notion, rss) | + +**Usage examples:** + +``` +"Scrape the Jupyter notebook analysis.ipynb" +→ scrape_generic(source_type="jupyter", name="analysis", path="analysis.ipynb") + +"Extract content from the API spec" +→ scrape_generic(source_type="openapi", name="my-api", path="api-spec.yaml") + +"Process the PowerPoint slides" +→ scrape_generic(source_type="pptx", name="slides", path="presentation.pptx") + +"Scrape the Confluence wiki" +→ scrape_generic(source_type="confluence", name="wiki", url="https://wiki.example.com") +``` + See [MCP Reference](../reference/MCP_REFERENCE.md) for full details. --- diff --git a/docs/advanced/multi-source.md b/docs/advanced/multi-source.md index f6f819a..e5156b1 100644 --- a/docs/advanced/multi-source.md +++ b/docs/advanced/multi-source.md @@ -1,28 +1,34 @@ # Multi-Source Scraping Guide -> **Skill Seekers v3.1.0** -> **Combine documentation, code, and PDFs into one skill** +> **Skill Seekers v3.2.0** +> **Combine 17 source types into one unified skill** --- ## What is Multi-Source Scraping? -Combine multiple sources into a single, comprehensive skill: +Combine multiple sources into a single, comprehensive skill. Skill Seekers supports **17 source types** that can be freely mixed and matched: ``` ┌──────────────┐ -│ Documentation │──┐ -│ (Web docs) │ │ -└──────────────┘ │ - │ -┌──────────────┐ │ ┌──────────────────┐ -│ GitHub Repo │──┼────▶│ Unified Skill │ -│ (Source code)│ │ │ (Single source │ -└──────────────┘ │ │ of truth) │ - │ └──────────────────┘ -┌──────────────┐ │ -│ PDF Manual │──┘ -│ (Reference) │ +│ Documentation│──┐ +│ (Web docs) │ │ +├──────────────┤ │ +│ GitHub Repo │ │ +│ (Source code) │ │ +├──────────────┤ │ ┌──────────────────┐ +│ PDF / Word / │ │ │ Unified Skill │ +│ EPUB / PPTX │──┼────▶│ (Single source │ +├──────────────┤ │ │ of truth) │ +│ Video / │ │ └──────────────────┘ +│ Jupyter / HTML│ │ +├──────────────┤ │ +│ OpenAPI / │ │ +│ AsciiDoc / │ │ +│ RSS / Man │ │ +├──────────────┤ │ +│ Confluence / │──┘ +│ Notion / Chat│ └──────────────┘ ``` @@ -38,6 +44,14 @@ Combine multiple sources into a single, comprehensive skill: | Product + API | Docs + OpenAPI spec | Usage + reference | | Legacy + Current | PDF + Web docs | Complete history | | Internal + External | Local code + Public docs | Full context | +| Data Science Project | Jupyter + GitHub + Docs | Code + notebooks + docs | +| Enterprise Wiki | Confluence + GitHub + Video | Wiki + code + tutorials | +| API-First Product | OpenAPI + Docs + Jupyter | Spec + docs + examples | +| CLI Tool | Man pages + GitHub + AsciiDoc | Reference + code + docs | +| Team Knowledge | Notion + Slack/Discord + Docs | Notes + discussions + docs | +| Book + Code | EPUB + GitHub + PDF | Theory + implementation | +| Presentations + Code | PowerPoint + GitHub + Docs | Slides + code + reference | +| Content Feed | RSS/Atom + Docs + GitHub | Updates + docs + code | ### Benefits @@ -75,9 +89,9 @@ Combine multiple sources into a single, comprehensive skill: --- -## Source Types +## Source Types (17 Supported) -### 1. Documentation +### 1. Documentation (Web) ```json { @@ -127,6 +141,139 @@ Combine multiple sources into a single, comprehensive skill: } ``` +### 5. Word Document (.docx) + +```json +{ + "type": "word", + "name": "product-spec", + "path": "docs/specification.docx" +} +``` + +### 6. Video (YouTube/Vimeo/Local) + +```json +{ + "type": "video", + "name": "tutorial-video", + "url": "https://www.youtube.com/watch?v=example", + "language": "en" +} +``` + +### 7. EPUB + +```json +{ + "type": "epub", + "name": "programming-book", + "path": "books/python-guide.epub" +} +``` + +### 8. Jupyter Notebook + +```json +{ + "type": "jupyter", + "name": "analysis-notebooks", + "path": "notebooks/data-analysis.ipynb" +} +``` + +### 9. Local HTML + +```json +{ + "type": "html", + "name": "exported-docs", + "path": "exports/documentation.html" +} +``` + +### 10. OpenAPI/Swagger + +```json +{ + "type": "openapi", + "name": "api-spec", + "path": "specs/openapi.yaml" +} +``` + +### 11. AsciiDoc + +```json +{ + "type": "asciidoc", + "name": "technical-docs", + "path": "docs/manual.adoc" +} +``` + +### 12. PowerPoint (.pptx) + +```json +{ + "type": "pptx", + "name": "architecture-deck", + "path": "presentations/architecture.pptx" +} +``` + +### 13. RSS/Atom Feed + +```json +{ + "type": "rss", + "name": "release-feed", + "url": "https://blog.example.com/releases.xml" +} +``` + +### 14. Man Pages + +```json +{ + "type": "manpage", + "name": "cli-reference", + "path": "man/mytool.1" +} +``` + +### 15. Confluence + +```json +{ + "type": "confluence", + "name": "team-wiki", + "base_url": "https://company.atlassian.net/wiki", + "space_key": "ENGINEERING" +} +``` + +### 16. Notion + +```json +{ + "type": "notion", + "name": "project-docs", + "workspace": "my-workspace", + "root_page_id": "abc123def456" +} +``` + +### 17. Slack/Discord (Chat) + +```json +{ + "type": "chat", + "name": "team-discussions", + "path": "exports/slack-export/" +} +``` + --- ## Complete Example @@ -240,6 +387,21 @@ Uses defined rules for merging: skill-seekers unified --config my-config.json --merge-mode rule-based ``` +### Generic Merge System + +When combining source types beyond the standard docs+github+pdf trio, the **generic merge system** (`_generic_merge()` in `unified_skill_builder.py`) handles any combination automatically. It uses pairwise synthesis for known combos (docs+github, docs+pdf, github+pdf) and falls back to a generic merging strategy for all other source type combinations. + +### AI-Powered Multi-Source Merging + +For complex multi-source projects, use the `complex-merge.yaml` workflow preset to apply AI-powered merging: + +```bash +skill-seekers unified --config my-config.json \ + --enhance-workflow complex-merge +``` + +This workflow uses Claude to intelligently reconcile content from disparate source types, resolving conflicts and creating coherent cross-references between sources that would otherwise be difficult to merge deterministically. + --- ## Conflict Detection @@ -319,7 +481,9 @@ output/react-complete/ "sources": [ {"type": "docs", "name": "official-docs"}, {"type": "github", "name": "source-code"}, - {"type": "pdf", "name": "legacy-reference"} + {"type": "pdf", "name": "legacy-reference"}, + {"type": "openapi", "name": "api-spec"}, + {"type": "confluence", "name": "team-wiki"} ] } ``` @@ -406,14 +570,40 @@ skill-seekers unified --config my-config.json --merge-mode rule-based } ``` -### API + Documentation +### Docs + OpenAPI Spec ```json { "name": "stripe-complete", "sources": [ {"type": "docs", "base_url": "https://stripe.com/docs"}, - {"type": "pdf", "pdf_path": "stripe-api-reference.pdf"} + {"type": "openapi", "path": "specs/stripe-openapi.yaml"} + ] +} +``` + +### Code + Jupyter Notebooks + +```json +{ + "name": "ml-project", + "sources": [ + {"type": "github", "repo": "org/ml-pipeline"}, + {"type": "jupyter", "path": "notebooks/training.ipynb"}, + {"type": "jupyter", "path": "notebooks/evaluation.ipynb"} + ] +} +``` + +### Confluence + GitHub + +```json +{ + "name": "internal-platform", + "sources": [ + {"type": "confluence", "base_url": "https://company.atlassian.net/wiki", "space_key": "PLATFORM"}, + {"type": "github", "repo": "company/platform-core"}, + {"type": "openapi", "path": "specs/platform-api.yaml"} ] } ``` @@ -430,6 +620,32 @@ skill-seekers unified --config my-config.json --merge-mode rule-based } ``` +### CLI Tool (Man Pages + GitHub + AsciiDoc) + +```json +{ + "name": "mytool-complete", + "sources": [ + {"type": "manpage", "path": "man/mytool.1"}, + {"type": "github", "repo": "org/mytool"}, + {"type": "asciidoc", "path": "docs/user-guide.adoc"} + ] +} +``` + +### Team Knowledge (Notion + Chat + Video) + +```json +{ + "name": "onboarding-knowledge", + "sources": [ + {"type": "notion", "workspace": "engineering", "root_page_id": "abc123"}, + {"type": "chat", "path": "exports/slack-engineering/"}, + {"type": "video", "url": "https://www.youtube.com/playlist?list=PLonboarding"} + ] +} +``` + --- ## See Also diff --git a/docs/architecture/UNIFIED_PARSERS.md b/docs/architecture/UNIFIED_PARSERS.md index 3f9870b..66fb6cb 100644 --- a/docs/architecture/UNIFIED_PARSERS.md +++ b/docs/architecture/UNIFIED_PARSERS.md @@ -2,12 +2,12 @@ ## Overview -The Unified Document Parser system provides a standardized interface for extracting structured content from multiple document formats (RST, Markdown, PDF). It replaces format-specific extraction logic with a common data model and extensible parser framework. +The Unified Document Parser system provides a standardized interface for extracting structured content from multiple document formats. As of v3.2.0, the system supports **17 source types** through registered parsers and scraper modules. It replaces format-specific extraction logic with a common data model and extensible parser framework. ## Architecture Goals 1. **Standardization**: All parsers output the same `Document` structure -2. **Extensibility**: Easy to add new formats (HTML, AsciiDoc, etc.) +2. **Extensibility**: Easy to add new formats via the scraper pattern (17 source types and growing) 3. **Quality**: Built-in quality scoring for extracted content 4. **Backward Compatibility**: Legacy parsers remain functional during migration @@ -163,9 +163,45 @@ class ParseResult: - Images and links - Frontmatter (YAML metadata) -#### PDF Parser (Future) +#### PDF Parser -**Status**: Not yet migrated to unified structure +**File**: `src/skill_seekers/cli/pdf_scraper.py` + +**Status**: Integrated. Extracts text, tables, images, and code blocks from PDF files. Supports OCR for scanned documents. + +#### Additional Registered Parsers (v3.2.0) + +The following source types each have a dedicated scraper module registered in `parsers/__init__.py` (PARSERS list), `main.py` (COMMAND_MODULES dict), and `config_validator.py` (VALID_SOURCE_TYPES set): + +| # | Source Type | Scraper Module | Parser Registration | +|---|------------|---------------|---------------------| +| 1 | Documentation (web) | `doc_scraper.py` | `documentation` | +| 2 | GitHub repo | `github_scraper.py` | `github` | +| 3 | PDF | `pdf_scraper.py` | `pdf` | +| 4 | Word (.docx) | `word_scraper.py` | `word` | +| 5 | EPUB | `epub_scraper.py` | `epub` | +| 6 | Video | `video_scraper.py` | `video` | +| 7 | Local codebase | `codebase_scraper.py` | `local` | +| 8 | Jupyter Notebook | `jupyter_scraper.py` | `jupyter` | +| 9 | Local HTML | `html_scraper.py` | `html` | +| 10 | OpenAPI/Swagger | `openapi_scraper.py` | `openapi` | +| 11 | AsciiDoc | `asciidoc_scraper.py` | `asciidoc` | +| 12 | PowerPoint | `pptx_scraper.py` | `pptx` | +| 13 | RSS/Atom | `rss_scraper.py` | `rss` | +| 14 | Man pages | `manpage_scraper.py` | `manpage` | +| 15 | Confluence | `confluence_scraper.py` | `confluence` | +| 16 | Notion | `notion_scraper.py` | `notion` | +| 17 | Slack/Discord | `chat_scraper.py` | `chat` | + +Each scraper follows the same pattern: a `ToSkillConverter` class with a `main()` function, registered in three places (see [CONTRIBUTING.md](../../CONTRIBUTING.md) for the full scraper pattern). + +#### Generic Merge System + +**File**: `src/skill_seekers/cli/unified_skill_builder.py` + +The `unified_skill_builder.py` handles multi-source merging: +- **Pairwise synthesis**: Optimized merge for common combos (docs+github, docs+pdf, github+pdf) +- **Generic merge** (`_generic_merge()`): Handles all other source type combinations (e.g., docs+jupyter+confluence) by normalizing each source's `scraped_data` into a common structure and merging sections ### 4. Quality Scoring Layer @@ -387,13 +423,12 @@ The enhanced `extract_rst_structure()` function: ## Future Enhancements -1. **PDF Parser**: Migrate to unified structure -2. **HTML Parser**: Add for web documentation -3. **Caching Layer**: Redis/disk cache for parsed docs -4. **Streaming**: Parse large files incrementally -5. **Validation**: JSON Schema validation for output +1. **Caching Layer**: Redis/disk cache for parsed docs +2. **Streaming**: Parse large files incrementally +3. **Validation**: JSON Schema validation for output +4. **Additional formats**: As new source types are added, they follow the same parser registration pattern --- -**Last Updated**: 2026-02-15 -**Version**: 1.0.0 +**Last Updated**: 2026-03-15 +**Version**: 2.0.0 (updated for 17 source types) diff --git a/docs/features/UNIFIED_SCRAPING.md b/docs/features/UNIFIED_SCRAPING.md index f2f0747..55148c2 100644 --- a/docs/features/UNIFIED_SCRAPING.md +++ b/docs/features/UNIFIED_SCRAPING.md @@ -1,20 +1,21 @@ # Unified Multi-Source Scraping -**Version:** 2.0 (Feature complete as of October 2025) +**Version:** 3.2.0 (17 source types supported) ## Overview -Unified multi-source scraping allows you to combine knowledge from multiple sources into a single comprehensive Claude skill. Instead of choosing between documentation, GitHub repositories, or PDF manuals, you can now extract and intelligently merge information from all of them. +Unified multi-source scraping allows you to combine knowledge from multiple sources into a single comprehensive skill. Instead of choosing between documentation, GitHub repositories, PDF manuals, or any of the 17 supported source types, you can extract and intelligently merge information from all of them. ## Why Unified Scraping? **The Problem**: Documentation and code often drift apart over time. Official docs might be outdated, missing features that exist in code, or documenting features that have been removed. Separately scraping docs and code creates two incomplete skills. **The Solution**: Unified scraping: -- Extracts information from multiple sources (documentation, GitHub, PDFs) +- Extracts information from **17 source types** (documentation, GitHub, PDFs, videos, Word docs, EPUB, Jupyter notebooks, local HTML, OpenAPI specs, AsciiDoc, PowerPoint, RSS/Atom feeds, man pages, Confluence, Notion, Slack/Discord, and local codebases) - **Detects conflicts** between documentation and actual code implementation - **Intelligently merges** conflicting information with transparency -- **Highlights discrepancies** with inline warnings (⚠️) +- **Generic merge system** combines any combination of source types via pairwise synthesis +- **Highlights discrepancies** with inline warnings - Creates a single, comprehensive skill that shows the complete picture ## Quick Start @@ -53,9 +54,9 @@ python3 cli/unified_scraper.py --config configs/react_unified.json ``` The tool will: -1. ✅ **Phase 1**: Scrape all sources (docs + GitHub + PDF + local) +1. ✅ **Phase 1**: Scrape all sources (any of the 17 supported types) 2. ✅ **Phase 2**: Detect conflicts between sources -3. ✅ **Phase 3**: Merge conflicts intelligently +3. ✅ **Phase 3**: Merge conflicts intelligently (pairwise synthesis or generic merge) 4. ✅ **Phase 4**: Build unified skill with conflict transparency 5. ✅ **Phase 5**: Apply enhancement workflows (optional) @@ -76,13 +77,35 @@ python3 cli/package_skill.py output/react/ "merge_mode": "rule-based|claude-enhanced", "sources": [ { - "type": "documentation|github|pdf", + "type": "", ...source-specific fields... } ] } ``` +#### Supported Source Types + +| Type | Config `type` Value | Description | +|------|-------------------|-------------| +| Documentation (web) | `documentation` | Web documentation sites | +| GitHub repo | `github` | GitHub repository analysis | +| PDF | `pdf` | PDF document extraction | +| Local codebase | `local` | Local directory analysis | +| Word (.docx) | `word` | Word document extraction | +| Video | `video` | YouTube/Vimeo/local video transcription | +| EPUB | `epub` | EPUB ebook extraction | +| Jupyter Notebook | `jupyter` | `.ipynb` notebook extraction | +| Local HTML | `html` | Local HTML file extraction | +| OpenAPI/Swagger | `openapi` | OpenAPI/Swagger spec parsing | +| AsciiDoc | `asciidoc` | AsciiDoc document extraction | +| PowerPoint | `pptx` | PowerPoint presentation extraction | +| RSS/Atom | `rss` | RSS/Atom feed extraction | +| Man pages | `manpage` | Unix man page extraction | +| Confluence | `confluence` | Atlassian Confluence wiki extraction | +| Notion | `notion` | Notion workspace extraction | +| Slack/Discord | `chat` | Chat export extraction | + ### Documentation Source ```json @@ -145,6 +168,126 @@ python3 cli/package_skill.py output/react/ } ``` +### Video Source + +```json +{ + "type": "video", + "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ", + "language": "en" +} +``` + +### Word Document Source + +```json +{ + "type": "word", + "path": "/path/to/document.docx" +} +``` + +### EPUB Source + +```json +{ + "type": "epub", + "path": "/path/to/book.epub" +} +``` + +### Jupyter Notebook Source + +```json +{ + "type": "jupyter", + "path": "/path/to/notebook.ipynb" +} +``` + +### Local HTML Source + +```json +{ + "type": "html", + "path": "/path/to/page.html" +} +``` + +### OpenAPI/Swagger Source + +```json +{ + "type": "openapi", + "path": "/path/to/openapi.yaml" +} +``` + +### AsciiDoc Source + +```json +{ + "type": "asciidoc", + "path": "/path/to/document.adoc" +} +``` + +### PowerPoint Source + +```json +{ + "type": "pptx", + "path": "/path/to/presentation.pptx" +} +``` + +### RSS/Atom Feed Source + +```json +{ + "type": "rss", + "url": "https://blog.example.com/feed.xml" +} +``` + +### Man Page Source + +```json +{ + "type": "manpage", + "path": "/path/to/command.1" +} +``` + +### Confluence Source + +```json +{ + "type": "confluence", + "base_url": "https://company.atlassian.net/wiki", + "space_key": "DOCS" +} +``` + +### Notion Source + +```json +{ + "type": "notion", + "workspace": "my-workspace", + "root_page_id": "abc123" +} +``` + +### Slack/Discord Chat Source + +```json +{ + "type": "chat", + "path": "/path/to/export/" +} +``` + ## Conflict Detection The unified scraper automatically detects 4 types of conflicts: @@ -257,6 +400,14 @@ output/skill-name/ │ │ └── releases.md │ ├── pdf/ # PDF references (if applicable) │ │ └── index.md +│ ├── video/ # Video transcripts (if applicable) +│ │ └── index.md +│ ├── openapi/ # OpenAPI spec (if applicable) +│ │ └── index.md +│ ├── jupyter/ # Notebook content (if applicable) +│ │ └── index.md +│ ├── / # Other source type references +│ │ └── index.md │ ├── api/ # Merged API reference │ │ └── merged_api.md │ └── conflicts.md # Detailed conflict report @@ -380,7 +531,61 @@ useEffect(callback: () => void | (() => void), deps?: readonly any[]) } ``` -### Example 3: Mixed Sources (Docs + GitHub + PDF) +### Example 3: API Project (Docs + OpenAPI + Jupyter) + +```json +{ + "name": "my-api", + "description": "Complete API knowledge with spec and notebooks", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", + "base_url": "https://api.example.com/docs/", + "extract_api": true, + "max_pages": 100 + }, + { + "type": "openapi", + "path": "specs/openapi.yaml" + }, + { + "type": "jupyter", + "path": "notebooks/api-examples.ipynb" + } + ] +} +``` + +### Example 4: Enterprise Knowledge (Confluence + GitHub + Video) + +```json +{ + "name": "internal-platform", + "description": "Internal platform knowledge from all sources", + "merge_mode": "claude-enhanced", + "sources": [ + { + "type": "confluence", + "base_url": "https://company.atlassian.net/wiki", + "space_key": "PLATFORM" + }, + { + "type": "github", + "repo": "company/platform", + "include_code": true, + "code_analysis_depth": "deep" + }, + { + "type": "video", + "url": "https://www.youtube.com/playlist?list=PLexample", + "language": "en" + } + ] +} +``` + +### Example 5: Mixed Sources (Docs + GitHub + PDF) ```json { @@ -590,6 +795,19 @@ UnifiedScraper.run() │ - GitHub → github_scraper │ │ - PDF → pdf_scraper │ │ - Local → codebase_scraper │ +│ - Video → video_scraper │ +│ - Word → word_scraper │ +│ - EPUB → epub_scraper │ +│ - Jupyter → jupyter_scraper │ +│ - HTML → html_scraper │ +│ - OpenAPI → openapi_scraper │ +│ - AsciiDoc → asciidoc_scraper │ +│ - PowerPoint → pptx_scraper │ +│ - RSS/Atom → rss_scraper │ +│ - Man pages → manpage_scraper │ +│ - Confluence → confluence_scraper │ +│ - Notion → notion_scraper │ +│ - Chat → chat_scraper │ └────────────────────────────────────┘ ↓ ┌────────────────────────────────────┐ @@ -601,6 +819,10 @@ UnifiedScraper.run() ↓ ┌────────────────────────────────────┐ │ Phase 3: Merge Sources │ +│ - Pairwise synthesis (docs+github │ +│ +pdf combos) │ +│ - Generic merge (_generic_merge) │ +│ for all other combinations │ │ - RuleBasedMerger (fast) │ │ - OR ClaudeEnhancedMerger (AI) │ │ - Create unified API reference │ @@ -703,6 +925,12 @@ For issues, questions, or suggestions: ## Changelog +**v3.2.0 (March 2026)**: 17 source types supported +- ✅ 13 new source types: Word, EPUB, Video, Jupyter, HTML, OpenAPI, AsciiDoc, PowerPoint, RSS/Atom, Man pages, Confluence, Notion, Slack/Discord +- ✅ Generic merge system (`_generic_merge()`) for combining any source type combination +- ✅ Pairwise synthesis for docs+github+pdf combos +- ✅ `complex-merge.yaml` workflow preset for AI-powered multi-source merging + **v3.1.0 (February 2026)**: Enhancement workflow support - ✅ Full workflow system integration (Phase 5) - ✅ All workflow flags supported (--enhance-workflow, --enhance-stage, --var, --workflow-dry-run) diff --git a/docs/getting-started/01-installation.md b/docs/getting-started/01-installation.md index d6a1cc6..84ff3c6 100644 --- a/docs/getting-started/01-installation.md +++ b/docs/getting-started/01-installation.md @@ -1,6 +1,6 @@ # Installation Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** Get Skill Seekers installed and running in under 5 minutes. @@ -126,6 +126,13 @@ pip install skill-seekers[dev] | `embedding` | Embedding server | `pip install skill-seekers[embedding]` | | `video` | YouTube/video transcript extraction | `pip install skill-seekers[video]` | | `video-full` | + Whisper transcription, scene detection | `pip install skill-seekers[video-full]` | +| `jupyter` | Jupyter Notebook extraction | `pip install skill-seekers[jupyter]` | +| `asciidoc` | AsciiDoc document processing | `pip install skill-seekers[asciidoc]` | +| `pptx` | PowerPoint presentation extraction | `pip install skill-seekers[pptx]` | +| `rss` | RSS/Atom feed extraction | `pip install skill-seekers[rss]` | +| `confluence` | Confluence wiki extraction | `pip install skill-seekers[confluence]` | +| `notion` | Notion workspace extraction | `pip install skill-seekers[notion]` | +| `chat` | Slack/Discord export extraction | `pip install skill-seekers[chat]` | | `all-llms` | All LLM platforms | `pip install skill-seekers[all-llms]` | | `all` | Everything | `pip install skill-seekers[all]` | | `dev` | Development tools | `pip install skill-seekers[dev]` | diff --git a/docs/getting-started/02-quick-start.md b/docs/getting-started/02-quick-start.md index 85f53a0..c31a73a 100644 --- a/docs/getting-started/02-quick-start.md +++ b/docs/getting-started/02-quick-start.md @@ -1,6 +1,6 @@ # Quick Start Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** > **Create your first skill in 3 commands** --- @@ -32,6 +32,19 @@ The `create` command auto-detects your source: | **GitHub Repo** | `skill-seekers create facebook/react` | | **Local Code** | `skill-seekers create ./my-project` | | **PDF File** | `skill-seekers create manual.pdf` | +| **Word Document** | `skill-seekers create report.docx` | +| **EPUB Book** | `skill-seekers create book.epub` | +| **Video** | `skill-seekers create https://youtube.com/watch?v=...` | +| **Jupyter Notebook** | `skill-seekers create analysis.ipynb` | +| **Local HTML** | `skill-seekers create page.html` | +| **OpenAPI Spec** | `skill-seekers create api-spec.yaml` | +| **AsciiDoc** | `skill-seekers create guide.adoc` | +| **PowerPoint** | `skill-seekers create slides.pptx` | +| **RSS/Atom Feed** | `skill-seekers create feed.rss` | +| **Man Page** | `skill-seekers create grep.1` | +| **Confluence** | `skill-seekers confluence --space DEV` | +| **Notion** | `skill-seekers notion --database abc123` | +| **Slack/Discord** | `skill-seekers chat --export slack-export/` | | **Config File** | `skill-seekers create configs/custom.json` | --- @@ -87,6 +100,48 @@ skill-seekers create paper.pdf --name research skill-seekers package output/research --target claude ``` +### Video + +```bash +# YouTube video transcript +skill-seekers create https://www.youtube.com/watch?v=dQw4w9WgXcQ --name tutorial +skill-seekers package output/tutorial --target claude +``` + +### Jupyter Notebook + +```bash +# Data science notebook +skill-seekers create analysis.ipynb --name ml-analysis +skill-seekers package output/ml-analysis --target claude +``` + +### PowerPoint / Word / EPUB + +```bash +# PowerPoint slides +skill-seekers create presentation.pptx --name quarterly-review + +# Word document +skill-seekers create spec.docx --name api-spec + +# EPUB book +skill-seekers create rust-book.epub --name rust-guide +``` + +### Confluence / Notion / Slack + +```bash +# Confluence wiki space +skill-seekers confluence --space DEV --name team-docs + +# Notion workspace +skill-seekers notion --database abc123 --name product-wiki + +# Slack/Discord export +skill-seekers chat --export slack-export/ --name team-chat +``` + --- ## Common Options diff --git a/docs/reference/API_REFERENCE.md b/docs/reference/API_REFERENCE.md index 4ba6d39..cf29ea3 100644 --- a/docs/reference/API_REFERENCE.md +++ b/docs/reference/API_REFERENCE.md @@ -1,7 +1,7 @@ # API Reference - Programmatic Usage -**Version:** 3.1.0-dev -**Last Updated:** 2026-02-18 +**Version:** 3.2.0 +**Last Updated:** 2026-03-15 **Status:** ✅ Production Ready --- @@ -217,7 +217,7 @@ skill_path = scrape_pdf( ### 4. Unified Multi-Source Scraping API -Combine multiple sources (docs + GitHub + PDF) into a single unified skill. +Combine multiple sources (any of 17 supported types) into a single unified skill. #### Unified Scraping @@ -552,27 +552,47 @@ Skill Seekers uses JSON configuration files to define scraping behavior. ### Unified Config Schema (Multi-Source) +Supports all 17 source types: `documentation`, `github`, `pdf`, `local`, `word`, `video`, `epub`, `jupyter`, `html`, `openapi`, `asciidoc`, `pptx`, `rss`, `manpage`, `confluence`, `notion`, `chat`. + ```json { "name": "framework-unified", "description": "Complete framework documentation", - "sources": { - "documentation": { - "type": "docs", + "merge_mode": "rule-based", + "sources": [ + { + "type": "documentation", "base_url": "https://docs.example.com/", "selectors": { "main_content": "article" } }, - "github": { + { "type": "github", - "repo_url": "https://github.com/org/repo", - "analysis_depth": "c3x" + "repo": "org/repo", + "include_code": true, + "code_analysis_depth": "deep" }, - "pdf": { + { "type": "pdf", - "pdf_path": "manual.pdf", - "enable_ocr": true + "path": "manual.pdf" + }, + { + "type": "openapi", + "path": "specs/openapi.yaml" + }, + { + "type": "video", + "url": "https://www.youtube.com/watch?v=example" + }, + { + "type": "jupyter", + "path": "notebooks/examples.ipynb" + }, + { + "type": "confluence", + "base_url": "https://company.atlassian.net/wiki", + "space_key": "DOCS" } - }, + ], "conflict_resolution": "prefer_code", "merge_strategy": "smart" } @@ -961,7 +981,21 @@ monitor_enhancement('output/react/', watch=True) | **Documentation Scraping** | `doc_scraper` | Extract from docs websites | | **GitHub Analysis** | `github_scraper` | Analyze code repositories | | **PDF Extraction** | `pdf_scraper` | Extract from PDF files | -| **Unified Scraping** | `unified_scraper` | Multi-source scraping | +| **Word Extraction** | `word_scraper` | Extract from .docx files | +| **EPUB Extraction** | `epub_scraper` | Extract from .epub files | +| **Video Transcription** | `video_scraper` | Extract from YouTube/Vimeo/local videos | +| **Jupyter Extraction** | `jupyter_scraper` | Extract from .ipynb notebooks | +| **HTML Extraction** | `html_scraper` | Extract from local HTML files | +| **OpenAPI Parsing** | `openapi_scraper` | Parse OpenAPI/Swagger specs | +| **AsciiDoc Extraction** | `asciidoc_scraper` | Extract from .adoc files | +| **PowerPoint Extraction** | `pptx_scraper` | Extract from .pptx files | +| **RSS/Atom Extraction** | `rss_scraper` | Extract from RSS/Atom feeds | +| **Man Page Extraction** | `manpage_scraper` | Extract from Unix man pages | +| **Confluence Extraction** | `confluence_scraper` | Extract from Confluence wikis | +| **Notion Extraction** | `notion_scraper` | Extract from Notion workspaces | +| **Chat Extraction** | `chat_scraper` | Extract from Slack/Discord exports | +| **Local Codebase Analysis** | `codebase_scraper` | Analyze local directories | +| **Unified Scraping** | `unified_scraper` | Multi-source scraping (17 types) | | **Skill Packaging** | `adaptors` | Package for LLM platforms | | **Skill Upload** | `adaptors` | Upload to platforms | | **AI Enhancement** | `adaptors` | Improve skill quality | @@ -979,6 +1013,6 @@ monitor_enhancement('output/react/', watch=True) --- -**Version:** 3.1.0-dev -**Last Updated:** 2026-02-18 +**Version:** 3.2.0 +**Last Updated:** 2026-03-15 **Status:** ✅ Production Ready diff --git a/docs/reference/CLI_REFERENCE.md b/docs/reference/CLI_REFERENCE.md index fb11b29..f8abe76 100644 --- a/docs/reference/CLI_REFERENCE.md +++ b/docs/reference/CLI_REFERENCE.md @@ -1,8 +1,8 @@ # CLI Reference - Skill Seekers -> **Version:** 3.1.2 -> **Last Updated:** 2026-02-23 -> **Complete reference for all 20 CLI commands** +> **Version:** 3.2.0 +> **Last Updated:** 2026-03-15 +> **Complete reference for all 30 CLI commands** --- @@ -14,19 +14,29 @@ - [Environment Variables](#environment-variables) - [Command Reference](#command-reference) - [analyze](#analyze) - Analyze local codebase + - [asciidoc](#asciidoc) - Extract from AsciiDoc files + - [chat](#chat) - Extract from Slack/Discord - [config](#config) - Configuration wizard + - [confluence](#confluence) - Extract from Confluence - [create](#create) - Create skill (auto-detects source) - [enhance](#enhance) - AI enhancement (local mode) - [enhance-status](#enhance-status) - Monitor enhancement - [estimate](#estimate) - Estimate page counts - [github](#github) - Scrape GitHub repository + - [html](#html) - Extract from local HTML files - [install](#install) - One-command complete workflow - [install-agent](#install-agent) - Install to AI agent + - [jupyter](#jupyter) - Extract from Jupyter notebooks + - [manpage](#manpage) - Extract from man pages - [multilang](#multilang) - Multi-language docs + - [notion](#notion) - Extract from Notion + - [openapi](#openapi) - Extract from OpenAPI/Swagger specs - [package](#package) - Package skill for platform - [pdf](#pdf) - Extract from PDF + - [pptx](#pptx) - Extract from PowerPoint files - [quality](#quality) - Quality scoring - [resume](#resume) - Resume interrupted jobs + - [rss](#rss) - Extract from RSS/Atom feeds - [scrape](#scrape) - Scrape documentation - [stream](#stream) - Stream large files - [unified](#unified) - Multi-source scraping @@ -42,7 +52,7 @@ ## Overview -Skill Seekers provides a unified CLI for converting documentation, GitHub repositories, PDFs, and local codebases into AI-ready skills. +Skill Seekers provides a unified CLI for converting documentation, GitHub repositories, PDFs, videos, notebooks, wikis, and 17 total source types into AI-ready skills for 16+ LLM platforms and RAG pipelines. ### Installation @@ -172,6 +182,74 @@ skill-seekers analyze --directory ./my-project --skip-dependency-graph --skip-pa --- +### asciidoc + +Extract content from AsciiDoc files and generate skill. + +**Purpose:** Convert `.adoc` / `.asciidoc` documentation into AI-ready skills. + +**Syntax:** +```bash +skill-seekers asciidoc [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--asciidoc-path PATH` | Path to AsciiDoc file or directory | +| `-n, --name` | Skill name | +| `--from-json FILE` | Build from extracted JSON | +| `--enhance-level` | AI enhancement (default: 0) | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# Single file +skill-seekers asciidoc --asciidoc-path guide.adoc --name my-guide + +# Directory of AsciiDoc files +skill-seekers asciidoc --asciidoc-path ./docs/ --name project-docs +``` + +--- + +### chat + +Extract knowledge from Slack or Discord chat exports. + +**Purpose:** Convert chat history into searchable AI-ready skills. + +**Syntax:** +```bash +skill-seekers chat [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--export-path PATH` | Path to chat export directory or file | +| `--platform {slack,discord}` | Chat platform (default: slack) | +| `--token TOKEN` | API token for authentication | +| `--channel CHANNEL` | Channel name or ID to extract from | +| `--max-messages N` | Max messages to extract (default: 10000) | +| `-n, --name` | Skill name | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# From Slack export +skill-seekers chat --export-path ./slack-export/ --name team-knowledge + +# From Discord via API +skill-seekers chat --platform discord --token $DISCORD_TOKEN --channel general --name discord-docs +``` + +--- + ### config Interactive configuration wizard for API keys and settings. @@ -210,6 +288,43 @@ skill-seekers config --test --- +### confluence + +Extract content from Confluence wikis. + +**Purpose:** Convert Confluence spaces into AI-ready skills via API or HTML export. + +**Syntax:** +```bash +skill-seekers confluence [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--base-url URL` | Confluence instance base URL | +| `--space-key KEY` | Confluence space key | +| `--export-path PATH` | Path to Confluence HTML/XML export directory | +| `--username USER` | Confluence username | +| `--token TOKEN` | Confluence API token | +| `--max-pages N` | Max pages to extract (default: 500) | +| `-n, --name` | Skill name | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# Via API +skill-seekers confluence --base-url https://wiki.example.com --space-key DEV \ + --username user@example.com --token $CONFLUENCE_TOKEN --name dev-wiki + +# From export +skill-seekers confluence --export-path ./confluence-export/ --name team-docs +``` + +--- + ### create Create skill from any source. Auto-detects source type. @@ -234,6 +349,15 @@ skill-seekers create [source] [options] | `owner/repo` | GitHub | `facebook/react` | | `./path` | Local codebase | `./my-project` | | `*.pdf` | PDF | `manual.pdf` | +| `*.docx` | Word | `report.docx` | +| `*.epub` | EPUB | `book.epub` | +| `*.ipynb` | Jupyter Notebook | `analysis.ipynb` | +| `*.html`/`*.htm` | Local HTML | `docs.html` | +| `*.yaml`/`*.yml` | OpenAPI/Swagger | `openapi.yaml` | +| `*.adoc`/`*.asciidoc` | AsciiDoc | `guide.adoc` | +| `*.pptx` | PowerPoint | `slides.pptx` | +| `*.rss`/`*.atom` | RSS/Atom feed | `feed.rss` | +| `*.1`-`*.8`/`*.man` | Man page | `grep.1` | | `*.json` | Config file | `config.json` | **Flags:** @@ -473,6 +597,39 @@ skill-seekers github --repo facebook/react --scrape-only --- +### html + +Extract content from local HTML files and generate skill. + +**Purpose:** Convert local HTML documentation into AI-ready skills (for offline/exported docs). + +**Syntax:** +```bash +skill-seekers html [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--html-path PATH` | Path to HTML file or directory | +| `-n, --name` | Skill name | +| `--from-json FILE` | Build from extracted JSON | +| `--enhance-level` | AI enhancement (default: 0) | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# Single HTML file +skill-seekers html --html-path docs/index.html --name my-docs + +# Directory of HTML files +skill-seekers html --html-path ./html-export/ --name exported-docs +``` + +--- + ### install One-command complete workflow: fetch → scrape → enhance → package → upload. @@ -558,6 +715,72 @@ skill-seekers install-agent output/react/ --agent cursor --force --- +### jupyter + +Extract content from Jupyter Notebook files and generate skill. + +**Purpose:** Convert `.ipynb` notebooks into AI-ready skills with code, markdown, and outputs. + +**Syntax:** +```bash +skill-seekers jupyter [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--notebook PATH` | Path to .ipynb file or directory | +| `-n, --name` | Skill name | +| `--from-json FILE` | Build from extracted JSON | +| `--enhance-level` | AI enhancement (default: 0) | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# Single notebook +skill-seekers jupyter --notebook analysis.ipynb --name data-analysis + +# Directory of notebooks +skill-seekers jupyter --notebook ./notebooks/ --name ml-tutorials +``` + +--- + +### manpage + +Extract content from Unix/Linux man pages and generate skill. + +**Purpose:** Convert man pages into AI-ready reference skills. + +**Syntax:** +```bash +skill-seekers manpage [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--man-names NAMES` | Comma-separated man page names (e.g., `ls,grep,find`) | +| `--man-path PATH` | Path to directory containing man page files | +| `--sections SECTIONS` | Comma-separated section numbers (e.g., `1,3,8`) | +| `-n, --name` | Skill name | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# By name (system man pages) +skill-seekers manpage --man-names ls,grep,find,awk --name unix-essentials + +# From directory +skill-seekers manpage --man-path /usr/share/man/man1/ --sections 1 --name section1-cmds +``` + +--- + ### multilang Multi-language documentation support. @@ -590,6 +813,75 @@ skill-seekers multilang --config configs/docs.json --languages en,zh,es --- +### notion + +Extract content from Notion workspaces. + +**Purpose:** Convert Notion pages and databases into AI-ready skills via API or export. + +**Syntax:** +```bash +skill-seekers notion [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--database-id ID` | Notion database ID to extract from | +| `--page-id ID` | Notion page ID to extract from | +| `--export-path PATH` | Path to Notion export directory | +| `--token TOKEN` | Notion integration token | +| `--max-pages N` | Max pages to extract (default: 500) | +| `-n, --name` | Skill name | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# Via API +skill-seekers notion --database-id abc123 --token $NOTION_TOKEN --name team-docs + +# From export +skill-seekers notion --export-path ./notion-export/ --name project-wiki +``` + +--- + +### openapi + +Extract content from OpenAPI/Swagger specifications and generate skill. + +**Purpose:** Convert API specs into AI-ready reference skills with endpoint documentation. + +**Syntax:** +```bash +skill-seekers openapi [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--spec PATH` | Path to OpenAPI/Swagger spec file | +| `--spec-url URL` | URL to OpenAPI/Swagger spec | +| `-n, --name` | Skill name | +| `--from-json FILE` | Build from extracted JSON | +| `--enhance-level` | AI enhancement (default: 0) | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# From local file +skill-seekers openapi --spec api/openapi.yaml --name my-api + +# From URL +skill-seekers openapi --spec-url https://petstore.swagger.io/v2/swagger.json --name petstore +``` + +--- + ### package Package skill directory into platform-specific format. @@ -713,6 +1005,39 @@ skill-seekers pdf --pdf manual.pdf --name test --dry-run --- +### pptx + +Extract content from PowerPoint files and generate skill. + +**Purpose:** Convert `.pptx` presentations into AI-ready skills. + +**Syntax:** +```bash +skill-seekers pptx [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--pptx PATH` | Path to PowerPoint file (.pptx) | +| `-n, --name` | Skill name | +| `--from-json FILE` | Build from extracted JSON | +| `--enhance-level` | AI enhancement (default: 0) | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# Extract from presentation +skill-seekers pptx --pptx training-slides.pptx --name training-material + +# With enhancement +skill-seekers pptx --pptx architecture.pptx --name arch-overview --enhance-level 2 +``` + +--- + ### quality Analyze and score skill documentation quality. @@ -791,6 +1116,41 @@ skill-seekers resume --clean --- +### rss + +Extract content from RSS/Atom feeds and generate skill. + +**Purpose:** Convert blog feeds and news sources into AI-ready skills. + +**Syntax:** +```bash +skill-seekers rss [options] +``` + +**Key Flags:** + +| Flag | Description | +|------|-------------| +| `--feed-url URL` | URL of the RSS/Atom feed | +| `--feed-path PATH` | Path to local RSS/Atom feed file | +| `--follow-links` | Follow article links for full content (default: true) | +| `--no-follow-links` | Use feed summary only | +| `--max-articles N` | Max articles to extract (default: 50) | +| `-n, --name` | Skill name | +| `--dry-run` | Preview without executing | + +**Examples:** + +```bash +# From URL +skill-seekers rss --feed-url https://blog.example.com/feed.xml --name blog-knowledge + +# From local file, summaries only +skill-seekers rss --feed-path ./feed.rss --no-follow-links --name feed-summaries +``` + +--- + ### scrape Scrape documentation website and generate skill. diff --git a/docs/reference/CONFIG_FORMAT.md b/docs/reference/CONFIG_FORMAT.md index 703cbf2..da30e17 100644 --- a/docs/reference/CONFIG_FORMAT.md +++ b/docs/reference/CONFIG_FORMAT.md @@ -1,8 +1,8 @@ # Config Format Reference - Skill Seekers -> **Version:** 3.1.4 -> **Last Updated:** 2026-02-26 -> **Complete JSON configuration specification** +> **Version:** 3.2.0 +> **Last Updated:** 2026-03-15 +> **Complete JSON configuration specification for 17 source types** --- @@ -14,6 +14,7 @@ - [GitHub Source](#github-source) - [PDF Source](#pdf-source) - [Local Source](#local-source) + - [Additional Source Types](#additional-source-types) - [Unified (Multi-Source) Config](#unified-multi-source-config) - [Common Fields](#common-fields) - [Selectors](#selectors) @@ -266,6 +267,158 @@ For analyzing local codebases. --- +### Additional Source Types + +The following 10 source types were added in v3.2.0. Each can be used as a standalone config or within a unified `sources` array. + +#### Jupyter Notebook Source + +```json +{ + "name": "ml-tutorial", + "sources": [{ + "type": "jupyter", + "notebook_path": "notebooks/tutorial.ipynb" + }] +} +``` + +#### Local HTML Source + +```json +{ + "name": "offline-docs", + "sources": [{ + "type": "html", + "html_path": "./exported-docs/" + }] +} +``` + +#### OpenAPI/Swagger Source + +```json +{ + "name": "petstore-api", + "sources": [{ + "type": "openapi", + "spec_path": "api/openapi.yaml", + "spec_url": "https://petstore.swagger.io/v2/swagger.json" + }] +} +``` + +#### AsciiDoc Source + +```json +{ + "name": "project-guide", + "sources": [{ + "type": "asciidoc", + "asciidoc_path": "./docs/guide.adoc" + }] +} +``` + +#### PowerPoint Source + +```json +{ + "name": "training-slides", + "sources": [{ + "type": "pptx", + "pptx_path": "presentations/training.pptx" + }] +} +``` + +#### RSS/Atom Feed Source + +```json +{ + "name": "engineering-blog", + "sources": [{ + "type": "rss", + "feed_url": "https://engineering.example.com/feed.xml", + "follow_links": true, + "max_articles": 50 + }] +} +``` + +#### Man Page Source + +```json +{ + "name": "unix-tools", + "sources": [{ + "type": "manpage", + "man_names": "ls,grep,find,awk,sed", + "sections": "1,3" + }] +} +``` + +#### Confluence Source + +```json +{ + "name": "team-wiki", + "sources": [{ + "type": "confluence", + "base_url": "https://wiki.example.com", + "space_key": "DEV", + "username": "user@example.com", + "max_pages": 500 + }] +} +``` + +#### Notion Source + +```json +{ + "name": "product-docs", + "sources": [{ + "type": "notion", + "database_id": "abc123def456", + "max_pages": 500 + }] +} +``` + +#### Chat (Slack/Discord) Source + +```json +{ + "name": "team-knowledge", + "sources": [{ + "type": "chat", + "export_path": "./slack-export/", + "platform": "slack", + "channel": "engineering", + "max_messages": 10000 + }] +} +``` + +#### Additional Source Fields Reference + +| Source Type | Required Fields | Optional Fields | +|-------------|-----------------|-----------------| +| `jupyter` | `notebook_path` | — | +| `html` | `html_path` | — | +| `openapi` | `spec_path` or `spec_url` | — | +| `asciidoc` | `asciidoc_path` | — | +| `pptx` | `pptx_path` | — | +| `rss` | `feed_url` or `feed_path` | `follow_links`, `max_articles` | +| `manpage` | `man_names` or `man_path` | `sections` | +| `confluence` | `base_url` + `space_key` or `export_path` | `username`, `token`, `max_pages` | +| `notion` | `database_id` or `page_id` or `export_path` | `token`, `max_pages` | +| `chat` | `export_path` | `platform`, `token`, `channel`, `max_messages` | + +--- + ## Unified (Multi-Source) Config Combine multiple sources into one skill with conflict detection. @@ -380,14 +533,27 @@ Unified configs support defining enhancement workflows at the top level: #### Source Types in Unified Config -Each source in the `sources` array can be: +Each source in the `sources` array can be any of the 17 supported types: | Type | Required Fields | |------|-----------------| -| `docs` | `base_url` | +| `documentation` / `docs` | `base_url` | | `github` | `repo` | | `pdf` | `pdf_path` | +| `word` | `docx_path` | +| `epub` | `epub_path` | +| `video` | `url` or `video_path` | | `local` | `directory` | +| `jupyter` | `notebook_path` | +| `html` | `html_path` | +| `openapi` | `spec_path` or `spec_url` | +| `asciidoc` | `asciidoc_path` | +| `pptx` | `pptx_path` | +| `rss` | `feed_url` or `feed_path` | +| `manpage` | `man_names` or `man_path` | +| `confluence` | `base_url` + `space_key` or `export_path` | +| `notion` | `database_id` or `page_id` or `export_path` | +| `chat` | `export_path` | --- @@ -606,6 +772,44 @@ Control which URLs are included or excluded: } ``` +### Unified with New Source Types + +```json +{ + "name": "project-complete", + "description": "Full project knowledge from multiple source types", + "merge_mode": "claude-enhanced", + "sources": [ + { + "type": "docs", + "name": "project-docs", + "base_url": "https://docs.example.com/", + "max_pages": 200 + }, + { + "type": "github", + "name": "project-code", + "repo": "example/project" + }, + { + "type": "openapi", + "name": "project-api", + "spec_path": "api/openapi.yaml" + }, + { + "type": "confluence", + "name": "project-wiki", + "export_path": "./confluence-export/" + }, + { + "type": "jupyter", + "name": "project-notebooks", + "notebook_path": "./notebooks/" + } + ] +} +``` + ### Local Project ```json diff --git a/docs/reference/FEATURE_MATRIX.md b/docs/reference/FEATURE_MATRIX.md index d2e49fc..7036f34 100644 --- a/docs/reference/FEATURE_MATRIX.md +++ b/docs/reference/FEATURE_MATRIX.md @@ -13,28 +13,55 @@ Complete feature support across all platforms and skill modes. ## Skill Mode Support -| Mode | Description | Platforms | Example Configs | -|------|-------------|-----------|-----------------| -| **Documentation** | Scrape HTML docs | All 4 | react.json, django.json (14 total) | -| **GitHub** | Analyze repositories | All 4 | react_github.json, godot_github.json | -| **PDF** | Extract from PDFs | All 4 | example_pdf.json | -| **Unified** | Multi-source (docs+GitHub+PDF) | All 4 | react_unified.json (5 total) | -| **Local Repo** | Unlimited local analysis | All 4 | deck_deck_go_local.json | +| Mode | Description | Platforms | CLI Command | `create` Detection | +|------|-------------|-----------|-------------|-------------------| +| **Documentation** | Scrape HTML docs | All 4 | `scrape` | `https://...` URLs | +| **GitHub** | Analyze repositories | All 4 | `github` | `owner/repo` or github.com URLs | +| **PDF** | Extract from PDFs | All 4 | `pdf` | `.pdf` extension | +| **Word** | Extract from DOCX | All 4 | `word` | `.docx` extension | +| **EPUB** | Extract from EPUB | All 4 | `epub` | `.epub` extension | +| **Video** | Video transcription | All 4 | `video` | YouTube/Vimeo URLs, video extensions | +| **Local Repo** | Local codebase analysis | All 4 | `analyze` | Directory paths | +| **Jupyter** | Extract from notebooks | All 4 | `jupyter` | `.ipynb` extension | +| **HTML** | Extract local HTML files | All 4 | `html` | `.html`/`.htm` extension | +| **OpenAPI** | Extract API specs | All 4 | `openapi` | `.yaml`/`.yml` with OpenAPI content | +| **AsciiDoc** | Extract AsciiDoc files | All 4 | `asciidoc` | `.adoc`/`.asciidoc` extension | +| **PowerPoint** | Extract from PPTX | All 4 | `pptx` | `.pptx` extension | +| **RSS/Atom** | Extract from feeds | All 4 | `rss` | `.rss`/`.atom` extension | +| **Man Pages** | Extract man pages | All 4 | `manpage` | `.1`-`.8`/`.man` extension | +| **Confluence** | Extract from Confluence | All 4 | `confluence` | API or export directory | +| **Notion** | Extract from Notion | All 4 | `notion` | API or export directory | +| **Chat** | Extract Slack/Discord | All 4 | `chat` | Export directory or API | +| **Unified** | Multi-source combination | All 4 | `unified` | N/A (config-driven) | ## CLI Command Support -| Command | Platforms | Skill Modes | Multi-Platform Flag | -|---------|-----------|-------------|---------------------| -| `scrape` | All | Docs only | No (output is universal) | -| `github` | All | GitHub only | No (output is universal) | -| `pdf` | All | PDF only | No (output is universal) | -| `unified` | All | Unified only | No (output is universal) | -| `enhance` | Claude, Gemini, OpenAI | All | ✅ `--target` | -| `package` | All | All | ✅ `--target` | -| `upload` | Claude, Gemini, OpenAI | All | ✅ `--target` | -| `estimate` | All | Docs only | No (estimation is universal) | -| `install` | All | All | ✅ `--target` | -| `install-agent` | All | All | No (agent-specific paths) | +| Command | Platforms | Skill Modes | Multi-Platform Flag | Optional Deps | +|---------|-----------|-------------|---------------------|---------------| +| `scrape` | All | Docs only | No (output is universal) | None | +| `github` | All | GitHub only | No (output is universal) | None | +| `pdf` | All | PDF only | No (output is universal) | `[pdf]` | +| `word` | All | Word only | No (output is universal) | `[word]` | +| `epub` | All | EPUB only | No (output is universal) | `[epub]` | +| `video` | All | Video only | No (output is universal) | `[video]` | +| `analyze` | All | Local only | No (output is universal) | None | +| `jupyter` | All | Jupyter only | No (output is universal) | `[jupyter]` | +| `html` | All | HTML only | No (output is universal) | None | +| `openapi` | All | OpenAPI only | No (output is universal) | `[openapi]` | +| `asciidoc` | All | AsciiDoc only | No (output is universal) | `[asciidoc]` | +| `pptx` | All | PPTX only | No (output is universal) | `[pptx]` | +| `rss` | All | RSS only | No (output is universal) | `[rss]` | +| `manpage` | All | Man pages only | No (output is universal) | None | +| `confluence` | All | Confluence only | No (output is universal) | `[confluence]` | +| `notion` | All | Notion only | No (output is universal) | `[notion]` | +| `chat` | All | Chat only | No (output is universal) | `[chat]` | +| `unified` | All | Unified only | No (output is universal) | Varies by source | +| `enhance` | Claude, Gemini, OpenAI | All | ✅ `--target` | None | +| `package` | All | All | ✅ `--target` | None | +| `upload` | Claude, Gemini, OpenAI | All | ✅ `--target` | None | +| `estimate` | All | Docs only | No (estimation is universal) | None | +| `install` | All | All | ✅ `--target` | None | +| `install-agent` | All | All | No (agent-specific paths) | None | ## MCP Tool Support @@ -50,6 +77,7 @@ Complete feature support across all platforms and skill modes. | `scrape_docs` | All | Docs + Unified | No (output is universal) | | `scrape_github` | All | GitHub only | No (output is universal) | | `scrape_pdf` | All | PDF only | No (output is universal) | +| `scrape_generic` | All | 10 new types | No (output is universal) | | **Packaging Tools** | | `package_skill` | All | All | ✅ `target` parameter | | `upload_skill` | Claude, Gemini, OpenAI | All | ✅ `target` parameter | @@ -260,8 +288,21 @@ Before release, verify all combinations: - [ ] Docs → Markdown - [ ] GitHub → All platforms - [ ] PDF → All platforms -- [ ] Unified → All platforms +- [ ] Word → All platforms +- [ ] EPUB → All platforms +- [ ] Video → All platforms - [ ] Local Repo → All platforms +- [ ] Jupyter → All platforms +- [ ] HTML → All platforms +- [ ] OpenAPI → All platforms +- [ ] AsciiDoc → All platforms +- [ ] PPTX → All platforms +- [ ] RSS → All platforms +- [ ] Man Pages → All platforms +- [ ] Confluence → All platforms +- [ ] Notion → All platforms +- [ ] Chat → All platforms +- [ ] Unified → All platforms ## Platform-Specific Notes @@ -310,7 +351,7 @@ A: Yes! Enhancement adds platform-specific formatting: - OpenAI: Plain text assistant instructions **Q: Do all skill modes work with all platforms?** -A: Yes! All 5 skill modes (Docs, GitHub, PDF, Unified, Local Repo) work with all 4 platforms. +A: Yes! All 17 source types work with all 4 platforms (Claude, Gemini, OpenAI, Markdown). ## See Also diff --git a/docs/reference/MCP_REFERENCE.md b/docs/reference/MCP_REFERENCE.md index ab9abf8..bde09eb 100644 --- a/docs/reference/MCP_REFERENCE.md +++ b/docs/reference/MCP_REFERENCE.md @@ -1,8 +1,8 @@ # MCP Reference - Skill Seekers -> **Version:** 3.1.0 -> **Last Updated:** 2026-02-16 -> **Complete reference for 26 MCP tools** +> **Version:** 3.2.0 +> **Last Updated:** 2026-03-15 +> **Complete reference for 27 MCP tools** --- @@ -79,7 +79,7 @@ Essential tools for basic skill creation workflow: | `enhance_skill` | AI enhancement | | `install_skill` | Complete workflow | -### Extended Tools (9) +### Extended Tools (10) Advanced scraping and analysis tools: @@ -88,6 +88,7 @@ Advanced scraping and analysis tools: | `scrape_github` | GitHub repository analysis | | `scrape_pdf` | PDF extraction | | `scrape_codebase` | Local codebase analysis | +| `scrape_generic` | Generic scraper for 10 new source types | | `unified_scrape` | Multi-source scraping | | `detect_patterns` | Pattern detection | | `extract_test_examples` | Extract usage examples from tests | @@ -642,6 +643,65 @@ Find discrepancies between documentation and code. --- +#### scrape_generic + +Scrape content from any of the 10 new source types. + +**Purpose:** A generic entry point that delegates to the appropriate CLI scraper module for: jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat. + +**Parameters:** + +| Name | Type | Required | Description | +|------|------|----------|-------------| +| `source_type` | string | Yes | One of: `jupyter`, `html`, `openapi`, `asciidoc`, `pptx`, `confluence`, `notion`, `rss`, `manpage`, `chat` | +| `name` | string | Yes | Skill name for the output | +| `path` | string | No | File or directory path (for file-based sources) | +| `url` | string | No | URL (for URL-based sources like confluence, notion, rss) | + +**Note:** Either `path` or `url` must be provided depending on the source type. + +**Source Type → Input Mapping:** + +| Source Type | Typical Input | CLI Flag Used | +|-------------|--------------|---------------| +| `jupyter` | `path` | `--notebook` | +| `html` | `path` | `--html-path` | +| `openapi` | `path` | `--spec` | +| `asciidoc` | `path` | `--asciidoc-path` | +| `pptx` | `path` | `--pptx` | +| `manpage` | `path` | `--man-path` | +| `confluence` | `path` or `url` | `--export-path` / `--base-url` | +| `notion` | `path` or `url` | `--export-path` / `--database-id` | +| `rss` | `path` or `url` | `--feed-path` / `--feed-url` | +| `chat` | `path` | `--export-path` | + +**Returns:** Scraping results with file paths and statistics + +```json +{ + "skill_directory": "output/my-api/", + "source_type": "openapi", + "status": "success" +} +``` + +**Example:** +```python +# Natural language +"Scrape the OpenAPI spec at api/openapi.yaml" +"Extract content from my Jupyter notebook analysis.ipynb" +"Process the Confluence export in ./wiki-export/" +"Convert the PowerPoint slides.pptx into a skill" + +# Explicit tool call +scrape_generic(source_type="openapi", name="my-api", path="api/openapi.yaml") +scrape_generic(source_type="jupyter", name="ml-tutorial", path="notebooks/tutorial.ipynb") +scrape_generic(source_type="rss", name="blog", url="https://blog.example.com/feed.xml") +scrape_generic(source_type="confluence", name="wiki", path="./confluence-export/") +``` + +--- + ### Config Source Tools #### add_config_source @@ -1030,7 +1090,19 @@ Tools: `list_workflows` → `unified_scrape` → `enhance_skill` → `package_sk --- -### Pattern 5: Vector Database Export +### Pattern 5: New Source Type Scraping + +```python +# Natural language sequence: +"Scrape the OpenAPI spec at api/openapi.yaml" +"Package the output for Claude" +``` + +Tools: `scrape_generic` → `package_skill` + +--- + +### Pattern 6: Vector Database Export ```python # Natural language sequence: diff --git a/docs/user-guide/01-core-concepts.md b/docs/user-guide/01-core-concepts.md index bb94460..5826754 100644 --- a/docs/user-guide/01-core-concepts.md +++ b/docs/user-guide/01-core-concepts.md @@ -1,19 +1,20 @@ # Core Concepts -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** > **Understanding how Skill Seekers works** --- ## Overview -Skill Seekers transforms documentation, code, and content into **structured knowledge assets** that AI systems can use effectively. +Skill Seekers transforms documentation, code, and content into **structured knowledge assets** that AI systems can use effectively. It supports **17 source types** including documentation sites, GitHub repos, PDFs, videos, notebooks, wikis, and more. ``` Raw Content → Skill Seekers → AI-Ready Skill ↓ ↓ - (docs, code, (SKILL.md + - PDFs, repos) references) + (docs, code, PDFs, (SKILL.md + + videos, notebooks, references) + wikis, feeds, etc.) ``` --- @@ -76,7 +77,7 @@ npm install my-framework ## Source Types -Skill Seekers works with four types of sources: +Skill Seekers works with **17 types of sources**: ### 1. Documentation Websites @@ -168,6 +169,157 @@ skill-seekers analyze --directory ./my-project --- +### 5. Word Documents + +**What:** Microsoft Word (.docx) files + +**Command:** +```bash +skill-seekers create report.docx +``` + +--- + +### 6. EPUB Books + +**What:** EPUB e-book files + +**Command:** +```bash +skill-seekers create book.epub +``` + +--- + +### 7. Videos + +**What:** YouTube, Vimeo, or local video files (transcripts + visual analysis) + +**Command:** +```bash +skill-seekers create https://www.youtube.com/watch?v=... +skill-seekers video --url https://www.youtube.com/watch?v=... +``` + +--- + +### 8. Jupyter Notebooks + +**What:** `.ipynb` notebook files with code, markdown, and outputs + +**Command:** +```bash +skill-seekers create analysis.ipynb +skill-seekers jupyter --notebook analysis.ipynb +``` + +--- + +### 9. Local HTML Files + +**What:** HTML/HTM files on disk + +**Command:** +```bash +skill-seekers create page.html +skill-seekers html --file page.html +``` + +--- + +### 10. OpenAPI/Swagger Specs + +**What:** OpenAPI YAML/JSON specifications + +**Command:** +```bash +skill-seekers create api-spec.yaml +skill-seekers openapi --spec api-spec.yaml +``` + +--- + +### 11. AsciiDoc + +**What:** AsciiDoc (.adoc, .asciidoc) files + +**Command:** +```bash +skill-seekers create guide.adoc +skill-seekers asciidoc --file guide.adoc +``` + +--- + +### 12. PowerPoint Presentations + +**What:** Microsoft PowerPoint (.pptx) files + +**Command:** +```bash +skill-seekers create slides.pptx +skill-seekers pptx --file slides.pptx +``` + +--- + +### 13. RSS/Atom Feeds + +**What:** RSS or Atom feed files + +**Command:** +```bash +skill-seekers create feed.rss +skill-seekers rss --feed feed.rss +``` + +--- + +### 14. Man Pages + +**What:** Unix manual pages (.1 through .8, .man) + +**Command:** +```bash +skill-seekers create grep.1 +skill-seekers manpage --file grep.1 +``` + +--- + +### 15. Confluence Wikis + +**What:** Atlassian Confluence spaces (via API or export) + +**Command:** +```bash +skill-seekers confluence --space DEV --base-url https://wiki.example.com +``` + +--- + +### 16. Notion Workspaces + +**What:** Notion pages and databases (via API or export) + +**Command:** +```bash +skill-seekers notion --database abc123 +``` + +--- + +### 17. Slack/Discord Chat + +**What:** Chat platform exports or API access + +**Command:** +```bash +skill-seekers chat --export slack-export/ +``` + +--- + ## The Workflow ### Phase 1: Ingest diff --git a/docs/user-guide/02-scraping.md b/docs/user-guide/02-scraping.md index 37f436d..1de7c29 100644 --- a/docs/user-guide/02-scraping.md +++ b/docs/user-guide/02-scraping.md @@ -1,13 +1,13 @@ # Scraping Guide -> **Skill Seekers v3.1.4** +> **Skill Seekers v3.2.0** > **Complete guide to all scraping options** --- ## Overview -Skill Seekers can extract knowledge from four types of sources: +Skill Seekers can extract knowledge from **17 types of sources**: | Source | Command | Best For | |--------|---------|----------| @@ -15,6 +15,19 @@ Skill Seekers can extract knowledge from four types of sources: | **GitHub** | `create ` | Source code, issues, releases | | **PDF** | `create ` | Manuals, papers, reports | | **Local** | `create <./path>` | Your projects, internal code | +| **Word** | `create ` | Reports, specifications | +| **EPUB** | `create ` | E-books, long-form docs | +| **Video** | `create ` | Tutorials, presentations | +| **Jupyter** | `create ` | Data science, experiments | +| **Local HTML** | `create ` | Offline docs, saved pages | +| **OpenAPI** | `create ` | API specs, Swagger docs | +| **AsciiDoc** | `create ` | Technical documentation | +| **PowerPoint** | `create ` | Slide decks, presentations | +| **RSS/Atom** | `create ` | Blog feeds, news sources | +| **Man Pages** | `create ` | Unix command documentation | +| **Confluence** | `confluence` | Team wikis, knowledge bases | +| **Notion** | `notion` | Workspace docs, databases | +| **Slack/Discord** | `chat` | Chat history, discussions | --- @@ -280,6 +293,274 @@ skill-seekers analyze --directory ./my-project \ --- +## Video Extraction + +### Basic Usage + +```bash +# YouTube video +skill-seekers create https://www.youtube.com/watch?v=dQw4w9WgXcQ + +# Local video file +skill-seekers create presentation.mp4 + +# With explicit command +skill-seekers video --url https://www.youtube.com/watch?v=... +``` + +### Visual Analysis + +```bash +# Install full video support (includes Whisper + scene detection) +pip install skill-seekers[video-full] +skill-seekers video --setup # auto-detect GPU and install PyTorch + +# Extract with visual analysis +skill-seekers video --url --visual-analysis +``` + +**Requirements:** +```bash +pip install skill-seekers[video] # Transcript only +pip install skill-seekers[video-full] # + Whisper, scene detection +``` + +--- + +## Word Document Extraction + +### Basic Usage + +```bash +# Extract from .docx +skill-seekers create report.docx --name project-report + +# With explicit command +skill-seekers word --file report.docx +``` + +**Handles:** Text, tables, headings, images, embedded metadata. + +--- + +## EPUB Extraction + +### Basic Usage + +```bash +# Extract from .epub +skill-seekers create programming-guide.epub --name guide + +# With explicit command +skill-seekers epub --file programming-guide.epub +``` + +**Handles:** Chapters, metadata, table of contents, embedded images. + +--- + +## Jupyter Notebook Extraction + +### Basic Usage + +```bash +# Extract from .ipynb +skill-seekers create analysis.ipynb --name data-analysis + +# With explicit command +skill-seekers jupyter --notebook analysis.ipynb +``` + +**Requirements:** +```bash +pip install skill-seekers[jupyter] +``` + +**Extracts:** Markdown cells, code cells, cell outputs, execution order. + +--- + +## Local HTML Extraction + +### Basic Usage + +```bash +# Extract from .html +skill-seekers create docs.html --name offline-docs + +# With explicit command +skill-seekers html --file docs.html +``` + +**Handles:** Full HTML parsing, text extraction, link resolution. + +--- + +## OpenAPI/Swagger Extraction + +### Basic Usage + +```bash +# Extract from OpenAPI spec +skill-seekers create api-spec.yaml --name my-api + +# With explicit command +skill-seekers openapi --spec api-spec.yaml +``` + +**Extracts:** Endpoints, request/response schemas, authentication info, examples. + +--- + +## AsciiDoc Extraction + +### Basic Usage + +```bash +# Extract from .adoc +skill-seekers create guide.adoc --name dev-guide + +# With explicit command +skill-seekers asciidoc --file guide.adoc +``` + +**Requirements:** +```bash +pip install skill-seekers[asciidoc] +``` + +**Handles:** Sections, code blocks, tables, cross-references, includes. + +--- + +## PowerPoint Extraction + +### Basic Usage + +```bash +# Extract from .pptx +skill-seekers create slides.pptx --name presentation + +# With explicit command +skill-seekers pptx --file slides.pptx +``` + +**Requirements:** +```bash +pip install skill-seekers[pptx] +``` + +**Extracts:** Slide text, speaker notes, images, tables, slide order. + +--- + +## RSS/Atom Feed Extraction + +### Basic Usage + +```bash +# Extract from RSS feed +skill-seekers create blog.rss --name blog-archive + +# Atom feed +skill-seekers create updates.atom --name updates + +# With explicit command +skill-seekers rss --feed blog.rss +``` + +**Requirements:** +```bash +pip install skill-seekers[rss] +``` + +**Extracts:** Articles, titles, dates, authors, categories. + +--- + +## Man Page Extraction + +### Basic Usage + +```bash +# Extract from man page +skill-seekers create curl.1 --name curl-manual + +# With explicit command +skill-seekers manpage --file curl.1 +``` + +**Handles:** Sections (NAME, SYNOPSIS, DESCRIPTION, OPTIONS, etc.), formatting. + +--- + +## Confluence Wiki Extraction + +### Basic Usage + +```bash +# From Confluence API +skill-seekers confluence \ + --base-url https://wiki.example.com \ + --space DEV \ + --name team-docs + +# From Confluence export directory +skill-seekers confluence --export-dir ./confluence-export/ +``` + +**Requirements:** +```bash +pip install skill-seekers[confluence] +``` + +**Extracts:** Pages, page trees, attachments, labels, spaces. + +--- + +## Notion Extraction + +### Basic Usage + +```bash +# From Notion API +export NOTION_API_KEY=secret_... +skill-seekers notion --database abc123 --name product-wiki + +# From Notion export directory +skill-seekers notion --export-dir ./notion-export/ +``` + +**Requirements:** +```bash +pip install skill-seekers[notion] +``` + +**Extracts:** Pages, databases, blocks, properties, relations. + +--- + +## Slack/Discord Chat Extraction + +### Basic Usage + +```bash +# From Slack export +skill-seekers chat --export slack-export/ --name team-discussions + +# From Discord export +skill-seekers chat --export discord-export/ --name server-archive +``` + +**Requirements:** +```bash +pip install skill-seekers[chat] +``` + +**Extracts:** Messages, threads, channels, reactions, attachments. + +--- + ## Common Scraping Patterns ### Pattern 1: Test First diff --git a/docs/user-guide/04-packaging.md b/docs/user-guide/04-packaging.md index 0f58bc7..ab067df 100644 --- a/docs/user-guide/04-packaging.md +++ b/docs/user-guide/04-packaging.md @@ -1,6 +1,6 @@ # Packaging Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** > **Export skills to AI platforms and vector databases** --- diff --git a/docs/user-guide/05-workflows.md b/docs/user-guide/05-workflows.md index c03cac6..a455a27 100644 --- a/docs/user-guide/05-workflows.md +++ b/docs/user-guide/05-workflows.md @@ -1,6 +1,6 @@ # Workflows Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** > **Enhancement workflow presets for specialized analysis** --- @@ -21,7 +21,7 @@ Basic Skill ──▶ Workflow: Security-Focus ──▶ Security-Enhanced Skill ## Built-in Presets -Skill Seekers includes 5 built-in workflow presets: +Skill Seekers includes 6 built-in workflow presets: | Preset | Stages | Best For | |--------|--------|----------| @@ -30,6 +30,7 @@ Skill Seekers includes 5 built-in workflow presets: | `security-focus` | 4 | Security analysis | | `architecture-comprehensive` | 7 | Deep architecture review | | `api-documentation` | 3 | API documentation focus | +| `complex-merge` | 3 | Merging multiple source types into a unified skill | --- @@ -233,6 +234,36 @@ skill-seekers create https://api.example.com/docs \ --- +### Complex-Merge Workflow + +**Stages:** 3 +**Purpose:** Merging multiple heterogeneous sources into a unified, coherent skill + +```yaml +stages: + - name: source-alignment + prompt: Align and deduplicate content from different source types... + + - name: cross-reference + prompt: Build cross-references between sources... + + - name: unified-synthesis + prompt: Synthesize a unified narrative from all sources... +``` + +**Use for:** +- Multi-source unified configs (docs + GitHub + PDF + video) +- Combining documentation with chat history or wiki pages +- Any skill built from 3+ different source types + +**Example:** +```bash +skill-seekers unified --config configs/multi-source.json \ + --enhance-workflow complex-merge +``` + +--- + ## Chaining Multiple Workflows Apply multiple workflows sequentially: @@ -532,7 +563,7 @@ skill-seekers create \ ## Workflow Support Across All Scrapers -Workflows are supported by **all 5 scrapers** in Skill Seekers: +Workflows are supported by **all 17 source types** in Skill Seekers: | Scraper | Command | Workflow Support | |---------|---------|------------------| @@ -540,6 +571,19 @@ Workflows are supported by **all 5 scrapers** in Skill Seekers: | GitHub | `github` | ✅ Full support | | Local Codebase | `analyze` | ✅ Full support | | PDF | `pdf` | ✅ Full support | +| Word | `word` | ✅ Full support | +| EPUB | `epub` | ✅ Full support | +| Video | `video` | ✅ Full support | +| Jupyter Notebook | `jupyter` | ✅ Full support | +| Local HTML | `html` | ✅ Full support | +| OpenAPI/Swagger | `openapi` | ✅ Full support | +| AsciiDoc | `asciidoc` | ✅ Full support | +| PowerPoint | `pptx` | ✅ Full support | +| RSS/Atom | `rss` | ✅ Full support | +| Man Pages | `manpage` | ✅ Full support | +| Confluence | `confluence` | ✅ Full support | +| Notion | `notion` | ✅ Full support | +| Slack/Discord | `chat` | ✅ Full support | | Unified/Multi-Source | `unified` | ✅ Full support | | Create (Auto-detect) | `create` | ✅ Full support | @@ -609,6 +653,7 @@ skill-seekers unified config.json --enhance-workflow api-documentation | **Security-Focus** | Security-sensitive projects | | **Architecture** | Large frameworks, systems | | **API-Docs** | API frameworks, libraries | +| **Complex-Merge** | Multi-source skills (3+ source types) | | **Custom** | Specialized domains | | **Chaining** | Multiple perspectives needed | diff --git a/docs/zh-CN/README.md b/docs/zh-CN/README.md index bee11a0..4fd123c 100644 --- a/docs/zh-CN/README.md +++ b/docs/zh-CN/README.md @@ -1,12 +1,12 @@ # Skill Seekers Documentation -> **Complete documentation for Skill Seekers v3.1.0** +> **Complete documentation for Skill Seekers v3.2.0** --- ## Welcome! -This is the official documentation for **Skill Seekers** - the universal tool for converting documentation, code, and PDFs into AI-ready skills. +This is the official documentation for **Skill Seekers** - the universal tool for converting 17 source types (documentation, code, PDFs, videos, notebooks, wikis, and more) into AI-ready skills. --- @@ -36,8 +36,9 @@ Explore our **User Guides**: Look up specific information: -- [CLI Reference](reference/CLI_REFERENCE.md) - All 20 commands -- [MCP Reference](reference/MCP_REFERENCE.md) - 26 MCP tools +- [CLI Reference](reference/CLI_REFERENCE.md) - All 30+ commands +- [MCP Reference](reference/MCP_REFERENCE.md) - 27 MCP tools +- [Feature Matrix](reference/FEATURE_MATRIX.md) - 17 source types × 4 platforms - [Config Format](reference/CONFIG_FORMAT.md) - JSON specification - [Environment Variables](reference/ENVIRONMENT_VARIABLES.md) - All env vars @@ -60,7 +61,7 @@ Power user features: # 1. Install pip install skill-seekers -# 2. Create skill +# 2. Create skill from any of 17 source types skill-seekers create https://docs.django.com/ # 3. Package for Claude @@ -82,6 +83,18 @@ skill-seekers pdf manual.pdf --name docs # Analyze local code skill-seekers analyze --directory ./my-project +# New source types (v3.2.0) +skill-seekers create notebook.ipynb # Jupyter Notebook +skill-seekers create page.html # Local HTML +skill-seekers create api-spec.yaml # OpenAPI/Swagger +skill-seekers create guide.adoc # AsciiDoc +skill-seekers create slides.pptx # PowerPoint +skill-seekers rss --feed-url https://blog.example.com/feed # RSS/Atom +skill-seekers manpage --man-path curl.1 # Man pages +skill-seekers confluence --space-key DEV # Confluence +skill-seekers notion --database-id abc123 # Notion +skill-seekers chat --export-path ./slack-export/ # Slack/Discord + # Enhance skill skill-seekers enhance output/my-skill/ @@ -119,8 +132,8 @@ docs/ │ └── 06-troubleshooting.md │ ├── reference/ # Technical reference -│ ├── CLI_REFERENCE.md # 20 commands -│ ├── MCP_REFERENCE.md # 26 MCP tools +│ ├── CLI_REFERENCE.md # 30+ commands +│ ├── MCP_REFERENCE.md # 27 MCP tools │ ├── CONFIG_FORMAT.md # JSON spec │ └── ENVIRONMENT_VARIABLES.md │ @@ -163,8 +176,8 @@ For Cursor, Windsurf, Cline: ## Version Information -- **Current Version:** 3.1.0 -- **Last Updated:** 2026-02-16 +- **Current Version:** 3.2.0 +- **Last Updated:** 2026-03-15 - **Python Required:** 3.10+ --- diff --git a/docs/zh-CN/advanced/mcp-server.md b/docs/zh-CN/advanced/mcp-server.md index c471fe7..d6cdcdb 100644 --- a/docs/zh-CN/advanced/mcp-server.md +++ b/docs/zh-CN/advanced/mcp-server.md @@ -1,7 +1,7 @@ # MCP Server Setup Guide -> **Skill Seekers v3.1.0** -> **Integrate with AI agents via Model Context Protocol** +> **Skill Seekers v3.2.0** +> **通过 Model Context Protocol 与 AI 代理集成** --- @@ -141,46 +141,78 @@ skill-seekers-mcp --transport http --port 8765 --- -## Available Tools +## 可用工具 -26 tools organized by category: +27 个工具,按类别组织: -### Core Tools (9) -- `list_configs` - List presets -- `generate_config` - Create config from URL -- `validate_config` - Check config -- `estimate_pages` - Page estimation -- `scrape_docs` - Scrape documentation -- `package_skill` - Package skill -- `upload_skill` - Upload to platform -- `enhance_skill` - AI enhancement -- `install_skill` - Complete workflow +### 核心工具(9 个) +- `list_configs` - 列出预设 +- `generate_config` - 从 URL 创建配置 +- `validate_config` - 检查配置 +- `estimate_pages` - 页面估算 +- `scrape_docs` - 抓取文档 +- `package_skill` - 打包技能 +- `upload_skill` - 上传到平台 +- `enhance_skill` - AI 增强 +- `install_skill` - 完整工作流 -### Extended Tools (9) -- `scrape_github` - GitHub repo -- `scrape_pdf` - PDF extraction -- `scrape_codebase` - Local code -- `unified_scrape` - Multi-source -- `detect_patterns` - Pattern detection -- `extract_test_examples` - Test examples -- `build_how_to_guides` - How-to guides -- `extract_config_patterns` - Config patterns -- `detect_conflicts` - Doc/code conflicts +### 扩展工具(10 个) +- `scrape_github` - GitHub 仓库 +- `scrape_pdf` - PDF 提取 +- `scrape_generic` - 10 种新来源类型的通用抓取器(见下文) +- `scrape_codebase` - 本地代码 +- `unified_scrape` - 多源抓取 +- `detect_patterns` - 模式检测 +- `extract_test_examples` - 测试示例 +- `build_how_to_guides` - 操作指南 +- `extract_config_patterns` - 配置模式 +- `detect_conflicts` - 文档/代码冲突 -### Config Sources (5) -- `add_config_source` - Register git source -- `list_config_sources` - List sources -- `remove_config_source` - Remove source -- `fetch_config` - Fetch configs -- `submit_config` - Submit configs +### 配置源(5 个) +- `add_config_source` - 注册 Git 源 +- `list_config_sources` - 列出源 +- `remove_config_source` - 删除源 +- `fetch_config` - 获取配置 +- `submit_config` - 提交配置 -### Vector DB (4) +### 向量数据库(4 个) - `export_to_weaviate` - `export_to_chroma` - `export_to_faiss` - `export_to_qdrant` -See [MCP Reference](../reference/MCP_REFERENCE.md) for full details. +### scrape_generic 工具 + +`scrape_generic` 是 v3.2.0 新增的 10 种来源类型的通用入口。它将请求委托给相应的 CLI 抓取器模块。 + +**支持的来源类型:** `jupyter`(Jupyter 笔记本)、`html`(本地 HTML)、`openapi`(OpenAPI/Swagger 规范)、`asciidoc`(AsciiDoc 文档)、`pptx`(PowerPoint 演示文稿)、`rss`(RSS/Atom 订阅源)、`manpage`(Man 手册页)、`confluence`(Confluence 维基)、`notion`(Notion 页面)、`chat`(Slack/Discord 聊天记录) + +**参数:** + +| 名称 | 类型 | 必需 | 描述 | +|------|------|------|------| +| `source_type` | string | 是 | 10 种支持的来源类型之一 | +| `name` | string | 是 | 输出的技能名称 | +| `path` | string | 否 | 文件或目录路径(用于基于文件的来源) | +| `url` | string | 否 | URL(用于 confluence、notion、rss 等基于 URL 的来源) | + +**使用示例:** + +``` +"抓取 Jupyter 笔记本 analysis.ipynb" +→ scrape_generic(source_type="jupyter", name="analysis", path="analysis.ipynb") + +"提取 API 规范内容" +→ scrape_generic(source_type="openapi", name="my-api", path="api-spec.yaml") + +"处理 PowerPoint 演示文稿" +→ scrape_generic(source_type="pptx", name="slides", path="presentation.pptx") + +"抓取 Confluence 维基" +→ scrape_generic(source_type="confluence", name="wiki", url="https://wiki.example.com") +``` + +详见 [MCP 参考文档](../reference/MCP_REFERENCE.md)。 --- @@ -317,6 +349,6 @@ skill-seekers-mcp --port 8766 ## See Also -- [MCP Reference](../reference/MCP_REFERENCE.md) - Complete tool reference -- [MCP Tools Deep Dive](mcp-tools.md) - Advanced usage -- [MCP Protocol](https://modelcontextprotocol.io/) - Official MCP docs +- [MCP 参考文档](../reference/MCP_REFERENCE.md) - 完整工具参考 +- [MCP 工具深入](mcp-tools.md) - 高级用法 +- [MCP 协议](https://modelcontextprotocol.io/) - 官方 MCP 文档 diff --git a/docs/zh-CN/getting-started/01-installation.md b/docs/zh-CN/getting-started/01-installation.md index 184334d..19709f6 100644 --- a/docs/zh-CN/getting-started/01-installation.md +++ b/docs/zh-CN/getting-started/01-installation.md @@ -1,6 +1,6 @@ # Installation Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** Get Skill Seekers installed and running in under 5 minutes. @@ -116,6 +116,12 @@ pip install skill-seekers[dev] | `gemini` | Google Gemini support | `pip install skill-seekers[gemini]` | | `openai` | OpenAI ChatGPT support | `pip install skill-seekers[openai]` | | `mcp` | MCP server | `pip install skill-seekers[mcp]` | +| `video` | YouTube/Vimeo subtitles & metadata | `pip install skill-seekers[video]` | +| `video-full` | + Whisper transcription & visual frames | `pip install skill-seekers[video-full]` | +| `jupyter` | Jupyter Notebook extraction | `pip install skill-seekers[jupyter]` | +| `ocr` | OCR support (scanned PDFs, visual frames) | `pip install skill-seekers[ocr]` | +| `confluence` | Confluence wiki support | `pip install skill-seekers[confluence]` | +| `notion` | Notion pages support | `pip install skill-seekers[notion]` | | `chroma` | ChromaDB export | `pip install skill-seekers[chroma]` | | `weaviate` | Weaviate export | `pip install skill-seekers[weaviate]` | | `qdrant` | Qdrant export | `pip install skill-seekers[qdrant]` | diff --git a/docs/zh-CN/getting-started/02-quick-start.md b/docs/zh-CN/getting-started/02-quick-start.md index 85f53a0..9bff84d 100644 --- a/docs/zh-CN/getting-started/02-quick-start.md +++ b/docs/zh-CN/getting-started/02-quick-start.md @@ -1,6 +1,6 @@ # Quick Start Guide -> **Skill Seekers v3.1.0** +> **Skill Seekers v3.2.0** > **Create your first skill in 3 commands** --- @@ -24,7 +24,7 @@ skill-seekers package output/django --target claude ## What You Can Create From -The `create` command auto-detects your source: +The `create` command auto-detects your source (17 source types supported): | Source Type | Example Command | |-------------|-----------------| @@ -32,6 +32,15 @@ The `create` command auto-detects your source: | **GitHub Repo** | `skill-seekers create facebook/react` | | **Local Code** | `skill-seekers create ./my-project` | | **PDF File** | `skill-seekers create manual.pdf` | +| **Word Document** | `skill-seekers create report.docx` | +| **EPUB Book** | `skill-seekers create book.epub` | +| **Jupyter Notebook** | `skill-seekers create analysis.ipynb` | +| **Local HTML** | `skill-seekers create page.html` | +| **OpenAPI/Swagger** | `skill-seekers create api-spec.yaml` | +| **AsciiDoc** | `skill-seekers create guide.adoc` | +| **PowerPoint** | `skill-seekers create slides.pptx` | +| **RSS/Atom Feed** | `skill-seekers create feed.rss` | +| **Man Page** | `skill-seekers create curl.1` | | **Config File** | `skill-seekers create configs/custom.json` | --- @@ -87,6 +96,49 @@ skill-seekers create paper.pdf --name research skill-seekers package output/research --target claude ``` +### Jupyter Notebook + +```bash +# Data analysis notebook +skill-seekers create analysis.ipynb --name data-analysis +skill-seekers package output/data-analysis --target claude +``` + +### OpenAPI/Swagger Spec + +```bash +# API specification +skill-seekers create api-spec.yaml --name my-api +skill-seekers package output/my-api --target claude +``` + +### PowerPoint Presentation + +```bash +# Slide deck +skill-seekers create slides.pptx --name presentation +skill-seekers package output/presentation --target claude +``` + +### Other Source Types + +```bash +# Confluence wiki +skill-seekers confluence --space-key DEV --name team-wiki + +# Notion pages +skill-seekers notion --database-id abc123 --name my-notes + +# RSS/Atom feed +skill-seekers rss --feed-url https://blog.example.com/feed --name blog + +# Man pages +skill-seekers manpage --man-path curl.1 --name curl-docs + +# Slack/Discord export +skill-seekers chat --export-path ./slack-export/ --name team-chat +``` + --- ## Common Options diff --git a/docs/zh-CN/reference/CLI_REFERENCE.md b/docs/zh-CN/reference/CLI_REFERENCE.md index 269dc51..9f291e1 100644 --- a/docs/zh-CN/reference/CLI_REFERENCE.md +++ b/docs/zh-CN/reference/CLI_REFERENCE.md @@ -1,8 +1,8 @@ # CLI Reference - Skill Seekers -> **Version:** 3.1.0 -> **Last Updated:** 2026-02-16 -> **Complete reference for all 20 CLI commands** +> **Version:** 3.2.0 +> **Last Updated:** 2026-03-15 +> **Complete reference for all 30+ CLI commands** --- @@ -32,6 +32,19 @@ - [unified](#unified) - Multi-source scraping - [update](#update) - Incremental updates - [upload](#upload) - Upload to platform + - [video](#video) - Extract from video + - [word](#word) - Extract from Word document + - [epub](#epub) - Extract from EPUB + - [jupyter](#jupyter) - Extract from Jupyter Notebook + - [html](#html) - Extract from local HTML + - [openapi](#openapi) - Extract from OpenAPI/Swagger spec + - [asciidoc](#asciidoc) - Extract from AsciiDoc + - [pptx](#pptx) - Extract from PowerPoint + - [rss](#rss) - Extract from RSS/Atom feed + - [manpage](#manpage) - Extract from man page + - [confluence](#confluence) - Extract from Confluence wiki + - [notion](#notion) - Extract from Notion pages + - [chat](#chat) - Extract from Slack/Discord export - [workflows](#workflows) - Manage workflow presets - [Common Workflows](#common-workflows) - [Exit Codes](#exit-codes) @@ -41,7 +54,7 @@ ## Overview -Skill Seekers provides a unified CLI for converting documentation, GitHub repositories, PDFs, and local codebases into AI-ready skills. +Skill Seekers provides a unified CLI for converting 17 source types—documentation, GitHub repositories, PDFs, videos, notebooks, wikis, and more—into AI-ready skills. ### Installation @@ -218,6 +231,15 @@ skill-seekers create [source] [options] | `owner/repo` | GitHub | `facebook/react` | | `./path` | Local codebase | `./my-project` | | `*.pdf` | PDF | `manual.pdf` | +| `*.docx` | Word Document | `report.docx` | +| `*.epub` | EPUB | `book.epub` | +| `*.ipynb` | Jupyter Notebook | `analysis.ipynb` | +| `*.html` / `*.htm` | Local HTML | `page.html` | +| `*.yaml` / `*.yml` (OpenAPI) | OpenAPI/Swagger | `api-spec.yaml` | +| `*.adoc` / `*.asciidoc` | AsciiDoc | `guide.adoc` | +| `*.pptx` | PowerPoint | `slides.pptx` | +| `*.rss` / `*.atom` | RSS/Atom Feed | `feed.rss` | +| `*.1`–`*.8` / `*.man` | Man Page | `curl.1` | | `*.json` | Config file | `config.json` | **Flags:** @@ -998,6 +1020,302 @@ skill-seekers upload output/react-weaviate.zip --target weaviate \ --- +### video + +Extract content from YouTube, Vimeo, or local video files. + +**Syntax:** +```bash +skill-seekers video [options] +``` + +**Flags:** + +| Short | Long | Default | Description | +|-------|------|---------|-------------| +| | `--url` | | YouTube/Vimeo URL | +| | `--video-file` | | Local video file path | +| | `--playlist` | | YouTube playlist URL | +| `-n` | `--name` | auto | Skill name | +| | `--visual` | | Enable visual frame analysis | +| | `--enhance-level` | 2 | AI enhancement (0-3) | +| | `--start-time` | | Start time (seconds or MM:SS or HH:MM:SS) | +| | `--end-time` | | End time | +| | `--setup` | | Auto-detect GPU and install visual dependencies | + +**Examples:** + +```bash +# YouTube video +skill-seekers video --url https://www.youtube.com/watch?v=... --name tutorial + +# Local video with visual analysis +skill-seekers video --video-file recording.mp4 --name recording --visual + +# Setup GPU-aware dependencies +skill-seekers video --setup +``` + +--- + +### word + +Extract content from Word (.docx) documents. + +**Syntax:** +```bash +skill-seekers word --docx FILE [options] +``` + +**Examples:** + +```bash +skill-seekers word --docx report.docx --name report +# Or via create: +skill-seekers create report.docx +``` + +--- + +### epub + +Extract content from EPUB e-books. + +**Syntax:** +```bash +skill-seekers epub --epub FILE [options] +``` + +**Examples:** + +```bash +skill-seekers epub --epub book.epub --name book +# Or via create: +skill-seekers create book.epub +``` + +--- + +### jupyter + +Extract content from Jupyter Notebooks (.ipynb). + +**Syntax:** +```bash +skill-seekers jupyter --notebook FILE [options] +``` + +**Examples:** + +```bash +skill-seekers jupyter --notebook analysis.ipynb --name data-analysis +# Or via create: +skill-seekers create analysis.ipynb +``` + +--- + +### html + +Extract content from local HTML files. + +**Syntax:** +```bash +skill-seekers html --html-path FILE [options] +``` + +**Examples:** + +```bash +skill-seekers html --html-path docs/index.html --name local-docs +# Or via create: +skill-seekers create page.html +``` + +--- + +### openapi + +Extract API documentation from OpenAPI/Swagger specifications. + +**Syntax:** +```bash +skill-seekers openapi --spec FILE [options] +``` + +**Examples:** + +```bash +skill-seekers openapi --spec api-spec.yaml --name my-api +# Or via create: +skill-seekers create api-spec.yaml +``` + +--- + +### asciidoc + +Extract content from AsciiDoc files. + +**Syntax:** +```bash +skill-seekers asciidoc --asciidoc-path FILE [options] +``` + +**Examples:** + +```bash +skill-seekers asciidoc --asciidoc-path guide.adoc --name guide +# Or via create: +skill-seekers create guide.adoc +``` + +--- + +### pptx + +Extract content from PowerPoint (.pptx) presentations. + +**Syntax:** +```bash +skill-seekers pptx --pptx FILE [options] +``` + +**Examples:** + +```bash +skill-seekers pptx --pptx slides.pptx --name presentation +# Or via create: +skill-seekers create slides.pptx +``` + +--- + +### rss + +Extract content from RSS/Atom feeds. + +**Syntax:** +```bash +skill-seekers rss [options] +``` + +**Flags:** + +| Short | Long | Description | +|-------|------|-------------| +| | `--feed-url` | RSS/Atom feed URL | +| | `--feed-path` | Local RSS/Atom file path | +| `-n` | `--name` | Skill name | + +**Examples:** + +```bash +skill-seekers rss --feed-url https://blog.example.com/feed --name blog +skill-seekers rss --feed-path feed.rss --name feed +# Or via create: +skill-seekers create feed.rss +``` + +--- + +### manpage + +Extract content from Unix man pages. + +**Syntax:** +```bash +skill-seekers manpage --man-path FILE [options] +``` + +**Examples:** + +```bash +skill-seekers manpage --man-path curl.1 --name curl-docs +# Or via create: +skill-seekers create curl.1 +``` + +--- + +### confluence + +Extract content from Confluence wikis. + +**Syntax:** +```bash +skill-seekers confluence [options] +``` + +**Flags:** + +| Short | Long | Description | +|-------|------|-------------| +| | `--space-key` | Confluence space key | +| | `--base-url` | Confluence base URL | +| | `--export-path` | Path to Confluence export directory | +| `-n` | `--name` | Skill name | + +**Examples:** + +```bash +# From Confluence API +skill-seekers confluence --space-key DEV --base-url https://wiki.example.com --name team-wiki + +# From Confluence export +skill-seekers confluence --export-path ./confluence-export/ --name wiki +``` + +--- + +### notion + +Extract content from Notion pages and databases. + +**Syntax:** +```bash +skill-seekers notion [options] +``` + +**Flags:** + +| Short | Long | Description | +|-------|------|-------------| +| | `--database-id` | Notion database ID | +| | `--page-id` | Notion page ID | +| | `--export-path` | Path to Notion export directory | +| `-n` | `--name` | Skill name | + +**Examples:** + +```bash +# From Notion API +skill-seekers notion --database-id abc123 --name my-notes + +# From Notion export +skill-seekers notion --export-path ./notion-export/ --name notes +``` + +--- + +### chat + +Extract content from Slack/Discord chat exports. + +**Syntax:** +```bash +skill-seekers chat --export-path DIR [options] +``` + +**Examples:** + +```bash +skill-seekers chat --export-path ./slack-export/ --name team-chat +skill-seekers chat --export-path ./discord-export/ --name server-archive +``` + +--- + ### workflows Manage enhancement workflow presets. diff --git a/docs/zh-CN/reference/FEATURE_MATRIX.md b/docs/zh-CN/reference/FEATURE_MATRIX.md index d2e49fc..889c1d8 100644 --- a/docs/zh-CN/reference/FEATURE_MATRIX.md +++ b/docs/zh-CN/reference/FEATURE_MATRIX.md @@ -11,6 +11,28 @@ Complete feature support across all platforms and skill modes. | **OpenAI ChatGPT** | ZIP | ✅ Assistants API | ✅ GPT-4o | OPENAI_API_KEY | | **Generic Markdown** | ZIP | ❌ Manual | ❌ None | None | +## Source Type Support (17 Types) + +| Source Type | CLI Command | Platforms | Detection | +|-------------|------------|-----------|-----------| +| **Documentation (web)** | `scrape` / `create ` | All 4 | HTTP/HTTPS URLs | +| **GitHub repo** | `github` / `create owner/repo` | All 4 | `owner/repo` or github.com URLs | +| **PDF** | `pdf` / `create file.pdf` | All 4 | `.pdf` extension | +| **Word (.docx)** | `word` / `create file.docx` | All 4 | `.docx` extension | +| **EPUB** | `epub` / `create file.epub` | All 4 | `.epub` extension | +| **Video** | `video` / `create ` | All 4 | YouTube/Vimeo URLs, video extensions | +| **Local codebase** | `analyze` / `create ./path` | All 4 | Directory paths | +| **Jupyter Notebook** | `jupyter` / `create file.ipynb` | All 4 | `.ipynb` extension | +| **Local HTML** | `html` / `create file.html` | All 4 | `.html`/`.htm` extensions | +| **OpenAPI/Swagger** | `openapi` / `create spec.yaml` | All 4 | `.yaml`/`.yml` with OpenAPI content | +| **AsciiDoc** | `asciidoc` / `create file.adoc` | All 4 | `.adoc`/`.asciidoc` extensions | +| **PowerPoint** | `pptx` / `create file.pptx` | All 4 | `.pptx` extension | +| **RSS/Atom** | `rss` / `create feed.rss` | All 4 | `.rss`/`.atom` extensions | +| **Man pages** | `manpage` / `create cmd.1` | All 4 | `.1`–`.8`/`.man` extensions | +| **Confluence** | `confluence` | All 4 | API or export directory | +| **Notion** | `notion` | All 4 | API or export directory | +| **Slack/Discord** | `chat` | All 4 | Export directory or API | + ## Skill Mode Support | Mode | Description | Platforms | Example Configs | @@ -18,17 +40,31 @@ Complete feature support across all platforms and skill modes. | **Documentation** | Scrape HTML docs | All 4 | react.json, django.json (14 total) | | **GitHub** | Analyze repositories | All 4 | react_github.json, godot_github.json | | **PDF** | Extract from PDFs | All 4 | example_pdf.json | -| **Unified** | Multi-source (docs+GitHub+PDF) | All 4 | react_unified.json (5 total) | +| **Unified** | Multi-source (docs+GitHub+PDF+more) | All 4 | react_unified.json (5 total) | | **Local Repo** | Unlimited local analysis | All 4 | deck_deck_go_local.json | ## CLI Command Support -| Command | Platforms | Skill Modes | Multi-Platform Flag | +| Command | Platforms | Source Types | Multi-Platform Flag | |---------|-----------|-------------|---------------------| | `scrape` | All | Docs only | No (output is universal) | | `github` | All | GitHub only | No (output is universal) | | `pdf` | All | PDF only | No (output is universal) | -| `unified` | All | Unified only | No (output is universal) | +| `word` | All | Word (.docx) only | No (output is universal) | +| `epub` | All | EPUB only | No (output is universal) | +| `video` | All | Video only | No (output is universal) | +| `jupyter` | All | Jupyter Notebook only | No (output is universal) | +| `html` | All | Local HTML only | No (output is universal) | +| `openapi` | All | OpenAPI/Swagger only | No (output is universal) | +| `asciidoc` | All | AsciiDoc only | No (output is universal) | +| `pptx` | All | PowerPoint only | No (output is universal) | +| `rss` | All | RSS/Atom only | No (output is universal) | +| `manpage` | All | Man pages only | No (output is universal) | +| `confluence` | All | Confluence only | No (output is universal) | +| `notion` | All | Notion only | No (output is universal) | +| `chat` | All | Slack/Discord only | No (output is universal) | +| `unified` | All | Multi-source | No (output is universal) | +| `create` | All | Auto-detects all 17 | No (output is universal) | | `enhance` | Claude, Gemini, OpenAI | All | ✅ `--target` | | `package` | All | All | ✅ `--target` | | `upload` | Claude, Gemini, OpenAI | All | ✅ `--target` | @@ -50,6 +86,7 @@ Complete feature support across all platforms and skill modes. | `scrape_docs` | All | Docs + Unified | No (output is universal) | | `scrape_github` | All | GitHub only | No (output is universal) | | `scrape_pdf` | All | PDF only | No (output is universal) | +| `scrape_generic` | All | 10 new source types | No (output is universal) | | **Packaging Tools** | | `package_skill` | All | All | ✅ `target` parameter | | `upload_skill` | Claude, Gemini, OpenAI | All | ✅ `target` parameter | @@ -310,7 +347,7 @@ A: Yes! Enhancement adds platform-specific formatting: - OpenAI: Plain text assistant instructions **Q: Do all skill modes work with all platforms?** -A: Yes! All 5 skill modes (Docs, GitHub, PDF, Unified, Local Repo) work with all 4 platforms. +A: Yes! All 17 source types and all 5 skill modes (Docs, GitHub, PDF, Unified, Local Repo) work with all 4 platforms. ## See Also diff --git a/docs/zh-CN/reference/MCP_REFERENCE.md b/docs/zh-CN/reference/MCP_REFERENCE.md index ab9abf8..65416f7 100644 --- a/docs/zh-CN/reference/MCP_REFERENCE.md +++ b/docs/zh-CN/reference/MCP_REFERENCE.md @@ -1,8 +1,8 @@ # MCP Reference - Skill Seekers -> **Version:** 3.1.0 -> **Last Updated:** 2026-02-16 -> **Complete reference for 26 MCP tools** +> **Version:** 3.2.0 +> **Last Updated:** 2026-03-15 +> **Complete reference for 27 MCP tools** --- @@ -79,7 +79,7 @@ Essential tools for basic skill creation workflow: | `enhance_skill` | AI enhancement | | `install_skill` | Complete workflow | -### Extended Tools (9) +### Extended Tools (10) Advanced scraping and analysis tools: @@ -87,6 +87,7 @@ Advanced scraping and analysis tools: |------|---------| | `scrape_github` | GitHub repository analysis | | `scrape_pdf` | PDF extraction | +| `scrape_generic` | Generic scraper for 10 new source types (jupyter, html, openapi, asciidoc, pptx, rss, manpage, confluence, notion, chat) | | `scrape_codebase` | Local codebase analysis | | `unified_scrape` | Multi-source scraping | | `detect_patterns` | Pattern detection | @@ -642,6 +643,56 @@ Find discrepancies between documentation and code. --- +#### scrape_generic + +Generic scraper for new source types. Supports 10 source types that were added in v3.2.0. + +**Parameters:** + +| Name | Type | Required | Description | +|------|------|----------|-------------| +| `source_type` | string | Yes | One of: `jupyter`, `html`, `openapi`, `asciidoc`, `pptx`, `confluence`, `notion`, `rss`, `manpage`, `chat` | +| `name` | string | Yes | Skill name for the output | +| `path` | string | No | File or directory path (for file-based sources) | +| `url` | string | No | URL (for URL-based sources like confluence, notion, rss) | + +**Supported Source Types:** + +| Source Type | Description | Input | +|-------------|-------------|-------| +| `jupyter` | Jupyter Notebook (.ipynb) | `path` | +| `html` | Local HTML files | `path` | +| `openapi` | OpenAPI/Swagger specification | `path` | +| `asciidoc` | AsciiDoc documents | `path` | +| `pptx` | PowerPoint presentations | `path` | +| `rss` | RSS/Atom feeds | `url` or `path` | +| `manpage` | Unix man pages | `path` | +| `confluence` | Confluence wiki | `url` or `path` | +| `notion` | Notion pages/databases | `url` or `path` | +| `chat` | Slack/Discord exports | `path` | + +**Returns:** Scraping results + +```json +{ + "source_type": "jupyter", + "skill_directory": "output/analysis/", + "status": "success" +} +``` + +**Example:** +```python +# Natural language +"Scrape the Jupyter notebook analysis.ipynb" +"Extract content from slides.pptx" +"Process the OpenAPI spec at api-spec.yaml" +"Scrape the Confluence wiki at https://wiki.example.com" +"Extract content from the RSS feed" +``` + +--- + ### Config Source Tools #### add_config_source @@ -1030,7 +1081,20 @@ Tools: `list_workflows` → `unified_scrape` → `enhance_skill` → `package_sk --- -### Pattern 5: Vector Database Export +### Pattern 5: Generic Source Types + +```python +# Natural language sequence: +"Scrape the Jupyter notebook analysis.ipynb" +"Enhance the output/analysis skill" +"Package it for Claude" +``` + +Tools: `scrape_generic` → `enhance_skill` → `package_skill` + +--- + +### Pattern 6: Vector Database Export ```python # Natural language sequence: