diff --git a/AGENTS.md b/AGENTS.md
index d0ae247..d26c952 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,866 +1,171 @@
 # AGENTS.md - Skill Seekers
 
-Essential guidance for AI coding agents working with the Skill Seekers codebase.
+Concise reference for AI coding agents. Skill Seekers is a Python CLI tool (v3.2.0) that converts documentation sites, GitHub repos, PDFs, videos, notebooks, wikis, and more into AI-ready skills for 16+ LLM platforms and RAG pipelines.
 
----
-
-## Project Overview
-
-**Skill Seekers** is a Python CLI tool that converts documentation websites, GitHub repositories, PDF files, and videos into AI-ready skills for LLM platforms and RAG (Retrieval-Augmented Generation) pipelines. It serves as the universal preprocessing layer for AI systems.
-
-### Key Facts
-
-| Attribute | Value |
-|-----------|-------|
-| **Current Version** | 3.1.3 |
-| **Python Version** | 3.10+ (tested on 3.10, 3.11, 3.12, 3.13) |
-| **License** | MIT |
-| **Package Name** | `skill-seekers` (PyPI) |
-| **Source Files** | 182 Python files |
-| **Test Files** | 105+ test files |
-| **Website** | https://skillseekersweb.com/ |
-| **Repository** | https://github.com/yusufkaraaslan/Skill_Seekers |
-
-### Supported Target Platforms
-
-| Platform | Format | Use Case |
-|----------|--------|----------|
-| **Claude AI** | ZIP + YAML | Claude Code skills |
-| **Google Gemini** | tar.gz | Gemini skills |
-| **OpenAI ChatGPT** | ZIP + Vector Store | Custom GPTs |
-| **LangChain** | Documents | QA chains, agents, retrievers |
-| **LlamaIndex** | TextNodes | Query engines, chat engines |
-| **Haystack** | Documents | Enterprise RAG pipelines |
-| **Pinecone** | Ready for upsert | Production vector search |
-| **Weaviate** | Vector objects | Vector database |
-| **Qdrant** | Points | Vector database |
-| **Chroma** | Documents | Local vector database |
-| **FAISS** | Index files | Local similarity search |
-| **Cursor IDE** | .cursorrules | AI coding assistant rules |
-| **Windsurf** | .windsurfrules | AI coding rules |
-| **Cline** | .clinerules + MCP | VS Code extension |
-| **Continue.dev** | HTTP context | Universal IDE support |
-| **Generic Markdown** | ZIP | Universal export |
-
-### Core Workflow
-
-1. **Scrape Phase** - Crawl documentation/GitHub/PDF/video sources
-2. **Build Phase** - Organize content into categorized references
-3. **Enhancement Phase** - AI-powered quality improvements (optional)
-4. **Package Phase** - Create platform-specific packages
-5. **Upload Phase** - Auto-upload to target platform (optional)
-
----
-
-## Project Structure
-
-```
-/mnt/1ece809a-2821-4f10-aecb-fcdf34760c0b/Git/Skill_Seekers/
-├── src/skill_seekers/              # Main source code (src/ layout)
-│   ├── cli/                        # CLI tools and commands (~70 modules)
-│   │   ├── adaptors/               # Platform adaptors (Strategy pattern)
-│   │   │   ├── base.py             # Abstract base class (SkillAdaptor)
-│   │   │   ├── claude.py           # Claude AI adaptor
-│   │   │   ├── gemini.py           # Google Gemini adaptor
-│   │   │   ├── openai.py           # OpenAI ChatGPT adaptor
-│   │   │   ├── markdown.py         # Generic Markdown adaptor
-│   │   │   ├── chroma.py           # Chroma vector DB adaptor
-│   │   │   ├── faiss_helpers.py    # FAISS index adaptor
-│   │   │   ├── haystack.py         # Haystack RAG adaptor
-│   │   │   ├── langchain.py        # LangChain adaptor
-│   │   │   ├── llama_index.py      # LlamaIndex adaptor
-│   │   │   ├── qdrant.py           # Qdrant vector DB adaptor
-│   │   │   ├── weaviate.py         # Weaviate vector DB adaptor
-│   │   │   └── streaming_adaptor.py # Streaming output adaptor
-│   │   ├── arguments/              # CLI argument definitions
-│   │   ├── parsers/                # Argument parsers
-│   │   │   └── extractors/         # Content extractors
-│   │   ├── presets/                # Preset configuration management
-│   │   ├── storage/                # Cloud storage adaptors
-│   │   ├── main.py                 # Unified CLI entry point
-│   │   ├── create_command.py       # Unified create command
-│   │   ├── doc_scraper.py          # Documentation scraper
-│   │   ├── github_scraper.py       # GitHub repository scraper
-│   │   ├── pdf_scraper.py          # PDF extraction
-│   │   ├── word_scraper.py         # Word document scraper
-│   │   ├── video_scraper.py        # Video extraction
-│   │   ├── video_setup.py          # GPU detection & dependency installation
-│   │   ├── unified_scraper.py      # Multi-source scraping
-│   │   ├── codebase_scraper.py     # Local codebase analysis
-│   │   ├── enhance_command.py      # AI enhancement command
-│   │   ├── enhance_skill_local.py  # AI enhancement (local mode)
-│   │   ├── package_skill.py        # Skill packager
-│   │   ├── upload_skill.py         # Upload to platforms
-│   │   ├── cloud_storage_cli.py    # Cloud storage CLI
-│   │   ├── benchmark_cli.py        # Benchmarking CLI
-│   │   ├── sync_cli.py             # Sync monitoring CLI
-│   │   └── workflows_command.py    # Workflow management CLI
-│   ├── mcp/                        # MCP server integration
-│   │   ├── server_fastmcp.py       # FastMCP server (~708 lines)
-│   │   ├── server_legacy.py        # Legacy server implementation
-│   │   ├── server.py               # Server entry point
-│   │   ├── agent_detector.py       # AI agent detection
-│   │   ├── git_repo.py             # Git repository operations
-│   │   ├── source_manager.py       # Config source management
-│   │   └── tools/                  # MCP tool implementations
-│   │       ├── config_tools.py     # Configuration tools
-│   │       ├── packaging_tools.py  # Packaging tools
-│   │       ├── scraping_tools.py   # Scraping tools
-│   │       ├── source_tools.py     # Source management tools
-│   │       ├── splitting_tools.py  # Config splitting tools
-│   │       ├── vector_db_tools.py  # Vector database tools
-│   │       └── workflow_tools.py   # Workflow management tools
-│   ├── sync/                       # Sync monitoring module
-│   │   ├── detector.py             # Change detection
-│   │   ├── models.py               # Data models (Pydantic)
-│   │   ├── monitor.py              # Monitoring logic
-│   │   └── notifier.py             # Notification system
-│   ├── benchmark/                  # Benchmarking framework
-│   │   ├── framework.py            # Benchmark framework
-│   │   ├── models.py               # Benchmark models
-│   │   └── runner.py               # Benchmark runner
-│   ├── embedding/                  # Embedding server
-│   │   ├── server.py               # FastAPI embedding server
-│   │   ├── generator.py            # Embedding generation
-│   │   ├── cache.py                # Embedding cache
-│   │   └── models.py               # Embedding models
-│   ├── workflows/                  # YAML workflow presets (66 presets)
-│   ├── _version.py                 # Version information (reads from pyproject.toml)
-│   └── __init__.py                 # Package init
-├── tests/                          # Test suite (105+ test files)
-├── configs/                        # Preset configuration files
-├── docs/                           # Documentation (80+ markdown files)
-│   ├── integrations/               # Platform integration guides
-│   ├── guides/                     # User guides
-│   ├── reference/                  # API reference
-│   ├── features/                   # Feature documentation
-│   ├── blog/                       # Blog posts
-│   └── roadmap/                    # Roadmap documents
-├── examples/                       # Usage examples
-├── .github/workflows/              # CI/CD workflows
-├── pyproject.toml                  # Main project configuration
-├── requirements.txt                # Pinned dependencies
-├── mypy.ini                        # MyPy type checker configuration
-├── Dockerfile                      # Main Docker image (multi-stage)
-├── Dockerfile.mcp                  # MCP server Docker image
-└── docker-compose.yml              # Full stack deployment
-```
-
----
-
-## Build and Development Commands
-
-### Prerequisites
-
-- Python 3.10 or higher
-- pip or uv package manager
-- Git (for GitHub scraping features)
-
-### Setup (REQUIRED before any development)
+## Setup
 
 ```bash
-# Install in editable mode (REQUIRED for tests due to src/ layout)
+# REQUIRED before running tests (src/ layout — tests fail without this)
 pip install -e .
-
-# Install with all platform dependencies
-pip install -e ".[all-llms]"
-
-# Install with all optional dependencies
-pip install -e ".[all]"
-
-# Install specific platforms only
-pip install -e ".[gemini]"    # Google Gemini support
-pip install -e ".[openai]"    # OpenAI ChatGPT support
-pip install -e ".[mcp]"       # MCP server dependencies
-pip install -e ".[s3]"        # AWS S3 support
-pip install -e ".[gcs]"       # Google Cloud Storage
-pip install -e ".[azure]"     # Azure Blob Storage
-pip install -e ".[embedding]" # Embedding server support
-pip install -e ".[rag-upload]" # Vector DB upload support
-
-# Install dev dependencies (using dependency-groups)
+# With dev tools
 pip install -e ".[dev]"
+# With all optional deps
+pip install -e ".[all]"
 ```
 
-**CRITICAL:** The project uses a `src/` layout. Tests WILL FAIL unless you install with `pip install -e .` first.
-
-### Building
+## Build / Test / Lint Commands
 
 ```bash
-# Build package using uv (recommended)
-uv build
-
-# Or using standard build
-python -m build
-
-# Publish to PyPI
-uv publish
-```
-
-### Docker
-
-```bash
-# Build Docker image
-docker build -t skill-seekers .
-
-# Run with docker-compose (includes vector databases)
-docker-compose up -d
-
-# Run MCP server only
-docker-compose up -d mcp-server
-
-# View logs
-docker-compose logs -f mcp-server
-```
-
----
-
-## Testing Instructions
-
-### Running Tests
-
-**CRITICAL:** Never skip tests - all tests must pass before commits.
-
-```bash
-# All tests (must run pip install -e . first!)
+# Run ALL tests (never skip tests — all must pass before commits)
 pytest tests/ -v
 
-# Specific test file
+# Run a single test file
 pytest tests/test_scraper_features.py -v
-pytest tests/test_mcp_fastmcp.py -v
-pytest tests/test_cloud_storage.py -v
 
-# With coverage
-pytest tests/ --cov=src/skill_seekers --cov-report=term --cov-report=html
-
-# Single test
+# Run a single test function
 pytest tests/test_scraper_features.py::test_detect_language -v
 
-# E2E tests
-pytest tests/test_e2e_three_stream_pipeline.py -v
+# Run a single test class method
+pytest tests/test_adaptors/test_claude_adaptor.py::TestClaudeAdaptor::test_package -v
 
-# Skip slow tests
-pytest tests/ -v -m "not slow"
-
-# Run only integration tests
-pytest tests/ -v -m integration
-
-# Run only specific marker
+# Skip slow/integration tests
 pytest tests/ -v -m "not slow and not integration"
-```
 
-### Test Architecture
+# With coverage
+pytest tests/ --cov=src/skill_seekers --cov-report=term
 
-- **105+ test files** covering all features
-- **CI Matrix:** Ubuntu + macOS, Python 3.10-3.12
-- Test markers defined in `pyproject.toml`:
-
-| Marker | Description |
-|--------|-------------|
-| `slow` | Tests taking >5 seconds |
-| `integration` | Requires external services (APIs) |
-| `e2e` | End-to-end tests (resource-intensive) |
-| `venv` | Requires virtual environment setup |
-| `bootstrap` | Bootstrap skill specific |
-| `benchmark` | Performance benchmark tests |
-
-### Test Configuration
-
-From `pyproject.toml`:
-```toml
-[tool.pytest.ini_options]
-testpaths = ["tests"]
-python_files = ["test_*.py"]
-addopts = "-v --tb=short --strict-markers"
-asyncio_mode = "auto"
-asyncio_default_fixture_loop_scope = "function"
-```
-
-The `conftest.py` file checks that the package is installed before running tests.
-
----
-
-## Code Style Guidelines
-
-### Linting and Formatting
-
-```bash
-# Run ruff linter
+# Lint (ruff)
 ruff check src/ tests/
-
-# Run ruff formatter check
-ruff format --check src/ tests/
-
-# Auto-fix issues
 ruff check src/ tests/ --fix
+
+# Format (ruff)
+ruff format --check src/ tests/
 ruff format src/ tests/
 
-# Run mypy type checker
+# Type check (mypy)
 mypy src/skill_seekers --show-error-codes --pretty
 ```
 
-### Style Rules (from pyproject.toml)
+**Test markers:** `slow`, `integration`, `e2e`, `venv`, `bootstrap`, `benchmark`
+**Async tests:** use `@pytest.mark.asyncio`; asyncio_mode is `auto`.
 
+## Code Style
+
+### Formatting Rules (ruff — from pyproject.toml)
 - **Line length:** 100 characters
 - **Target Python:** 3.10+
-- **Enabled rules:** E, W, F, I, B, C4, UP, ARG, SIM
-- **Ignored rules:** E501, F541, ARG002, B007, I001, SIM114
-- **Import sorting:** isort style with `skill_seekers` as first-party
+- **Enabled lint rules:** E, W, F, I, B, C4, UP, ARG, SIM
+- **Ignored rules:** E501 (line length handled by formatter), F541 (f-string style), ARG002 (unused method args for interface compliance), B007 (intentional unused loop vars), I001 (formatter handles imports), SIM114 (readability preference)
 
-### MyPy Configuration (from pyproject.toml)
+### Imports
+- Sort with isort (via ruff); `skill_seekers` is first-party
+- Standard library → third-party → first-party, separated by blank lines
+- Use `from __future__ import annotations` only if needed for forward refs
+- Guard optional imports with try/except ImportError (see `adaptors/__init__.py` pattern)
 
-```toml
-[tool.mypy]
-python_version = "3.10"
-warn_return_any = true
-warn_unused_configs = true
-disallow_untyped_defs = false
-disallow_incomplete_defs = false
-check_untyped_defs = true
-ignore_missing_imports = true
-show_error_codes = true
-pretty = true
+### Naming Conventions
+- **Files:** `snake_case.py`
+- **Classes:** `PascalCase` (e.g., `SkillAdaptor`, `ClaudeAdaptor`)
+- **Functions/methods:** `snake_case`
+- **Constants:** `UPPER_CASE` (e.g., `ADAPTORS`, `DEFAULT_CHUNK_TOKENS`)
+- **Private:** prefix with `_`
+
+### Type Hints
+- Gradual typing — add hints where practical, not enforced everywhere
+- Use modern syntax: `str | None` not `Optional[str]`, `list[str]` not `List[str]`
+- MyPy config: `disallow_untyped_defs = false`, `check_untyped_defs = true`, `ignore_missing_imports = true`
+
+### Docstrings
+- Module-level docstring on every file (triple-quoted, describes purpose)
+- Google-style or standard docstrings for public functions/classes
+- Include `Args:`, `Returns:`, `Raises:` sections where useful
+
+### Error Handling
+- Use specific exceptions, never bare `except:`
+- Provide helpful error messages with context (see `get_adaptor()` in `adaptors/__init__.py`)
+- Use `raise ValueError(...)` for invalid arguments, `raise RuntimeError(...)` for state errors
+- Guard optional dependency imports with try/except and give clear install instructions on failure
+
+### Suppressing Lint Warnings
+- Use inline `# noqa: XXXX` comments (e.g., `# noqa: F401` for re-exports, `# noqa: ARG001` for required but unused params)
+
+## Supported Source Types (17)
+
+| Type | CLI Command | Config Type | Detection |
+|------|------------|-------------|-----------|
+| Documentation (web) | `scrape` / `create <url>` | `documentation` | HTTP/HTTPS URLs |
+| GitHub repo | `github` / `create owner/repo` | `github` | `owner/repo` or github.com URLs |
+| PDF | `pdf` / `create file.pdf` | `pdf` | `.pdf` extension |
+| Word (.docx) | `word` / `create file.docx` | `word` | `.docx` extension |
+| EPUB | `epub` / `create file.epub` | `epub` | `.epub` extension |
+| Video | `video` / `create <url/file>` | `video` | YouTube/Vimeo URLs, video extensions |
+| Local codebase | `analyze` / `create ./path` | `local` | Directory paths |
+| Jupyter Notebook | `jupyter` / `create file.ipynb` | `jupyter` | `.ipynb` extension |
+| Local HTML | `html` / `create file.html` | `html` | `.html`/`.htm` extensions |
+| OpenAPI/Swagger | `openapi` / `create spec.yaml` | `openapi` | `.yaml`/`.yml` with OpenAPI content |
+| AsciiDoc | `asciidoc` / `create file.adoc` | `asciidoc` | `.adoc`/`.asciidoc` extensions |
+| PowerPoint | `pptx` / `create file.pptx` | `pptx` | `.pptx` extension |
+| RSS/Atom | `rss` / `create feed.rss` | `rss` | `.rss`/`.atom` extensions |
+| Man pages | `manpage` / `create cmd.1` | `manpage` | `.1`-`.8`/`.man` extensions |
+| Confluence | `confluence` | `confluence` | API or export directory |
+| Notion | `notion` | `notion` | API or export directory |
+| Slack/Discord | `chat` | `chat` | Export directory or API |
+
+## Project Layout
+
+```
+src/skill_seekers/           # Main package (src/ layout)
+  cli/                       # CLI commands and entry points
+    adaptors/                # Platform adaptors (Strategy pattern, inherit SkillAdaptor)
+    arguments/               # CLI argument definitions (one per source type)
+    parsers/                 # Subcommand parsers (one per source type)
+    storage/                 # Cloud storage (inherit BaseStorageAdaptor)
+    main.py                  # Unified CLI entry point (COMMAND_MODULES dict)
+    source_detector.py       # Auto-detects source type from user input
+    create_command.py        # Unified `create` command routing
+    config_validator.py      # VALID_SOURCE_TYPES set + per-type validation
+    unified_scraper.py       # Multi-source orchestrator (scraped_data + dispatch)
+    unified_skill_builder.py # Pairwise synthesis + generic merge
+  mcp/                       # MCP server (FastMCP + legacy)
+    tools/                   # MCP tool implementations by category
+  sync/                      # Sync monitoring (Pydantic models)
+  benchmark/                 # Benchmarking framework
+  embedding/                 # FastAPI embedding server
+  workflows/                 # 67 YAML workflow presets (includes complex-merge.yaml)
+  _version.py                # Reads version from pyproject.toml
+tests/                       # 115+ test files (pytest)
+configs/                     # Preset JSON scraping configs
+docs/                        # 80+ markdown doc files
 ```
 
-### Code Conventions
+## Key Patterns
 
-1. **Use type hints** where practical (gradual typing approach)
-2. **Docstrings:** Use Google-style or standard docstrings
-3. **Error handling:** Use specific exceptions, provide helpful messages
-4. **Async code:** Use `asyncio`, mark tests with `@pytest.mark.asyncio`
-5. **File naming:** Use snake_case for all Python files
-6. **Class naming:** Use PascalCase for classes
-7. **Function naming:** Use snake_case for functions and methods
-8. **Constants:** Use UPPER_CASE for module-level constants
+**Adaptor (Strategy) pattern** — all platform logic in `cli/adaptors/`. Inherit `SkillAdaptor`, implement `format_skill_md()`, `package()`, `upload()`. Register in `adaptors/__init__.py` ADAPTORS dict.
 
----
+**Scraper pattern** — each source type has: `cli/<type>_scraper.py` (with `<Type>ToSkillConverter` class + `main()`), `arguments/<type>.py`, `parsers/<type>_parser.py`. Register in `parsers/__init__.py` PARSERS list, `main.py` COMMAND_MODULES dict, `config_validator.py` VALID_SOURCE_TYPES set.
 
-## Architecture Patterns
+**Unified pipeline** — `unified_scraper.py` dispatches to per-type `_scrape_<type>()` methods. `unified_skill_builder.py` uses pairwise synthesis for docs+github+pdf combos and `_generic_merge()` for all other combinations.
 
-### Platform Adaptor Pattern (Strategy Pattern)
+**MCP tools** — grouped in `mcp/tools/` by category. `scrape_generic_tool` handles all new source types.
 
-All platform-specific logic is encapsulated in adaptors:
-
-```python
-from skill_seekers.cli.adaptors import get_adaptor
-
-# Get platform-specific adaptor
-adaptor = get_adaptor('gemini')  # or 'claude', 'openai', 'langchain', etc.
-
-# Package skill
-adaptor.package(skill_dir='output/react/', output_path='output/')
-
-# Upload to platform
-adaptor.upload(
-    package_path='output/react-gemini.tar.gz',
-    api_key=os.getenv('GOOGLE_API_KEY')
-)
-```
-
-Each adaptor inherits from `SkillAdaptor` base class and implements:
-- `format_skill_md()` - Format SKILL.md content
-- `package()` - Create platform-specific package
-- `upload()` - Upload to platform API
-- `validate_api_key()` - Validate API key format
-- `supports_enhancement()` - Whether AI enhancement is supported
-
-### CLI Architecture (Git-style)
-
-Entry point: `src/skill_seekers/cli/main.py`
-
-The CLI uses subcommands that delegate to existing modules:
-
-```bash
-# skill-seekers scrape --config react.json
-# Transforms to: doc_scraper.main() with modified sys.argv
-```
-
-**Available subcommands:**
-- `create` - Unified create command
-- `config` - Configuration wizard
-- `scrape` - Documentation scraping
-- `github` - GitHub repository scraping
-- `pdf` - PDF extraction
-- `word` - Word document extraction
-- `video` - Video extraction (YouTube or local). Use `--setup` to auto-detect GPU and install visual deps.
-- `unified` - Multi-source scraping
-- `analyze` / `codebase` - Local codebase analysis
-- `enhance` - AI enhancement
-- `package` - Package skill for target platform
-- `upload` - Upload to platform
-- `cloud` - Cloud storage operations
-- `sync` - Sync monitoring
-- `benchmark` - Performance benchmarking
-- `embed` - Embedding server
-- `install` / `install-agent` - Complete workflow
-- `stream` - Streaming ingestion
-- `update` - Incremental updates
-- `multilang` - Multi-language support
-- `quality` - Quality metrics
-- `resume` - Resume interrupted jobs
-- `estimate` - Estimate page counts
-- `workflows` - Workflow management
-
-### MCP Server Architecture
-
-Two implementations:
-- `server_fastmcp.py` - Modern, decorator-based (recommended, ~708 lines)
-- `server_legacy.py` - Legacy implementation
-
-Tools are organized by category:
-- Config tools (3 tools): generate_config, list_configs, validate_config
-- Scraping tools (10 tools): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video (supports `setup` parameter for GPU detection and visual dep installation), scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
-- Packaging tools (4 tools): package_skill, upload_skill, enhance_skill, install_skill
-- Source tools (5 tools): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
-- Splitting tools (2 tools): split_config, generate_router
-- Vector Database tools (4 tools): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant
-- Workflow tools (5 tools): list_workflows, get_workflow, create_workflow, update_workflow, delete_workflow
-
-**Running MCP Server:**
-```bash
-# Stdio transport (default)
-python -m skill_seekers.mcp.server_fastmcp
-
-# HTTP transport
-python -m skill_seekers.mcp.server_fastmcp --http --port 8765
-```
-
-### Cloud Storage Architecture
-
-Abstract base class pattern for cloud providers:
-- `base_storage.py` - Defines `BaseStorageAdaptor` interface
-- `s3_storage.py` - AWS S3 implementation
-- `gcs_storage.py` - Google Cloud Storage implementation
-- `azure_storage.py` - Azure Blob Storage implementation
-
-### Sync Monitoring Architecture
-
-Pydantic-based models in `src/skill_seekers/sync/`:
-- `models.py` - Data models (SyncConfig, ChangeReport, SyncState)
-- `detector.py` - Change detection logic
-- `monitor.py` - Monitoring daemon
-- `notifier.py` - Notification system (webhook, email, slack)
-
----
+**CLI subcommands** — git-style in `cli/main.py`. Each delegates to a module's `main()` function.
 
 ## Git Workflow
 
-### Branch Structure
+- **`main`** — production, protected
+- **`development`** — default PR target, active dev
+- Feature branches created from `development`
 
-```
-main (production)
-  ↑
-  │ (only maintainer merges)
-  │
-development (integration) ← default branch for PRs
-  ↑
-  │ (all contributor PRs go here)
-  │
-feature branches
-```
-
-- **`main`** - Production, always stable, protected
-- **`development`** - Active development, default for PRs
-- **Feature branches** - Your work, created from `development`
-
-### Creating a Feature Branch
+## Pre-commit Checklist
 
 ```bash
-# 1. Checkout development
-git checkout development
-git pull upstream development
-
-# 2. Create feature branch
-git checkout -b my-feature
-
-# 3. Make changes, commit, push
-git add .
-git commit -m "Add my feature"
-git push origin my-feature
-
-# 4. Create PR targeting 'development' branch
-```
-
----
-
-## CI/CD Configuration
-
-### GitHub Actions Workflows
-
-All workflows are in `.github/workflows/`:
-
-**`tests.yml`:**
-- Runs on: push/PR to `main` and `development`
-- Lint job: Ruff + MyPy
-- Test matrix: Ubuntu + macOS, Python 3.10-3.12
-- Coverage: Uploads to Codecov
-
-**`release.yml`:**
-- Triggered on version tags (`v*`)
-- Builds and publishes to PyPI using `uv`
-- Creates GitHub release with changelog
-
-**`docker-publish.yml`:**
-- Builds and publishes Docker images
-- Multi-architecture support (linux/amd64, linux/arm64)
-
-**`vector-db-export.yml`:**
-- Tests vector database exports
-
-**`scheduled-updates.yml`:**
-- Scheduled sync monitoring
-
-**`quality-metrics.yml`:**
-- Quality metrics tracking
-
-**`test-vector-dbs.yml`:**
-- Vector database integration tests
-
-### Pre-commit Checks (Manual)
-
-```bash
-# Before committing, run:
 ruff check src/ tests/
 ruff format --check src/ tests/
-pytest tests/ -v -x  # Stop on first failure
+pytest tests/ -v -x   # stop on first failure
 ```
 
----
+Never commit API keys. Use env vars: `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `OPENAI_API_KEY`, `GITHUB_TOKEN`.
 
-## Security Considerations
+## CI
 
-### API Keys and Secrets
-
-1. **Never commit API keys** to the repository
-2. **Use environment variables:**
-   - `ANTHROPIC_API_KEY` - Claude AI
-   - `GOOGLE_API_KEY` - Google Gemini
-   - `OPENAI_API_KEY` - OpenAI
-   - `GITHUB_TOKEN` - GitHub API
-   - `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` - AWS S3
-   - `GOOGLE_APPLICATION_CREDENTIALS` - GCS
-   - `AZURE_STORAGE_CONNECTION_STRING` - Azure
-3. **Configuration storage:**
-   - Stored at `~/.config/skill-seekers/config.json`
-   - Permissions: 600 (owner read/write only)
-
-### Rate Limit Handling
-
-- GitHub API has rate limits (5000 requests/hour for authenticated)
-- The tool has built-in rate limit handling with retry logic
-- Use `--non-interactive` flag for CI/CD environments
-
-### Custom API Endpoints
-
-Support for Claude-compatible APIs:
-
-```bash
-export ANTHROPIC_API_KEY=your-custom-api-key
-export ANTHROPIC_BASE_URL=https://custom-endpoint.com/v1
-```
-
----
-
-## Common Development Tasks
-
-### Adding a New CLI Command
-
-1. Create module in `src/skill_seekers/cli/my_command.py`
-2. Implement `main()` function with argument parsing
-3. Add entry point in `pyproject.toml`:
-   ```toml
-   [project.scripts]
-   skill-seekers-my-command = "skill_seekers.cli.my_command:main"
-   ```
-4. Add subcommand handler in `src/skill_seekers/cli/main.py`
-5. Add argument parser in `src/skill_seekers/cli/parsers/`
-6. Add tests in `tests/test_my_command.py`
-
-### Adding a New Platform Adaptor
-
-1. Create `src/skill_seekers/cli/adaptors/my_platform.py`
-2. Inherit from `SkillAdaptor` base class
-3. Implement required methods: `package()`, `upload()`, `format_skill_md()`
-4. Register in `src/skill_seekers/cli/adaptors/__init__.py`
-5. Add optional dependencies in `pyproject.toml`
-6. Add tests in `tests/test_adaptors/`
-
-### Adding an MCP Tool
-
-1. Implement tool logic in `src/skill_seekers/mcp/tools/category_tools.py`
-2. Register in `src/skill_seekers/mcp/server_fastmcp.py`
-3. Add test in `tests/test_mcp_fastmcp.py`
-
-### Adding Cloud Storage Provider
-
-1. Create module in `src/skill_seekers/cli/storage/my_storage.py`
-2. Inherit from `BaseStorageAdaptor` base class
-3. Implement required methods: `upload_file()`, `download_file()`, `list_files()`, `delete_file()`
-4. Register in `src/skill_seekers/cli/storage/__init__.py`
-5. Add optional dependencies in `pyproject.toml`
-
----
-
-## Documentation
-
-### Project Documentation (New Structure - v3.1.0+)
-
-**Entry Points:**
-- **README.md** - Main project documentation with navigation
-- **docs/README.md** - Documentation hub
-- **AGENTS.md** - This file, for AI coding agents
-
-**Getting Started (for new users):**
-- `docs/getting-started/01-installation.md` - Installation guide
-- `docs/getting-started/02-quick-start.md` - 3 commands to first skill
-- `docs/getting-started/03-your-first-skill.md` - Complete walkthrough
-- `docs/getting-started/04-next-steps.md` - Where to go from here
-
-**User Guides (common tasks):**
-- `docs/user-guide/01-core-concepts.md` - How Skill Seekers works
-- `docs/user-guide/02-scraping.md` - All scraping options
-- `docs/user-guide/03-enhancement.md` - AI enhancement explained
-- `docs/user-guide/04-packaging.md` - Export to platforms
-- `docs/user-guide/05-workflows.md` - Enhancement workflows
-- `docs/user-guide/06-troubleshooting.md` - Common issues
-
-**Reference (technical details):**
-- `docs/reference/CLI_REFERENCE.md` - Complete command reference (20 commands)
-- `docs/reference/MCP_REFERENCE.md` - MCP tools reference (33 tools)
-- `docs/reference/CONFIG_FORMAT.md` - JSON configuration specification
-- `docs/reference/ENVIRONMENT_VARIABLES.md` - All environment variables
-
-**Advanced (power user topics):**
-- `docs/advanced/mcp-server.md` - MCP server setup
-- `docs/advanced/mcp-tools.md` - Advanced MCP usage
-- `docs/advanced/custom-workflows.md` - Creating custom workflows
-- `docs/advanced/multi-source.md` - Multi-source scraping
-
-### Configuration Documentation
-
-Preset configs are in `configs/` directory:
-- `godot.json` / `godot_unified.json` - Godot Engine
-- `blender.json` / `blender-unified.json` - Blender Engine
-- `claude-code.json` - Claude Code
-- `httpx_comprehensive.json` - HTTPX library
-- `medusa-mercurjs.json` - Medusa/MercurJS
-- `astrovalley_unified.json` - Astrovalley
-- `react.json` - React documentation
-- `configs/integrations/` - Integration-specific configs
-
----
-
-## Key Dependencies
-
-### Core Dependencies (Required)
-
-| Package | Version | Purpose |
-|---------|---------|---------|
-| `requests` | >=2.32.5 | HTTP requests |
-| `beautifulsoup4` | >=4.14.2 | HTML parsing |
-| `PyGithub` | >=2.5.0 | GitHub API |
-| `GitPython` | >=3.1.40 | Git operations |
-| `httpx` | >=0.28.1 | Async HTTP |
-| `anthropic` | >=0.76.0 | Claude AI API |
-| `PyMuPDF` | >=1.24.14 | PDF processing |
-| `Pillow` | >=11.0.0 | Image processing |
-| `pytesseract` | >=0.3.13 | OCR |
-| `pydantic` | >=2.12.3 | Data validation |
-| `pydantic-settings` | >=2.11.0 | Settings management |
-| `click` | >=8.3.0 | CLI framework |
-| `Pygments` | >=2.19.2 | Syntax highlighting |
-| `pathspec` | >=0.12.1 | Path matching |
-| `networkx` | >=3.0 | Graph operations |
-| `schedule` | >=1.2.0 | Scheduled tasks |
-| `python-dotenv` | >=1.1.1 | Environment variables |
-| `jsonschema` | >=4.25.1 | JSON validation |
-| `PyYAML` | >=6.0 | YAML parsing |
-| `langchain` | >=1.2.10 | LangChain integration |
-| `llama-index` | >=0.14.15 | LlamaIndex integration |
-
-### Optional Dependencies
-
-| Feature | Package | Install Command |
-|---------|---------|-----------------|
-| MCP Server | `mcp>=1.25,<2` | `pip install -e ".[mcp]"` |
-| Google Gemini | `google-generativeai>=0.8.0` | `pip install -e ".[gemini]"` |
-| OpenAI | `openai>=1.0.0` | `pip install -e ".[openai]"` |
-| AWS S3 | `boto3>=1.34.0` | `pip install -e ".[s3]"` |
-| Google Cloud Storage | `google-cloud-storage>=2.10.0` | `pip install -e ".[gcs]"` |
-| Azure Blob Storage | `azure-storage-blob>=12.19.0` | `pip install -e ".[azure]"` |
-| Word Documents | `mammoth>=1.6.0`, `python-docx>=1.1.0` | `pip install -e ".[docx]"` |
-| Video (lightweight) | `yt-dlp>=2024.12.0`, `youtube-transcript-api>=1.2.0` | `pip install -e ".[video]"` |
-| Video (full) | +`faster-whisper`, `scenedetect`, `opencv-python-headless` (`easyocr` now installed via `--setup`) | `pip install -e ".[video-full]"` |
-| Video (GPU setup) | Auto-detects GPU, installs PyTorch + easyocr + all visual deps | `skill-seekers video --setup` |
-| Chroma DB | `chromadb>=0.4.0` | `pip install -e ".[chroma]"` |
-| Weaviate | `weaviate-client>=3.25.0` | `pip install -e ".[weaviate]"` |
-| Pinecone | `pinecone>=5.0.0` | `pip install -e ".[pinecone]"` |
-| Embedding Server | `fastapi>=0.109.0`, `uvicorn>=0.27.0`, `sentence-transformers>=2.3.0` | `pip install -e ".[embedding]"` |
-
-### Dev Dependencies (in dependency-groups)
-
-| Package | Version | Purpose |
-|---------|---------|---------|
-| `pytest` | >=8.4.2 | Testing framework |
-| `pytest-asyncio` | >=0.24.0 | Async test support |
-| `pytest-cov` | >=7.0.0 | Coverage |
-| `coverage` | >=7.11.0 | Coverage reporting |
-| `ruff` | >=0.14.13 | Linting/formatting |
-| `mypy` | >=1.19.1 | Type checking |
-| `psutil` | >=5.9.0 | Process utilities for testing |
-| `numpy` | >=1.24.0 | Numerical operations |
-| `starlette` | >=0.31.0 | HTTP transport testing |
-| `httpx` | >=0.24.0 | HTTP client for testing |
-| `boto3` | >=1.26.0 | AWS S3 testing |
-| `google-cloud-storage` | >=2.10.0 | GCS testing |
-| `azure-storage-blob` | >=12.17.0 | Azure testing |
-
----
-
-## Troubleshooting
-
-### Common Issues
-
-**ImportError: No module named 'skill_seekers'**
-- Solution: Run `pip install -e .`
-
-**Tests failing with "package not installed"**
-- Solution: Ensure you ran `pip install -e .` in the correct virtual environment
-
-**MCP server import errors**
-- Solution: Install with `pip install -e ".[mcp]"`
-
-**Type checking failures**
-- MyPy is configured to be lenient (gradual typing)
-- Focus on critical paths, not full coverage
-
-**Docker build failures**
-- Ensure you have BuildKit enabled: `DOCKER_BUILDKIT=1`
-- Check that all submodules are initialized: `git submodule update --init`
-
-**Rate limit errors from GitHub**
-- Set `GITHUB_TOKEN` environment variable for authenticated requests
-- Improves rate limit from 60 to 5000 requests/hour
-
-### Getting Help
-
-- Check **TROUBLESHOOTING.md** for detailed solutions
-- Review **docs/FAQ.md** for common questions
-- Visit https://skillseekersweb.com/ for documentation
-- Open an issue on GitHub with:
-  - Clear title and description
-  - Steps to reproduce
-  - Expected vs actual behavior
-  - Environment details (OS, Python version)
-  - Error messages and stack traces
-
----
-
-## Environment Variables Reference
-
-| Variable | Purpose | Required For |
-|----------|---------|--------------|
-| `ANTHROPIC_API_KEY` | Claude AI API access | Claude enhancement/upload |
-| `GOOGLE_API_KEY` | Google Gemini API access | Gemini enhancement/upload |
-| `OPENAI_API_KEY` | OpenAI API access | OpenAI enhancement/upload |
-| `GITHUB_TOKEN` | GitHub API authentication | GitHub scraping (recommended) |
-| `AWS_ACCESS_KEY_ID` | AWS S3 authentication | S3 cloud storage |
-| `AWS_SECRET_ACCESS_KEY` | AWS S3 authentication | S3 cloud storage |
-| `GOOGLE_APPLICATION_CREDENTIALS` | GCS authentication path | GCS cloud storage |
-| `AZURE_STORAGE_CONNECTION_STRING` | Azure Blob authentication | Azure cloud storage |
-| `ANTHROPIC_BASE_URL` | Custom Claude endpoint | Custom API endpoints |
-| `SKILL_SEEKERS_HOME` | Data directory path | Docker/runtime |
-| `SKILL_SEEKERS_OUTPUT` | Output directory path | Docker/runtime |
-
----
-
-## Version Management
-
-The version is defined in `pyproject.toml` and dynamically read by `src/skill_seekers/_version.py`:
-
-```python
-# _version.py reads from pyproject.toml
-__version__ = get_version()  # Returns version from pyproject.toml
-```
-
-**To update version:**
-1. Edit `version` in `pyproject.toml`
-2. The `_version.py` file will automatically pick up the new version
-
----
-
-## Configuration File Format
-
-Skill Seekers uses JSON configuration files to define scraping targets. Example structure:
-
-```json
-{
-  "name": "godot",
-  "description": "Godot Engine documentation",
-  "merge_mode": "claude-enhanced",
-  "sources": [
-    {
-      "type": "documentation",
-      "base_url": "https://docs.godotengine.org/en/stable/",
-      "extract_api": true,
-      "selectors": {
-        "main_content": "div[role='main']",
-        "title": "title",
-        "code_blocks": "pre"
-      },
-      "url_patterns": {
-        "include": [],
-        "exclude": ["/search.html", "/_static/"]
-      },
-      "categories": {
-        "getting_started": ["introduction", "getting_started"],
-        "scripting": ["scripting", "gdscript"]
-      },
-      "rate_limit": 0.5,
-      "max_pages": 500
-    },
-    {
-      "type": "github",
-      "repo": "godotengine/godot",
-      "enable_codebase_analysis": true,
-      "code_analysis_depth": "deep",
-      "fetch_issues": true,
-      "max_issues": 100
-    }
-  ]
-}
-```
-
----
-
-## Workflow Presets
-
-Skill Seekers includes 66 YAML workflow presets for AI enhancement in `src/skill_seekers/workflows/`:
-
-**Built-in presets:**
-- `default.yaml` - Standard enhancement workflow
-- `minimal.yaml` - Fast, minimal enhancement
-- `security-focus.yaml` - Security-focused review
-- `architecture-comprehensive.yaml` - Deep architecture analysis
-- `api-documentation.yaml` - API documentation focus
-- And 61 more specialized presets...
-
-**Usage:**
-```bash
-# Apply a preset
-skill-seekers create ./my-project --enhance-workflow security-focus
-
-# Chain multiple presets
-skill-seekers create ./my-project --enhance-workflow security-focus --enhance-workflow minimal
-
-# Manage presets
-skill-seekers workflows list
-skill-seekers workflows show security-focus
-skill-seekers workflows copy security-focus
-```
-
----
-
-*This document is maintained for AI coding agents. For human contributors, see README.md and CONTRIBUTING.md.*
-
-*Last updated: 2026-03-01*
+GitHub Actions (`.github/workflows/tests.yml`): ruff + mypy lint job, then pytest matrix (Ubuntu + macOS, Python 3.10-3.12) with Codecov upload.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 220d09e..407d485 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,77 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+
+#### 10 New Skill Source Types (17 total)
+
+Skill Seekers now supports 17 source types — up from 7. Every new type is fully integrated into the CLI (`skill-seekers <type>`), `create` command auto-detection, unified multi-source configs, config validation, the MCP server, and the skill builder.
+
+- **Jupyter Notebook** — `skill-seekers jupyter --notebook file.ipynb` or `skill-seekers create file.ipynb`
+  - Extracts markdown cells, code cells with outputs, kernel metadata, imports, and language detection
+  - Handles single files and directories of notebooks; filters `.ipynb_checkpoints`
+  - Optional dependency: `pip install "skill-seekers[jupyter]"` (nbformat)
+  - Entry point: `skill-seekers-jupyter`
+
+- **Local HTML** — `skill-seekers html --html-path file.html` or `skill-seekers create file.html`
+  - Parses HTML using BeautifulSoup with smart main content detection (`<article>`, `<main>`, `.content`, largest div)
+  - Extracts headings, code blocks, tables (to markdown), images, links; converts inline HTML to markdown
+  - Handles single files and directories; supports `.html`, `.htm`, `.xhtml` extensions
+  - No extra dependencies (BeautifulSoup is a core dep)
+
+- **OpenAPI/Swagger** — `skill-seekers openapi --spec spec.yaml` or `skill-seekers create spec.yaml`
+  - Parses OpenAPI 3.0/3.1 and Swagger 2.0 specs from YAML or JSON (local files or URLs via `--spec-url`)
+  - Extracts endpoints, parameters, request/response schemas, security schemes, tags
+  - Resolves `$ref` references with circular reference protection; handles `allOf`/`oneOf`/`anyOf`
+  - Groups endpoints by tags; generates comprehensive API reference markdown
+  - Source detection sniffs YAML file content for `openapi:` or `swagger:` keys (avoids false positives on non-API YAML files)
+  - Optional dependency: `pip install "skill-seekers[openapi]"` (pyyaml — already a core dep, guard added for safety)
+
+- **AsciiDoc** — `skill-seekers asciidoc --asciidoc-path file.adoc` or `skill-seekers create file.adoc`
+  - Regex-based parser (no external library required) with optional `asciidoc` library support
+  - Extracts headings (= through =====), `[source,lang]` code blocks, `|===` tables, admonitions (NOTE/TIP/WARNING/IMPORTANT/CAUTION), and `include::` directives
+  - Converts AsciiDoc formatting to markdown; handles single files and directories
+  - Optional dependency: `pip install "skill-seekers[asciidoc]"` (asciidoc library for advanced rendering)
+
+- **PowerPoint (.pptx)** — `skill-seekers pptx --pptx file.pptx` or `skill-seekers create file.pptx`
+  - Extracts slide text, speaker notes, tables, images (with alt text), and grouped shapes
+  - Detects code blocks by monospace font analysis (30+ font families)
+  - Groups slides into sections by layout type; handles single files and directories
+  - Optional dependency: `pip install "skill-seekers[pptx]"` (python-pptx)
+
+- **RSS/Atom Feeds** — `skill-seekers rss --feed-url <url>` / `--feed-path file.rss` or `skill-seekers create feed.rss`
+  - Parses RSS 2.0, RSS 1.0, and Atom feeds via feedparser
+  - Optionally follows article links (`--follow-links`, default on) to scrape full page content using BeautifulSoup
+  - Extracts article titles, summaries, authors, dates, categories; configurable `--max-articles` (default 50)
+  - Source detection matches `.rss` and `.atom` extensions (`.xml` excluded to avoid false positives)
+  - Optional dependency: `pip install "skill-seekers[rss]"` (feedparser)
+
+- **Man Pages** — `skill-seekers manpage --man-names git,curl` / `--man-path dir/` or `skill-seekers create git.1`
+  - Extracts man pages by running `man` command via subprocess or reading `.1`–`.8`/`.man` files directly
+  - Handles gzip/bzip2/xz compressed man files; strips troff/groff formatting (backspace overstriking, macros, font escapes)
+  - Parses structured sections (NAME, SYNOPSIS, DESCRIPTION, OPTIONS, EXAMPLES, SEE ALSO)
+  - Source detection uses basename heuristic to avoid false positives on log rotation files (e.g., `access.log.1`)
+  - No external dependencies (stdlib only)
+
+- **Confluence** — `skill-seekers confluence --base-url <url> --space-key <key>` or `--export-path dir/`
+  - API mode: fetches pages from Confluence REST API with pagination (`atlassian-python-api`)
+  - Export mode: parses Confluence HTML/XML export directories
+  - Extracts page content, code/panel/info/warning macros, page hierarchy, tables
+  - Optional dependency: `pip install "skill-seekers[confluence]"` (atlassian-python-api)
+
+- **Notion** — `skill-seekers notion --database-id <id>` / `--page-id <id>` or `--export-path dir/`
+  - API mode: fetches pages via Notion API with support for 20+ block types (paragraph, heading, code, callout, toggle, table, etc.)
+  - Export mode: parses Notion Markdown/CSV export directories
+  - Extracts rich text with annotations (bold, italic, code, links), 16+ property types for database entries
+  - Optional dependency: `pip install "skill-seekers[notion]"` (notion-client)
+
+- **Slack/Discord Chat** — `skill-seekers chat --export-path dir/` or `--token <token> --channel <channel>`
+  - Slack: parses workspace JSON exports or fetches via Slack Web API (`slack_sdk`)
+  - Discord: parses DiscordChatExporter JSON or fetches via Discord HTTP API
+  - Extracts messages, code snippets (fenced blocks), shared URLs, threads, reactions, attachments
+  - Generates per-channel summaries and topic categorization
+  - Optional dependency: `pip install "skill-seekers[chat]"` (slack-sdk)
+
+#### EPUB Unified Pipeline Integration
 - **EPUB (.epub) input support** via `skill-seekers create book.epub` or `skill-seekers epub --epub book.epub`
   - Extracts chapters, metadata (Dublin Core), code blocks, images, and tables from EPUB 2 and EPUB 3 files
   - DRM detection with clear error messages (Adobe ADEPT, Apple FairPlay, Readium LCP)
@@ -16,6 +87,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - `--help-epub` flag for EPUB-specific help
   - Optional dependency: `pip install "skill-seekers[epub]"` (ebooklib)
   - 107 tests across 14 test classes
+- **EPUB added to unified scraper** — `_scrape_epub()` method, `scraped_data["epub"]`, config validation (`_validate_epub_source`), and dry-run display. Previously EPUB worked standalone but was missing from multi-source configs.
+
+#### Unified Skill Builder — Generic Merge System
+- **`_generic_merge()`** — Priority-based section merge for any combination of source types not covered by existing pairwise synthesis (docs+github, docs+pdf, etc.). Produces YAML frontmatter + source-attributed sections.
+- **`_append_extra_sources()`** — Appends additional source type content (e.g., Jupyter + PPTX) to pairwise-synthesized SKILL.md.
+- **`_generate_generic_references()`** — Generates `references/<type>/index.md` for any source type, with ID resolution fallback chain.
+- **`_SOURCE_LABELS`** dict — Human-readable labels for all 17 source types used in merge attribution.
+
+#### Config Validator Expansion
+- **17 source types in `VALID_SOURCE_TYPES`** — All new types plus `word` and `video` now have per-type validation methods.
+- **`_validate_word_source()`** — Validates `path` field for Word documents (was previously missing).
+- **`_validate_video_source()`** — Validates `url`, `path`, or `playlist` field for video sources (was previously missing).
+- **11 new `_validate_*_source()` methods** — One for each new type with appropriate required-field checks.
+
+#### Source Detection Improvements
+- **7 new file extension detections** in `SourceDetector.detect()` — `.ipynb`, `.html`/`.htm`, `.pptx`, `.adoc`/`.asciidoc`, `.rss`/`.atom`, `.1`–`.8`/`.man`, `.yaml`/`.yml` (with content sniffing)
+- **`_looks_like_openapi()`** — Content sniffing for YAML files: only classifies as OpenAPI if the file contains `openapi:` or `swagger:` key in first 20 lines (prevents false positives on docker-compose, Ansible, Kubernetes manifests, etc.)
+- **Man page basename heuristic** — `.1`–`.8` extensions only detected as man pages if the basename has no dots (e.g., `git.1` matches but `access.log.1` does not)
+- **`.xml` excluded from RSS detection** — Too generic; only `.rss` and `.atom` trigger RSS detection
+
+#### MCP Server Integration
+- **`scrape_generic` tool** — New MCP tool handles all 10 new source types via subprocess with per-type flag mapping
+- **`_PATH_FLAGS` / `_URL_FLAGS` dicts** — Correct flag routing for each source type (e.g., jupyter→`--notebook`, html→`--html-path`, rss→`--feed-url`)
+- **`GENERIC_SOURCE_TYPES` tuple** — Lists all 10 new types for validation
+- **Config validation display** — `validate_config` tool now shows source details for all new types
+- **Tool count updated** — 33 → 34 tools (scraping tools 10 → 11)
+
+#### CLI Wiring
+- **10 new CLI subcommands** — `jupyter`, `html`, `openapi`, `asciidoc`, `pptx`, `rss`, `manpage`, `confluence`, `notion`, `chat` in `COMMAND_MODULES`
+- **10 new argument modules** — `arguments/{jupyter,html,openapi,asciidoc,pptx,rss,manpage,confluence,notion,chat}.py` with per-type `*_ARGUMENTS` dicts
+- **10 new parser modules** — `parsers/{jupyter,html,openapi,asciidoc,pptx,rss,manpage,confluence,notion,chat}_parser.py` with `SubcommandParser` implementations
+- **`create` command routing** — `_route_generic()` method for all new types with correct module names and CLI flags
+- **10 new entry points** in pyproject.toml — `skill-seekers-{jupyter,html,openapi,asciidoc,pptx,rss,manpage,confluence,notion,chat}`
+- **7 new optional dependency groups** in pyproject.toml — `[jupyter]`, `[asciidoc]`, `[pptx]`, `[confluence]`, `[notion]`, `[rss]`, `[chat]`
+- **`[all]` group updated** — Includes all 7 new optional dependencies
+
+#### Workflow & Documentation
+- **`complex-merge.yaml`** — New 7-stage AI-powered workflow for complex multi-source merging (source inventory → cross-reference → conflict detection → priority merge → gap analysis → synthesis → quality check)
+- **AGENTS.md rewritten** — Updated with all 17 source types, scraper pattern docs, project layout, and key pattern documentation
+- **77 new integration tests** in `test_new_source_types.py` — Source detection, config validation, generic merge, CLI wiring, validation, and create command routing
+
+### Fixed
+- **Config validator missing `word` and `video` dispatch** — `_validate_source()` had no `elif` branches for `word` or `video` types, silently skipping validation. Added dispatch entries and `_validate_word_source()` / `_validate_video_source()` methods.
+- **`openapi_scraper.py` unconditional `import yaml`** — Would crash at import time if pyyaml not installed. Added `try/except ImportError` guard with `YAML_AVAILABLE` flag and `_check_yaml_deps()` helper.
+- **`asciidoc_scraper.py` missing standard arguments** — `main()` manually defined args instead of using `add_asciidoc_arguments()`. Refactored to use shared argument definitions + added enhancement workflow integration.
+- **`pptx_scraper.py` missing standard arguments** — Same issue. Refactored to use `add_pptx_arguments()`.
+- **`chat_scraper.py` missing standard arguments** — Same issue. Refactored to use `add_chat_arguments()`.
+- **`notion_scraper.py` missing `run_workflows` call** — `--enhance-workflow` flags were silently ignored. Added workflow runner integration.
+- **`openapi_scraper.py` return type `None`** — `main()` returned `None` instead of `int`. Fixed to `return 0` on success, matching all other scrapers.
+- **MCP `scrape_generic_tool` flag mismatch** — Was passing `--path`/`--url` as generic flags, but every scraper expects its own flag name (e.g., `--notebook`, `--html-path`, `--spec`). All 10 source types would have failed at runtime. Fixed with per-type `_PATH_FLAGS` and `_URL_FLAGS` mappings.
+- **Word scraper `docx_id` key mismatch** — Unified scraper data dict used `docx_id` but generic reference generation looked for `word_id`. Added `word_id` alias.
+- **`main.py` docstring stale** — Missing all 10 new commands. Updated to list all 27 commands.
+- **`source_detector.py` module docstring stale** — Described only 5 source types. Updated to describe 14+ detected types.
+- **`manpage_parser.py` docstring referenced wrong file** — Said `manpage_scraper.py` but actual file is `man_scraper.py`. Fixed.
+- **Parser registry test count** — Updated expected count from 25 to 35 for 10 new parsers.
 
 ## [3.2.0] - 2026-03-01
 
diff --git a/pyproject.toml b/pyproject.toml
index 5b10fed..962392b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -168,6 +168,35 @@ all-cloud = [
     "azure-storage-blob>=12.19.0",
 ]
 
+# New source type dependencies (v3.2.0+)
+jupyter = [
+    "nbformat>=5.9.0",
+]
+
+asciidoc = [
+    "asciidoc>=10.0.0",
+]
+
+pptx = [
+    "python-pptx>=0.6.21",
+]
+
+confluence = [
+    "atlassian-python-api>=3.41.0",
+]
+
+notion = [
+    "notion-client>=2.0.0",
+]
+
+rss = [
+    "feedparser>=6.0.0",
+]
+
+chat = [
+    "slack-sdk>=3.27.0",
+]
+
 # Embedding server support
 embedding = [
     "fastapi>=0.109.0",
@@ -204,6 +233,14 @@ all = [
     "sentence-transformers>=2.3.0",
     "numpy>=1.24.0",
     "voyageai>=0.2.0",
+    # New source types (v3.2.0+)
+    "nbformat>=5.9.0",
+    "asciidoc>=10.0.0",
+    "python-pptx>=0.6.21",
+    "atlassian-python-api>=3.41.0",
+    "notion-client>=2.0.0",
+    "feedparser>=6.0.0",
+    "slack-sdk>=3.27.0",
 ]
 
 [project.urls]
@@ -253,6 +290,18 @@ skill-seekers-quality = "skill_seekers.cli.quality_metrics:main"
 skill-seekers-workflows = "skill_seekers.cli.workflows_command:main"
 skill-seekers-sync-config = "skill_seekers.cli.sync_config:main"
 
+# New source type entry points (v3.2.0+)
+skill-seekers-jupyter = "skill_seekers.cli.jupyter_scraper:main"
+skill-seekers-html = "skill_seekers.cli.html_scraper:main"
+skill-seekers-openapi = "skill_seekers.cli.openapi_scraper:main"
+skill-seekers-asciidoc = "skill_seekers.cli.asciidoc_scraper:main"
+skill-seekers-pptx = "skill_seekers.cli.pptx_scraper:main"
+skill-seekers-rss = "skill_seekers.cli.rss_scraper:main"
+skill-seekers-manpage = "skill_seekers.cli.man_scraper:main"
+skill-seekers-confluence = "skill_seekers.cli.confluence_scraper:main"
+skill-seekers-notion = "skill_seekers.cli.notion_scraper:main"
+skill-seekers-chat = "skill_seekers.cli.chat_scraper:main"
+
 [tool.setuptools]
 package-dir = {"" = "src"}
 
diff --git a/src/skill_seekers/cli/arguments/asciidoc.py b/src/skill_seekers/cli/arguments/asciidoc.py
new file mode 100644
index 0000000..2ea6e30
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/asciidoc.py
@@ -0,0 +1,68 @@
+"""AsciiDoc command argument definitions.
+
+This module defines ALL arguments for the asciidoc command in ONE place.
+Both asciidoc_scraper.py (standalone) and parsers/asciidoc_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# AsciiDoc-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "asciidoc_path": {
+        "flags": ("--asciidoc-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to AsciiDoc file or directory containing .adoc files",
+            "metavar": "PATH",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_asciidoc_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all asciidoc command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds AsciiDoc-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for AsciiDoc.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for AsciiDoc
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for AsciiDoc), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # AsciiDoc-specific args
+    for arg_name, arg_def in ASCIIDOC_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/chat.py b/src/skill_seekers/cli/arguments/chat.py
new file mode 100644
index 0000000..563f162
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/chat.py
@@ -0,0 +1,102 @@
+"""Chat command argument definitions.
+
+This module defines ALL arguments for the chat command in ONE place.
+Both chat_scraper.py (standalone) and parsers/chat_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# Chat-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+CHAT_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "export_path": {
+        "flags": ("--export-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to chat export directory or file",
+            "metavar": "PATH",
+        },
+    },
+    "platform": {
+        "flags": ("--platform",),
+        "kwargs": {
+            "type": str,
+            "choices": ["slack", "discord"],
+            "default": "slack",
+            "help": "Chat platform type (default: slack)",
+        },
+    },
+    "token": {
+        "flags": ("--token",),
+        "kwargs": {
+            "type": str,
+            "help": "API token for chat platform authentication",
+            "metavar": "TOKEN",
+        },
+    },
+    "channel": {
+        "flags": ("--channel",),
+        "kwargs": {
+            "type": str,
+            "help": "Channel name or ID to extract from",
+            "metavar": "CHANNEL",
+        },
+    },
+    "max_messages": {
+        "flags": ("--max-messages",),
+        "kwargs": {
+            "type": int,
+            "default": 10000,
+            "help": "Maximum number of messages to extract (default: 10000)",
+            "metavar": "N",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_chat_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all chat command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds Chat-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for Chat.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Chat
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for Chat), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # Chat-specific args
+    for arg_name, arg_def in CHAT_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/confluence.py b/src/skill_seekers/cli/arguments/confluence.py
new file mode 100644
index 0000000..f65673c
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/confluence.py
@@ -0,0 +1,109 @@
+"""Confluence command argument definitions.
+
+This module defines ALL arguments for the confluence command in ONE place.
+Both confluence_scraper.py (standalone) and parsers/confluence_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# Confluence-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "base_url": {
+        "flags": ("--base-url",),
+        "kwargs": {
+            "type": str,
+            "help": "Confluence instance base URL",
+            "metavar": "URL",
+        },
+    },
+    "space_key": {
+        "flags": ("--space-key",),
+        "kwargs": {
+            "type": str,
+            "help": "Confluence space key to extract from",
+            "metavar": "KEY",
+        },
+    },
+    "export_path": {
+        "flags": ("--export-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to Confluence HTML/XML export directory",
+            "metavar": "PATH",
+        },
+    },
+    "username": {
+        "flags": ("--username",),
+        "kwargs": {
+            "type": str,
+            "help": "Confluence username for API authentication",
+            "metavar": "USER",
+        },
+    },
+    "token": {
+        "flags": ("--token",),
+        "kwargs": {
+            "type": str,
+            "help": "Confluence API token for authentication",
+            "metavar": "TOKEN",
+        },
+    },
+    "max_pages": {
+        "flags": ("--max-pages",),
+        "kwargs": {
+            "type": int,
+            "default": 500,
+            "help": "Maximum number of pages to extract (default: 500)",
+            "metavar": "N",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_confluence_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all confluence command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds Confluence-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for Confluence.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Confluence
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for Confluence), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # Confluence-specific args
+    for arg_name, arg_def in CONFLUENCE_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py
index 094590a..6fb153f 100644
--- a/src/skill_seekers/cli/arguments/create.py
+++ b/src/skill_seekers/cli/arguments/create.py
@@ -549,6 +549,121 @@ CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
     # For unified config files, use `skill-seekers unified --fresh` directly.
 }
 
+# New source type arguments (v3.2.0+)
+# These are minimal dicts since most flags are handled by each scraper's own argument module.
+# The create command only needs the primary input flag for routing.
+
+JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "notebook": {
+        "flags": ("--notebook",),
+        "kwargs": {"type": str, "help": "Jupyter Notebook file path (.ipynb)", "metavar": "PATH"},
+    },
+}
+
+HTML_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "html_path": {
+        "flags": ("--html-path",),
+        "kwargs": {"type": str, "help": "Local HTML file or directory path", "metavar": "PATH"},
+    },
+}
+
+OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "spec": {
+        "flags": ("--spec",),
+        "kwargs": {"type": str, "help": "OpenAPI/Swagger spec file path", "metavar": "PATH"},
+    },
+    "spec_url": {
+        "flags": ("--spec-url",),
+        "kwargs": {"type": str, "help": "OpenAPI/Swagger spec URL", "metavar": "URL"},
+    },
+}
+
+ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "asciidoc_path": {
+        "flags": ("--asciidoc-path",),
+        "kwargs": {"type": str, "help": "AsciiDoc file or directory path", "metavar": "PATH"},
+    },
+}
+
+PPTX_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "pptx": {
+        "flags": ("--pptx",),
+        "kwargs": {"type": str, "help": "PowerPoint file path (.pptx)", "metavar": "PATH"},
+    },
+}
+
+RSS_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "feed_url": {
+        "flags": ("--feed-url",),
+        "kwargs": {"type": str, "help": "RSS/Atom feed URL", "metavar": "URL"},
+    },
+    "feed_path": {
+        "flags": ("--feed-path",),
+        "kwargs": {"type": str, "help": "RSS/Atom feed file path", "metavar": "PATH"},
+    },
+}
+
+MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "man_names": {
+        "flags": ("--man-names",),
+        "kwargs": {
+            "type": str,
+            "help": "Comma-separated man page names (e.g., 'git,curl')",
+            "metavar": "NAMES",
+        },
+    },
+    "man_path": {
+        "flags": ("--man-path",),
+        "kwargs": {"type": str, "help": "Directory of man page files", "metavar": "PATH"},
+    },
+}
+
+CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "conf_base_url": {
+        "flags": ("--conf-base-url",),
+        "kwargs": {"type": str, "help": "Confluence base URL", "metavar": "URL"},
+    },
+    "space_key": {
+        "flags": ("--space-key",),
+        "kwargs": {"type": str, "help": "Confluence space key", "metavar": "KEY"},
+    },
+    "conf_export_path": {
+        "flags": ("--conf-export-path",),
+        "kwargs": {"type": str, "help": "Confluence export directory", "metavar": "PATH"},
+    },
+}
+
+NOTION_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "database_id": {
+        "flags": ("--database-id",),
+        "kwargs": {"type": str, "help": "Notion database ID", "metavar": "ID"},
+    },
+    "page_id": {
+        "flags": ("--page-id",),
+        "kwargs": {"type": str, "help": "Notion page ID", "metavar": "ID"},
+    },
+    "notion_export_path": {
+        "flags": ("--notion-export-path",),
+        "kwargs": {"type": str, "help": "Notion export directory", "metavar": "PATH"},
+    },
+}
+
+CHAT_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "chat_export_path": {
+        "flags": ("--chat-export-path",),
+        "kwargs": {"type": str, "help": "Slack/Discord export directory", "metavar": "PATH"},
+    },
+    "platform": {
+        "flags": ("--platform",),
+        "kwargs": {
+            "type": str,
+            "choices": ["slack", "discord"],
+            "default": "slack",
+            "help": "Chat platform (default: slack)",
+        },
+    },
+}
+
 # =============================================================================
 # TIER 3: ADVANCED/RARE ARGUMENTS
 # =============================================================================
@@ -613,6 +728,17 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]
         "epub": EPUB_ARGUMENTS,
         "video": VIDEO_ARGUMENTS,
         "config": CONFIG_ARGUMENTS,
+        # New source types (v3.2.0+)
+        "jupyter": JUPYTER_ARGUMENTS,
+        "html": HTML_ARGUMENTS,
+        "openapi": OPENAPI_ARGUMENTS,
+        "asciidoc": ASCIIDOC_ARGUMENTS,
+        "pptx": PPTX_ARGUMENTS,
+        "rss": RSS_ARGUMENTS,
+        "manpage": MANPAGE_ARGUMENTS,
+        "confluence": CONFLUENCE_ARGUMENTS,
+        "notion": NOTION_ARGUMENTS,
+        "chat": CHAT_ARGUMENTS,
     }
     return source_args.get(source_type, {})
 
@@ -703,6 +829,24 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default")
         for arg_name, arg_def in CONFIG_ARGUMENTS.items():
             parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
 
+    # New source types (v3.2.0+)
+    _NEW_SOURCE_ARGS = {
+        "jupyter": JUPYTER_ARGUMENTS,
+        "html": HTML_ARGUMENTS,
+        "openapi": OPENAPI_ARGUMENTS,
+        "asciidoc": ASCIIDOC_ARGUMENTS,
+        "pptx": PPTX_ARGUMENTS,
+        "rss": RSS_ARGUMENTS,
+        "manpage": MANPAGE_ARGUMENTS,
+        "confluence": CONFLUENCE_ARGUMENTS,
+        "notion": NOTION_ARGUMENTS,
+        "chat": CHAT_ARGUMENTS,
+    }
+    for stype, sargs in _NEW_SOURCE_ARGS.items():
+        if mode in [stype, "all"]:
+            for arg_name, arg_def in sargs.items():
+                parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
+
     # Add advanced arguments if requested
     if mode in ["advanced", "all"]:
         for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
diff --git a/src/skill_seekers/cli/arguments/html.py b/src/skill_seekers/cli/arguments/html.py
new file mode 100644
index 0000000..56ee554
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/html.py
@@ -0,0 +1,68 @@
+"""HTML command argument definitions.
+
+This module defines ALL arguments for the html command in ONE place.
+Both html_scraper.py (standalone) and parsers/html_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# HTML-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+HTML_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "html_path": {
+        "flags": ("--html-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to HTML file or directory containing HTML files",
+            "metavar": "PATH",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_html_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all html command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds HTML-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for HTML.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for HTML
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for HTML), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # HTML-specific args
+    for arg_name, arg_def in HTML_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/jupyter.py b/src/skill_seekers/cli/arguments/jupyter.py
new file mode 100644
index 0000000..f4f0bbd
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/jupyter.py
@@ -0,0 +1,68 @@
+"""Jupyter Notebook command argument definitions.
+
+This module defines ALL arguments for the jupyter command in ONE place.
+Both jupyter_scraper.py (standalone) and parsers/jupyter_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# Jupyter-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "notebook": {
+        "flags": ("--notebook",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to .ipynb file or directory containing notebooks",
+            "metavar": "PATH",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_jupyter_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all jupyter command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds Jupyter-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for Jupyter.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Jupyter
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for Jupyter), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # Jupyter-specific args
+    for arg_name, arg_def in JUPYTER_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/manpage.py b/src/skill_seekers/cli/arguments/manpage.py
new file mode 100644
index 0000000..f867c35
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/manpage.py
@@ -0,0 +1,84 @@
+"""Man page command argument definitions.
+
+This module defines ALL arguments for the manpage command in ONE place.
+Both manpage_scraper.py (standalone) and parsers/manpage_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# ManPage-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "man_names": {
+        "flags": ("--man-names",),
+        "kwargs": {
+            "type": str,
+            "help": "Comma-separated list of man page names (e.g., 'ls,grep,find')",
+            "metavar": "NAMES",
+        },
+    },
+    "man_path": {
+        "flags": ("--man-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to directory containing man page files",
+            "metavar": "PATH",
+        },
+    },
+    "sections": {
+        "flags": ("--sections",),
+        "kwargs": {
+            "type": str,
+            "help": "Comma-separated section numbers to include (e.g., '1,3,8')",
+            "metavar": "SECTIONS",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_manpage_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all manpage command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds ManPage-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for ManPage.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for ManPage
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for ManPage), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # ManPage-specific args
+    for arg_name, arg_def in MANPAGE_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/notion.py b/src/skill_seekers/cli/arguments/notion.py
new file mode 100644
index 0000000..b48f161
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/notion.py
@@ -0,0 +1,101 @@
+"""Notion command argument definitions.
+
+This module defines ALL arguments for the notion command in ONE place.
+Both notion_scraper.py (standalone) and parsers/notion_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# Notion-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+NOTION_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "database_id": {
+        "flags": ("--database-id",),
+        "kwargs": {
+            "type": str,
+            "help": "Notion database ID to extract from",
+            "metavar": "ID",
+        },
+    },
+    "page_id": {
+        "flags": ("--page-id",),
+        "kwargs": {
+            "type": str,
+            "help": "Notion page ID to extract from",
+            "metavar": "ID",
+        },
+    },
+    "export_path": {
+        "flags": ("--export-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to Notion export directory",
+            "metavar": "PATH",
+        },
+    },
+    "token": {
+        "flags": ("--token",),
+        "kwargs": {
+            "type": str,
+            "help": "Notion integration token for API authentication",
+            "metavar": "TOKEN",
+        },
+    },
+    "max_pages": {
+        "flags": ("--max-pages",),
+        "kwargs": {
+            "type": int,
+            "default": 500,
+            "help": "Maximum number of pages to extract (default: 500)",
+            "metavar": "N",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_notion_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all notion command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds Notion-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for Notion.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Notion
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for Notion), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # Notion-specific args
+    for arg_name, arg_def in NOTION_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/openapi.py b/src/skill_seekers/cli/arguments/openapi.py
new file mode 100644
index 0000000..ed0ffa5
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/openapi.py
@@ -0,0 +1,76 @@
+"""OpenAPI command argument definitions.
+
+This module defines ALL arguments for the openapi command in ONE place.
+Both openapi_scraper.py (standalone) and parsers/openapi_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# OpenAPI-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "spec": {
+        "flags": ("--spec",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to OpenAPI/Swagger spec file",
+            "metavar": "PATH",
+        },
+    },
+    "spec_url": {
+        "flags": ("--spec-url",),
+        "kwargs": {
+            "type": str,
+            "help": "URL to OpenAPI/Swagger spec",
+            "metavar": "URL",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_openapi_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all openapi command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds OpenAPI-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for OpenAPI.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for OpenAPI
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for OpenAPI), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # OpenAPI-specific args
+    for arg_name, arg_def in OPENAPI_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/pptx.py b/src/skill_seekers/cli/arguments/pptx.py
new file mode 100644
index 0000000..ce0b114
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/pptx.py
@@ -0,0 +1,68 @@
+"""PPTX command argument definitions.
+
+This module defines ALL arguments for the pptx command in ONE place.
+Both pptx_scraper.py (standalone) and parsers/pptx_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# PPTX-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+PPTX_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "pptx": {
+        "flags": ("--pptx",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to PowerPoint file (.pptx)",
+            "metavar": "PATH",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_pptx_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all pptx command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds PPTX-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for PPTX.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for PPTX
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for PPTX), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # PPTX-specific args
+    for arg_name, arg_def in PPTX_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/arguments/rss.py b/src/skill_seekers/cli/arguments/rss.py
new file mode 100644
index 0000000..6ca89c7
--- /dev/null
+++ b/src/skill_seekers/cli/arguments/rss.py
@@ -0,0 +1,101 @@
+"""RSS command argument definitions.
+
+This module defines ALL arguments for the rss command in ONE place.
+Both rss_scraper.py (standalone) and parsers/rss_parser.py (unified CLI)
+import and use these definitions.
+
+Shared arguments (name, description, output, enhance-level, api-key,
+dry-run, verbose, quiet, workflow args) come from common.py / workflow.py
+via ``add_all_standard_arguments()``.
+"""
+
+import argparse
+from typing import Any
+
+from .common import add_all_standard_arguments
+
+# RSS-specific argument definitions as data structure
+# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run,
+#       verbose, quiet, workflow args) are registered by add_all_standard_arguments().
+RSS_ARGUMENTS: dict[str, dict[str, Any]] = {
+    "feed_url": {
+        "flags": ("--feed-url",),
+        "kwargs": {
+            "type": str,
+            "help": "URL of the RSS/Atom feed",
+            "metavar": "URL",
+        },
+    },
+    "feed_path": {
+        "flags": ("--feed-path",),
+        "kwargs": {
+            "type": str,
+            "help": "Path to local RSS/Atom feed file",
+            "metavar": "PATH",
+        },
+    },
+    "follow_links": {
+        "flags": ("--follow-links",),
+        "kwargs": {
+            "action": "store_true",
+            "default": True,
+            "help": "Follow article links and extract full content (default: True)",
+        },
+    },
+    "no_follow_links": {
+        "flags": ("--no-follow-links",),
+        "kwargs": {
+            "action": "store_false",
+            "dest": "follow_links",
+            "help": "Do not follow article links; use feed summary only",
+        },
+    },
+    "max_articles": {
+        "flags": ("--max-articles",),
+        "kwargs": {
+            "type": int,
+            "default": 50,
+            "help": "Maximum number of articles to extract (default: 50)",
+            "metavar": "N",
+        },
+    },
+    "from_json": {
+        "flags": ("--from-json",),
+        "kwargs": {
+            "type": str,
+            "help": "Build skill from extracted JSON",
+            "metavar": "FILE",
+        },
+    },
+}
+
+
+def add_rss_arguments(parser: argparse.ArgumentParser) -> None:
+    """Add all rss command arguments to a parser.
+
+    Registers shared args (name, description, output, enhance-level, api-key,
+    dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(),
+    then adds RSS-specific args on top.
+
+    The default for --enhance-level is overridden to 0 (disabled) for RSS.
+    """
+    # Shared universal args first
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for RSS
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for RSS), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # RSS-specific args
+    for arg_name, arg_def in RSS_ARGUMENTS.items():
+        flags = arg_def["flags"]
+        kwargs = arg_def["kwargs"]
+        parser.add_argument(*flags, **kwargs)
diff --git a/src/skill_seekers/cli/asciidoc_scraper.py b/src/skill_seekers/cli/asciidoc_scraper.py
new file mode 100644
index 0000000..b5082ed
--- /dev/null
+++ b/src/skill_seekers/cli/asciidoc_scraper.py
@@ -0,0 +1,1085 @@
+#!/usr/bin/env python3
+"""
+AsciiDoc Documentation to Skill Converter
+
+Converts AsciiDoc (.adoc, .asciidoc) documentation files into AI-ready skills.
+Supports both single files and directories of AsciiDoc documents.
+
+Uses the ``asciidoc`` library when available for accurate HTML rendering,
+falling back to a comprehensive regex-based parser that handles headings,
+code blocks, tables, admonitions, include directives, and inline formatting.
+
+Usage:
+    skill-seekers asciidoc --asciidoc-path doc.adoc --name myskill
+    skill-seekers asciidoc --asciidoc-path docs/ --name myskill
+    skill-seekers asciidoc --from-json doc_extracted.json
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+
+# Optional dependency guard — asciidoc library for HTML conversion
+try:
+    import asciidoc as asciidoc_lib  # noqa: F401
+
+    ASCIIDOC_AVAILABLE = True
+except ImportError:
+    ASCIIDOC_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+ASCIIDOC_EXTENSIONS = {".adoc", ".asciidoc", ".asc", ".ad"}
+ADMONITION_TYPES = ("NOTE", "TIP", "WARNING", "IMPORTANT", "CAUTION")
+
+# Regex patterns for AsciiDoc structure
+RE_HEADING = re.compile(r"^(={1,5})\s+(.+)$", re.MULTILINE)
+RE_SOURCE_ATTR = re.compile(r"^\[source(?:,\s*(\w[\w+#.-]*))?(?:,.*?)?\]$", re.MULTILINE)
+RE_LISTING_DELIM = re.compile(r"^(-{4,})$", re.MULTILINE)
+RE_LITERAL_DELIM = re.compile(r"^(\.{4,})$", re.MULTILINE)
+RE_TABLE_DELIM = re.compile(r"^\|={3,}$", re.MULTILINE)
+RE_TABLE_CELL = re.compile(r"^\|(.+)$", re.MULTILINE)
+RE_ADMONITION_PARA = re.compile(
+    r"^(NOTE|TIP|WARNING|IMPORTANT|CAUTION):\s+(.+?)(?:\n\n|\Z)",
+    re.MULTILINE | re.DOTALL,
+)
+RE_ADMONITION_BLOCK = re.compile(
+    r"^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\n={4,}\n(.*?)\n={4,}",
+    re.MULTILINE | re.DOTALL,
+)
+RE_INCLUDE = re.compile(r"^include::(.+?)\[([^\]]*)\]$", re.MULTILINE)
+RE_ATTRIBUTE = re.compile(r"^:([a-zA-Z0-9_-]+):\s*(.*)$", re.MULTILINE)
+RE_ATTR_REF = re.compile(r"\{([a-zA-Z0-9_-]+)\}")
+RE_BOLD = re.compile(r"\*([^\s*](?:.*?[^\s*])?)\*")
+RE_ITALIC = re.compile(r"_([^\s_](?:.*?[^\s_])?)_")
+RE_MONO = re.compile(r"`([^`]+)`")
+RE_LINK = re.compile(r"(https?://\S+)\[([^\]]*)\]")
+RE_XREF = re.compile(r"<<([^,>]+)(?:,\s*([^>]+))?>>")
+
+
+def _check_asciidoc_deps() -> None:
+    """Log debug message when asciidoc library is not installed (regex fallback used)."""
+    if not ASCIIDOC_AVAILABLE:
+        logger.debug(
+            "asciidoc library not installed; using regex-based parser.\n"
+            'Install with: pip install "skill-seekers[asciidoc]" or: pip install asciidoc'
+        )
+
+
+def infer_description_from_asciidoc(metadata: dict | None = None, name: str = "") -> str:
+    """Infer skill description from AsciiDoc document metadata."""
+    if metadata:
+        if metadata.get("description") and len(str(metadata["description"])) > 20:
+            desc = str(metadata["description"]).strip()
+            return (
+                f"Use when {desc[:147].lower()}..."
+                if len(desc) > 150
+                else f"Use when {desc.lower()}"
+            )
+        if metadata.get("title") and len(str(metadata["title"])) > 10:
+            return f"Use when working with {str(metadata['title']).lower()}"
+    return (
+        f"Use when referencing {name} documentation"
+        if name
+        else "Use when referencing this documentation"
+    )
+
+
+def _score_code_quality(code: str) -> float:
+    """Simple quality heuristic for code blocks (0-10 scale)."""
+    if not code:
+        return 0.0
+    score = 5.0
+    line_count = len(code.strip().split("\n"))
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+    if len(code) < 30:
+        score -= 2.0
+    return min(10.0, max(0.0, score))
+
+
+class AsciiDocToSkillConverter:
+    """Convert AsciiDoc documentation to an AI-ready skill.
+
+    Handles single ``.adoc`` files and directories. Content is parsed into
+    intermediate JSON, categorised, then rendered into the standard skill
+    directory layout (SKILL.md, references/, etc.).
+    """
+
+    def __init__(self, config: dict) -> None:
+        self.config = config
+        self.name: str = config["name"]
+        self.asciidoc_path: str = config.get("asciidoc_path", "")
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} documentation"
+        )
+        self.skill_dir: str = f"output/{self.name}"
+        self.data_file: str = f"output/{self.name}_extracted.json"
+        self.categories: dict = config.get("categories", {})
+        self.extracted_data: dict | None = None
+
+    # ------------------------------------------------------------------
+    # Extraction
+    # ------------------------------------------------------------------
+
+    def extract_asciidoc(self) -> bool:
+        """Extract content from AsciiDoc file(s).
+
+        Discovers files, resolves attributes/includes, parses sections,
+        detects languages, and saves intermediate JSON.
+
+        Returns:
+            True on success.
+
+        Raises:
+            FileNotFoundError: If path does not exist.
+            ValueError: If no AsciiDoc files found.
+        """
+        _check_asciidoc_deps()
+        from skill_seekers.cli.language_detector import LanguageDetector
+
+        print(f"\n🔍 Extracting from AsciiDoc: {self.asciidoc_path}")
+        path = Path(self.asciidoc_path)
+        if not path.exists():
+            raise FileNotFoundError(f"AsciiDoc path not found: {self.asciidoc_path}")
+
+        files = self._discover_files(path)
+        if not files:
+            raise ValueError(
+                f"No AsciiDoc files found at: {self.asciidoc_path}\n"
+                f"Expected extensions: {', '.join(sorted(ASCIIDOC_EXTENSIONS))}"
+            )
+        print(f"   Found {len(files)} AsciiDoc file(s)")
+
+        all_sections: list[dict] = []
+        metadata: dict = {}
+        section_counter = 0
+
+        for file_path in sorted(files):
+            raw_text = file_path.read_text(encoding="utf-8", errors="replace")
+            attributes = self._extract_attributes(raw_text)
+            resolved_text = self._resolve_attributes(raw_text, attributes)
+            resolved_text = self._resolve_includes(resolved_text, file_path.parent)
+            if not metadata:
+                metadata = self._build_metadata(attributes, file_path)
+
+            for section in self._parse_asciidoc_sections(resolved_text):
+                section_counter += 1
+                section["section_number"] = section_counter
+                section["source_file"] = str(file_path)
+                body = section.pop("body", "")
+                section["code_samples"] = self._extract_code_blocks(body)
+                section["tables"] = self._extract_tables(body)
+                section["admonitions"] = self._extract_admonitions(body)
+                section["includes"] = self._extract_includes(body)
+                section["text"] = self._convert_to_markdown(body)
+                all_sections.append(section)
+
+        # Language detection
+        detector = LanguageDetector(min_confidence=0.15)
+        languages_detected: dict[str, int] = {}
+        total_code_blocks = 0
+        for section in all_sections:
+            for cs in section.get("code_samples", []):
+                if cs.get("language"):
+                    languages_detected[cs["language"]] = (
+                        languages_detected.get(cs["language"], 0) + 1
+                    )
+                total_code_blocks += 1
+        for section in all_sections:
+            for cs in section.get("code_samples", []):
+                if not cs.get("language") and cs.get("code"):
+                    lang, conf = detector.detect_from_code(cs["code"])
+                    if lang and conf >= 0.3:
+                        cs["language"] = lang
+                        languages_detected[lang] = languages_detected.get(lang, 0) + 1
+
+        if not self.config.get("description"):
+            self.description = infer_description_from_asciidoc(metadata, self.name)
+
+        result_data = {
+            "source_path": self.asciidoc_path,
+            "metadata": metadata,
+            "total_sections": len(all_sections),
+            "total_files": len(files),
+            "total_code_blocks": total_code_blocks,
+            "total_tables": sum(len(s.get("tables", [])) for s in all_sections),
+            "total_admonitions": sum(len(s.get("admonitions", [])) for s in all_sections),
+            "languages_detected": languages_detected,
+            "pages": all_sections,
+        }
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {len(all_sections)} sections, {total_code_blocks} code blocks, "
+            f"{result_data['total_tables']} tables, {result_data['total_admonitions']} admonitions"
+        )
+        return True
+
+    def _discover_files(self, path: Path) -> list[Path]:
+        """Return sorted list of AsciiDoc files from *path* (file or directory)."""
+        if path.is_file():
+            return [path] if path.suffix.lower() in ASCIIDOC_EXTENSIONS else []
+        found: list[Path] = []
+        for ext in ASCIIDOC_EXTENSIONS:
+            found.extend(path.rglob(f"*{ext}"))
+        return sorted(set(found))
+
+    # ------------------------------------------------------------------
+    # Attribute / include resolution
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _extract_attributes(text: str) -> dict[str, str]:
+        """Extract ``:attr-name: value`` definitions from text."""
+        return {m.group(1): m.group(2).strip() for m in RE_ATTRIBUTE.finditer(text)}
+
+    @staticmethod
+    def _resolve_attributes(text: str, attributes: dict[str, str]) -> str:
+        """Replace ``{attr-name}`` references with their values."""
+        return RE_ATTR_REF.sub(lambda m: attributes.get(m.group(1), m.group(0)), text)
+
+    def _resolve_includes(self, text: str, base_dir: Path) -> str:
+        """Resolve ``include::`` directives by inlining referenced files."""
+        max_depth = 5
+
+        def _resolve_once(src: str, depth: int) -> str:
+            if depth >= max_depth:
+                return src
+
+            def _replacer(match: re.Match) -> str:
+                inc_path = match.group(1).strip()
+                inc_file = base_dir / inc_path
+                if inc_file.is_file():
+                    try:
+                        return _resolve_once(
+                            inc_file.read_text(encoding="utf-8", errors="replace"), depth + 1
+                        )
+                    except OSError:
+                        logger.debug("Could not read include file: %s", inc_file)
+                return f"// include::{inc_path}[] (not resolved)"
+
+            return RE_INCLUDE.sub(_replacer, src)
+
+        return _resolve_once(text, 0)
+
+    @staticmethod
+    def _build_metadata(attributes: dict[str, str], file_path: Path) -> dict:
+        """Build metadata dict from document attributes."""
+        return {
+            "title": attributes.get("doctitle", attributes.get("title", file_path.stem)),
+            "author": attributes.get("author", ""),
+            "email": attributes.get("email", ""),
+            "revision": attributes.get("revnumber", attributes.get("version", "")),
+            "date": attributes.get("revdate", attributes.get("date", "")),
+            "description": attributes.get("description", ""),
+            "keywords": attributes.get("keywords", ""),
+            "source_file": str(file_path),
+        }
+
+    # ------------------------------------------------------------------
+    # Section parsing
+    # ------------------------------------------------------------------
+
+    def _parse_asciidoc_sections(self, text: str) -> list[dict]:
+        """Parse AsciiDoc text into sections split by headings (= through =====)."""
+        heading_matches = [
+            (m.start(), len(m.group(1)), m.group(2).strip(), m.group(0))
+            for m in RE_HEADING.finditer(text)
+        ]
+        if not heading_matches:
+            return [{"heading": "", "heading_level": "h1", "body": text.strip(), "headings": []}]
+
+        sections: list[dict] = []
+        preamble = text[: heading_matches[0][0]].strip()
+        if preamble:
+            sections.append(
+                {"heading": "", "heading_level": "h1", "body": preamble, "headings": []}
+            )
+
+        for idx, (start, level, heading_text, raw) in enumerate(heading_matches):
+            body_start = start + len(raw)
+            body_end = heading_matches[idx + 1][0] if idx + 1 < len(heading_matches) else len(text)
+            body = text[body_start:body_end].strip()
+
+            sub_headings = [
+                {"level": f"h{len(m.group(1))}", "text": m.group(2).strip()}
+                for m in RE_HEADING.finditer(body)
+                if len(m.group(1)) > level
+            ]
+            sections.append(
+                {
+                    "heading": heading_text,
+                    "heading_level": f"h{level}",
+                    "body": body,
+                    "headings": sub_headings,
+                }
+            )
+        return sections
+
+    # ------------------------------------------------------------------
+    # Code block extraction
+    # ------------------------------------------------------------------
+
+    def _extract_code_blocks(self, text: str) -> list[dict]:
+        """Extract source/listing/literal code blocks from AsciiDoc text.
+
+        Handles [source,lang] + ---- blocks, bare ---- blocks, and .... blocks.
+        """
+        blocks: list[dict] = []
+        consumed: list[tuple[int, int]] = []
+
+        # Pattern 1: [source,lang] + ---- block
+        for attr_m in RE_SOURCE_ATTR.finditer(text):
+            lang = (attr_m.group(1) or "").strip()
+            open_m = RE_LISTING_DELIM.search(text, attr_m.end())
+            if not open_m:
+                continue
+            between = text[attr_m.end() : open_m.start()].strip()
+            if between and not between.startswith(".") and "\n" in between:
+                continue
+            delim = open_m.group(1)
+            close_m = re.search(
+                r"^" + re.escape(delim) + r"$", text[open_m.end() + 1 :], re.MULTILINE
+            )
+            if not close_m:
+                continue
+            abs_close = open_m.end() + 1 + close_m.start()
+            code = text[open_m.end() : abs_close].strip("\n")
+            if code:
+                blocks.append(
+                    {"code": code, "language": lang, "quality_score": _score_code_quality(code)}
+                )
+                consumed.append((attr_m.start(), abs_close + len(close_m.group(0))))
+
+        # Pattern 2: bare ---- listing blocks
+        for m in RE_LISTING_DELIM.finditer(text):
+            if self._in_range(m.start(), consumed):
+                continue
+            delim = m.group(1)
+            close_m = re.search(r"^" + re.escape(delim) + r"$", text[m.end() + 1 :], re.MULTILINE)
+            if not close_m:
+                continue
+            abs_close = m.end() + 1 + close_m.start()
+            code = text[m.end() : abs_close].strip("\n")
+            if code:
+                blocks.append(
+                    {"code": code, "language": "", "quality_score": _score_code_quality(code)}
+                )
+                consumed.append((m.start(), abs_close + len(close_m.group(0))))
+
+        # Pattern 3: .... literal blocks
+        for m in RE_LITERAL_DELIM.finditer(text):
+            if self._in_range(m.start(), consumed):
+                continue
+            delim = m.group(1)
+            close_m = re.search(r"^" + re.escape(delim) + r"$", text[m.end() + 1 :], re.MULTILINE)
+            if not close_m:
+                continue
+            abs_close = m.end() + 1 + close_m.start()
+            code = text[m.end() : abs_close].strip("\n")
+            if code:
+                blocks.append(
+                    {"code": code, "language": "", "quality_score": _score_code_quality(code)}
+                )
+                consumed.append((m.start(), abs_close + len(close_m.group(0))))
+
+        return blocks
+
+    # ------------------------------------------------------------------
+    # Table extraction
+    # ------------------------------------------------------------------
+
+    def _extract_tables(self, text: str) -> list[dict]:
+        """Parse AsciiDoc tables delimited by ``|===``."""
+        tables: list[dict] = []
+        delimiters = list(RE_TABLE_DELIM.finditer(text))
+        idx = 0
+        while idx + 1 < len(delimiters):
+            body = text[delimiters[idx].end() : delimiters[idx + 1].start()].strip()
+            if body:
+                table = self._parse_table_body(body)
+                if table:
+                    tables.append(table)
+            idx += 2
+        return tables
+
+    @staticmethod
+    def _parse_table_body(table_body: str) -> dict | None:
+        """Parse body of an AsciiDoc table into headers and rows."""
+        groups = re.split(r"\n\s*\n", table_body.strip())
+        if not groups:
+            return None
+
+        def _parse_row(row_text: str) -> list[str]:
+            return [p.strip() for p in row_text.split("|") if p.strip()]
+
+        # First group → headers
+        headers: list[str] = []
+        for line in groups[0].strip().splitlines():
+            if line.strip().startswith("|"):
+                parsed = _parse_row(line)
+                if parsed and not headers:
+                    headers = parsed
+                elif parsed:
+                    for i, cell in enumerate(parsed):
+                        if i < len(headers):
+                            headers[i] = f"{headers[i]} {cell}".strip()
+                        else:
+                            headers.append(cell)
+
+        # Remaining groups → rows
+        rows: list[list[str]] = []
+        for group in groups[1:]:
+            for line in group.strip().splitlines():
+                if line.strip().startswith("|"):
+                    parsed = _parse_row(line)
+                    if parsed:
+                        rows.append(parsed)
+
+        # Single group fallback: first parsed line = header, rest = rows
+        if len(groups) == 1 and not rows:
+            all_parsed = [
+                _parse_row(line)
+                for line in groups[0].strip().splitlines()
+                if line.strip().startswith("|")
+            ]
+            all_parsed = [r for r in all_parsed if r]
+            if len(all_parsed) > 1:
+                headers, rows = all_parsed[0], all_parsed[1:]
+            elif all_parsed:
+                headers = all_parsed[0]
+
+        return {"headers": headers, "rows": rows} if headers or rows else None
+
+    # ------------------------------------------------------------------
+    # Admonition extraction
+    # ------------------------------------------------------------------
+
+    def _extract_admonitions(self, text: str) -> list[dict]:
+        """Extract NOTE/TIP/WARNING/IMPORTANT/CAUTION admonitions."""
+        admonitions: list[dict] = []
+        seen: set[str] = set()
+        for pattern in (RE_ADMONITION_BLOCK, RE_ADMONITION_PARA):
+            for m in pattern.finditer(text):
+                adm_type, adm_text = m.group(1), m.group(2).strip()
+                if adm_text and adm_text not in seen:
+                    admonitions.append({"type": adm_type, "text": adm_text})
+                    seen.add(adm_text)
+        return admonitions
+
+    # ------------------------------------------------------------------
+    # Include directive extraction
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _extract_includes(text: str) -> list[dict]:
+        """Detect remaining ``include::`` directives in text."""
+        return [
+            {"path": m.group(1).strip(), "options": m.group(2).strip()}
+            for m in RE_INCLUDE.finditer(text)
+        ]
+
+    # ------------------------------------------------------------------
+    # AsciiDoc → Markdown conversion
+    # ------------------------------------------------------------------
+
+    def _convert_to_markdown(self, text: str) -> str:
+        """Convert AsciiDoc inline formatting to Markdown equivalents."""
+        result = text
+        # Remove processed block delimiters and attribute lines
+        for pat in (
+            RE_LISTING_DELIM,
+            RE_LITERAL_DELIM,
+            RE_TABLE_DELIM,
+            RE_SOURCE_ATTR,
+            RE_ATTRIBUTE,
+        ):
+            result = pat.sub("", result)
+        # Remove admonition block markers and delimiters
+        result = re.sub(
+            r"^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\s*$", "", result, flags=re.MULTILINE
+        )
+        result = re.sub(r"^={4,}$", "", result, flags=re.MULTILINE)
+        # Headings: = Title → # Title
+        result = RE_HEADING.sub(lambda m: f"{'#' * len(m.group(1))} {m.group(2).strip()}", result)
+        # Inline formatting
+        result = RE_BOLD.sub(r"**\1**", result)
+        result = RE_ITALIC.sub(r"*\1*", result)
+        result = RE_LINK.sub(r"[\2](\1)", result)
+        result = RE_XREF.sub(lambda m: f"*{m.group(2) or m.group(1)}*", result)
+        # Lists: * item → - item, . item → 1. item
+        result = re.sub(
+            r"^(\*{1,5})\s+",
+            lambda m: "  " * (len(m.group(1)) - 1) + "- ",
+            result,
+            flags=re.MULTILINE,
+        )
+        result = re.sub(
+            r"^(\.{1,5})\s+",
+            lambda m: "  " * (len(m.group(1)) - 1) + "1. ",
+            result,
+            flags=re.MULTILINE,
+        )
+        # Block titles: .Title → **Title**
+        result = re.sub(r"^\.([A-Z][\w\s]+)$", r"**\1**", result, flags=re.MULTILINE)
+        # Include comments
+        result = re.sub(
+            r"^//\s*include::(.+?)\[\].*$", r"*(included: \1)*", result, flags=re.MULTILINE
+        )
+        # Remove leftover table cell markers
+        result = re.sub(r"^\|\s*", "", result, flags=re.MULTILINE)
+        # Collapse blank lines
+        result = re.sub(r"\n{3,}", "\n\n", result)
+        return result.strip()
+
+    # ------------------------------------------------------------------
+    # Load / categorize / build
+    # ------------------------------------------------------------------
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from JSON file."""
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", [])))
+        print(f"✅ Loaded {total} sections")
+        return True
+
+    def categorize_content(self) -> dict:
+        """Categorize sections by source file, headings, or keywords."""
+        print("\n📋 Categorizing content...")
+        categorized: dict[str, dict] = {}
+        sections = self.extracted_data.get("pages", [])
+        path = Path(self.asciidoc_path) if self.asciidoc_path else None
+
+        if path and path.is_file():
+            key = self._sanitize_filename(path.stem)
+            categorized[key] = {"title": path.stem, "pages": sections}
+            print(f"✅ Created 1 category (single file): {path.stem}: {len(sections)} sections")
+            return categorized
+
+        if path and path.is_dir():
+            for s in sections:
+                src_stem = Path(s.get("source_file", "unknown")).stem
+                key = self._sanitize_filename(src_stem)
+                categorized.setdefault(key, {"title": src_stem, "pages": []})["pages"].append(s)
+            if categorized:
+                print(f"✅ Created {len(categorized)} categories (by source file)")
+                for cat in categorized.values():
+                    print(f"   - {cat['title']}: {len(cat['pages'])} sections")
+                return categorized
+
+        if self.categories:
+            first_val = next(iter(self.categories.values()), None)
+            if isinstance(first_val, list) and first_val and isinstance(first_val[0], dict):
+                for k, pages in self.categories.items():
+                    categorized[k] = {"title": k.replace("_", " ").title(), "pages": pages}
+            else:
+                for k in self.categories:
+                    categorized[k] = {"title": k.replace("_", " ").title(), "pages": []}
+                for s in sections:
+                    txt = s.get("text", "").lower()
+                    htxt = s.get("heading", "").lower()
+                    scores = {
+                        k: sum(
+                            1
+                            for kw in kws
+                            if isinstance(kw, str) and (kw.lower() in txt or kw.lower() in htxt)
+                        )
+                        for k, kws in self.categories.items()
+                        if isinstance(kws, list)
+                    }
+                    scores = {k: v for k, v in scores.items() if v > 0}
+                    if scores:
+                        categorized[max(scores, key=scores.get)]["pages"].append(s)
+                    else:
+                        categorized.setdefault("other", {"title": "Other", "pages": []})[
+                            "pages"
+                        ].append(s)
+        else:
+            categorized["content"] = {"title": "Content", "pages": sections}
+
+        print(f"✅ Created {len(categorized)} categories")
+        for cat in categorized.values():
+            print(f"   - {cat['title']}: {len(cat['pages'])} sections")
+        return categorized
+
+    def build_skill(self) -> None:
+        """Build complete skill directory structure."""
+        print(f"\n🏗️  Building skill: {self.name}")
+        for subdir in ("references", "scripts", "assets"):
+            os.makedirs(f"{self.skill_dir}/{subdir}", exist_ok=True)
+
+        categorized = self.categorize_content()
+        print("\n📝 Generating reference files...")
+        total_cats = len(categorized)
+        for i, (cat_key, cat_data) in enumerate(categorized.items(), 1):
+            self._generate_reference_file(cat_key, cat_data, i, total_cats)
+        self._generate_index(categorized)
+        self._generate_skill_md(categorized)
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ------------------------------------------------------------------
+    # Private generation methods
+    # ------------------------------------------------------------------
+
+    def _ref_filename(self, cat_data: dict, section_num: int, total: int) -> str:
+        """Compute reference file path for a category."""
+        sections = cat_data["pages"]
+        adoc_base = ""
+        if self.asciidoc_path:
+            p = Path(self.asciidoc_path)
+            adoc_base = p.stem if p.is_file() else ""
+
+        if sections:
+            nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+            if total == 1:
+                return f"{self.skill_dir}/references/{adoc_base or 'main'}.md"
+            base = adoc_base or "section"
+            return f"{self.skill_dir}/references/{base}_s{min(nums)}-s{max(nums)}.md"
+        return f"{self.skill_dir}/references/section_{section_num:02d}.md"
+
+    def _generate_reference_file(
+        self, _cat_key: str, cat_data: dict, section_num: int, total: int
+    ) -> None:
+        """Generate a reference Markdown file for one category."""
+        filename = self._ref_filename(cat_data, section_num, total)
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+            for section in cat_data["pages"]:
+                sec_num = section.get("section_number", "?")
+                heading = section.get("heading", "")
+                hl = section.get("heading_level", "h1")
+                f.write(f"---\n\n**📄 Source: Section {sec_num}**\n\n")
+                if heading:
+                    f.write(f"{'#' * (int(hl[1]) + 1)} {heading}\n\n")
+                for sub in section.get("headings", []):
+                    sl = sub.get("level", "h3")
+                    if sub.get("text"):
+                        f.write(f"{'#' * (int(sl[1]) + 1)} {sub['text']}\n\n")
+                if section.get("text"):
+                    f.write(f"{section['text']}\n\n")
+                if section.get("code_samples"):
+                    f.write("### Code Examples\n\n")
+                    for c in section["code_samples"]:
+                        f.write(f"```{c.get('language', '')}\n{c['code']}\n```\n\n")
+                if section.get("tables"):
+                    f.write("### Tables\n\n")
+                    for t in section["tables"]:
+                        hdrs = t.get("headers", [])
+                        if hdrs:
+                            f.write("| " + " | ".join(str(h) for h in hdrs) + " |\n")
+                            f.write("| " + " | ".join("---" for _ in hdrs) + " |\n")
+                        for row in t.get("rows", []):
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+                if section.get("admonitions"):
+                    f.write("### Notes & Warnings\n\n")
+                    for a in section["admonitions"]:
+                        f.write(f"> **{a.get('type', 'NOTE')}:** {a.get('text', '')}\n\n")
+                f.write("---\n\n")
+        print(f"   Generated: {filename}")
+
+    def _generate_index(self, categorized: dict) -> None:
+        """Generate references/index.md."""
+        filename = f"{self.skill_dir}/references/index.md"
+        adoc_base = ""
+        if self.asciidoc_path:
+            p = Path(self.asciidoc_path)
+            adoc_base = p.stem if p.is_file() else ""
+        total = len(categorized)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Documentation Reference\n\n## Categories\n\n")
+            for i, (_k, cd) in enumerate(categorized.items(), 1):
+                pages = cd["pages"]
+                cnt = len(pages)
+                if pages:
+                    nums = [s.get("section_number", j + 1) for j, s in enumerate(pages)]
+                    rng = f"Sections {min(nums)}-{max(nums)}"
+                    if total == 1:
+                        lf = f"{adoc_base or 'main'}.md"
+                    else:
+                        lf = f"{adoc_base or 'section'}_s{min(nums)}-s{max(nums)}.md"
+                else:
+                    lf, rng = f"section_{i:02d}.md", "N/A"
+                f.write(f"- [{cd['title']}]({lf}) ({cnt} sections, {rng})\n")
+
+            f.write("\n## Statistics\n\n")
+            for key, label in [
+                ("total_sections", "Total sections"),
+                ("total_code_blocks", "Code blocks"),
+                ("total_tables", "Tables"),
+                ("total_admonitions", "Admonitions"),
+                ("total_files", "Source files"),
+            ]:
+                f.write(f"- {label}: {self.extracted_data.get(key, 0)}\n")
+            meta = self.extracted_data.get("metadata", {})
+            if meta.get("author"):
+                f.write(f"- Author: {meta['author']}\n")
+            if meta.get("date"):
+                f.write(f"- Date: {meta['date']}\n")
+        print(f"   Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict) -> None:
+        """Generate main SKILL.md file with rich summary content."""
+        filename = f"{self.skill_dir}/SKILL.md"
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024]
+        ed = self.extracted_data  # shorthand
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"---\nname: {skill_name}\ndescription: {desc}\n---\n\n")
+            f.write(f"# {self.name.title()} Documentation Skill\n\n{self.description}\n\n")
+
+            # Document metadata
+            meta = ed.get("metadata", {})
+            if any(v for v in meta.values() if v):
+                f.write("## 📋 Document Information\n\n")
+                for key, label in [
+                    ("title", "Title"),
+                    ("author", "Author"),
+                    ("revision", "Revision"),
+                    ("date", "Date"),
+                    ("description", "Description"),
+                ]:
+                    if meta.get(key):
+                        f.write(f"**{label}:** {meta[key]}\n\n")
+
+            f.write("## 💡 When to Use This Skill\n\nUse this skill when you need to:\n")
+            f.write(f"- Understand {self.name} concepts and fundamentals\n")
+            f.write("- Look up API references and technical specifications\n")
+            f.write("- Find code examples and implementation patterns\n")
+            f.write("- Review tutorials, guides, and best practices\n")
+            f.write("- Explore the complete documentation structure\n\n")
+
+            # Section Overview
+            f.write(
+                f"## 📖 Section Overview\n\n**Total Sections:** {ed.get('total_sections', 0)}\n\n"
+            )
+            f.write("**Content Breakdown:**\n\n")
+            for cd in categorized.values():
+                f.write(f"- **{cd['title']}**: {len(cd['pages'])} sections\n")
+            f.write("\n")
+
+            f.write(self._format_key_concepts())
+            f.write("## ⚡ Quick Reference\n\n")
+            f.write(self._format_patterns_from_content())
+
+            # Code examples (top 15 grouped by language)
+            all_code = [c for s in ed.get("pages", []) for c in s.get("code_samples", [])]
+            all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
+            if all_code[:15]:
+                f.write("## 📝 Code Examples\n\n*High-quality examples from documentation*\n\n")
+                by_lang: dict[str, list] = {}
+                for c in all_code[:15]:
+                    by_lang.setdefault(c.get("language", "unknown"), []).append(c)
+                for lang in sorted(by_lang):
+                    exs = by_lang[lang]
+                    f.write(f"### {lang.title()} Examples ({len(exs)})\n\n")
+                    for i, c in enumerate(exs[:5], 1):
+                        ct = c.get("code", "")
+                        f.write(
+                            f"**Example {i}** (Quality: {c.get('quality_score', 0):.1f}/10):\n\n"
+                        )
+                        f.write(f"```{lang}\n{ct[:500]}{'...' if len(ct) > 500 else ''}\n```\n\n")
+
+            # Table summary
+            all_tables = [
+                (s.get("heading", ""), t) for s in ed.get("pages", []) for t in s.get("tables", [])
+            ]
+            if all_tables:
+                f.write(f"## 📊 Table Summary\n\n*{len(all_tables)} table(s) found*\n\n")
+                for sh, t in all_tables[:5]:
+                    if sh:
+                        f.write(f"**From section: {sh}**\n\n")
+                    hdrs = t.get("headers", [])
+                    if hdrs:
+                        f.write("| " + " | ".join(str(h) for h in hdrs) + " |\n")
+                        f.write("| " + " | ".join("---" for _ in hdrs) + " |\n")
+                        for row in t.get("rows", [])[:5]:
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+
+            # Admonition summary
+            all_adm = [a for s in ed.get("pages", []) for a in s.get("admonitions", [])]
+            if all_adm:
+                f.write("## ⚠️ Admonition Summary\n\n")
+                by_type: dict[str, list[str]] = {}
+                for a in all_adm:
+                    by_type.setdefault(a.get("type", "NOTE"), []).append(a.get("text", ""))
+                for at in sorted(by_type):
+                    items = by_type[at]
+                    f.write(f"**{at}** ({len(items)}):\n\n")
+                    for txt in items[:5]:
+                        f.write(f"> {txt[:120]}{'...' if len(txt) > 120 else ''}\n\n")
+
+            # Statistics
+            f.write("## 📊 Documentation Statistics\n\n")
+            for key, label in [
+                ("total_sections", "Total Sections"),
+                ("total_code_blocks", "Code Blocks"),
+                ("total_tables", "Tables"),
+                ("total_admonitions", "Admonitions"),
+                ("total_files", "Source Files"),
+            ]:
+                f.write(f"- **{label}**: {ed.get(key, 0)}\n")
+            langs = ed.get("languages_detected", {})
+            if langs:
+                f.write(f"- **Programming Languages**: {len(langs)}\n\n**Language Breakdown:**\n\n")
+                for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
+                    f.write(f"- {lang}: {count} examples\n")
+                f.write("\n")
+
+            # Navigation
+            f.write("## 🗺️ Navigation\n\n**Reference Files:**\n\n")
+            for cd in categorized.values():
+                cf = self._sanitize_filename(cd["title"])
+                f.write(f"- `references/{cf}.md` - {cd['title']}\n")
+            f.write("\nSee `references/index.md` for complete documentation structure.\n\n")
+            f.write("---\n\n**Generated by Skill Seeker** | AsciiDoc Scraper\n")
+
+        with open(filename, encoding="utf-8") as f:
+            print(f"   Generated: {filename} ({len(f.read().splitlines())} lines)")
+
+    # ------------------------------------------------------------------
+    # Content analysis helpers
+    # ------------------------------------------------------------------
+
+    def _format_key_concepts(self) -> str:
+        """Extract key concepts from headings across all sections."""
+        all_h: list[tuple[str, str]] = []
+        for s in self.extracted_data.get("pages", []):
+            h = s.get("heading", "").strip()
+            if h and len(h) > 3:
+                all_h.append((s.get("heading_level", "h1"), h))
+            for sub in s.get("headings", []):
+                t = sub.get("text", "").strip()
+                if t and len(t) > 3:
+                    all_h.append((sub.get("level", "h3"), t))
+        if not all_h:
+            return ""
+        content = "## 🔑 Key Concepts\n\n*Main topics covered in this documentation*\n\n"
+        h1s = [t for lv, t in all_h if lv == "h1"]
+        h2s = [t for lv, t in all_h if lv == "h2"]
+        if h1s:
+            content += "**Major Topics:**\n\n" + "".join(f"- {h}\n" for h in h1s[:10]) + "\n"
+        if h2s:
+            content += "**Subtopics:**\n\n" + "".join(f"- {h}\n" for h in h2s[:15]) + "\n"
+        return content
+
+    def _format_patterns_from_content(self) -> str:
+        """Extract common documentation patterns from section headings."""
+        keywords = [
+            "getting started",
+            "installation",
+            "configuration",
+            "usage",
+            "api",
+            "examples",
+            "tutorial",
+            "guide",
+            "best practices",
+            "troubleshooting",
+            "faq",
+        ]
+        patterns: list[dict] = []
+        for s in self.extracted_data.get("pages", []):
+            ht = s.get("heading", "").lower()
+            for kw in keywords:
+                if kw in ht:
+                    patterns.append(
+                        {
+                            "type": kw.title(),
+                            "heading": s.get("heading", ""),
+                            "section": s.get("section_number", 0),
+                        }
+                    )
+                    break
+        if not patterns:
+            return "*See reference files for detailed content*\n\n"
+        by_type: dict[str, list] = {}
+        for p in patterns:
+            by_type.setdefault(p["type"], []).append(p)
+        content = "*Common documentation patterns found:*\n\n"
+        for pt in sorted(by_type):
+            items = by_type[pt]
+            content += f"**{pt}** ({len(items)} sections):\n"
+            content += "".join(f"- {it['heading']} (section {it['section']})\n" for it in items[:3])
+            content += "\n"
+        return content
+
+    # ------------------------------------------------------------------
+    # Utilities
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _sanitize_filename(name: str) -> str:
+        """Convert name to a safe filename slug."""
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        return re.sub(r"[-\s]+", "_", safe)
+
+    @staticmethod
+    def _in_range(pos: int, ranges: list[tuple[int, int]]) -> bool:
+        """Check whether pos falls within any consumed range."""
+        return any(s <= pos < e for s, e in ranges)
+
+
+# ============================================================================
+# CLI entry point
+# ============================================================================
+
+
+def main() -> int:
+    """CLI entry point for AsciiDoc scraper."""
+    from skill_seekers.cli.arguments.asciidoc import add_asciidoc_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert AsciiDoc documentation to skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    add_asciidoc_arguments(parser)
+
+    args = parser.parse_args()
+
+    # Set logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = (
+            getattr(args, "asciidoc_path", None) or getattr(args, "from_json", None) or "(none)"
+        )
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: AsciiDoc Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate inputs
+    if not (getattr(args, "asciidoc_path", None) or getattr(args, "from_json", None)):
+        parser.error("Must specify --asciidoc-path or --from-json")
+
+    # Build from JSON workflow
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} documentation",
+        }
+        try:
+            converter = AsciiDocToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Direct AsciiDoc mode
+    if not getattr(args, "name", None):
+        p = Path(args.asciidoc_path)
+        args.name = p.stem if p.is_file() else p.name
+
+    config = {
+        "name": args.name,
+        "asciidoc_path": args.asciidoc_path,
+        "description": getattr(args, "description", None),
+    }
+
+    try:
+        converter = AsciiDocToSkillConverter(config)
+
+        # Extract
+        if not converter.extract_asciidoc():
+            print("\n❌ AsciiDoc extraction failed - see error above", file=sys.stderr)
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis,"
+                    " enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except (FileNotFoundError, ValueError, RuntimeError) as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Unexpected error during AsciiDoc processing: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/chat_scraper.py b/src/skill_seekers/cli/chat_scraper.py
new file mode 100644
index 0000000..2d60c7c
--- /dev/null
+++ b/src/skill_seekers/cli/chat_scraper.py
@@ -0,0 +1,1920 @@
+#!/usr/bin/env python3
+"""
+Slack/Discord Chat Export to Skill Converter
+
+Converts chat history from Slack and Discord into AI-ready skills.
+Supports two modes of operation per platform:
+
+**Export mode** (offline, no API key required):
+  - Slack: Parse workspace export ZIP/directory (JSON files per channel per day)
+  - Discord: Parse DiscordChatExporter JSON output
+
+**API mode** (live, requires authentication token):
+  - Slack: Fetch messages via Slack Web API (slack_sdk)
+  - Discord: Fetch messages via Discord HTTP API (discord.py or aiohttp)
+
+Extracted content includes messages, threads, reactions, code snippets,
+shared links, attachments, and user references. Messages are categorized
+by channel, date, and detected topic for structured skill output.
+
+Usage:
+    # Slack workspace export (directory or ZIP)
+    skill-seekers chat --export-path ./slack-export/ --platform slack --name myteam
+
+    # Slack API (live fetch)
+    skill-seekers chat --platform slack --token xoxb-... --channel C01234 --name myteam
+
+    # Discord export (DiscordChatExporter JSON)
+    skill-seekers chat --export-path ./discord-export.json --platform discord --name myserver
+
+    # Discord API (live fetch)
+    skill-seekers chat --platform discord --token Bot ... --channel 12345 --name myserver
+
+    # Build from previously extracted JSON
+    skill-seekers chat --from-json myteam_extracted.json --name myteam
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Optional dependency guard — Slack SDK
+try:
+    from slack_sdk import WebClient
+    from slack_sdk.errors import SlackApiError
+
+    SLACK_AVAILABLE = True
+except ImportError:
+    SLACK_AVAILABLE = False
+
+# Optional dependency guard — Discord
+try:
+    import discord  # noqa: F401
+
+    DISCORD_AVAILABLE = True
+except ImportError:
+    DISCORD_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+# Maximum messages to fetch per channel when using API mode
+DEFAULT_MAX_MESSAGES = 5000
+
+# Topic keywords for automatic content categorization
+_TOPIC_KEYWORDS: dict[str, list[str]] = {
+    "troubleshooting": [
+        "error",
+        "bug",
+        "fix",
+        "issue",
+        "broken",
+        "crash",
+        "exception",
+        "traceback",
+        "debug",
+        "failing",
+        "stacktrace",
+        "segfault",
+    ],
+    "setup": [
+        "install",
+        "setup",
+        "configure",
+        "config",
+        "environment",
+        "docker",
+        "deploy",
+        "ci/cd",
+        "pipeline",
+        "build",
+        "dependency",
+    ],
+    "architecture": [
+        "design",
+        "architecture",
+        "pattern",
+        "refactor",
+        "abstraction",
+        "interface",
+        "module",
+        "service",
+        "microservice",
+        "api",
+    ],
+    "code_review": [
+        "review",
+        "pr",
+        "pull request",
+        "merge",
+        "approve",
+        "lgtm",
+        "nit",
+        "suggestion",
+        "feedback",
+        "diff",
+    ],
+    "howto": [
+        "how to",
+        "how do",
+        "tutorial",
+        "example",
+        "guide",
+        "walkthrough",
+        "step by step",
+        "documentation",
+        "docs",
+    ],
+    "release": [
+        "release",
+        "version",
+        "changelog",
+        "migration",
+        "upgrade",
+        "breaking change",
+        "deprecat",
+        "v1",
+        "v2",
+        "v3",
+    ],
+    "performance": [
+        "performance",
+        "slow",
+        "fast",
+        "optimize",
+        "latency",
+        "throughput",
+        "benchmark",
+        "profil",
+        "memory",
+        "cpu",
+    ],
+    "testing": [
+        "test",
+        "pytest",
+        "unittest",
+        "coverage",
+        "mock",
+        "fixture",
+        "assertion",
+        "spec",
+        "e2e",
+        "integration test",
+    ],
+}
+
+
+# ---------------------------------------------------------------------------
+# Dependency checks
+# ---------------------------------------------------------------------------
+
+
+def _check_slack_deps() -> None:
+    """Raise RuntimeError if slack_sdk is not installed."""
+    if not SLACK_AVAILABLE:
+        raise RuntimeError(
+            "slack_sdk is required for Slack API support.\n"
+            'Install with: pip install "skill-seekers[slack]"\n'
+            "Or: pip install slack_sdk"
+        )
+
+
+def _check_discord_deps() -> None:
+    """Raise RuntimeError if discord.py is not installed."""
+    if not DISCORD_AVAILABLE:
+        raise RuntimeError(
+            "discord.py is required for Discord API support.\n"
+            'Install with: pip install "skill-seekers[discord]"\n'
+            "Or: pip install discord.py"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Helper: code quality scoring (consistent with other scrapers)
+# ---------------------------------------------------------------------------
+
+
+def _score_code_quality(code: str) -> float:
+    """Score code quality on a 0-10 scale using heuristics.
+
+    Args:
+        code: Source code text to score.
+
+    Returns:
+        Float quality score between 0.0 and 10.0.
+    """
+    if not code:
+        return 0.0
+
+    score = 5.0
+    lines = code.strip().split("\n")
+    line_count = len(lines)
+
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+    if len(code) < 30:
+        score -= 2.0
+
+    return min(10.0, max(0.0, score))
+
+
+# ---------------------------------------------------------------------------
+# Main converter class
+# ---------------------------------------------------------------------------
+
+
+class ChatToSkillConverter:
+    """Convert Slack or Discord chat history into an AI-ready skill.
+
+    Follows the same pipeline pattern as the EPUB, Jupyter, and PPTX scrapers:
+    extract -> categorize -> build_skill (reference files + index + SKILL.md).
+
+    Supports two input modes per platform:
+    - **Export mode**: Parse a previously exported archive (Slack workspace
+      export directory/ZIP or DiscordChatExporter JSON).
+    - **API mode**: Fetch messages live from the platform's API using an
+      authentication token.
+
+    The extraction phase produces a normalized intermediate JSON containing
+    messages with text, user, timestamp, reactions, threads, attachments,
+    code snippets, and shared links. Messages are then categorized by
+    channel, date range, and detected topic.
+    """
+
+    def __init__(self, config: dict) -> None:
+        """Initialize the converter with a configuration dictionary.
+
+        Args:
+            config: Configuration dict with keys:
+                - name (str): Skill name (required).
+                - export_path (str): Path to export file/directory (optional).
+                - platform (str): "slack" or "discord" (default "slack").
+                - token (str): API authentication token (optional, API mode).
+                - channel (str): Channel ID to fetch (optional, API mode).
+                - max_messages (int): Max messages to fetch per channel
+                  (default 5000).
+                - description (str): Skill description (optional, inferred
+                  if absent).
+        """
+        self.config = config
+        self.name: str = config["name"]
+        self.export_path: str = config.get("export_path", "")
+        self.platform: str = config.get("platform", "slack").lower()
+        self.token: str = config.get("token", "")
+        self.channel: str = config.get("channel", "")
+        self.max_messages: int = config.get("max_messages", DEFAULT_MAX_MESSAGES)
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} chat knowledge base"
+        )
+
+        # Output paths
+        self.skill_dir: str = f"output/{self.name}"
+        self.data_file: str = f"output/{self.name}_extracted.json"
+
+        # Extracted data (populated by extract_chat or load_extracted_data)
+        self.extracted_data: dict | None = None
+
+    # ------------------------------------------------------------------
+    # Extraction — public entry point
+    # ------------------------------------------------------------------
+
+    def extract_chat(self) -> bool:
+        """Extract chat content based on platform and input mode.
+
+        Dispatches to the appropriate extraction method:
+        - Export mode (export_path set): parse local export files.
+        - API mode (token set): fetch messages via platform API.
+
+        Returns:
+            True on successful extraction.
+
+        Raises:
+            ValueError: If neither export_path nor token is provided, or
+                if the platform is not recognized.
+        """
+        if self.platform not in ("slack", "discord"):
+            raise ValueError(
+                f"Unsupported platform: '{self.platform}'. Supported platforms: 'slack', 'discord'"
+            )
+
+        # Determine mode
+        if self.export_path:
+            print(f"\n🔍 Extracting {self.platform} chat from export: {self.export_path}")
+            if self.platform == "slack":
+                messages = self._extract_slack_export()
+            else:
+                messages = self._extract_discord_export()
+        elif self.token:
+            print(f"\n🔍 Fetching {self.platform} chat via API...")
+            if self.platform == "slack":
+                _check_slack_deps()
+                messages = self._extract_slack_api()
+            else:
+                _check_discord_deps()
+                messages = self._extract_discord_api()
+        else:
+            raise ValueError(
+                "Must provide either --export-path (export mode) "
+                "or --token (API mode) for chat extraction."
+            )
+
+        if not messages:
+            logger.warning("No messages extracted from %s source", self.platform)
+            print("   ⚠️  No messages were extracted.")
+
+        # Identify threads and extract enrichment
+        threads = self._identify_threads(messages)
+        code_snippets = self._extract_code_snippets(messages)
+        links = self._extract_links(messages)
+        channel_summaries = self._summarize_channels(messages)
+
+        # Group messages into sections by channel
+        sections = self._build_sections(messages, threads)
+
+        # Compute statistics
+        total_messages = len(messages)
+        total_threads = len(threads)
+        total_code_snippets = len(code_snippets)
+        total_links = len(links)
+        unique_users = len({m.get("user", "unknown") for m in messages})
+        channels_found = list(channel_summaries.keys())
+
+        result_data = {
+            "source": self.export_path or f"{self.platform}-api",
+            "platform": self.platform,
+            "metadata": {
+                "total_messages": total_messages,
+                "total_threads": total_threads,
+                "total_code_snippets": total_code_snippets,
+                "total_links": total_links,
+                "unique_users": unique_users,
+                "channels": channels_found,
+            },
+            "total_sections": len(sections),
+            "total_code_blocks": total_code_snippets,
+            "channel_summaries": channel_summaries,
+            "code_snippets": code_snippets[:100],  # Keep top 100 for JSON size
+            "links": links[:200],
+            "pages": sections,
+        }
+
+        # Save extracted data
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {total_messages} messages across "
+            f"{len(channels_found)} channel(s), "
+            f"{total_threads} threads, "
+            f"{total_code_snippets} code snippets"
+        )
+        return True
+
+    # ------------------------------------------------------------------
+    # Load previously extracted data
+    # ------------------------------------------------------------------
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from JSON file.
+
+        Args:
+            json_path: Path to the extracted JSON file.
+
+        Returns:
+            True on success.
+        """
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", [])))
+        print(f"✅ Loaded {total} sections")
+        return True
+
+    # ------------------------------------------------------------------
+    # Categorization
+    # ------------------------------------------------------------------
+
+    def categorize_content(self) -> dict[str, dict]:
+        """Categorize sections by channel, date range, and detected topic.
+
+        Groups the extracted sections into categories suitable for
+        generating reference files. Each category contains a title
+        and a list of page/section dicts.
+
+        Returns:
+            Dict mapping category keys to dicts with 'title' and 'pages'.
+        """
+        print("\n📋 Categorizing content...")
+
+        categorized: dict[str, dict] = {}
+        sections = self.extracted_data.get("pages", [])
+
+        if not sections:
+            categorized["content"] = {"title": "Chat Content", "pages": []}
+            print("✅ Created 0 categories (no content)")
+            return categorized
+
+        # Group sections by channel name
+        by_channel: dict[str, list[dict]] = defaultdict(list)
+        for section in sections:
+            channel = section.get("channel", "general")
+            by_channel[channel].append(section)
+
+        if len(by_channel) <= 1:
+            # Single channel — categorize by topic instead
+            all_sections = sections
+            topic_buckets: dict[str, list[dict]] = defaultdict(list)
+            uncategorized: list[dict] = []
+
+            for section in all_sections:
+                combined = self._section_text(section)
+                matched_topic = ""
+                best_score = 0
+                for topic, keywords in _TOPIC_KEYWORDS.items():
+                    score = sum(1 for kw in keywords if kw.lower() in combined)
+                    if score > best_score:
+                        best_score = score
+                        matched_topic = topic
+                if matched_topic and best_score >= 2:
+                    topic_buckets[matched_topic].append(section)
+                else:
+                    uncategorized.append(section)
+
+            for topic, pages in sorted(topic_buckets.items()):
+                categorized[topic] = {
+                    "title": topic.replace("_", " ").title(),
+                    "pages": pages,
+                }
+            if uncategorized:
+                categorized["general"] = {
+                    "title": "General Discussion",
+                    "pages": uncategorized,
+                }
+        else:
+            # Multiple channels — use channel names as categories
+            for channel, channel_sections in sorted(by_channel.items()):
+                cat_key = self._sanitize_filename(channel)
+                categorized[cat_key] = {
+                    "title": f"#{channel}",
+                    "pages": channel_sections,
+                }
+
+        if not categorized:
+            categorized["content"] = {"title": "Chat Content", "pages": sections}
+
+        print(f"✅ Created {len(categorized)} categories")
+        for cat_data in categorized.values():
+            print(f"   - {cat_data['title']}: {len(cat_data['pages'])} sections")
+
+        return categorized
+
+    # ------------------------------------------------------------------
+    # Build skill
+    # ------------------------------------------------------------------
+
+    def build_skill(self) -> None:
+        """Build complete skill directory structure from extracted data.
+
+        Creates the output directory tree with:
+        - references/ — one markdown file per category
+        - references/index.md — category index with statistics
+        - SKILL.md — main skill file with frontmatter and overview
+        - scripts/ — reserved for future use
+        - assets/ — reserved for future use
+        """
+        print(f"\n🏗️  Building skill: {self.name}")
+
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        categorized = self.categorize_content()
+
+        print("\n📝 Generating reference files...")
+        total_categories = len(categorized)
+        for section_num, (cat_key, cat_data) in enumerate(categorized.items(), 1):
+            self._generate_reference_file(cat_key, cat_data, section_num, total_categories)
+
+        self._generate_index(categorized)
+        self._generate_skill_md(categorized)
+
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ------------------------------------------------------------------
+    # Slack export extraction
+    # ------------------------------------------------------------------
+
+    def _extract_slack_export(self) -> list[dict]:
+        """Parse a Slack workspace export directory.
+
+        Slack exports contain one directory per channel, each with JSON
+        files named by date (e.g., ``2024-01-15.json``). Each JSON file
+        is a list of message objects.
+
+        Returns:
+            List of normalized message dicts.
+
+        Raises:
+            FileNotFoundError: If export_path does not exist.
+            ValueError: If the path structure is not a valid Slack export.
+        """
+        export_path = Path(self.export_path)
+        if not export_path.exists():
+            raise FileNotFoundError(f"Slack export path not found: {self.export_path}")
+
+        # Handle ZIP archives
+        if export_path.is_file() and export_path.suffix == ".zip":
+            export_path = self._unzip_export(export_path)
+
+        if not export_path.is_dir():
+            raise ValueError(
+                f"Expected a directory for Slack export, got: {self.export_path}\n"
+                "Slack workspace exports are directories containing channel "
+                "subdirectories with daily JSON files."
+            )
+
+        messages: list[dict] = []
+        channel_dirs = sorted(
+            d for d in export_path.iterdir() if d.is_dir() and not d.name.startswith(".")
+        )
+
+        if not channel_dirs:
+            raise ValueError(
+                f"No channel directories found in Slack export: {self.export_path}\n"
+                "Expected subdirectories named after channels (e.g., general/, random/)."
+            )
+
+        # Load users.json if available (for display name resolution)
+        users_map = self._load_slack_users(export_path)
+
+        for channel_dir in channel_dirs:
+            channel_name = channel_dir.name
+            json_files = sorted(channel_dir.glob("*.json"))
+
+            for json_file in json_files:
+                try:
+                    with open(json_file, encoding="utf-8") as f:
+                        day_messages = json.load(f)
+                except (json.JSONDecodeError, OSError) as e:
+                    logger.warning("Failed to parse %s: %s", json_file, e)
+                    continue
+
+                if not isinstance(day_messages, list):
+                    continue
+
+                for raw_msg in day_messages:
+                    parsed = self._parse_slack_message(raw_msg, channel_name, users_map)
+                    if parsed:
+                        messages.append(parsed)
+
+            print(f"   📁 #{channel_name}: {len(json_files)} day file(s)")
+
+        print(f"   Total messages parsed: {len(messages)}")
+        return messages
+
+    def _load_slack_users(self, export_dir: Path) -> dict[str, str]:
+        """Load user ID -> display name mapping from users.json.
+
+        Args:
+            export_dir: Root directory of the Slack export.
+
+        Returns:
+            Dict mapping user IDs to display names.
+        """
+        users_file = export_dir / "users.json"
+        if not users_file.exists():
+            return {}
+
+        try:
+            with open(users_file, encoding="utf-8") as f:
+                users_list = json.load(f)
+        except (json.JSONDecodeError, OSError):
+            return {}
+
+        users_map: dict[str, str] = {}
+        if isinstance(users_list, list):
+            for user in users_list:
+                uid = user.get("id", "")
+                display = (
+                    user.get("profile", {}).get("display_name")
+                    or user.get("profile", {}).get("real_name")
+                    or user.get("real_name")
+                    or user.get("name", uid)
+                )
+                if uid:
+                    users_map[uid] = display
+
+        return users_map
+
+    def _unzip_export(self, zip_path: Path) -> Path:
+        """Extract a ZIP export to a temporary directory.
+
+        Args:
+            zip_path: Path to the ZIP archive.
+
+        Returns:
+            Path to the extracted directory.
+        """
+        import zipfile
+
+        extract_dir = zip_path.parent / zip_path.stem
+        if extract_dir.exists():
+            print(f"   Using existing extracted directory: {extract_dir}")
+            return extract_dir
+
+        print(f"   Extracting ZIP: {zip_path} -> {extract_dir}")
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(extract_dir)
+
+        return extract_dir
+
+    # ------------------------------------------------------------------
+    # Slack API extraction
+    # ------------------------------------------------------------------
+
+    def _extract_slack_api(self) -> list[dict]:
+        """Fetch messages from Slack via the Web API using slack_sdk.
+
+        Requires ``self.token`` to be set to a valid Slack Bot or User
+        token. If ``self.channel`` is set, only that channel is fetched;
+        otherwise all accessible channels are iterated.
+
+        Returns:
+            List of normalized message dicts.
+
+        Raises:
+            RuntimeError: If the API call fails.
+        """
+        client = WebClient(token=self.token)
+        messages: list[dict] = []
+
+        try:
+            # Determine channels to fetch
+            if self.channel:
+                channel_ids = [self.channel]
+                channel_names = {self.channel: self.channel}
+            else:
+                # List all accessible channels
+                result = client.conversations_list(
+                    types="public_channel,private_channel",
+                    limit=200,
+                )
+                channels = result.get("channels", [])
+                channel_ids = [ch["id"] for ch in channels]
+                channel_names = {ch["id"]: ch.get("name", ch["id"]) for ch in channels}
+                print(f"   Found {len(channel_ids)} channel(s)")
+
+            for ch_id in channel_ids:
+                ch_name = channel_names.get(ch_id, ch_id)
+                ch_messages = self._fetch_slack_channel_messages(client, ch_id, ch_name)
+                messages.extend(ch_messages)
+                print(f"   📡 #{ch_name}: {len(ch_messages)} messages")
+
+        except SlackApiError as e:
+            raise RuntimeError(
+                f"Slack API error: {e.response['error']}\n"
+                "Check your token permissions (channels:history, channels:read)."
+            ) from e
+
+        print(f"   Total messages fetched: {len(messages)}")
+        return messages
+
+    def _fetch_slack_channel_messages(
+        self, client: "WebClient", channel_id: str, channel_name: str
+    ) -> list[dict]:
+        """Fetch all messages from a single Slack channel with pagination.
+
+        Args:
+            client: Authenticated slack_sdk WebClient.
+            channel_id: Slack channel ID.
+            channel_name: Human-readable channel name.
+
+        Returns:
+            List of normalized message dicts.
+        """
+        messages: list[dict] = []
+        cursor = None
+        fetched = 0
+
+        while fetched < self.max_messages:
+            kwargs: dict = {
+                "channel": channel_id,
+                "limit": min(200, self.max_messages - fetched),
+            }
+            if cursor:
+                kwargs["cursor"] = cursor
+
+            result = client.conversations_history(**kwargs)
+            batch = result.get("messages", [])
+            if not batch:
+                break
+
+            for raw_msg in batch:
+                parsed = self._parse_slack_message(raw_msg, channel_name, {})
+                if parsed:
+                    messages.append(parsed)
+
+            fetched += len(batch)
+
+            # Pagination
+            response_meta = result.get("response_metadata", {})
+            cursor = response_meta.get("next_cursor")
+            if not cursor:
+                break
+
+        return messages
+
+    # ------------------------------------------------------------------
+    # Discord export extraction
+    # ------------------------------------------------------------------
+
+    def _extract_discord_export(self) -> list[dict]:
+        """Parse a Discord chat export in DiscordChatExporter JSON format.
+
+        DiscordChatExporter produces a single JSON file per channel with
+        a ``messages`` array. Each message object has ``id``, ``content``,
+        ``author``, ``timestamp``, ``attachments``, ``reactions``, etc.
+
+        Returns:
+            List of normalized message dicts.
+
+        Raises:
+            FileNotFoundError: If export_path does not exist.
+            ValueError: If the file is not valid JSON or has unexpected structure.
+        """
+        export_path = Path(self.export_path)
+        if not export_path.exists():
+            raise FileNotFoundError(f"Discord export path not found: {self.export_path}")
+
+        # Support single file or directory of JSON files
+        json_files: list[Path] = []
+        if export_path.is_file():
+            json_files = [export_path]
+        elif export_path.is_dir():
+            json_files = sorted(export_path.glob("*.json"))
+        else:
+            raise ValueError(f"Invalid export path: {self.export_path}")
+
+        if not json_files:
+            raise ValueError(f"No JSON files found in Discord export: {self.export_path}")
+
+        messages: list[dict] = []
+
+        for json_file in json_files:
+            try:
+                with open(json_file, encoding="utf-8") as f:
+                    export_data = json.load(f)
+            except (json.JSONDecodeError, OSError) as e:
+                logger.warning("Failed to parse %s: %s", json_file, e)
+                continue
+
+            # DiscordChatExporter format: top-level object with "messages" key
+            if isinstance(export_data, dict):
+                channel_info = export_data.get("channel", {})
+                channel_name = (
+                    channel_info.get("name", json_file.stem)
+                    if isinstance(channel_info, dict)
+                    else json_file.stem
+                )
+                raw_messages = export_data.get("messages", [])
+            elif isinstance(export_data, list):
+                # Some exporters produce a bare list of messages
+                channel_name = json_file.stem
+                raw_messages = export_data
+            else:
+                logger.warning("Unexpected JSON structure in %s", json_file)
+                continue
+
+            for raw_msg in raw_messages:
+                parsed = self._parse_discord_message(raw_msg, channel_name)
+                if parsed:
+                    messages.append(parsed)
+
+            print(f"   📁 #{channel_name}: {len(raw_messages)} messages")
+
+        print(f"   Total messages parsed: {len(messages)}")
+        return messages
+
+    # ------------------------------------------------------------------
+    # Discord API extraction
+    # ------------------------------------------------------------------
+
+    def _extract_discord_api(self) -> list[dict]:
+        """Fetch messages from Discord via the HTTP API.
+
+        Uses aiohttp directly (not the discord.py gateway client) to
+        fetch channel history. Requires a Bot token and channel ID.
+
+        Returns:
+            List of normalized message dicts.
+
+        Raises:
+            RuntimeError: If the API call fails.
+            ValueError: If no channel ID is provided.
+        """
+        if not self.channel:
+            raise ValueError(
+                "Discord API mode requires --channel (channel ID). "
+                "Find channel IDs in Discord Developer Mode."
+            )
+
+        import asyncio
+
+        try:
+            import aiohttp
+        except ImportError:
+            raise RuntimeError(
+                "aiohttp is required for Discord API mode.\nInstall with: pip install aiohttp"
+            ) from None
+
+        async def _fetch() -> list[dict]:
+            messages: list[dict] = []
+            base_url = "https://discord.com/api/v10"
+            headers = {"Authorization": f"Bot {self.token}"}
+
+            # Get channel info
+            async with aiohttp.ClientSession() as session:
+                async with session.get(
+                    f"{base_url}/channels/{self.channel}", headers=headers
+                ) as resp:
+                    if resp.status != 200:
+                        body = await resp.text()
+                        raise RuntimeError(
+                            f"Discord API error (HTTP {resp.status}): {body}\n"
+                            "Check your Bot token and channel ID."
+                        )
+                    channel_info = await resp.json()
+                    channel_name = channel_info.get("name", self.channel)
+
+                # Fetch messages with pagination (before= cursor)
+                before: str | None = None
+                fetched = 0
+
+                while fetched < self.max_messages:
+                    params: dict[str, str | int] = {"limit": min(100, self.max_messages - fetched)}
+                    if before:
+                        params["before"] = before
+
+                    async with session.get(
+                        f"{base_url}/channels/{self.channel}/messages",
+                        headers=headers,
+                        params=params,
+                    ) as resp:
+                        if resp.status != 200:
+                            body = await resp.text()
+                            logger.warning("Discord API error fetching messages: %s", body)
+                            break
+                        batch = await resp.json()
+
+                    if not batch:
+                        break
+
+                    for raw_msg in batch:
+                        parsed = self._parse_discord_message(raw_msg, channel_name)
+                        if parsed:
+                            messages.append(parsed)
+
+                    fetched += len(batch)
+                    before = batch[-1]["id"]
+
+            print(f"   📡 #{channel_name}: {len(messages)} messages")
+            return messages
+
+        loop = asyncio.new_event_loop()
+        try:
+            return loop.run_until_complete(_fetch())
+        finally:
+            loop.close()
+
+    # ------------------------------------------------------------------
+    # Message parsing
+    # ------------------------------------------------------------------
+
+    def _parse_slack_message(
+        self, raw: dict, channel: str, users_map: dict[str, str]
+    ) -> dict | None:
+        """Parse a single Slack message into normalized format.
+
+        Handles regular messages, bot messages, and subtypes like
+        ``channel_join``, ``channel_leave``, ``file_share``, etc.
+        System subtypes (join/leave/topic) are skipped.
+
+        Args:
+            raw: Raw Slack message dict from export or API.
+            channel: Channel name this message belongs to.
+            users_map: User ID -> display name mapping.
+
+        Returns:
+            Normalized message dict, or None if the message should be skipped.
+        """
+        # Skip system messages
+        subtype = raw.get("subtype", "")
+        skip_subtypes = {
+            "channel_join",
+            "channel_leave",
+            "channel_topic",
+            "channel_purpose",
+            "channel_name",
+            "channel_archive",
+            "channel_unarchive",
+            "group_join",
+            "group_leave",
+        }
+        if subtype in skip_subtypes:
+            return None
+
+        text = raw.get("text", "").strip()
+        if not text and not raw.get("files") and not raw.get("attachments"):
+            return None
+
+        # Resolve user
+        user_id = raw.get("user", raw.get("bot_id", "unknown"))
+        user_name = users_map.get(user_id, user_id)
+        if raw.get("username"):
+            user_name = raw["username"]
+
+        # Parse timestamp
+        ts = raw.get("ts", "0")
+        try:
+            timestamp = datetime.fromtimestamp(float(ts), tz=timezone.utc).isoformat()
+        except (ValueError, TypeError, OSError):
+            timestamp = ts
+
+        # Resolve user mentions in text: <@U12345> -> @username
+        def _resolve_mention(match: re.Match) -> str:
+            uid = match.group(1)
+            return f"@{users_map.get(uid, uid)}"
+
+        text = re.sub(r"<@(U[A-Z0-9]+)>", _resolve_mention, text)
+
+        # Decode Slack link format: <url|label> -> label (url)
+        text = re.sub(r"<(https?://[^|>]+)\|([^>]+)>", r"\2 (\1)", text)
+        text = re.sub(r"<(https?://[^>]+)>", r"\1", text)
+
+        # Reactions
+        reactions = []
+        for reaction in raw.get("reactions", []):
+            reactions.append(
+                {
+                    "emoji": reaction.get("name", ""),
+                    "count": reaction.get("count", 0),
+                }
+            )
+
+        # Attachments / files
+        attachments = []
+        for f in raw.get("files", []):
+            attachments.append(
+                {
+                    "name": f.get("name", f.get("title", "unnamed")),
+                    "type": f.get("mimetype", f.get("filetype", "")),
+                    "url": f.get("url_private", f.get("permalink", "")),
+                }
+            )
+        for att in raw.get("attachments", []):
+            attachments.append(
+                {
+                    "name": att.get("title", att.get("fallback", "attachment")),
+                    "type": "link",
+                    "url": att.get("from_url", att.get("title_link", "")),
+                    "text": att.get("text", ""),
+                }
+            )
+
+        # Thread info
+        thread_ts = raw.get("thread_ts")
+        is_thread_parent = thread_ts == ts and raw.get("reply_count", 0) > 0
+        reply_count = raw.get("reply_count", 0) if is_thread_parent else 0
+
+        return {
+            "platform": "slack",
+            "channel": channel,
+            "user": user_name,
+            "user_id": user_id,
+            "text": text,
+            "timestamp": timestamp,
+            "ts": ts,
+            "thread_ts": thread_ts,
+            "is_thread_parent": is_thread_parent,
+            "reply_count": reply_count,
+            "reactions": reactions,
+            "attachments": attachments,
+            "subtype": subtype,
+        }
+
+    def _parse_discord_message(self, raw: dict, channel: str) -> dict | None:
+        """Parse a single Discord message into normalized format.
+
+        Handles regular messages, embeds, and attachments. System messages
+        (type != 0 and type != 19) are skipped.
+
+        Args:
+            raw: Raw Discord message dict from export or API.
+            channel: Channel name this message belongs to.
+
+        Returns:
+            Normalized message dict, or None if the message should be skipped.
+        """
+        # Skip system messages (type 0 = DEFAULT, 19 = REPLY)
+        msg_type = raw.get("type", 0)
+        if isinstance(msg_type, int) and msg_type not in (0, 19):
+            return None
+        # DiscordChatExporter uses string type names
+        if isinstance(msg_type, str) and msg_type not in ("Default", "Reply"):
+            return None
+
+        content = raw.get("content", "").strip()
+
+        # Extract author info
+        author = raw.get("author", {})
+        if isinstance(author, dict):
+            user_name = (
+                author.get("nickname") or author.get("name") or author.get("username", "unknown")
+            )
+            user_id = str(author.get("id", "unknown"))
+        else:
+            user_name = str(author)
+            user_id = str(author)
+
+        # Parse timestamp
+        raw_ts = raw.get("timestamp", "")
+        try:
+            if isinstance(raw_ts, str) and raw_ts:
+                # ISO 8601 format from Discord API
+                dt = datetime.fromisoformat(raw_ts.replace("Z", "+00:00"))
+                timestamp = dt.isoformat()
+            else:
+                timestamp = str(raw_ts)
+        except (ValueError, TypeError):
+            timestamp = str(raw_ts)
+
+        # Skip empty messages with no content and no attachments
+        embeds = raw.get("embeds", [])
+        attachments_raw = raw.get("attachments", [])
+        if not content and not embeds and not attachments_raw:
+            return None
+
+        # Reactions
+        reactions = []
+        for reaction in raw.get("reactions", []):
+            emoji_data = reaction.get("emoji", {})
+            if isinstance(emoji_data, dict):
+                emoji_name = emoji_data.get("name", "")
+            else:
+                emoji_name = str(emoji_data)
+            reactions.append(
+                {
+                    "emoji": emoji_name,
+                    "count": reaction.get("count", 0),
+                }
+            )
+
+        # Attachments
+        attachments = []
+        for att in attachments_raw:
+            attachments.append(
+                {
+                    "name": att.get("fileName", att.get("filename", "unnamed")),
+                    "type": att.get("contentType", att.get("content_type", "")),
+                    "url": att.get("url", ""),
+                }
+            )
+
+        # Embeds as additional content
+        embed_texts: list[str] = []
+        for embed in embeds:
+            title = embed.get("title", "")
+            desc = embed.get("description", "")
+            if title or desc:
+                embed_texts.append(f"[Embed: {title}] {desc}".strip())
+
+        if embed_texts:
+            content = content + "\n" + "\n".join(embed_texts) if content else "\n".join(embed_texts)
+
+        # Thread / reply info
+        reference = raw.get("reference", raw.get("messageReference"))
+        thread_ts = None
+        if isinstance(reference, dict):
+            thread_ts = str(reference.get("messageId", ""))
+
+        msg_id = str(raw.get("id", ""))
+
+        return {
+            "platform": "discord",
+            "channel": channel,
+            "user": user_name,
+            "user_id": user_id,
+            "text": content,
+            "timestamp": timestamp,
+            "ts": msg_id,
+            "thread_ts": thread_ts,
+            "is_thread_parent": False,  # Determined later in _identify_threads
+            "reply_count": 0,
+            "reactions": reactions,
+            "attachments": attachments,
+            "subtype": "",
+        }
+
+    # ------------------------------------------------------------------
+    # Content enrichment
+    # ------------------------------------------------------------------
+
+    def _extract_code_snippets(self, messages: list[dict]) -> list[dict]:
+        """Extract fenced code blocks from all messages.
+
+        Detects triple-backtick fenced code blocks (````` ```lang ... ``` `````)
+        and inline code that spans multiple lines.
+
+        Args:
+            messages: List of normalized message dicts.
+
+        Returns:
+            List of code snippet dicts with 'code', 'language',
+            'quality_score', 'channel', 'user', and 'timestamp'.
+        """
+        snippets: list[dict] = []
+        code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL)
+
+        for msg in messages:
+            text = msg.get("text", "")
+            for match in code_block_pattern.finditer(text):
+                lang = match.group(1) or ""
+                code = match.group(2).strip()
+                if code:
+                    snippets.append(
+                        {
+                            "code": code,
+                            "language": lang,
+                            "quality_score": _score_code_quality(code),
+                            "channel": msg.get("channel", ""),
+                            "user": msg.get("user", ""),
+                            "timestamp": msg.get("timestamp", ""),
+                        }
+                    )
+
+        # Sort by quality descending
+        snippets.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
+        return snippets
+
+    def _extract_links(self, messages: list[dict]) -> list[dict]:
+        """Extract shared URLs from all messages.
+
+        Finds HTTP/HTTPS URLs in message text and deduplicates by URL.
+
+        Args:
+            messages: List of normalized message dicts.
+
+        Returns:
+            List of link dicts with 'url', 'channel', 'user', 'timestamp',
+            and 'context' (surrounding text snippet).
+        """
+        links: list[dict] = []
+        seen_urls: set[str] = set()
+        url_pattern = re.compile(r"https?://[^\s<>\"')\]]+")
+
+        for msg in messages:
+            text = msg.get("text", "")
+            for match in url_pattern.finditer(text):
+                url = match.group(0).rstrip(".,;:!?)")
+                if url in seen_urls:
+                    continue
+                seen_urls.add(url)
+
+                # Extract context: up to 80 chars around the URL
+                start = max(0, match.start() - 40)
+                end = min(len(text), match.end() + 40)
+                context = text[start:end].strip()
+
+                links.append(
+                    {
+                        "url": url,
+                        "channel": msg.get("channel", ""),
+                        "user": msg.get("user", ""),
+                        "timestamp": msg.get("timestamp", ""),
+                        "context": context,
+                    }
+                )
+
+        return links
+
+    def _identify_threads(self, messages: list[dict]) -> list[dict]:
+        """Group messages into conversation threads.
+
+        Threads are identified by shared ``thread_ts`` values (Slack)
+        or ``thread_ts`` references (Discord). Each thread contains the
+        parent message and its replies in chronological order.
+
+        Args:
+            messages: List of normalized message dicts.
+
+        Returns:
+            List of thread dicts with 'parent', 'replies', 'channel',
+            'reply_count', and 'participants'.
+        """
+        # Group by thread_ts
+        thread_map: dict[str, list[dict]] = defaultdict(list)
+        msg_by_ts: dict[str, dict] = {}
+
+        for msg in messages:
+            ts = msg.get("ts", "")
+            if ts:
+                msg_by_ts[ts] = msg
+
+            thread_ts = msg.get("thread_ts")
+            if thread_ts:
+                thread_map[thread_ts].append(msg)
+
+        threads: list[dict] = []
+        for thread_ts, thread_msgs in thread_map.items():
+            if len(thread_msgs) < 2:
+                continue
+
+            # Sort by timestamp
+            thread_msgs.sort(key=lambda m: m.get("timestamp", ""))
+
+            parent = msg_by_ts.get(thread_ts, thread_msgs[0])
+            replies = [m for m in thread_msgs if m.get("ts") != thread_ts]
+            participants = list({m.get("user", "unknown") for m in thread_msgs})
+
+            threads.append(
+                {
+                    "parent": parent,
+                    "replies": replies,
+                    "channel": parent.get("channel", ""),
+                    "reply_count": len(replies),
+                    "participants": participants,
+                }
+            )
+
+        return threads
+
+    def _summarize_channels(self, messages: list[dict]) -> dict[str, dict]:
+        """Generate summary statistics for each channel.
+
+        Args:
+            messages: List of normalized message dicts.
+
+        Returns:
+            Dict mapping channel names to summary dicts with message_count,
+            unique_users, date_range, top_users, and has_code.
+        """
+        channel_data: dict[str, list[dict]] = defaultdict(list)
+        for msg in messages:
+            channel_data[msg.get("channel", "unknown")].append(msg)
+
+        summaries: dict[str, dict] = {}
+        for channel, ch_messages in channel_data.items():
+            users = [m.get("user", "unknown") for m in ch_messages]
+            user_counts: dict[str, int] = defaultdict(int)
+            for u in users:
+                user_counts[u] += 1
+
+            top_users = sorted(user_counts.items(), key=lambda x: x[1], reverse=True)[:5]
+            timestamps = [m.get("timestamp", "") for m in ch_messages if m.get("timestamp")]
+
+            has_code = any("```" in m.get("text", "") for m in ch_messages)
+
+            summaries[channel] = {
+                "message_count": len(ch_messages),
+                "unique_users": len(set(users)),
+                "date_range": {
+                    "earliest": min(timestamps) if timestamps else "",
+                    "latest": max(timestamps) if timestamps else "",
+                },
+                "top_users": [{"user": u, "count": c} for u, c in top_users],
+                "has_code": has_code,
+            }
+
+        return summaries
+
+    # Alias for single-channel usage in _build_sections
+    _summarize_channel = _summarize_channels
+
+    # ------------------------------------------------------------------
+    # Section building
+    # ------------------------------------------------------------------
+
+    def _build_sections(self, messages: list[dict], threads: list[dict]) -> list[dict]:
+        """Build sections from messages, grouping by channel and date.
+
+        Each section represents a chunk of conversation from a single
+        channel on a single date. Sections are compatible with the
+        pipeline's intermediate JSON 'pages' format.
+
+        Args:
+            messages: List of normalized message dicts.
+            threads: List of thread dicts (for enrichment).
+
+        Returns:
+            List of section dicts with heading, text, code_samples, etc.
+        """
+        # Group by (channel, date)
+        groups: dict[tuple[str, str], list[dict]] = defaultdict(list)
+        for msg in messages:
+            channel = msg.get("channel", "general")
+            ts = msg.get("timestamp", "")
+            try:
+                date_str = ts[:10] if ts else "unknown"
+            except (TypeError, IndexError):
+                date_str = "unknown"
+            groups[(channel, date_str)].append(msg)
+
+        sections: list[dict] = []
+
+        for section_number, ((channel, date_str), group_msgs) in enumerate(
+            sorted(groups.items()), 1
+        ):
+            # Sort messages chronologically
+            group_msgs.sort(key=lambda m: m.get("timestamp", ""))
+
+            # Build text from messages
+            text_parts: list[str] = []
+            code_samples: list[dict] = []
+
+            for msg in group_msgs:
+                user = msg.get("user", "unknown")
+                text = msg.get("text", "")
+                ts_display = msg.get("timestamp", "")[:19]
+
+                # Format message
+                msg_line = f"**{user}** ({ts_display}): {text}"
+                text_parts.append(msg_line)
+
+                # Add reactions
+                reactions = msg.get("reactions", [])
+                if reactions:
+                    reaction_str = " ".join(f":{r['emoji']}: ({r['count']})" for r in reactions)
+                    text_parts.append(f"  Reactions: {reaction_str}")
+
+                # Extract inline code blocks
+                code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL)
+                for match in code_block_pattern.finditer(text):
+                    lang = match.group(1) or ""
+                    code = match.group(2).strip()
+                    if code:
+                        code_samples.append(
+                            {
+                                "code": code,
+                                "language": lang,
+                                "quality_score": _score_code_quality(code),
+                            }
+                        )
+
+            sections.append(
+                {
+                    "section_number": section_number,
+                    "heading": f"#{channel} - {date_str}",
+                    "heading_level": "h2",
+                    "text": "\n\n".join(text_parts),
+                    "headings": [],
+                    "code_samples": code_samples,
+                    "tables": [],
+                    "images": [],
+                    "channel": channel,
+                    "date": date_str,
+                    "message_count": len(group_msgs),
+                }
+            )
+
+        return sections
+
+    # ------------------------------------------------------------------
+    # Output generation (private)
+    # ------------------------------------------------------------------
+
+    def _generate_reference_file(
+        self,
+        _cat_key: str,
+        cat_data: dict,
+        section_num: int,
+        total_sections: int,
+    ) -> None:
+        """Generate a reference markdown file for a category.
+
+        Args:
+            _cat_key: Category key (unused, for interface consistency).
+            cat_data: Category dict with 'title' and 'pages'.
+            section_num: 1-based index among all categories.
+            total_sections: Total number of categories being generated.
+        """
+        sections = cat_data["pages"]
+
+        if sections:
+            section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+            if total_sections == 1:
+                filename = f"{self.skill_dir}/references/main.md"
+            else:
+                sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
+                filename = f"{self.skill_dir}/references/{_cat_key}_{sec_range}.md"
+        else:
+            filename = f"{self.skill_dir}/references/section_{section_num:02d}.md"
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+
+            for section in sections:
+                sec_num = section.get("section_number", "?")
+                heading = section.get("heading", "")
+                msg_count = section.get("message_count", 0)
+
+                f.write(f"---\n\n**📄 Section {sec_num}**")
+                f.write(f" ({msg_count} messages)\n\n")
+
+                if heading:
+                    f.write(f"## {heading}\n\n")
+
+                # Message text
+                text = section.get("text", "").strip()
+                if text:
+                    f.write(f"{text}\n\n")
+
+                # Code samples
+                code_list = section.get("code_samples", [])
+                if code_list:
+                    f.write("### Code Snippets\n\n")
+                    for code in code_list:
+                        lang = code.get("language", "")
+                        f.write(f"```{lang}\n{code['code']}\n```\n\n")
+
+                f.write("---\n\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_index(self, categorized: dict[str, dict]) -> None:
+        """Generate reference index file listing all categories.
+
+        Args:
+            categorized: Dict mapping category keys to category dicts.
+        """
+        filename = f"{self.skill_dir}/references/index.md"
+        total_cats = len(categorized)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Chat Reference\n\n")
+            f.write("## Categories\n\n")
+
+            for section_num, (_ck, cd) in enumerate(categorized.items(), 1):
+                pages = cd["pages"]
+                count = len(pages)
+                total_msgs = sum(p.get("message_count", 0) for p in pages)
+
+                if pages:
+                    snums = [s.get("section_number", i + 1) for i, s in enumerate(pages)]
+                    rng = f"Sections {min(snums)}-{max(snums)}"
+                    link = "main.md" if total_cats == 1 else f"{_ck}_s{min(snums)}-s{max(snums)}.md"
+                else:
+                    link = f"section_{section_num:02d}.md"
+                    rng = "N/A"
+
+                f.write(
+                    f"- [{cd['title']}]({link}) ({count} sections, {total_msgs} messages, {rng})\n"
+                )
+
+            # Statistics
+            f.write("\n## Statistics\n\n")
+            meta = self.extracted_data.get("metadata", {})
+            f.write(f"- Platform: {self.extracted_data.get('platform', 'unknown')}\n")
+            f.write(f"- Total messages: {meta.get('total_messages', 0)}\n")
+            f.write(f"- Total threads: {meta.get('total_threads', 0)}\n")
+            f.write(f"- Code snippets: {meta.get('total_code_snippets', 0)}\n")
+            f.write(f"- Shared links: {meta.get('total_links', 0)}\n")
+            f.write(f"- Unique users: {meta.get('unique_users', 0)}\n")
+            f.write(f"- Channels: {len(meta.get('channels', []))}\n")
+
+            # Channel summaries
+            channel_summaries = self.extracted_data.get("channel_summaries", {})
+            if channel_summaries:
+                f.write("\n## Channel Summary\n\n")
+                for ch_name, summary in sorted(channel_summaries.items()):
+                    f.write(f"### #{ch_name}\n\n")
+                    f.write(f"- Messages: {summary.get('message_count', 0)}\n")
+                    f.write(f"- Users: {summary.get('unique_users', 0)}\n")
+                    dr = summary.get("date_range", {})
+                    if dr.get("earliest") and dr.get("latest"):
+                        f.write(f"- Date range: {dr['earliest'][:10]} to {dr['latest'][:10]}\n")
+                    if summary.get("has_code"):
+                        f.write("- Contains code snippets\n")
+                    top_users = summary.get("top_users", [])
+                    if top_users:
+                        top_str = ", ".join(f"{u['user']} ({u['count']})" for u in top_users[:3])
+                        f.write(f"- Top contributors: {top_str}\n")
+                    f.write("\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict[str, dict]) -> None:
+        """Generate main SKILL.md file with YAML frontmatter and overview.
+
+        Args:
+            categorized: Dict mapping category keys to category dicts.
+        """
+        filename = f"{self.skill_dir}/SKILL.md"
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024]
+        meta = self.extracted_data.get("metadata", {})
+
+        with open(filename, "w", encoding="utf-8") as f:
+            # YAML frontmatter
+            f.write("---\n")
+            f.write(f"name: {skill_name}\n")
+            f.write(f"description: {desc}\n")
+            f.write("---\n\n")
+
+            platform_label = self.platform.title()
+            f.write(f"# {self.name.title()} {platform_label} Chat Skill\n\n")
+            f.write(f"{self.description}\n\n")
+
+            # Chat metadata
+            f.write(f"## 📋 {platform_label} Chat Information\n\n")
+            f.write(f"**Platform:** {platform_label}\n\n")
+            f.write(f"**Source:** {self.extracted_data.get('source', 'N/A')}\n\n")
+            f.write(f"**Total Messages:** {meta.get('total_messages', 0)}\n\n")
+            f.write(f"**Unique Users:** {meta.get('unique_users', 0)}\n\n")
+            channels = meta.get("channels", [])
+            if channels:
+                f.write(f"**Channels:** {', '.join(f'#{c}' for c in channels)}\n\n")
+
+            # When to Use
+            f.write("## 💡 When to Use This Skill\n\n")
+            f.write("Use this skill when you need to:\n")
+            f.write(f"- Find solutions discussed in {self.name} chat history\n")
+            f.write("- Reference code snippets shared by team members\n")
+            f.write("- Understand team decisions and architectural discussions\n")
+            f.write("- Look up troubleshooting steps from past conversations\n")
+            f.write("- Find shared links and resources from the team\n\n")
+
+            # Section overview
+            total_sections = self.extracted_data.get("total_sections", 0)
+            f.write(f"## 📖 Content Overview\n\n")
+            f.write(f"**Total Sections:** {total_sections}\n\n")
+            f.write("**Content Breakdown:**\n\n")
+            for cd in categorized.values():
+                f.write(f"- **{cd['title']}**: {len(cd['pages'])} sections\n")
+            f.write("\n")
+
+            # Key topics
+            f.write(self._format_key_topics())
+
+            # Top code examples
+            code_snippets = self.extracted_data.get("code_snippets", [])
+            if code_snippets:
+                f.write("## 📝 Top Code Snippets\n\n")
+                f.write("*High-quality code shared in chat*\n\n")
+
+                by_lang: dict[str, list] = {}
+                for cs in code_snippets[:15]:
+                    lang = cs.get("language", "unknown") or "unknown"
+                    by_lang.setdefault(lang, []).append(cs)
+
+                for lang in sorted(by_lang.keys()):
+                    examples = by_lang[lang]
+                    f.write(f"### {lang.title()} ({len(examples)} snippets)\n\n")
+                    for i, cs in enumerate(examples[:3], 1):
+                        quality = cs.get("quality_score", 0)
+                        user = cs.get("user", "")
+                        code_text = cs.get("code", "")
+                        f.write(f"**Snippet {i}**")
+                        if user:
+                            f.write(f" (by {user})")
+                        f.write(f" (Quality: {quality:.1f}/10):\n\n")
+                        f.write(f"```{lang}\n")
+                        if len(code_text) <= 500:
+                            f.write(code_text)
+                        else:
+                            f.write(code_text[:500] + "\n...")
+                        f.write("\n```\n\n")
+
+            # Shared links
+            links = self.extracted_data.get("links", [])
+            if links:
+                f.write(f"## 🔗 Shared Links ({len(links)})\n\n")
+                f.write("*Key resources shared in chat*\n\n")
+                for link in links[:20]:
+                    url = link.get("url", "")
+                    user = link.get("user", "")
+                    channel = link.get("channel", "")
+                    f.write(f"- {url}")
+                    if user or channel:
+                        parts = []
+                        if user:
+                            parts.append(f"by {user}")
+                        if channel:
+                            parts.append(f"in #{channel}")
+                        f.write(f" ({', '.join(parts)})")
+                    f.write("\n")
+                if len(links) > 20:
+                    f.write(f"\n*... and {len(links) - 20} more links*\n")
+                f.write("\n")
+
+            # Statistics
+            f.write(f"## 📊 Chat Statistics\n\n")
+            f.write(f"- **Total Messages**: {meta.get('total_messages', 0)}\n")
+            f.write(f"- **Total Threads**: {meta.get('total_threads', 0)}\n")
+            f.write(f"- **Code Snippets**: {meta.get('total_code_snippets', 0)}\n")
+            f.write(f"- **Shared Links**: {meta.get('total_links', 0)}\n")
+            f.write(f"- **Unique Users**: {meta.get('unique_users', 0)}\n")
+            f.write(f"- **Channels**: {len(meta.get('channels', []))}\n\n")
+
+            # Channel breakdown
+            channel_summaries = self.extracted_data.get("channel_summaries", {})
+            if channel_summaries:
+                f.write("**Channel Activity:**\n\n")
+                for ch_name, summary in sorted(
+                    channel_summaries.items(),
+                    key=lambda x: x[1].get("message_count", 0),
+                    reverse=True,
+                ):
+                    msg_count = summary.get("message_count", 0)
+                    user_count = summary.get("unique_users", 0)
+                    f.write(f"- #{ch_name}: {msg_count} messages, {user_count} users\n")
+                f.write("\n")
+
+            # Navigation
+            f.write("## 🗺️ Navigation\n\n")
+            f.write("**Reference Files:**\n\n")
+            for cd in categorized.values():
+                cat_file = self._sanitize_filename(cd["title"])
+                f.write(f"- `references/{cat_file}.md` - {cd['title']}\n")
+            f.write("\nSee `references/index.md` for complete chat structure.\n\n")
+
+            # Footer
+            f.write("---\n\n")
+            f.write(f"**Generated by Skill Seeker** | {platform_label} Chat Scraper\n")
+
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"   Generated: {filename} ({line_count} lines)")
+
+    # ------------------------------------------------------------------
+    # Content analysis helpers
+    # ------------------------------------------------------------------
+
+    def _format_key_topics(self) -> str:
+        """Extract key discussion topics from section headings and content.
+
+        Returns:
+            Markdown string with key topics section.
+        """
+        sections = self.extracted_data.get("pages", [])
+        if not sections:
+            return ""
+
+        # Count topic matches across all sections
+        topic_counts: dict[str, int] = defaultdict(int)
+        for section in sections:
+            combined = self._section_text(section)
+            for topic, keywords in _TOPIC_KEYWORDS.items():
+                score = sum(1 for kw in keywords if kw.lower() in combined)
+                if score >= 2:
+                    topic_counts[topic] += 1
+
+        if not topic_counts:
+            return ""
+
+        content = "## 🔑 Key Discussion Topics\n\n"
+        content += "*Topics frequently discussed in chat*\n\n"
+
+        for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True):
+            label = topic.replace("_", " ").title()
+            content += f"- **{label}**: {count} conversations\n"
+        content += "\n"
+
+        return content
+
+    def _section_text(self, section: dict) -> str:
+        """Combine section text, heading, and code into a lowercase string.
+
+        Args:
+            section: Section dict.
+
+        Returns:
+            Combined lowercase text for keyword matching.
+        """
+        text = section.get("text", "").lower()
+        heading = section.get("heading", "").lower()
+        code = " ".join(cs.get("code", "").lower() for cs in section.get("code_samples", []))
+        return f"{text} {heading} {code}"
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert a string to a filesystem-safe filename.
+
+        Args:
+            name: Input string to sanitize.
+
+        Returns:
+            Safe lowercase filename with underscores.
+        """
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        return re.sub(r"[-\s]+", "_", safe)
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """CLI entry point for the Slack/Discord chat scraper.
+
+    Parses command-line arguments and runs the extraction and
+    skill-building pipeline. Supports export import, API fetch,
+    and loading from previously extracted JSON.
+
+    Returns:
+        Exit code (0 for success, non-zero for errors).
+    """
+    from .arguments.chat import add_chat_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert Slack/Discord chat history to AI-ready skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Slack workspace export
+    %(prog)s --export-path ./slack-export/ --platform slack --name myteam
+
+    # Slack API
+    %(prog)s --platform slack --token xoxb-... --channel C01234 --name myteam
+
+    # Discord export (DiscordChatExporter)
+    %(prog)s --export-path ./discord-export.json --platform discord --name myserver
+
+    # Discord API
+    %(prog)s --platform discord --token Bot-token --channel 12345 --name myserver
+
+    # From previously extracted JSON
+    %(prog)s --from-json myteam_extracted.json --name myteam
+        """,
+    )
+
+    add_chat_arguments(parser)
+
+    args = parser.parse_args()
+
+    # Set logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle --dry-run
+    if args.dry_run:
+        source = args.export_path or args.from_json or f"{args.platform}-api"
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: Chat Extraction")
+        print(f"{'=' * 60}")
+        print(f"Platform:       {args.platform}")
+        print(f"Source:         {source}")
+        print(f"Name:           {args.name or '(auto-detect)'}")
+        print(f"Channel:        {args.channel or '(all)'}")
+        print(f"Max messages:   {args.max_messages}")
+        print(f"Enhance level:  {args.enhance_level}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate inputs
+    if args.from_json:
+        # Build from previously extracted JSON
+        name = args.name or Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": name,
+            "description": (args.description or f"Use when referencing {name} chat knowledge base"),
+        }
+        try:
+            converter = ChatToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Require either --export-path or --token for extraction
+    if not args.export_path and not args.token:
+        parser.error(
+            "Must specify --export-path (export mode), --token (API mode), "
+            "or --from-json (build from extracted data)"
+        )
+
+    if not args.name:
+        if args.export_path:
+            args.name = Path(args.export_path).stem
+        else:
+            args.name = f"{args.platform}_chat"
+
+    config = {
+        "name": args.name,
+        "export_path": args.export_path or "",
+        "platform": args.platform,
+        "token": args.token or "",
+        "channel": args.channel or "",
+        "max_messages": args.max_messages,
+        "description": args.description,
+    }
+
+    try:
+        converter = ChatToSkillConverter(config)
+
+        # Extract
+        if not converter.extract_chat():
+            print(
+                "\n❌ Chat extraction failed - see error above",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis, "
+                    "enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import (
+                        LocalSkillEnhancer,
+                    )
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import (
+                    LocalSkillEnhancer,
+                )
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except (FileNotFoundError, ValueError) as e:
+        print(f"\n❌ Input error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(
+            f"\n❌ Unexpected error during chat processing: {e}",
+            file=sys.stderr,
+        )
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/config_validator.py b/src/skill_seekers/cli/config_validator.py
index c55e73d..086d2ef 100644
--- a/src/skill_seekers/cli/config_validator.py
+++ b/src/skill_seekers/cli/config_validator.py
@@ -7,6 +7,19 @@ Validates unified config format that supports multiple sources:
 - github (repository scraping)
 - pdf (PDF document scraping)
 - local (local codebase analysis)
+- word (Word .docx document scraping)
+- video (video transcript/visual extraction)
+- epub (EPUB e-book extraction)
+- jupyter (Jupyter Notebook extraction)
+- html (local HTML file extraction)
+- openapi (OpenAPI/Swagger spec extraction)
+- asciidoc (AsciiDoc document extraction)
+- pptx (PowerPoint presentation extraction)
+- confluence (Confluence wiki extraction)
+- notion (Notion page extraction)
+- rss (RSS/Atom feed extraction)
+- manpage (man page extraction)
+- chat (Slack/Discord chat export extraction)
 
 Legacy config format support removed in v2.11.0.
 All configs must use unified format with 'sources' array.
@@ -27,7 +40,25 @@ class ConfigValidator:
     """
 
     # Valid source types
-    VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"}
+    VALID_SOURCE_TYPES = {
+        "documentation",
+        "github",
+        "pdf",
+        "local",
+        "word",
+        "video",
+        "epub",
+        "jupyter",
+        "html",
+        "openapi",
+        "asciidoc",
+        "pptx",
+        "confluence",
+        "notion",
+        "rss",
+        "manpage",
+        "chat",
+    }
 
     # Valid merge modes
     VALID_MERGE_MODES = {"rule-based", "claude-enhanced"}
@@ -159,6 +190,32 @@ class ConfigValidator:
             self._validate_pdf_source(source, index)
         elif source_type == "local":
             self._validate_local_source(source, index)
+        elif source_type == "word":
+            self._validate_word_source(source, index)
+        elif source_type == "video":
+            self._validate_video_source(source, index)
+        elif source_type == "epub":
+            self._validate_epub_source(source, index)
+        elif source_type == "jupyter":
+            self._validate_jupyter_source(source, index)
+        elif source_type == "html":
+            self._validate_html_source(source, index)
+        elif source_type == "openapi":
+            self._validate_openapi_source(source, index)
+        elif source_type == "asciidoc":
+            self._validate_asciidoc_source(source, index)
+        elif source_type == "pptx":
+            self._validate_pptx_source(source, index)
+        elif source_type == "confluence":
+            self._validate_confluence_source(source, index)
+        elif source_type == "notion":
+            self._validate_notion_source(source, index)
+        elif source_type == "rss":
+            self._validate_rss_source(source, index)
+        elif source_type == "manpage":
+            self._validate_manpage_source(source, index)
+        elif source_type == "chat":
+            self._validate_chat_source(source, index)
 
     def _validate_documentation_source(self, source: dict[str, Any], index: int):
         """Validate documentation source configuration."""
@@ -253,12 +310,126 @@ class ConfigValidator:
                     f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}"
                 )
 
+    def _validate_word_source(self, source: dict[str, Any], index: int):
+        """Validate Word document (.docx) source configuration."""
+        if "path" not in source:
+            raise ValueError(f"Source {index} (word): Missing required field 'path'")
+        word_path = source["path"]
+        if not Path(word_path).exists():
+            logger.warning(f"Source {index} (word): File not found: {word_path}")
+
+    def _validate_video_source(self, source: dict[str, Any], index: int):
+        """Validate video source configuration."""
+        has_url = "url" in source
+        has_path = "path" in source
+        has_playlist = "playlist" in source
+        if not has_url and not has_path and not has_playlist:
+            raise ValueError(
+                f"Source {index} (video): Missing required field 'url', 'path', or 'playlist'"
+            )
+
+    def _validate_epub_source(self, source: dict[str, Any], index: int):
+        """Validate EPUB source configuration."""
+        if "path" not in source:
+            raise ValueError(f"Source {index} (epub): Missing required field 'path'")
+        epub_path = source["path"]
+        if not Path(epub_path).exists():
+            logger.warning(f"Source {index} (epub): File not found: {epub_path}")
+
+    def _validate_jupyter_source(self, source: dict[str, Any], index: int):
+        """Validate Jupyter Notebook source configuration."""
+        if "path" not in source:
+            raise ValueError(f"Source {index} (jupyter): Missing required field 'path'")
+        nb_path = source["path"]
+        if not Path(nb_path).exists():
+            logger.warning(f"Source {index} (jupyter): Path not found: {nb_path}")
+
+    def _validate_html_source(self, source: dict[str, Any], index: int):
+        """Validate local HTML source configuration."""
+        if "path" not in source:
+            raise ValueError(f"Source {index} (html): Missing required field 'path'")
+        html_path = source["path"]
+        if not Path(html_path).exists():
+            logger.warning(f"Source {index} (html): Path not found: {html_path}")
+
+    def _validate_openapi_source(self, source: dict[str, Any], index: int):
+        """Validate OpenAPI/Swagger source configuration."""
+        if "path" not in source and "url" not in source:
+            raise ValueError(f"Source {index} (openapi): Missing required field 'path' or 'url'")
+        if "path" in source and not Path(source["path"]).exists():
+            logger.warning(f"Source {index} (openapi): File not found: {source['path']}")
+
+    def _validate_asciidoc_source(self, source: dict[str, Any], index: int):
+        """Validate AsciiDoc source configuration."""
+        if "path" not in source:
+            raise ValueError(f"Source {index} (asciidoc): Missing required field 'path'")
+        adoc_path = source["path"]
+        if not Path(adoc_path).exists():
+            logger.warning(f"Source {index} (asciidoc): Path not found: {adoc_path}")
+
+    def _validate_pptx_source(self, source: dict[str, Any], index: int):
+        """Validate PowerPoint source configuration."""
+        if "path" not in source:
+            raise ValueError(f"Source {index} (pptx): Missing required field 'path'")
+        pptx_path = source["path"]
+        if not Path(pptx_path).exists():
+            logger.warning(f"Source {index} (pptx): File not found: {pptx_path}")
+
+    def _validate_confluence_source(self, source: dict[str, Any], index: int):
+        """Validate Confluence source configuration."""
+        has_url = "url" in source or "base_url" in source
+        has_path = "path" in source
+        if not has_url and not has_path:
+            raise ValueError(
+                f"Source {index} (confluence): Missing required field 'url'/'base_url' "
+                f"(for API) or 'path' (for export)"
+            )
+        if has_url and "space_key" not in source and "path" not in source:
+            logger.warning(f"Source {index} (confluence): No 'space_key' specified for API mode")
+
+    def _validate_notion_source(self, source: dict[str, Any], index: int):
+        """Validate Notion source configuration."""
+        has_url = "url" in source or "database_id" in source or "page_id" in source
+        has_path = "path" in source
+        if not has_url and not has_path:
+            raise ValueError(
+                f"Source {index} (notion): Missing required field 'url'/'database_id'/'page_id' "
+                f"(for API) or 'path' (for export)"
+            )
+
+    def _validate_rss_source(self, source: dict[str, Any], index: int):
+        """Validate RSS/Atom feed source configuration."""
+        if "url" not in source and "path" not in source:
+            raise ValueError(f"Source {index} (rss): Missing required field 'url' or 'path'")
+
+    def _validate_manpage_source(self, source: dict[str, Any], index: int):
+        """Validate man page source configuration."""
+        if "path" not in source and "names" not in source:
+            raise ValueError(f"Source {index} (manpage): Missing required field 'path' or 'names'")
+        if "path" in source and not Path(source["path"]).exists():
+            logger.warning(f"Source {index} (manpage): Path not found: {source['path']}")
+
+    def _validate_chat_source(self, source: dict[str, Any], index: int):
+        """Validate Slack/Discord chat source configuration."""
+        has_path = "path" in source
+        has_api = "token" in source or "webhook_url" in source
+        has_channel = "channel" in source or "channel_id" in source
+        if not has_path and not has_api:
+            raise ValueError(
+                f"Source {index} (chat): Missing required field 'path' (for export) "
+                f"or 'token' (for API)"
+            )
+        if has_api and not has_channel:
+            logger.warning(
+                f"Source {index} (chat): No 'channel' or 'channel_id' specified for API mode"
+            )
+
     def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]:
         """
         Get all sources of a specific type.
 
         Args:
-            source_type: 'documentation', 'github', 'pdf', or 'local'
+            source_type: Any valid source type string
 
         Returns:
             List of sources matching the type
diff --git a/src/skill_seekers/cli/confluence_scraper.py b/src/skill_seekers/cli/confluence_scraper.py
new file mode 100644
index 0000000..6204606
--- /dev/null
+++ b/src/skill_seekers/cli/confluence_scraper.py
@@ -0,0 +1,2166 @@
+#!/usr/bin/env python3
+"""
+Confluence Documentation to Skill Converter
+
+Converts Confluence spaces into AI-ready skills by extracting page content,
+hierarchy, code blocks, tables, and attachments. Supports two extraction modes:
+
+1. **API mode**: Connects to a Confluence instance via the Atlassian REST API
+   (requires ``atlassian-python-api``). Fetches pages from a specified space,
+   preserving the parent-child hierarchy. Requires ``--base-url``, ``--space-key``,
+   and authentication via ``--username`` / ``--token`` (or env vars).
+
+2. **Export mode**: Parses a Confluence HTML/XML export directory previously
+   downloaded from the Confluence admin UI. Requires ``--export-path`` pointing
+   to the extracted export directory containing ``entities.xml`` or HTML files.
+
+Usage:
+    # API mode
+    skill-seekers confluence --base-url https://wiki.example.com \\
+        --space-key PROJ --username user@example.com --token $CONFLUENCE_TOKEN \\
+        --name my-project-wiki
+
+    # Export mode
+    skill-seekers confluence --export-path ./confluence-export/ --name my-wiki
+
+    # Build from previously extracted JSON
+    skill-seekers confluence --from-json my-wiki_extracted.json
+
+    # Standalone execution
+    python3 -m skill_seekers.cli.confluence_scraper --base-url https://wiki.example.com \\
+        --space-key DEV --name dev-wiki --max-pages 200
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+# Optional dependency guard for atlassian-python-api
+try:
+    from atlassian import Confluence
+
+    ATLASSIAN_AVAILABLE = True
+except ImportError:
+    ATLASSIAN_AVAILABLE = False
+
+# BeautifulSoup is a core dependency (always available)
+from bs4 import BeautifulSoup, Comment, Tag
+
+logger = logging.getLogger(__name__)
+
+# Confluence-specific HTML macro class patterns to strip during cleaning
+_CONFLUENCE_MACRO_CLASSES = {
+    "confluence-information-macro",
+    "confluence-information-macro-body",
+    "confluence-information-macro-icon",
+    "expand-container",
+    "expand-content",
+    "expand-control",
+    "plugin-tabmeta",
+    "plugin_pagetree",
+    "page-metadata",
+    "aui-message",
+}
+
+# Confluence macro element tag names (structured-macro in storage format)
+_STORAGE_MACRO_TAGS = {
+    "ac:structured-macro",
+    "ac:rich-text-body",
+    "ac:parameter",
+    "ac:plain-text-body",
+    "ac:image",
+    "ac:link",
+    "ac:emoticon",
+    "ac:task-list",
+    "ac:task",
+    "ac:task-body",
+    "ac:task-status",
+    "ri:attachment",
+    "ri:page",
+    "ri:space",
+    "ri:url",
+    "ri:user",
+}
+
+# Known Confluence code macro language mappings
+_CODE_MACRO_LANGS = {
+    "py": "python",
+    "python": "python",
+    "python3": "python",
+    "js": "javascript",
+    "javascript": "javascript",
+    "ts": "typescript",
+    "typescript": "typescript",
+    "java": "java",
+    "bash": "bash",
+    "sh": "bash",
+    "shell": "bash",
+    "sql": "sql",
+    "xml": "xml",
+    "html": "html",
+    "css": "css",
+    "json": "json",
+    "yaml": "yaml",
+    "yml": "yaml",
+    "ruby": "ruby",
+    "go": "go",
+    "golang": "go",
+    "rust": "rust",
+    "c": "c",
+    "cpp": "cpp",
+    "csharp": "csharp",
+    "cs": "csharp",
+    "kotlin": "kotlin",
+    "swift": "swift",
+    "scala": "scala",
+    "groovy": "groovy",
+    "perl": "perl",
+    "php": "php",
+    "r": "r",
+    "powershell": "powershell",
+    "dockerfile": "dockerfile",
+    "terraform": "hcl",
+    "hcl": "hcl",
+    "markdown": "markdown",
+    "text": "",
+    "none": "",
+}
+
+
+def _check_atlassian_deps() -> None:
+    """Raise RuntimeError if atlassian-python-api is not installed."""
+    if not ATLASSIAN_AVAILABLE:
+        raise RuntimeError(
+            "atlassian-python-api is required for Confluence API mode.\n"
+            "Install with: pip install atlassian-python-api\n"
+            'Or: pip install "skill-seekers[confluence]"'
+        )
+
+
+def infer_description_from_confluence(
+    space_info: dict | None = None,
+    name: str = "",
+) -> str:
+    """Infer skill description from Confluence space metadata.
+
+    Args:
+        space_info: Confluence space metadata dict (name, description, key).
+        name: Skill name for fallback.
+
+    Returns:
+        Description string suitable for "Use when..." format.
+    """
+    if space_info:
+        desc_text = space_info.get("description", "")
+        if isinstance(desc_text, dict):
+            # Confluence API returns description as {"plain": {"value": "..."}}
+            desc_text = desc_text.get("plain", {}).get("value", "") or desc_text.get(
+                "view", {}
+            ).get("value", "")
+        if desc_text and len(desc_text) > 20:
+            clean = re.sub(r"<[^>]+>", "", desc_text).strip()
+            if len(clean) > 150:
+                clean = clean[:147] + "..."
+            return f"Use when {clean.lower()}"
+        space_name = space_info.get("name", "")
+        if space_name and len(space_name) > 5:
+            return f"Use when working with {space_name.lower()} documentation"
+    return (
+        f"Use when referencing {name} documentation"
+        if name
+        else "Use when referencing this Confluence documentation"
+    )
+
+
+class ConfluenceToSkillConverter:
+    """Convert Confluence space documentation to an AI-ready skill.
+
+    Supports two extraction modes:
+
+    - **API mode**: Uses the Atlassian Confluence REST API to fetch pages from
+      a space, including page hierarchy, labels, and storage-format content.
+      Requires ``base_url``, ``space_key``, and authentication credentials.
+
+    - **Export mode**: Parses a Confluence HTML/XML export directory that has
+      been downloaded and extracted from the Confluence admin interface.
+      Requires ``export_path`` pointing to the extracted directory.
+
+    After extraction, the converter categorises pages by their parent-child
+    hierarchy, generates reference markdown files, an index, and the main
+    SKILL.md manifest.
+
+    Attributes:
+        config: Configuration dictionary.
+        name: Skill name used for output directory and filenames.
+        base_url: Confluence instance base URL (API mode).
+        space_key: Confluence space key (API mode).
+        export_path: Path to exported Confluence directory (export mode).
+        username: Confluence username / email for API authentication.
+        token: Confluence API token or password.
+        description: Skill description for SKILL.md frontmatter.
+        max_pages: Maximum number of pages to fetch in API mode.
+        skill_dir: Output directory for the generated skill.
+        data_file: Path to the intermediate extracted JSON file.
+        extracted_data: Structured extraction results dict.
+    """
+
+    def __init__(self, config: dict) -> None:
+        """Initialize the Confluence to skill converter.
+
+        Args:
+            config: Configuration dictionary containing:
+                - name (str): Skill name (required).
+                - base_url (str): Confluence instance URL (API mode).
+                - space_key (str): Confluence space key (API mode).
+                - export_path (str): Path to export directory (export mode).
+                - username (str): API username / email (optional, falls back to env).
+                - token (str): API token (optional, falls back to env).
+                - description (str): Skill description (optional).
+                - max_pages (int): Maximum pages to fetch, default 500.
+        """
+        self.config = config
+        self.name: str = config["name"]
+        self.base_url: str = config.get("base_url", "")
+        self.space_key: str = config.get("space_key", "")
+        self.export_path: str = config.get("export_path", "")
+        self.username: str = config.get("username", "")
+        self.token: str = config.get("token", "")
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} documentation"
+        )
+        self.max_pages: int = int(config.get("max_pages", 500))
+
+        # Output paths
+        self.skill_dir = f"output/{self.name}"
+        self.data_file = f"output/{self.name}_extracted.json"
+
+        # Extracted data storage
+        self.extracted_data: dict[str, Any] | None = None
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Extraction dispatcher
+    # ──────────────────────────────────────────────────────────────────────
+
+    def extract_confluence(self) -> bool:
+        """Extract content from Confluence, dispatching to API or export mode.
+
+        Determines the extraction mode based on the provided configuration:
+        - If ``base_url`` and ``space_key`` are set, uses API mode.
+        - If ``export_path`` is set, uses export mode.
+        - Raises ValueError if neither mode is configured.
+
+        After extraction, saves intermediate JSON to ``{name}_extracted.json``
+        and updates the description from space metadata if not explicitly set.
+
+        Returns:
+            True on successful extraction.
+
+        Raises:
+            ValueError: If neither API nor export configuration is provided.
+            RuntimeError: If API dependencies are missing or connection fails.
+        """
+        if self.base_url and self.space_key:
+            print(f"\n  Extracting from Confluence API: {self.base_url}")
+            print(f"  Space: {self.space_key}")
+            raw_pages = self._extract_via_api()
+        elif self.export_path:
+            print(f"\n  Extracting from Confluence export: {self.export_path}")
+            raw_pages = self._extract_from_export()
+        else:
+            raise ValueError(
+                "No Confluence source configured. Provide either:\n"
+                "  - --base-url and --space-key (API mode), or\n"
+                "  - --export-path (export mode)"
+            )
+
+        if not raw_pages:
+            logger.warning("No pages extracted from Confluence")
+
+        # Build page hierarchy tree
+        page_tree = self._extract_page_tree(raw_pages)
+
+        # Parse each page's HTML content to structured sections
+        sections: list[dict[str, Any]] = []
+        total_code_blocks = 0
+        total_images = 0
+        section_number = 0
+
+        for page in raw_pages:
+            page_id = page.get("id", "")
+            page_title = page.get("title", "Untitled")
+            body_html = page.get("body", "")
+            labels = page.get("labels", [])
+            parent_id = page.get("parent_id", "")
+
+            if not body_html:
+                logger.debug("Skipping page with no body: %s", page_title)
+                continue
+
+            # Parse the Confluence HTML content
+            parsed = self._parse_confluence_html(body_html, page_title)
+
+            section_number += 1
+            section_data: dict[str, Any] = {
+                "section_number": section_number,
+                "page_id": page_id,
+                "heading": page_title,
+                "heading_level": "h1",
+                "parent_id": parent_id,
+                "labels": labels,
+                "text": parsed.get("text", ""),
+                "headings": parsed.get("headings", []),
+                "code_samples": parsed.get("code_samples", []),
+                "tables": parsed.get("tables", []),
+                "images": parsed.get("images", []),
+                "links": parsed.get("links", []),
+                "macros": parsed.get("macros", []),
+            }
+            sections.append(section_data)
+            total_code_blocks += len(parsed.get("code_samples", []))
+            total_images += len(parsed.get("images", []))
+
+        # Collect space metadata
+        space_info = raw_pages[0].get("space_info", {}) if raw_pages else {}
+
+        # Update description from space metadata if not explicitly set
+        if not self.config.get("description"):
+            self.description = infer_description_from_confluence(space_info, self.name)
+
+        # Detect programming languages in code samples
+        languages_detected: dict[str, int] = {}
+        for section in sections:
+            for code_sample in section.get("code_samples", []):
+                lang = code_sample.get("language", "")
+                if lang:
+                    languages_detected[lang] = languages_detected.get(lang, 0) + 1
+
+        result_data: dict[str, Any] = {
+            "source": self.base_url or self.export_path,
+            "space_key": self.space_key,
+            "space_info": space_info,
+            "page_tree": page_tree,
+            "total_sections": len(sections),
+            "total_pages": len(raw_pages),
+            "total_code_blocks": total_code_blocks,
+            "total_images": total_images,
+            "languages_detected": languages_detected,
+            "pages": sections,
+        }
+
+        # Save extracted data
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+
+        print(f"\n  Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"  Extracted {len(sections)} pages, "
+            f"{total_code_blocks} code blocks, "
+            f"{total_images} images"
+        )
+        return True
+
+    # ──────────────────────────────────────────────────────────────────────
+    # API extraction
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _extract_via_api(self) -> list[dict[str, Any]]:
+        """Fetch pages from a Confluence space using the REST API.
+
+        Connects to the Confluence instance using ``atlassian-python-api``,
+        retrieves all pages in the configured space (up to ``max_pages``),
+        and returns them as a list of normalised page dicts.
+
+        Authentication is resolved in priority order:
+        1. Constructor arguments (username/token)
+        2. Environment variables (CONFLUENCE_USERNAME / CONFLUENCE_TOKEN)
+
+        Returns:
+            List of page dicts with keys: id, title, body, parent_id, labels,
+            url, space_info, version, created, modified.
+
+        Raises:
+            RuntimeError: If atlassian-python-api is not installed or
+                          the connection / fetch fails.
+        """
+        _check_atlassian_deps()
+
+        # Resolve authentication credentials
+        username = (
+            self.username
+            or os.environ.get("CONFLUENCE_USERNAME", "")
+            or os.environ.get("ATLASSIAN_USERNAME", "")
+        )
+        token = (
+            self.token
+            or os.environ.get("CONFLUENCE_TOKEN", "")
+            or os.environ.get("ATLASSIAN_TOKEN", "")
+        )
+
+        if not username or not token:
+            raise RuntimeError(
+                "Confluence API authentication required.\n"
+                "Provide --username and --token, or set CONFLUENCE_USERNAME "
+                "and CONFLUENCE_TOKEN environment variables."
+            )
+
+        # Connect to Confluence
+        try:
+            confluence = Confluence(
+                url=self.base_url,
+                username=username,
+                password=token,
+                cloud=self._is_cloud_instance(),
+            )
+        except Exception as e:
+            raise RuntimeError(f"Failed to connect to Confluence at {self.base_url}: {e}") from e
+
+        # Fetch space information
+        space_info: dict[str, Any] = {}
+        try:
+            space_data = confluence.get_space(self.space_key, expand="description.plain,homepage")
+            space_info = {
+                "key": space_data.get("key", self.space_key),
+                "name": space_data.get("name", self.space_key),
+                "description": space_data.get("description", {}),
+                "type": space_data.get("type", "global"),
+                "homepage_id": (
+                    space_data.get("homepage", {}).get("id", "")
+                    if space_data.get("homepage")
+                    else ""
+                ),
+            }
+            print(f"  Space: {space_info.get('name', self.space_key)}")
+        except Exception as e:
+            logger.warning("Could not fetch space info: %s", e)
+            space_info = {"key": self.space_key, "name": self.space_key}
+
+        # Fetch all pages in the space, paginated
+        pages: list[dict[str, Any]] = []
+        start = 0
+        limit = 50  # Confluence API page size
+        expand_fields = "body.storage,version,ancestors,metadata.labels"
+
+        print(f"  Fetching pages (max {self.max_pages})...")
+
+        while len(pages) < self.max_pages:
+            try:
+                batch = confluence.get_all_pages_from_space(
+                    self.space_key,
+                    start=start,
+                    limit=min(limit, self.max_pages - len(pages)),
+                    expand=expand_fields,
+                    content_type="page",
+                )
+            except Exception as e:
+                logger.error("Failed to fetch pages at offset %d: %s", start, e)
+                break
+
+            if not batch:
+                break
+
+            for page_data in batch:
+                page_id = str(page_data.get("id", ""))
+                title = page_data.get("title", "Untitled")
+
+                # Extract body (storage format HTML)
+                body = page_data.get("body", {}).get("storage", {}).get("value", "")
+
+                # Extract parent ID from ancestors
+                ancestors = page_data.get("ancestors", [])
+                parent_id = str(ancestors[-1]["id"]) if ancestors else ""
+
+                # Extract labels
+                labels_data = page_data.get("metadata", {}).get("labels", {}).get("results", [])
+                labels = [lbl.get("name", "") for lbl in labels_data if lbl.get("name")]
+
+                # Version and dates
+                version_info = page_data.get("version", {})
+                version_number = version_info.get("number", 1)
+                created = version_info.get("when", "") if version_number == 1 else ""
+                modified = version_info.get("when", "")
+
+                # Build page URL
+                page_url = f"{self.base_url}/wiki/spaces/{self.space_key}/pages/{page_id}"
+                links = page_data.get("_links", {})
+                if links.get("webui"):
+                    page_url = f"{self.base_url}/wiki{links['webui']}"
+
+                page_dict: dict[str, Any] = {
+                    "id": page_id,
+                    "title": title,
+                    "body": body,
+                    "parent_id": parent_id,
+                    "labels": labels,
+                    "url": page_url,
+                    "space_info": space_info,
+                    "version": version_number,
+                    "created": created,
+                    "modified": modified,
+                }
+                pages.append(page_dict)
+
+            print(f"    Fetched {len(pages)} pages...")
+            start += len(batch)
+
+            # If we got fewer results than the limit, we've reached the end
+            if len(batch) < limit:
+                break
+
+        print(f"  Total pages fetched: {len(pages)}")
+        return pages
+
+    def _is_cloud_instance(self) -> bool:
+        """Detect whether the base URL points to an Atlassian Cloud instance.
+
+        Cloud instances use ``*.atlassian.net`` domain names.
+
+        Returns:
+            True if the URL looks like an Atlassian Cloud instance.
+        """
+        return "atlassian.net" in self.base_url.lower()
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Export extraction
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _extract_from_export(self) -> list[dict[str, Any]]:
+        """Parse a Confluence HTML/XML export directory into page dicts.
+
+        Confluence exports can contain either:
+        - An ``entities.xml`` file (full XML export from admin)
+        - A directory of HTML files (HTML export)
+
+        This method auto-detects the export format and delegates accordingly.
+        HTML files are parsed with BeautifulSoup to extract content and metadata.
+
+        Returns:
+            List of normalised page dicts (same structure as API mode).
+
+        Raises:
+            FileNotFoundError: If the export path does not exist.
+            ValueError: If no parseable content is found in the export.
+        """
+        export_dir = Path(self.export_path)
+        if not export_dir.exists():
+            raise FileNotFoundError(f"Confluence export path not found: {self.export_path}")
+        if not export_dir.is_dir():
+            raise ValueError(f"Export path is not a directory: {self.export_path}")
+
+        pages: list[dict[str, Any]] = []
+        space_info: dict[str, Any] = {"key": self.space_key or "EXPORT", "name": self.name}
+
+        # Check for entities.xml (full XML export)
+        entities_xml = export_dir / "entities.xml"
+        if entities_xml.exists():
+            pages = self._parse_entities_xml(entities_xml, space_info)
+            if pages:
+                print(f"  Parsed entities.xml: {len(pages)} pages")
+                return pages
+
+        # Fall back to HTML file export
+        html_files = sorted(
+            f for f in export_dir.rglob("*.html") if f.is_file() and f.name != "index.html"
+        )
+
+        if not html_files:
+            # Also try .htm files
+            html_files = sorted(
+                f for f in export_dir.rglob("*.htm") if f.is_file() and f.name != "index.htm"
+            )
+
+        if not html_files:
+            raise ValueError(
+                f"No HTML files found in export directory: {self.export_path}\n"
+                "Expected either entities.xml or HTML files from Confluence export."
+            )
+
+        print(f"  Found {len(html_files)} HTML files in export")
+
+        # Parse index.html for page hierarchy if available
+        index_file = export_dir / "index.html"
+        hierarchy_map: dict[str, str] = {}  # filename -> parent filename
+        if index_file.exists():
+            hierarchy_map = self._parse_export_index(index_file)
+
+        for idx, html_file in enumerate(html_files):
+            if idx >= self.max_pages:
+                logger.info("Reached max_pages limit (%d)", self.max_pages)
+                break
+
+            try:
+                raw_html = html_file.read_text(encoding="utf-8", errors="ignore")
+            except Exception as e:
+                logger.warning("Could not read %s: %s", html_file, e)
+                continue
+
+            soup = BeautifulSoup(raw_html, "html.parser")
+
+            # Extract title
+            title_tag = soup.find("title")
+            title = title_tag.get_text(strip=True) if title_tag else html_file.stem
+
+            # Find main content area (Confluence exports use specific div IDs)
+            main_content = (
+                soup.find("div", id="main-content")
+                or soup.find("div", class_="wiki-content")
+                or soup.find("div", id="content")
+                or soup.find("body")
+            )
+
+            body_html = str(main_content) if main_content else ""
+            file_key = html_file.stem
+            parent_key = hierarchy_map.get(file_key, "")
+
+            page_dict: dict[str, Any] = {
+                "id": file_key,
+                "title": title,
+                "body": body_html,
+                "parent_id": parent_key,
+                "labels": [],
+                "url": str(html_file),
+                "space_info": space_info,
+                "version": 1,
+                "created": "",
+                "modified": "",
+            }
+            pages.append(page_dict)
+
+        print(f"  Parsed {len(pages)} pages from HTML export")
+        return pages
+
+    def _parse_entities_xml(
+        self,
+        xml_path: Path,
+        space_info: dict[str, Any],
+    ) -> list[dict[str, Any]]:
+        """Parse Confluence entities.xml export file.
+
+        The entities.xml file contains all page data including body content
+        in Confluence storage format. This method extracts page objects and
+        their parent-child relationships.
+
+        Args:
+            xml_path: Path to the entities.xml file.
+            space_info: Space metadata dict to attach to each page.
+
+        Returns:
+            List of normalised page dicts.
+        """
+        pages: list[dict[str, Any]] = []
+
+        try:
+            # Use iterparse for memory efficiency on large exports
+            import xml.etree.ElementTree as ET
+
+            tree = ET.parse(xml_path)  # noqa: S314
+            root = tree.getroot()
+        except Exception as e:
+            logger.warning("Failed to parse entities.xml: %s", e)
+            return []
+
+        # Find all page objects in the XML
+        for obj_elem in root.iter("object"):
+            obj_class = obj_elem.get("class", "")
+            if obj_class != "Page":
+                continue
+
+            page_data: dict[str, str] = {}
+            for prop_elem in obj_elem:
+                prop_name = prop_elem.get("name", "")
+                if prop_name == "title":
+                    page_data["title"] = prop_elem.text or ""
+                elif prop_name == "id":
+                    page_data["id"] = prop_elem.text or ""
+                elif prop_name == "bodyContents":
+                    # Body content is nested inside a collection
+                    for body_obj in prop_elem.iter("object"):
+                        for body_prop in body_obj:
+                            if body_prop.get("name") == "body":
+                                page_data["body"] = body_prop.text or ""
+                elif prop_name == "parent":
+                    # Parent reference
+                    parent_ref = prop_elem.find("id")
+                    if parent_ref is not None and parent_ref.text:
+                        page_data["parent_id"] = parent_ref.text
+
+            if page_data.get("title") and page_data.get("id"):
+                page_dict: dict[str, Any] = {
+                    "id": page_data.get("id", ""),
+                    "title": page_data.get("title", ""),
+                    "body": page_data.get("body", ""),
+                    "parent_id": page_data.get("parent_id", ""),
+                    "labels": [],
+                    "url": "",
+                    "space_info": space_info,
+                    "version": 1,
+                    "created": "",
+                    "modified": "",
+                }
+                pages.append(page_dict)
+
+        return pages
+
+    def _parse_export_index(self, index_path: Path) -> dict[str, str]:
+        """Parse the index.html from a Confluence HTML export for hierarchy.
+
+        The export index page contains a nested list structure representing
+        the page tree. This method parses it to build a child-to-parent mapping.
+
+        Args:
+            index_path: Path to the index.html file.
+
+        Returns:
+            Dict mapping page filename stem to parent filename stem.
+        """
+        hierarchy: dict[str, str] = {}
+
+        try:
+            raw_html = index_path.read_text(encoding="utf-8", errors="ignore")
+            soup = BeautifulSoup(raw_html, "html.parser")
+
+            # Confluence export index uses nested <ul><li><a href="..."> structure
+            def _walk_list(ul_elem: Tag, parent_key: str = "") -> None:
+                for li in ul_elem.find_all("li", recursive=False):
+                    link = li.find("a", href=True)
+                    if not link:
+                        continue
+                    href = link.get("href", "")
+                    # Extract filename stem from href
+                    page_key = Path(href).stem if href else ""
+                    if page_key and parent_key:
+                        hierarchy[page_key] = parent_key
+
+                    # Recurse into nested lists
+                    nested_ul = li.find("ul", recursive=False)
+                    if nested_ul:
+                        _walk_list(nested_ul, page_key)
+
+            top_ul = soup.find("ul")
+            if top_ul:
+                _walk_list(top_ul)
+
+        except Exception as e:
+            logger.warning("Failed to parse export index: %s", e)
+
+        return hierarchy
+
+    # ──────────────────────────────────────────────────────────────────────
+    # HTML / content parsing
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _parse_confluence_html(
+        self,
+        html_content: str,
+        page_title: str = "",
+    ) -> dict[str, Any]:
+        """Parse Confluence storage format HTML into structured content.
+
+        Confluence uses a custom XHTML-based storage format with proprietary
+        macro elements (``ac:structured-macro``, ``ac:rich-text-body``, etc.).
+        This method:
+
+        1. Extracts code macros and panel macros before cleaning.
+        2. Cleans Confluence-specific markup (macros, boilerplate divs).
+        3. Extracts sub-headings, text content, code blocks, tables, images,
+           and links from the cleaned HTML.
+
+        Args:
+            html_content: Raw HTML string in Confluence storage format.
+            page_title: Page title for context in logging.
+
+        Returns:
+            Dict with keys: text, headings, code_samples, tables, images,
+            links, macros.
+        """
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        # Step 1: Extract macros before cleaning (they contain valuable content)
+        macros = self._extract_macros(soup)
+
+        # Step 2: Clean Confluence-specific HTML
+        cleaned_soup = self._clean_confluence_html(soup)
+
+        # Step 3: Extract structured content from cleaned HTML
+        text_parts: list[str] = []
+        headings: list[dict[str, str]] = []
+        code_samples: list[dict[str, Any]] = []
+        tables: list[dict[str, Any]] = []
+        images: list[dict[str, str]] = []
+        links: list[dict[str, str]] = []
+
+        # Add code samples from extracted macros
+        for macro in macros:
+            if macro.get("type") == "code":
+                code_samples.append(
+                    {
+                        "code": macro.get("content", ""),
+                        "language": macro.get("language", ""),
+                        "title": macro.get("title", ""),
+                        "quality_score": _score_code_quality(macro.get("content", "")),
+                    }
+                )
+
+        # Extract headings (h1-h6)
+        for heading_tag in cleaned_soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
+            heading_text = heading_tag.get_text(strip=True)
+            if heading_text:
+                headings.append(
+                    {
+                        "level": heading_tag.name,
+                        "text": heading_text,
+                    }
+                )
+
+        # Extract code blocks from <pre>/<code> elements (non-macro code)
+        for pre_tag in cleaned_soup.find_all("pre"):
+            code_elem = pre_tag.find("code")
+            if code_elem:
+                code_text = code_elem.get_text()
+                lang = self._detect_language_from_classes(code_elem)
+            else:
+                code_text = pre_tag.get_text()
+                lang = self._detect_language_from_classes(pre_tag)
+
+            code_text = code_text.strip()
+            if code_text and len(code_text) > 10:
+                # Avoid duplicates from macro extraction
+                is_duplicate = any(cs.get("code", "").strip() == code_text for cs in code_samples)
+                if not is_duplicate:
+                    code_samples.append(
+                        {
+                            "code": code_text,
+                            "language": lang,
+                            "title": "",
+                            "quality_score": _score_code_quality(code_text),
+                        }
+                    )
+            pre_tag.decompose()
+
+        # Extract tables
+        for table_tag in cleaned_soup.find_all("table"):
+            table_data = self._extract_table(table_tag)
+            if table_data:
+                tables.append(table_data)
+            table_tag.decompose()
+
+        # Extract images
+        for img_tag in cleaned_soup.find_all("img"):
+            src = img_tag.get("src", "")
+            alt = img_tag.get("alt", "")
+            if src:
+                images.append({"src": src, "alt": alt})
+
+        # Extract links
+        for a_tag in cleaned_soup.find_all("a", href=True):
+            href = a_tag.get("href", "")
+            link_text = a_tag.get_text(strip=True)
+            if href and link_text and not href.startswith("javascript:"):
+                links.append({"href": href, "text": link_text})
+
+        # Extract remaining text content
+        body_text = self._html_to_text(cleaned_soup)
+        if body_text and body_text.strip():
+            text_parts.append(body_text.strip())
+
+        return {
+            "text": "\n\n".join(text_parts),
+            "headings": headings,
+            "code_samples": code_samples,
+            "tables": tables,
+            "images": images,
+            "links": links,
+            "macros": [m for m in macros if m.get("type") != "code"],
+        }
+
+    def _extract_macros(self, soup: BeautifulSoup) -> list[dict[str, Any]]:
+        """Extract Confluence macros from storage format HTML.
+
+        Identifies and parses structured macros including:
+        - **code**: Code blocks with language specification.
+        - **panel** / **info** / **note** / **warning** / **tip**: Callout panels.
+        - **expand**: Expandable content sections.
+        - **toc**: Table of contents macro.
+        - **jira**: JIRA issue references.
+        - **excerpt**: Page excerpts.
+
+        Extracts the macro content and metadata, then removes the macro
+        elements from the soup to avoid double-processing.
+
+        Args:
+            soup: BeautifulSoup object containing Confluence storage format HTML.
+
+        Returns:
+            List of macro dicts with type, content, language (for code), title.
+        """
+        macros: list[dict[str, Any]] = []
+
+        # Find all ac:structured-macro elements
+        for macro_elem in soup.find_all("ac:structured-macro"):
+            macro_name = macro_elem.get("ac:name", "") or macro_elem.get("data-macro-name", "")
+            if not macro_name:
+                continue
+
+            # Extract parameters
+            params: dict[str, str] = {}
+            for param in macro_elem.find_all("ac:parameter"):
+                param_name = param.get("ac:name", "") or param.get("name", "")
+                param_value = param.get_text(strip=True)
+                if param_name:
+                    params[param_name] = param_value
+
+            # Extract body content
+            body_elem = macro_elem.find("ac:rich-text-body") or macro_elem.find(
+                "ac:plain-text-body"
+            )
+            body_content = ""
+            if body_elem:
+                if macro_elem.find("ac:plain-text-body"):
+                    body_content = body_elem.get_text()
+                else:
+                    body_content = body_elem.get_text(strip=True)
+
+            macro_dict: dict[str, Any] = {
+                "type": macro_name,
+                "params": params,
+                "content": body_content,
+            }
+
+            # Special handling for code macros
+            if macro_name == "code":
+                lang_raw = params.get("language", "").lower().strip()
+                macro_dict["language"] = _CODE_MACRO_LANGS.get(lang_raw, lang_raw)
+                macro_dict["title"] = params.get("title", "")
+                macro_dict["type"] = "code"
+
+            # Panel-type macros
+            elif macro_name in ("panel", "info", "note", "warning", "tip", "excerpt"):
+                macro_dict["title"] = params.get("title", "")
+
+            macros.append(macro_dict)
+
+            # Remove the macro element to avoid double-processing
+            macro_elem.decompose()
+
+        # Also handle legacy Confluence code blocks with class="code-macro"
+        for code_div in soup.find_all("div", class_="code"):
+            pre_elem = code_div.find("pre")
+            if pre_elem:
+                code_text = pre_elem.get_text()
+                if code_text and code_text.strip():
+                    macros.append(
+                        {
+                            "type": "code",
+                            "params": {},
+                            "content": code_text.strip(),
+                            "language": "",
+                            "title": "",
+                        }
+                    )
+            code_div.decompose()
+
+        return macros
+
+    def _clean_confluence_html(self, soup: BeautifulSoup) -> BeautifulSoup:
+        """Strip Confluence-specific markup from parsed HTML.
+
+        Removes:
+        - Script and style elements.
+        - HTML comments.
+        - Confluence-specific macro wrapper divs (by class name).
+        - Remaining ``ac:*`` and ``ri:*`` namespace elements.
+        - Empty ``<div>`` and ``<span>`` containers.
+        - Confluence status/date live-search elements.
+
+        Args:
+            soup: BeautifulSoup object to clean (modified in-place and returned).
+
+        Returns:
+            The cleaned BeautifulSoup object.
+        """
+        # Remove script, style, noscript
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
+            comment.extract()
+
+        # Remove Confluence-specific boilerplate divs by class
+        for css_class in _CONFLUENCE_MACRO_CLASSES:
+            for elem in soup.find_all(class_=css_class):
+                elem.decompose()
+
+        # Remove remaining ac:* and ri:* namespace elements that weren't
+        # captured by macro extraction (e.g. empty placeholders)
+        for tag_name in list(_STORAGE_MACRO_TAGS):
+            for elem in soup.find_all(tag_name):
+                # Preserve text content by replacing element with its text
+                text_content = elem.get_text(strip=True)
+                if text_content:
+                    elem.replace_with(text_content)
+                else:
+                    elem.decompose()
+
+        # Remove Confluence status macros and date elements
+        for elem in soup.find_all("time"):
+            elem.decompose()
+        for elem in soup.find_all("ac:emoticon"):
+            elem.decompose()
+
+        # Remove empty wrapper divs and spans (cleanup after macro removal)
+        for tag_name in ("div", "span"):
+            for elem in soup.find_all(tag_name):
+                if not elem.get_text(strip=True) and not elem.find(["img", "table", "pre"]):
+                    elem.decompose()
+
+        return soup
+
+    def _extract_page_tree(
+        self,
+        pages: list[dict[str, Any]],
+    ) -> list[dict[str, Any]]:
+        """Build a hierarchical page tree from a flat list of pages.
+
+        Constructs a tree structure based on parent_id relationships. Pages
+        without a parent are placed at the root level. The tree is useful
+        for categorisation and navigation.
+
+        Args:
+            pages: Flat list of page dicts with id and parent_id fields.
+
+        Returns:
+            List of tree node dicts, each with keys: id, title, children,
+            depth, labels.
+        """
+        # Build lookup maps
+        by_id: dict[str, dict[str, Any]] = {}
+        for page in pages:
+            page_id = page.get("id", "")
+            if page_id:
+                by_id[page_id] = {
+                    "id": page_id,
+                    "title": page.get("title", ""),
+                    "children": [],
+                    "depth": 0,
+                    "labels": page.get("labels", []),
+                }
+
+        # Build parent-child relationships
+        roots: list[dict[str, Any]] = []
+        for page in pages:
+            page_id = page.get("id", "")
+            parent_id = page.get("parent_id", "")
+            node = by_id.get(page_id)
+            if not node:
+                continue
+
+            if parent_id and parent_id in by_id:
+                parent_node = by_id[parent_id]
+                parent_node["children"].append(node)
+                node["depth"] = parent_node["depth"] + 1
+            else:
+                roots.append(node)
+
+        # Sort children alphabetically at each level
+        def _sort_children(node: dict[str, Any]) -> None:
+            node["children"].sort(key=lambda n: n.get("title", "").lower())
+            for child in node["children"]:
+                _sort_children(child)
+
+        for root in roots:
+            _sort_children(root)
+
+        roots.sort(key=lambda n: n.get("title", "").lower())
+        return roots
+
+    def _extract_table(self, table_elem: Tag) -> dict[str, Any] | None:
+        """Extract an HTML table to a markdown-ready dict.
+
+        Handles ``<thead>``/``<tbody>`` structure as well as header-less tables.
+        Confluence tables often use ``<th>`` in the first row.
+
+        Args:
+            table_elem: BeautifulSoup ``<table>`` Tag.
+
+        Returns:
+            Dict with 'headers' and 'rows' lists, or None if empty.
+        """
+        headers: list[str] = []
+        rows: list[list[str]] = []
+
+        # Try <thead> for headers
+        thead = table_elem.find("thead")
+        if thead:
+            header_row = thead.find("tr")
+            if header_row:
+                headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]
+
+        # Body rows
+        tbody = table_elem.find("tbody") or table_elem
+        all_rows = tbody.find_all("tr")
+
+        for row in all_rows:
+            cells = row.find_all(["td", "th"])
+            cell_texts = [c.get_text(strip=True) for c in cells]
+
+            # If no thead and first row has <th> elements, use as headers
+            if not headers and row.find("th") and not rows:
+                headers = cell_texts
+                continue
+
+            if cell_texts and cell_texts != headers:
+                rows.append(cell_texts)
+
+        # If still no headers, promote first row
+        if not headers and rows:
+            headers = rows.pop(0)
+
+        if not headers and not rows:
+            return None
+
+        return {"headers": headers, "rows": rows}
+
+    def _detect_language_from_classes(self, elem: Tag) -> str:
+        """Detect programming language from CSS classes on an element.
+
+        Checks for common class conventions: ``language-python``,
+        ``brush: java``, ``code-python``, or bare language names.
+
+        Args:
+            elem: BeautifulSoup Tag with potential language class hints.
+
+        Returns:
+            Normalised language string, or empty string if undetected.
+        """
+        classes = elem.get("class", [])
+        if not classes:
+            return ""
+
+        prefixes = ("language-", "lang-", "code-", "highlight-", "brush:")
+        for cls in classes:
+            cls_lower = cls.lower().strip()
+            for prefix in prefixes:
+                if cls_lower.startswith(prefix):
+                    lang_raw = cls_lower[len(prefix) :].strip()
+                    return _CODE_MACRO_LANGS.get(lang_raw, lang_raw)
+
+        # Check for bare language names
+        known = set(_CODE_MACRO_LANGS.keys())
+        for cls in classes:
+            if cls.lower() in known:
+                return _CODE_MACRO_LANGS.get(cls.lower(), cls.lower())
+
+        return ""
+
+    def _html_to_text(self, elem: Tag | BeautifulSoup) -> str:
+        """Convert an HTML element to clean markdown-like text.
+
+        Handles paragraphs, bold/italic, links, lists, blockquotes,
+        inline code, headings, definition lists, and horizontal rules.
+
+        Args:
+            elem: BeautifulSoup Tag or soup to convert.
+
+        Returns:
+            Cleaned text string with basic markdown formatting.
+        """
+        if not hasattr(elem, "children"):
+            return str(elem).strip()
+
+        parts: list[str] = []
+
+        for child in elem.children:
+            if not hasattr(child, "name"):
+                text = str(child)
+                if text.strip():
+                    parts.append(text)
+                continue
+
+            if child.name is None:
+                continue
+
+            tag = child.name
+
+            if tag == "br":
+                parts.append("\n")
+            elif tag in ("p", "div"):
+                inner = self._html_to_text(child)
+                if inner.strip():
+                    parts.append(f"\n\n{inner.strip()}\n\n")
+            elif tag in ("strong", "b"):
+                inner = child.get_text(strip=True)
+                if inner:
+                    parts.append(f"**{inner}**")
+            elif tag in ("em", "i"):
+                inner = child.get_text(strip=True)
+                if inner:
+                    parts.append(f"*{inner}*")
+            elif tag == "a" and child.get("href"):
+                link_text = child.get_text(strip=True)
+                href = child.get("href", "")
+                if link_text and href and not href.startswith("javascript:"):
+                    parts.append(f"[{link_text}]({href})")
+                elif link_text:
+                    parts.append(link_text)
+            elif tag in ("ul", "ol"):
+                items = child.find_all("li", recursive=False)
+                for idx, li in enumerate(items):
+                    li_text = li.get_text(strip=True)
+                    if li_text:
+                        prefix = f"{idx + 1}." if tag == "ol" else "-"
+                        parts.append(f"\n{prefix} {li_text}")
+                parts.append("\n")
+            elif tag == "blockquote":
+                bq_text = child.get_text(strip=True)
+                if bq_text:
+                    lines = bq_text.split("\n")
+                    quoted = "\n".join(f"> {line}" for line in lines)
+                    parts.append(f"\n\n{quoted}\n\n")
+            elif tag == "code":
+                if child.find_parent("pre") is None:
+                    code_text = child.get_text()
+                    if code_text.strip():
+                        parts.append(f"`{code_text.strip()}`")
+            elif tag in ("h1", "h2", "h3", "h4", "h5", "h6"):
+                level = int(tag[1])
+                inner = child.get_text(strip=True)
+                if inner:
+                    parts.append(f"\n\n{'#' * level} {inner}\n\n")
+            elif tag == "dl":
+                for dt in child.find_all("dt"):
+                    term = dt.get_text(strip=True)
+                    dd = dt.find_next_sibling("dd")
+                    definition = dd.get_text(strip=True) if dd else ""
+                    parts.append(f"\n**{term}**: {definition}")
+                parts.append("\n")
+            elif tag == "hr":
+                parts.append("\n\n---\n\n")
+            else:
+                inner = self._html_to_text(child)
+                if inner.strip():
+                    parts.append(inner)
+
+        result = "".join(parts)
+        result = re.sub(r"\n{3,}", "\n\n", result)
+        return result
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Load extracted data
+    # ──────────────────────────────────────────────────────────────────────
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from a JSON file.
+
+        Args:
+            json_path: Path to the intermediate extracted JSON file.
+
+        Returns:
+            True on success.
+
+        Raises:
+            FileNotFoundError: If the JSON file does not exist.
+        """
+        print(f"\n  Loading extracted data from: {json_path}")
+        if not os.path.exists(json_path):
+            raise FileNotFoundError(f"Extracted data file not found: {json_path}")
+
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+
+        total = self.extracted_data.get("total_sections", 0)
+        print(f"  Loaded {total} pages")
+        return True
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Categorisation
+    # ──────────────────────────────────────────────────────────────────────
+
+    def categorize_content(self) -> dict[str, dict[str, Any]]:
+        """Categorise pages by space / parent-page hierarchy.
+
+        Groups pages based on their parent page relationships. Root pages
+        (those without a parent) form top-level categories. Pages with
+        parents are grouped under their parent's category. Deep nesting
+        is flattened to two levels.
+
+        If no hierarchy information is available, falls back to grouping
+        by labels or placing all pages in a single "content" category.
+
+        Returns:
+            Dict mapping category key to dict with 'title' and 'pages' lists.
+        """
+        print("\n  Categorising content...")
+
+        categorised: dict[str, dict[str, Any]] = {}
+        sections = self.extracted_data.get("pages", [])
+        page_tree = self.extracted_data.get("page_tree", [])
+
+        if not sections:
+            categorised["content"] = {"title": "Content", "pages": []}
+            return categorised
+
+        # Build a lookup from page_id to section
+        sections_by_id: dict[str, dict[str, Any]] = {}
+        for section in sections:
+            page_id = section.get("page_id", "")
+            if page_id:
+                sections_by_id[page_id] = section
+
+        # Strategy 1: Use page hierarchy if available
+        if page_tree:
+            for root_node in page_tree:
+                root_id = root_node.get("id", "")
+                root_title = root_node.get("title", "Untitled")
+                cat_key = self._sanitize_filename(root_title)
+
+                # Collect the root page and all its descendants
+                descendant_ids = self._collect_descendant_ids(root_node)
+                all_ids = [root_id] + descendant_ids
+
+                cat_pages = [sections_by_id[pid] for pid in all_ids if pid in sections_by_id]
+
+                if cat_pages:
+                    categorised[cat_key] = {
+                        "title": root_title,
+                        "pages": cat_pages,
+                    }
+
+        # Strategy 2: Group by parent_id when no tree is available
+        if not categorised:
+            parent_groups: dict[str, list[dict[str, Any]]] = {}
+            for section in sections:
+                parent_id = section.get("parent_id", "")
+                group_key = parent_id or "root"
+                if group_key not in parent_groups:
+                    parent_groups[group_key] = []
+                parent_groups[group_key].append(section)
+
+            for group_key, group_pages in parent_groups.items():
+                if group_key == "root":
+                    cat_title = "Root Pages"
+                else:
+                    # Try to find the parent page title
+                    parent_section = sections_by_id.get(group_key)
+                    cat_title = (
+                        parent_section.get("heading", "Section")
+                        if parent_section
+                        else f"Section {group_key}"
+                    )
+
+                cat_key = self._sanitize_filename(cat_title)
+                categorised[cat_key] = {
+                    "title": cat_title,
+                    "pages": group_pages,
+                }
+
+        # Strategy 3: Single category fallback
+        if not categorised:
+            categorised["content"] = {
+                "title": "Content",
+                "pages": sections,
+            }
+
+        print(f"  Created {len(categorised)} categories")
+        for cat_key, cat_data in categorised.items():
+            print(f"    - {cat_data['title']}: {len(cat_data['pages'])} pages")
+
+        return categorised
+
+    def _collect_descendant_ids(self, node: dict[str, Any]) -> list[str]:
+        """Recursively collect all descendant page IDs from a tree node.
+
+        Args:
+            node: Tree node dict with 'children' list.
+
+        Returns:
+            Flat list of all descendant page IDs.
+        """
+        ids: list[str] = []
+        for child in node.get("children", []):
+            child_id = child.get("id", "")
+            if child_id:
+                ids.append(child_id)
+            ids.extend(self._collect_descendant_ids(child))
+        return ids
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Skill building
+    # ──────────────────────────────────────────────────────────────────────
+
+    def build_skill(self) -> None:
+        """Build the complete skill structure from extracted data.
+
+        Creates output directories, categorises content, and generates:
+        - Reference markdown files for each category.
+        - A reference index file.
+        - The main SKILL.md manifest.
+
+        The output directory structure follows the standard skill layout::
+
+            output/{name}/
+                SKILL.md
+                references/
+                    index.md
+                    {category}.md
+                scripts/
+                assets/
+        """
+        print(f"\n  Building skill: {self.name}")
+
+        # Create directories
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        # Categorise content
+        categorised = self.categorize_content()
+
+        # Generate reference files
+        print("\n  Generating reference files...")
+        section_num = 1
+        total_categories = len(categorised)
+        for cat_key, cat_data in categorised.items():
+            self._generate_reference_file(cat_key, cat_data, section_num, total_categories)
+            section_num += 1
+
+        # Generate index
+        self._generate_index(categorised)
+
+        # Generate SKILL.md
+        self._generate_skill_md(categorised)
+
+        print(f"\n  Skill built successfully: {self.skill_dir}/")
+        print(f"\n  Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Private generators
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _generate_reference_file(
+        self,
+        cat_key: str,
+        cat_data: dict[str, Any],
+        section_num: int,
+        total_categories: int,
+    ) -> None:
+        """Generate a reference markdown file for a content category.
+
+        Creates a markdown file containing all pages in the category, with
+        headings, text content, code examples, tables, images, and links.
+
+        Args:
+            cat_key: Category key (sanitised filename stem).
+            cat_data: Category dict with 'title' and 'pages' keys.
+            section_num: Current section number for filename generation.
+            total_categories: Total number of categories for filename logic.
+        """
+        sections = cat_data["pages"]
+        safe_key = self._sanitize_filename(cat_data["title"])
+        filename = f"{self.skill_dir}/references/{safe_key}.md"
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+
+            for section in sections:
+                sec_num = section.get("section_number", "?")
+                heading = section.get("heading", "")
+                labels = section.get("labels", [])
+
+                f.write(f"---\n\n**Page {sec_num}: {heading}**\n\n")
+
+                # Labels
+                if labels:
+                    label_str = ", ".join(f"`{lbl}`" for lbl in labels)
+                    f.write(f"**Labels:** {label_str}\n\n")
+
+                # Sub-headings
+                for sub in section.get("headings", []):
+                    sub_level = sub.get("level", "h3")
+                    sub_text = sub.get("text", "")
+                    if sub_text:
+                        md_depth = int(sub_level[1]) + 1 if sub_level else 4
+                        md_depth = min(md_depth, 6)
+                        f.write(f"{'#' * md_depth} {sub_text}\n\n")
+
+                # Text content
+                if section.get("text"):
+                    f.write(f"{section['text']}\n\n")
+
+                # Code samples
+                code_list = section.get("code_samples", [])
+                if code_list:
+                    f.write("### Code Examples\n\n")
+                    for code in code_list:
+                        lang = code.get("language", "")
+                        title = code.get("title", "")
+                        if title:
+                            f.write(f"**{title}**\n\n")
+                        f.write(f"```{lang}\n{code['code']}\n```\n\n")
+
+                # Tables
+                table_list = section.get("tables", [])
+                if table_list:
+                    for table in table_list:
+                        headers = table.get("headers", [])
+                        rows = table.get("rows", [])
+                        if headers:
+                            f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
+                            f.write("| " + " | ".join("---" for _ in headers) + " |\n")
+                        for row in rows:
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+
+                # Images
+                image_list = section.get("images", [])
+                if image_list:
+                    for img in image_list:
+                        alt = img.get("alt", "Image")
+                        src = img.get("src", "")
+                        if src:
+                            f.write(f"![{alt}]({src})\n\n")
+
+                # Links
+                link_list = section.get("links", [])
+                if link_list:
+                    f.write("### Related Links\n\n")
+                    for link in link_list[:20]:
+                        f.write(f"- [{link['text']}]({link['href']})\n")
+                    f.write("\n")
+
+                # Non-code macros (panels, notes, warnings, etc.)
+                macro_list = section.get("macros", [])
+                if macro_list:
+                    for macro in macro_list:
+                        macro_type = macro.get("type", "")
+                        macro_content = macro.get("content", "")
+                        macro_title = macro.get("title", "")
+
+                        if macro_type in ("info", "note", "tip"):
+                            prefix = {"info": "INFO", "note": "NOTE", "tip": "TIP"}.get(
+                                macro_type, "NOTE"
+                            )
+                            header = f"> **{prefix}**"
+                            if macro_title:
+                                header += f": {macro_title}"
+                            f.write(f"{header}\n")
+                            for line in macro_content.split("\n"):
+                                f.write(f"> {line}\n")
+                            f.write("\n")
+                        elif macro_type == "warning":
+                            header = "> **WARNING**"
+                            if macro_title:
+                                header += f": {macro_title}"
+                            f.write(f"{header}\n")
+                            for line in macro_content.split("\n"):
+                                f.write(f"> {line}\n")
+                            f.write("\n")
+                        elif macro_type == "panel":
+                            if macro_title:
+                                f.write(f"**{macro_title}**\n\n")
+                            if macro_content:
+                                f.write(f"{macro_content}\n\n")
+                        elif macro_type == "expand":
+                            expand_title = macro_title or "Details"
+                            f.write(f"<details>\n<summary>{expand_title}</summary>\n\n")
+                            f.write(f"{macro_content}\n\n")
+                            f.write("</details>\n\n")
+                        elif macro_content:
+                            f.write(f"{macro_content}\n\n")
+
+                f.write("---\n\n")
+
+        print(f"    Generated: {filename}")
+
+    def _generate_index(self, categorised: dict[str, dict[str, Any]]) -> None:
+        """Generate the reference index file.
+
+        Creates an ``index.md`` listing all categories with links, page counts,
+        and overall statistics about the extracted content.
+
+        Args:
+            categorised: Dict of category_key -> category data.
+        """
+        filename = f"{self.skill_dir}/references/index.md"
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Confluence Reference\n\n")
+
+            space_info = self.extracted_data.get("space_info", {})
+            if space_info.get("name"):
+                f.write(f"**Space:** {space_info['name']}")
+                if space_info.get("key"):
+                    f.write(f" ({space_info['key']})")
+                f.write("\n\n")
+
+            f.write("## Categories\n\n")
+
+            for cat_key, cat_data in categorised.items():
+                safe_name = self._sanitize_filename(cat_data["title"])
+                page_count = len(cat_data["pages"])
+                f.write(f"- [{cat_data['title']}]({safe_name}.md) ({page_count} pages)\n")
+
+            f.write("\n## Statistics\n\n")
+            f.write(f"- Total pages: {self.extracted_data.get('total_sections', 0)}\n")
+            f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
+            f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
+
+            langs = self.extracted_data.get("languages_detected", {})
+            if langs:
+                f.write(f"- Programming languages: {len(langs)}\n\n")
+                f.write("**Language Breakdown:**\n\n")
+                for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
+                    f.write(f"- {lang}: {count} examples\n")
+
+            # Page tree structure
+            page_tree = self.extracted_data.get("page_tree", [])
+            if page_tree:
+                f.write("\n## Page Tree\n\n")
+                f.write("```\n")
+                self._write_tree_structure(f, page_tree, indent=0)
+                f.write("```\n")
+
+        print(f"    Generated: {filename}")
+
+    def _write_tree_structure(
+        self,
+        f: Any,
+        nodes: list[dict[str, Any]],
+        indent: int = 0,
+    ) -> None:
+        """Write a page tree structure to a file in ASCII tree format.
+
+        Args:
+            f: File handle to write to.
+            nodes: List of tree node dicts with 'title' and 'children'.
+            indent: Current indentation level.
+        """
+        for node in nodes:
+            prefix = "  " * indent
+            title = node.get("title", "Untitled")
+            f.write(f"{prefix}- {title}\n")
+            children = node.get("children", [])
+            if children:
+                self._write_tree_structure(f, children, indent + 1)
+
+    def _generate_skill_md(self, categorised: dict[str, dict[str, Any]]) -> None:
+        """Generate the main SKILL.md file.
+
+        Creates a comprehensive skill manifest with:
+        - YAML frontmatter (name, description).
+        - Space information and metadata.
+        - Usage guidance ("When to Use This Skill").
+        - Content overview with category listing.
+        - Key topics extracted from page headings.
+        - Code examples (top quality samples).
+        - Documentation statistics.
+        - Navigation links to reference files.
+
+        Args:
+            categorised: Dict of category_key -> category data.
+        """
+        filename = f"{self.skill_dir}/SKILL.md"
+        space_info = self.extracted_data.get("space_info", {})
+
+        # Skill name for frontmatter (lowercase, hyphens, max 64 chars)
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        with open(filename, "w", encoding="utf-8") as f:
+            # YAML frontmatter
+            f.write("---\n")
+            f.write(f"name: {skill_name}\n")
+            f.write(f"description: {desc}\n")
+            f.write("---\n\n")
+
+            # Header
+            space_name = space_info.get("name", self.name.title())
+            f.write(f"# {space_name} Documentation Skill\n\n")
+            f.write(f"{self.description}\n\n")
+
+            # Space metadata
+            if space_info.get("key"):
+                f.write("## Space Information\n\n")
+                f.write(f"**Space:** {space_info.get('name', 'N/A')}\n")
+                f.write(f"**Key:** {space_info.get('key', 'N/A')}\n")
+                source = self.extracted_data.get("source", "")
+                if source:
+                    f.write(f"**Source:** {source}\n")
+                f.write(f"**Pages:** {self.extracted_data.get('total_pages', 0)}\n\n")
+
+            # When to Use
+            f.write("## When to Use This Skill\n\n")
+            f.write("Use this skill when you need to:\n")
+            f.write(f"- Understand {space_name} concepts and architecture\n")
+            f.write("- Look up API references and technical specifications\n")
+            f.write("- Find code examples and implementation patterns\n")
+            f.write("- Review processes, guidelines, and best practices\n")
+            f.write("- Navigate the documentation structure and find related pages\n\n")
+
+            # Content overview
+            total_pages = self.extracted_data.get("total_sections", 0)
+            f.write("## Content Overview\n\n")
+            f.write(f"**Total Pages:** {total_pages}\n\n")
+            f.write("**Categories:**\n\n")
+            for cat_key, cat_data in categorised.items():
+                page_count = len(cat_data["pages"])
+                f.write(f"- **{cat_data['title']}**: {page_count} pages\n")
+            f.write("\n")
+
+            # Key topics from headings
+            f.write(self._format_key_topics())
+
+            # Code examples (top quality)
+            all_code: list[dict[str, Any]] = []
+            for section in self.extracted_data.get("pages", []):
+                for code in section.get("code_samples", []):
+                    code_copy = dict(code)
+                    code_copy["source_page"] = section.get("heading", "")
+                    all_code.append(code_copy)
+
+            all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
+            top_code = all_code[:10]
+
+            if top_code:
+                f.write("## Code Examples\n\n")
+                f.write("*Top code examples from the documentation*\n\n")
+
+                by_lang: dict[str, list[dict[str, Any]]] = {}
+                for code in top_code:
+                    lang = code.get("language", "") or "unknown"
+                    by_lang.setdefault(lang, []).append(code)
+
+                for lang in sorted(by_lang.keys()):
+                    examples = by_lang[lang]
+                    lang_display = lang.title() if lang != "unknown" else "Other"
+                    f.write(f"### {lang_display} ({len(examples)} examples)\n\n")
+                    for i, code in enumerate(examples[:3], 1):
+                        quality = code.get("quality_score", 0)
+                        source = code.get("source_page", "")
+                        title = code.get("title", "")
+                        code_text = code.get("code", "")
+
+                        header_parts = [f"**Example {i}**"]
+                        if title:
+                            header_parts.append(f"({title})")
+                        if source:
+                            header_parts.append(f"from *{source}*")
+                        header_parts.append(f"[Quality: {quality:.1f}/10]")
+                        f.write(" ".join(header_parts) + ":\n\n")
+
+                        f.write(f"```{lang}\n")
+                        if len(code_text) <= 500:
+                            f.write(code_text)
+                        else:
+                            f.write(code_text[:500] + "\n...")
+                        f.write("\n```\n\n")
+
+            # Statistics
+            f.write("## Documentation Statistics\n\n")
+            f.write(f"- **Total Pages**: {total_pages}\n")
+            f.write(f"- **Code Blocks**: {self.extracted_data.get('total_code_blocks', 0)}\n")
+            f.write(f"- **Images**: {self.extracted_data.get('total_images', 0)}\n")
+            f.write(f"- **Categories**: {len(categorised)}\n")
+
+            langs = self.extracted_data.get("languages_detected", {})
+            if langs:
+                f.write(f"- **Programming Languages**: {len(langs)}\n\n")
+                f.write("**Language Breakdown:**\n\n")
+                for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
+                    f.write(f"- {lang}: {count} examples\n")
+                f.write("\n")
+            else:
+                f.write("\n")
+
+            # Navigation
+            f.write("## Navigation\n\n")
+            f.write("**Reference Files:**\n\n")
+            for cat_key, cat_data in categorised.items():
+                safe_name = self._sanitize_filename(cat_data["title"])
+                f.write(f"- `references/{safe_name}.md` - {cat_data['title']}\n")
+            f.write("\n")
+            f.write("See `references/index.md` for complete documentation structure.\n\n")
+
+            # Footer
+            f.write("---\n\n")
+            f.write("**Generated by Skill Seekers** | Confluence Documentation Scraper\n")
+
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"    Generated: {filename} ({line_count} lines)")
+
+    def _format_key_topics(self) -> str:
+        """Extract key topics from page headings across all sections.
+
+        Collects page titles and sub-headings to identify the main topics
+        covered in the documentation.
+
+        Returns:
+            Formatted markdown string with key topics section.
+        """
+        page_titles: list[str] = []
+        sub_headings: list[str] = []
+
+        for section in self.extracted_data.get("pages", []):
+            heading = section.get("heading", "").strip()
+            if heading and len(heading) > 3:
+                page_titles.append(heading)
+
+            for sub in section.get("headings", []):
+                text = sub.get("text", "").strip()
+                level = sub.get("level", "h3")
+                if text and len(text) > 3 and level in ("h2", "h3"):
+                    sub_headings.append(text)
+
+        if not page_titles and not sub_headings:
+            return ""
+
+        content = "## Key Topics\n\n"
+        content += "*Main topics covered in this documentation*\n\n"
+
+        if page_titles:
+            content += "**Pages:**\n\n"
+            for title in page_titles[:15]:
+                content += f"- {title}\n"
+            if len(page_titles) > 15:
+                content += f"- *...and {len(page_titles) - 15} more*\n"
+            content += "\n"
+
+        if sub_headings:
+            # Deduplicate and show top subtopics
+            unique_subs = list(dict.fromkeys(sub_headings))
+            content += "**Subtopics:**\n\n"
+            for heading in unique_subs[:20]:
+                content += f"- {heading}\n"
+            if len(unique_subs) > 20:
+                content += f"- *...and {len(unique_subs) - 20} more*\n"
+            content += "\n"
+
+        return content
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Utility helpers
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert a string to a safe filename.
+
+        Removes special characters, converts spaces and hyphens to underscores,
+        and lowercases the result.
+
+        Args:
+            name: Raw string to sanitise.
+
+        Returns:
+            Filesystem-safe filename string.
+        """
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        safe = re.sub(r"[-\s]+", "_", safe)
+        return safe[:100]  # Limit filename length
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Module-level helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def _score_code_quality(code: str) -> float:
+    """Simple quality heuristic for code blocks (0-10 scale).
+
+    Scores based on line count, presence of definitions, imports,
+    indentation, and operator usage. Short snippets are penalised.
+
+    Args:
+        code: Source code string.
+
+    Returns:
+        Quality score between 0.0 and 10.0.
+    """
+    if not code:
+        return 0.0
+
+    score = 5.0
+    lines = code.strip().split("\n")
+    line_count = len(lines)
+
+    # More lines = more substantial
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+
+    # Has function/class definitions
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+
+    # Has imports/require
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+
+    # Has indentation (structured code)
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+
+    # Has assignment, operators, or common code syntax
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+
+    # Very short snippets get penalised
+    if len(code) < 30:
+        score -= 2.0
+
+    return min(10.0, max(0.0, score))
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI entry point
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def main() -> int:
+    """CLI entry point for the Confluence scraper.
+
+    Parses command-line arguments and runs the extraction/build pipeline.
+    Supports three workflows:
+
+    1. **API mode**: ``--base-url URL --space-key KEY --name my-skill``
+    2. **Export mode**: ``--export-path ./export-dir/ --name my-skill``
+    3. **Build from JSON**: ``--from-json my-skill_extracted.json``
+
+    Returns:
+        Exit code (0 for success, non-zero for failure).
+    """
+    parser = argparse.ArgumentParser(
+        description="Convert Confluence documentation to AI-ready skills",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Examples:\n"
+            "  %(prog)s --base-url https://wiki.example.com "
+            "--space-key PROJ --name my-wiki\n"
+            "  %(prog)s --export-path ./confluence-export/ --name my-wiki\n"
+            "  %(prog)s --from-json my-wiki_extracted.json\n"
+        ),
+    )
+
+    # Standard shared arguments
+    from .arguments.common import add_all_standard_arguments
+
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Confluence
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for Confluence), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # Confluence-specific arguments
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        help="Confluence instance base URL (e.g., https://wiki.example.com)",
+        metavar="URL",
+    )
+    parser.add_argument(
+        "--space-key",
+        type=str,
+        help="Confluence space key to extract (e.g., PROJ, DEV)",
+        metavar="KEY",
+    )
+    parser.add_argument(
+        "--export-path",
+        type=str,
+        help="Path to Confluence HTML/XML export directory",
+        metavar="PATH",
+    )
+    parser.add_argument(
+        "--username",
+        type=str,
+        help=("Confluence username / email for API auth (or set CONFLUENCE_USERNAME env var)"),
+        metavar="USER",
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help=("Confluence API token for API auth (or set CONFLUENCE_TOKEN env var)"),
+        metavar="TOKEN",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=500,
+        help="Maximum number of pages to fetch (default: 500)",
+        metavar="N",
+    )
+    parser.add_argument(
+        "--from-json",
+        type=str,
+        help="Build skill from previously extracted JSON data",
+        metavar="FILE",
+    )
+
+    args = parser.parse_args()
+
+    # Setup logging
+    if getattr(args, "quiet", False):
+        logging.basicConfig(level=logging.WARNING, format="%(message)s")
+    elif getattr(args, "verbose", False):
+        logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s")
+    else:
+        logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = (
+            getattr(args, "base_url", None)
+            or getattr(args, "export_path", None)
+            or getattr(args, "from_json", None)
+            or "(none)"
+        )
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: Confluence Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Space key:      {getattr(args, 'space_key', None) or '(N/A)'}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Max pages:      {getattr(args, 'max_pages', 500)}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n  Dry run complete")
+        return 0
+
+    # Validate inputs
+    has_api = getattr(args, "base_url", None) and getattr(args, "space_key", None)
+    has_export = getattr(args, "export_path", None)
+    has_json = getattr(args, "from_json", None)
+
+    if not (has_api or has_export or has_json):
+        parser.error(
+            "Must specify one of:\n"
+            "  --base-url URL --space-key KEY (API mode)\n"
+            "  --export-path PATH (export mode)\n"
+            "  --from-json FILE (build from JSON)"
+        )
+
+    # Build from pre-extracted JSON
+    if has_json:
+        name = getattr(args, "name", None) or Path(args.from_json).stem.replace("_extracted", "")
+        config: dict[str, Any] = {
+            "name": name,
+            "description": (
+                getattr(args, "description", None) or f"Use when referencing {name} documentation"
+            ),
+        }
+        try:
+            converter = ConfluenceToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n  Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Determine name
+    if not getattr(args, "name", None):
+        if has_api:
+            args.name = args.space_key.lower()
+        elif has_export:
+            args.name = Path(args.export_path).name
+        else:
+            args.name = "confluence-skill"
+
+    # Build config
+    config = {
+        "name": args.name,
+        "base_url": getattr(args, "base_url", "") or "",
+        "space_key": getattr(args, "space_key", "") or "",
+        "export_path": getattr(args, "export_path", "") or "",
+        "username": getattr(args, "username", "") or "",
+        "token": getattr(args, "token", "") or "",
+        "max_pages": getattr(args, "max_pages", 500),
+    }
+    if getattr(args, "description", None):
+        config["description"] = args.description
+
+    # Create converter and run
+    try:
+        converter = ConfluenceToSkillConverter(config)
+
+        if not converter.extract_confluence():
+            print("\n  Confluence extraction failed", file=sys.stderr)
+            sys.exit(1)
+
+        converter.build_skill()
+
+        # Enhancement workflow integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"  AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"    Running after workflow: {workflow_name}")
+                print(
+                    "    (Workflow provides specialised analysis,"
+                    " enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("  API enhancement complete!")
+                except ImportError:
+                    print("  API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import (
+                        LocalSkillEnhancer,
+                    )
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("  Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import (
+                    LocalSkillEnhancer,
+                )
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("  Local enhancement complete!")
+
+    except (ValueError, RuntimeError, FileNotFoundError) as e:
+        print(f"\n  Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n  Unexpected error during Confluence processing: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py
index d7374e4..6f01023 100644
--- a/src/skill_seekers/cli/create_command.py
+++ b/src/skill_seekers/cli/create_command.py
@@ -140,6 +140,26 @@ class CreateCommand:
             return self._route_video()
         elif self.source_info.type == "config":
             return self._route_config()
+        elif self.source_info.type == "jupyter":
+            return self._route_generic("jupyter_scraper", "--notebook")
+        elif self.source_info.type == "html":
+            return self._route_generic("html_scraper", "--html-path")
+        elif self.source_info.type == "openapi":
+            return self._route_generic("openapi_scraper", "--spec")
+        elif self.source_info.type == "asciidoc":
+            return self._route_generic("asciidoc_scraper", "--asciidoc-path")
+        elif self.source_info.type == "pptx":
+            return self._route_generic("pptx_scraper", "--pptx")
+        elif self.source_info.type == "rss":
+            return self._route_generic("rss_scraper", "--feed-path")
+        elif self.source_info.type == "manpage":
+            return self._route_generic("man_scraper", "--man-path")
+        elif self.source_info.type == "confluence":
+            return self._route_generic("confluence_scraper", "--export-path")
+        elif self.source_info.type == "notion":
+            return self._route_generic("notion_scraper", "--export-path")
+        elif self.source_info.type == "chat":
+            return self._route_generic("chat_scraper", "--export-path")
         else:
             logger.error(f"Unknown source type: {self.source_info.type}")
             return 1
@@ -485,6 +505,40 @@ class CreateCommand:
         finally:
             sys.argv = original_argv
 
+    def _route_generic(self, module_name: str, file_flag: str) -> int:
+        """Generic routing for new source types.
+
+        Most new source types (jupyter, html, openapi, asciidoc, pptx, rss,
+        manpage, confluence, notion, chat) follow the same pattern:
+        import module, build argv with --flag <file_path>, add common args, call main().
+
+        Args:
+            module_name: Python module name under skill_seekers.cli (e.g., "jupyter_scraper")
+            file_flag: CLI flag for the source file (e.g., "--notebook")
+
+        Returns:
+            Exit code from scraper
+        """
+        import importlib
+
+        module = importlib.import_module(f"skill_seekers.cli.{module_name}")
+
+        argv = [module_name]
+
+        file_path = self.source_info.parsed.get("file_path", "")
+        if file_path:
+            argv.extend([file_flag, file_path])
+
+        self._add_common_args(argv)
+
+        logger.debug(f"Calling {module_name} with argv: {argv}")
+        original_argv = sys.argv
+        try:
+            sys.argv = argv
+            return module.main()
+        finally:
+            sys.argv = original_argv
+
     def _add_common_args(self, argv: list[str]) -> None:
         """Add truly universal arguments to argv list.
 
diff --git a/src/skill_seekers/cli/html_scraper.py b/src/skill_seekers/cli/html_scraper.py
new file mode 100644
index 0000000..1109c16
--- /dev/null
+++ b/src/skill_seekers/cli/html_scraper.py
@@ -0,0 +1,1942 @@
+#!/usr/bin/env python3
+"""
+Local HTML Documentation to Skill Converter
+
+Converts local HTML files or directories of HTML files into skills.
+Uses BeautifulSoup for HTML parsing and content extraction. Supports single
+HTML files (.html/.htm) and directories containing multiple HTML files.
+
+Extracts document structure, headings, main content, code blocks, tables,
+images, and links. Converts extracted content to clean markdown-like output
+suitable for AI skill consumption.
+
+Usage:
+    skill-seekers html --html-path page.html --name myskill
+    skill-seekers html --html-path ./docs/ --name myskill
+    skill-seekers html --from-json page_extracted.json
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+
+# BeautifulSoup is a core dependency (always available)
+from bs4 import BeautifulSoup, Comment, Tag
+
+logger = logging.getLogger(__name__)
+
+# File extensions treated as HTML
+HTML_EXTENSIONS = {".html", ".htm", ".xhtml"}
+
+
+def infer_description_from_html(metadata: dict | None = None, name: str = "") -> str:
+    """Infer skill description from HTML metadata.
+
+    Args:
+        metadata: HTML document metadata dict (title, description, author, etc.)
+        name: Skill name for fallback
+
+    Returns:
+        Description string suitable for "Use when..." format
+    """
+    if metadata:
+        if metadata.get("description") and len(metadata["description"]) > 20:
+            desc = metadata["description"].strip()
+            if len(desc) > 150:
+                desc = desc[:147] + "..."
+            return f"Use when {desc.lower()}"
+        if metadata.get("title") and len(metadata["title"]) > 10:
+            return f"Use when working with {metadata['title'].lower()}"
+    return (
+        f"Use when referencing {name} documentation"
+        if name
+        else "Use when referencing this documentation"
+    )
+
+
+def _collect_html_files(html_path: str) -> list[Path]:
+    """Collect HTML files from a path (file or directory).
+
+    For a single file, returns a list with that file. For a directory,
+    recursively finds all .html/.htm/.xhtml files sorted alphabetically.
+
+    Args:
+        html_path: Path to an HTML file or directory containing HTML files.
+
+    Returns:
+        Sorted list of Path objects pointing to HTML files.
+
+    Raises:
+        FileNotFoundError: If the path does not exist.
+        ValueError: If no HTML files are found.
+    """
+    path = Path(html_path)
+
+    if not path.exists():
+        raise FileNotFoundError(f"HTML path not found: {html_path}")
+
+    if path.is_file():
+        if path.suffix.lower() not in HTML_EXTENSIONS:
+            raise ValueError(f"Not an HTML file (expected .html/.htm/.xhtml): {html_path}")
+        return [path]
+
+    if path.is_dir():
+        files = sorted(
+            f for f in path.rglob("*") if f.is_file() and f.suffix.lower() in HTML_EXTENSIONS
+        )
+        if not files:
+            raise ValueError(f"No HTML files found in directory: {html_path}")
+        return files
+
+    raise ValueError(f"Path is neither a file nor a directory: {html_path}")
+
+
+class HtmlToSkillConverter:
+    """Convert local HTML files to a skill.
+
+    Supports single HTML files and directories of HTML files. Parses document
+    structure, extracts headings, content, code blocks, tables, images, and
+    links, then builds a complete skill directory structure.
+
+    Attributes:
+        config: Configuration dict with name, html_path, description.
+        name: Skill name.
+        html_path: Path to the HTML file or directory.
+        description: Skill description text.
+        skill_dir: Output directory for the built skill.
+        data_file: Path to the intermediate extracted JSON file.
+        extracted_data: Parsed extraction results dict.
+    """
+
+    def __init__(self, config: dict) -> None:
+        """Initialize the HTML to skill converter.
+
+        Args:
+            config: Configuration dict containing:
+                - name (str): Skill name (required).
+                - html_path (str): Path to HTML file or directory (optional).
+                - description (str): Skill description (optional).
+                - categories (dict): Category definitions for content grouping.
+        """
+        self.config = config
+        self.name: str = config["name"]
+        self.html_path: str = config.get("html_path", "")
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} documentation"
+        )
+
+        # Paths
+        self.skill_dir = f"output/{self.name}"
+        self.data_file = f"output/{self.name}_extracted.json"
+
+        # Categories config
+        self.categories: dict = config.get("categories", {})
+
+        # Extracted data
+        self.extracted_data: dict | None = None
+
+    # ------------------------------------------------------------------
+    # Extraction
+    # ------------------------------------------------------------------
+
+    def extract_html(self) -> bool:
+        """Extract content from local HTML file(s).
+
+        Workflow:
+        1. Collect HTML files from path (single file or directory)
+        2. For each file: parse with BeautifulSoup (html.parser)
+        3. Extract document metadata (title, meta tags)
+        4. Extract main content using common selectors (article, main, etc.)
+        5. Split content by h1/h2 heading boundaries into sections
+        6. Extract code blocks from <pre>/<code> elements
+        7. Extract tables and convert to markdown-ready dicts
+        8. Extract images and links
+        9. Detect code languages via LanguageDetector
+        10. Save intermediate JSON to {name}_extracted.json
+
+        Returns:
+            True on success.
+
+        Raises:
+            FileNotFoundError: If the HTML path does not exist.
+            ValueError: If no valid HTML files are found.
+        """
+        from skill_seekers.cli.language_detector import LanguageDetector
+
+        print(f"\n🔍 Extracting from HTML: {self.html_path}")
+
+        html_files = _collect_html_files(self.html_path)
+        print(f"   Found {len(html_files)} HTML file(s)")
+
+        # Aggregate metadata from the first file
+        aggregate_metadata: dict = {}
+        all_sections: list[dict] = []
+        total_images = 0
+        section_number = 0
+
+        for file_path in html_files:
+            try:
+                raw_html = file_path.read_text(encoding="utf-8", errors="ignore")
+            except Exception as e:
+                logger.warning("Could not read %s: %s", file_path, e)
+                continue
+
+            soup = BeautifulSoup(raw_html, "html.parser")
+
+            # Extract metadata from first file (or merge)
+            file_meta = self._extract_metadata(soup, file_path)
+            if not aggregate_metadata:
+                aggregate_metadata = file_meta
+            elif file_meta.get("title"):
+                # Keep track of all titles for multi-file mode
+                existing = aggregate_metadata.get("all_titles", [])
+                if aggregate_metadata.get("title"):
+                    existing.append(aggregate_metadata["title"])
+                existing.append(file_meta["title"])
+                aggregate_metadata["all_titles"] = existing
+
+            print(f"   Processing: {file_path.name}")
+
+            # Clean the soup
+            self._clean_soup(soup)
+
+            # Find main content area
+            main_content = self._find_main_content(soup)
+
+            # Split into sections by heading boundaries
+            file_sections, img_count = self._extract_sections(
+                main_content, section_number, file_path
+            )
+            section_number += len(file_sections)
+            total_images += img_count
+            all_sections.extend(file_sections)
+
+        # If no sections were created, warn
+        if not all_sections:
+            logger.warning("No sections extracted from HTML files")
+
+        # Update description from metadata if not set explicitly
+        if not self.config.get("description"):
+            self.description = infer_description_from_html(aggregate_metadata, self.name)
+
+        print(f"   Title: {aggregate_metadata.get('title', 'Unknown')}")
+        print(f"   Author: {aggregate_metadata.get('author', 'Unknown')}")
+
+        # Detect languages for code samples
+        detector = LanguageDetector(min_confidence=0.15)
+        languages_detected: dict[str, int] = {}
+        total_code_blocks = 0
+
+        for section in all_sections:
+            for code_sample in section.get("code_samples", []):
+                lang = code_sample.get("language", "")
+                if lang:
+                    languages_detected[lang] = languages_detected.get(lang, 0) + 1
+                total_code_blocks += 1
+
+        # Detect languages for samples without language
+        for section in all_sections:
+            for code_sample in section.get("code_samples", []):
+                if not code_sample.get("language"):
+                    code = code_sample.get("code", "")
+                    if code:
+                        lang, confidence = detector.detect_from_code(code)
+                        if lang and confidence >= 0.3:
+                            code_sample["language"] = lang
+                            languages_detected[lang] = languages_detected.get(lang, 0) + 1
+
+        result_data = {
+            "source_file": self.html_path,
+            "metadata": aggregate_metadata,
+            "total_sections": len(all_sections),
+            "total_code_blocks": total_code_blocks,
+            "total_images": total_images,
+            "total_files": len(html_files),
+            "languages_detected": languages_detected,
+            "pages": all_sections,
+        }
+
+        # Save extracted data
+        os.makedirs(os.path.dirname(self.data_file), exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {len(all_sections)} sections, "
+            f"{total_code_blocks} code blocks, "
+            f"{total_images} images from {len(html_files)} file(s)"
+        )
+        return True
+
+    # ------------------------------------------------------------------
+    # Metadata extraction
+    # ------------------------------------------------------------------
+
+    def _extract_metadata(self, soup: BeautifulSoup, file_path: Path) -> dict:
+        """Extract metadata from HTML document head.
+
+        Checks <title>, <meta name="..."> tags for standard metadata fields
+        (description, author, keywords, generator, language).
+
+        Args:
+            soup: Parsed BeautifulSoup document.
+            file_path: Path to the source file (used as fallback title).
+
+        Returns:
+            Metadata dict with title, author, description, language, etc.
+        """
+        metadata: dict[str, str | None] = {
+            "title": None,
+            "author": None,
+            "description": None,
+            "language": None,
+            "keywords": None,
+            "generator": None,
+            "source_file": str(file_path),
+        }
+
+        # <title> tag
+        title_tag = soup.find("title")
+        if title_tag:
+            metadata["title"] = title_tag.get_text(strip=True)
+
+        # <meta> tags
+        meta_map = {
+            "description": "description",
+            "author": "author",
+            "keywords": "keywords",
+            "generator": "generator",
+        }
+        for meta_name, key in meta_map.items():
+            meta_tag = soup.find("meta", attrs={"name": meta_name})
+            if meta_tag and meta_tag.get("content"):
+                metadata[key] = meta_tag["content"].strip()
+
+        # OpenGraph fallbacks
+        if not metadata["title"]:
+            og_title = soup.find("meta", attrs={"property": "og:title"})
+            if og_title and og_title.get("content"):
+                metadata["title"] = og_title["content"].strip()
+
+        if not metadata["description"]:
+            og_desc = soup.find("meta", attrs={"property": "og:description"})
+            if og_desc and og_desc.get("content"):
+                metadata["description"] = og_desc["content"].strip()
+
+        # Language from <html lang="...">
+        html_tag = soup.find("html")
+        if html_tag and html_tag.get("lang"):
+            metadata["language"] = html_tag["lang"]
+
+        # Fallback title from filename
+        if not metadata["title"]:
+            metadata["title"] = file_path.stem.replace("_", " ").replace("-", " ").title()
+
+        return metadata
+
+    # ------------------------------------------------------------------
+    # Soup cleaning
+    # ------------------------------------------------------------------
+
+    def _clean_soup(self, soup: BeautifulSoup) -> None:
+        """Remove non-content elements from the parsed HTML.
+
+        Strips scripts, styles, navigation, footers, ads, comments, and other
+        boilerplate elements that should not be part of the extracted content.
+
+        Args:
+            soup: BeautifulSoup object to clean in-place.
+        """
+        # Remove script and style elements
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
+            comment.extract()
+
+        # Remove common boilerplate elements by tag
+        boilerplate_tags = ["nav", "footer", "header"]
+        for tag_name in boilerplate_tags:
+            for tag in soup.find_all(tag_name):
+                # Keep header if it contains h1 (likely document title)
+                if tag_name == "header" and tag.find(["h1", "h2"]):
+                    continue
+                tag.decompose()
+
+        # Remove common boilerplate by class/id patterns
+        boilerplate_patterns = [
+            "sidebar",
+            "menu",
+            "navbar",
+            "breadcrumb",
+            "pagination",
+            "cookie",
+            "banner",
+            "advertisement",
+            "ad-",
+            "social-share",
+            "share-buttons",
+            "comment-section",
+            "comments",
+        ]
+        for pattern in boilerplate_patterns:
+            for elem in soup.find_all(
+                attrs={"class": lambda c, p=pattern: c and p in " ".join(c).lower()}
+            ):
+                elem.decompose()
+            for elem in soup.find_all(attrs={"id": lambda i, p=pattern: i and p in i.lower()}):
+                elem.decompose()
+
+    # ------------------------------------------------------------------
+    # Main content detection
+    # ------------------------------------------------------------------
+
+    def _find_main_content(self, soup: BeautifulSoup) -> Tag | BeautifulSoup:
+        """Find the main content area of an HTML document.
+
+        Tries common content selectors in priority order:
+        1. <main> tag
+        2. <article> tag
+        3. Elements with role="main"
+        4. Common content class/id selectors (.content, #content, etc.)
+        5. Falls back to <body> or the entire soup
+
+        Args:
+            soup: Cleaned BeautifulSoup document.
+
+        Returns:
+            BeautifulSoup Tag representing the main content area.
+        """
+        # Priority 1: semantic HTML5 tags
+        main_tag = soup.find("main")
+        if main_tag and len(main_tag.get_text(strip=True)) > 50:
+            return main_tag
+
+        article_tag = soup.find("article")
+        if article_tag and len(article_tag.get_text(strip=True)) > 50:
+            return article_tag
+
+        # Priority 2: ARIA role
+        role_main = soup.find(attrs={"role": "main"})
+        if role_main and len(role_main.get_text(strip=True)) > 50:
+            return role_main
+
+        # Priority 3: common CSS class/id selectors
+        content_selectors = [
+            {"class_": "content"},
+            {"class_": "main-content"},
+            {"class_": "page-content"},
+            {"class_": "post-content"},
+            {"class_": "entry-content"},
+            {"class_": "article-content"},
+            {"class_": "documentation"},
+            {"class_": "doc-content"},
+            {"id": "content"},
+            {"id": "main-content"},
+            {"id": "main"},
+            {"id": "article"},
+            {"id": "documentation"},
+        ]
+
+        for selector in content_selectors:
+            # find_all returns tags matching any class in a multi-class element
+            elem = soup.find("div", **selector) or soup.find("section", **selector)
+            if elem and len(elem.get_text(strip=True)) > 50:
+                return elem
+
+        # Priority 4: largest <div> by text length (heuristic)
+        divs = soup.find_all("div")
+        if divs:
+            largest = max(divs, key=lambda d: len(d.get_text(strip=True)))
+            text_len = len(largest.get_text(strip=True))
+            if text_len > 200:
+                return largest
+
+        # Fallback: body or entire soup
+        body = soup.find("body")
+        return body if body else soup
+
+    # ------------------------------------------------------------------
+    # Section extraction
+    # ------------------------------------------------------------------
+
+    def _extract_sections(
+        self,
+        content: Tag | BeautifulSoup,
+        start_section_number: int,
+        source_file: Path,
+    ) -> tuple[list[dict], int]:
+        """Extract sections from HTML content by splitting on heading boundaries.
+
+        Iterates through top-level children of the content element. When an
+        h1 or h2 heading is encountered, the previous accumulated elements
+        are flushed into a section dict. Code blocks, tables, images, and
+        links are extracted from each section.
+
+        Args:
+            content: BeautifulSoup Tag containing the main content.
+            start_section_number: Starting section number for numbering.
+            source_file: Path to the source HTML file.
+
+        Returns:
+            Tuple of (sections list, image count).
+        """
+        sections: list[dict] = []
+        section_number = start_section_number
+        image_count = 0
+
+        current_heading: str | None = None
+        current_heading_level: str | None = None
+        current_elements: list = []
+
+        for elem in content.children:
+            if not hasattr(elem, "name") or elem.name is None:
+                # NavigableString — skip whitespace, keep text
+                continue
+
+            if elem.name in ("h1", "h2"):
+                # Flush previous section
+                if current_heading is not None or current_elements:
+                    section_number += 1
+                    section, img_count = self._build_section(
+                        section_number,
+                        current_heading,
+                        current_heading_level,
+                        current_elements,
+                        source_file,
+                    )
+                    sections.append(section)
+                    image_count += img_count
+                current_heading = elem.get_text(strip=True)
+                current_heading_level = elem.name
+                current_elements = []
+            else:
+                current_elements.append(elem)
+
+        # Flush last section
+        if current_heading is not None or current_elements:
+            section_number += 1
+            section, img_count = self._build_section(
+                section_number,
+                current_heading,
+                current_heading_level,
+                current_elements,
+                source_file,
+            )
+            sections.append(section)
+            image_count += img_count
+
+        return sections, image_count
+
+    def _build_section(
+        self,
+        section_number: int,
+        heading: str | None,
+        heading_level: str | None,
+        elements: list,
+        source_file: Path,
+    ) -> tuple[dict, int]:
+        """Build a section dict from a list of BeautifulSoup elements.
+
+        Processes each element to extract text paragraphs, code samples,
+        tables, sub-headings, images, and links. Handles nested structures
+        by recursively searching within container elements.
+
+        Args:
+            section_number: 1-based section index.
+            heading: Heading text (or None for preamble content).
+            heading_level: Heading level string ('h1', 'h2', etc.).
+            elements: List of BeautifulSoup Tag objects in this section.
+            source_file: Path to the source HTML file for resolving links.
+
+        Returns:
+            Tuple of (section dict, image count found in this section).
+        """
+        text_parts: list[str] = []
+        code_samples: list[dict] = []
+        tables: list[dict] = []
+        sub_headings: list[dict] = []
+        images: list[dict] = []
+        links: list[dict] = []
+
+        for elem in elements:
+            if not hasattr(elem, "name") or elem.name is None:
+                continue
+
+            tag = elem.name
+
+            # Sub-headings (h3, h4, h5, h6) within the section
+            if tag in ("h3", "h4", "h5", "h6"):
+                sub_text = elem.get_text(strip=True)
+                if sub_text:
+                    sub_headings.append({"level": tag, "text": sub_text})
+                continue
+
+            # Code blocks — <pre> or standalone <code>
+            if tag == "pre" or (tag == "code" and elem.find_parent("pre") is None):
+                extracted = self._extract_code_blocks(elem)
+                if extracted:
+                    code_samples.extend(extracted)
+                continue
+
+            # Tables
+            if tag == "table":
+                table_data = self._extract_tables(elem)
+                if table_data:
+                    tables.append(table_data)
+                continue
+
+            # Images (top-level)
+            if tag == "img":
+                img_info = self._extract_image_info(elem, source_file)
+                if img_info:
+                    img_info["index"] = len(images)
+                    images.append(img_info)
+                continue
+
+            # For container elements, recursively look for nested content
+            nested_codes = elem.find_all("pre")
+            for pre in nested_codes:
+                extracted = self._extract_code_blocks(pre)
+                if extracted:
+                    code_samples.extend(extracted)
+                pre.decompose()  # Remove so we don't double-count text
+
+            nested_tables = elem.find_all("table")
+            for tbl in nested_tables:
+                table_data = self._extract_tables(tbl)
+                if table_data:
+                    tables.append(table_data)
+                tbl.decompose()
+
+            nested_images = elem.find_all("img")
+            for img in nested_images:
+                img_info = self._extract_image_info(img, source_file)
+                if img_info:
+                    img_info["index"] = len(images)
+                    images.append(img_info)
+
+            # Extract links from this element
+            for a_tag in elem.find_all("a", href=True):
+                link_info = self._extract_link_info(a_tag, source_file)
+                if link_info:
+                    links.append(link_info)
+
+            # Regular text/paragraph content
+            text = self._html_to_text(elem)
+            if text and text.strip():
+                text_parts.append(text.strip())
+
+        image_count = len(images)
+
+        section_dict = {
+            "section_number": section_number,
+            "heading": heading or "",
+            "heading_level": heading_level or "h1",
+            "text": "\n\n".join(text_parts),
+            "headings": sub_headings,
+            "code_samples": code_samples,
+            "tables": tables,
+            "images": images,
+            "links": links,
+            "source_file": str(source_file.name),
+        }
+        return section_dict, image_count
+
+    # ------------------------------------------------------------------
+    # Code block extraction
+    # ------------------------------------------------------------------
+
+    def _extract_code_blocks(self, elem: Tag) -> list[dict]:
+        """Extract code blocks from <pre> and <code> elements.
+
+        Handles multiple patterns:
+        - <pre><code class="language-python">...</code></pre>
+        - <pre class="code">...</pre>
+        - Standalone <code>...</code> (only if substantial)
+
+        Language detection is attempted from CSS classes first, falling
+        back to content-based heuristics via _detect_language().
+
+        Args:
+            elem: A BeautifulSoup Tag (<pre> or <code>).
+
+        Returns:
+            List of code sample dicts with 'code', 'language', 'quality_score'.
+        """
+        results: list[dict] = []
+
+        if elem.name == "pre":
+            # Look for <code> child within <pre>
+            code_elem = elem.find("code")
+            if code_elem:
+                code_text = code_elem.get_text()
+                lang = self._detect_language_from_classes(code_elem)
+                if not lang:
+                    lang = self._detect_language_from_classes(elem)
+            else:
+                code_text = elem.get_text()
+                lang = self._detect_language_from_classes(elem)
+
+            code_text = code_text.strip()
+            if code_text:
+                quality = _score_code_quality(code_text)
+                results.append(
+                    {
+                        "code": code_text,
+                        "language": lang,
+                        "quality_score": quality,
+                    }
+                )
+
+        elif elem.name == "code":
+            # Standalone <code> — only include if substantial
+            code_text = elem.get_text().strip()
+            if code_text and len(code_text) > 30:
+                lang = self._detect_language_from_classes(elem)
+                quality = _score_code_quality(code_text)
+                results.append(
+                    {
+                        "code": code_text,
+                        "language": lang,
+                        "quality_score": quality,
+                    }
+                )
+
+        return results
+
+    def _detect_language_from_classes(self, elem: Tag) -> str:
+        """Detect programming language from CSS classes on an element.
+
+        Common conventions: ``language-python``, ``lang-js``, ``code-ruby``,
+        ``highlight-go``, bare language names as class values.
+
+        Args:
+            elem: BeautifulSoup Tag with potential language class.
+
+        Returns:
+            Detected language string, or empty string if not found.
+        """
+        classes = elem.get("class", [])
+        if not classes:
+            return ""
+
+        # Known class prefixes for language hints
+        prefixes = ("language-", "lang-", "code-", "highlight-", "brush:")
+        for cls in classes:
+            cls_lower = cls.lower()
+            for prefix in prefixes:
+                if cls_lower.startswith(prefix):
+                    return cls_lower[len(prefix) :]
+
+        # Check for bare language names
+        known_langs = {
+            "python",
+            "javascript",
+            "typescript",
+            "java",
+            "ruby",
+            "go",
+            "rust",
+            "cpp",
+            "c",
+            "csharp",
+            "php",
+            "swift",
+            "kotlin",
+            "scala",
+            "html",
+            "css",
+            "sql",
+            "bash",
+            "shell",
+            "json",
+            "yaml",
+            "xml",
+            "markdown",
+            "r",
+            "perl",
+            "lua",
+            "dart",
+            "haskell",
+            "elixir",
+            "clojure",
+            "jsx",
+            "tsx",
+        }
+        for cls in classes:
+            if cls.lower() in known_langs:
+                return cls.lower()
+
+        return ""
+
+    def _detect_language(self, code: str) -> str:
+        """Detect programming language from code content using heuristics.
+
+        Performs lightweight pattern matching against common language features.
+        For more robust detection, the full LanguageDetector is used during
+        the extract_html() pipeline.
+
+        Args:
+            code: Source code string.
+
+        Returns:
+            Best-guess language string, or empty string if unknown.
+        """
+        if not code or len(code) < 10:
+            return ""
+
+        # Quick heuristic patterns (ordered by specificity)
+        patterns: list[tuple[str, str]] = [
+            (r"\bdef\s+\w+\s*\(.*\)\s*(->\s*\w+)?\s*:", "python"),
+            (r"\bimport\s+\w+\s*\n|from\s+\w+\s+import\b", "python"),
+            (r"\bclass\s+\w+.*:\s*$", "python"),
+            (r"\bfunction\s+\w+\s*\(", "javascript"),
+            (r"\bconst\s+\w+\s*=\s*(async\s+)?\(", "javascript"),
+            (r"\bexport\s+(default\s+)?", "javascript"),
+            (r"\binterface\s+\w+\s*\{", "typescript"),
+            (r":\s*(string|number|boolean|void)\b", "typescript"),
+            (r"\bpackage\s+\w+;", "java"),
+            (r"\bpublic\s+class\s+\w+", "java"),
+            (r"\bfn\s+\w+\s*\(", "rust"),
+            (r"\blet\s+mut\s+", "rust"),
+            (r"\bfunc\s+\w+\s*\(", "go"),
+            (r"\bpackage\s+main\b", "go"),
+            (r"<\?php\b", "php"),
+            (r"\$\w+\s*=\s*", "php"),
+            (r"#include\s*<\w+", "c"),
+            (r"\bint\s+main\s*\(", "c"),
+            (r"\bstd::", "cpp"),
+            (r"\busing\s+namespace\s+", "cpp"),
+            (r"\brequire\s*\(", "javascript"),
+            (r"^\s*<\w+[\s>]", "html"),
+            (r"SELECT\s+.*\s+FROM\s+", "sql"),
+            (r"#!/bin/(ba)?sh", "bash"),
+            (r"\b(if|for|while)\s*\[", "bash"),
+        ]
+
+        for pattern, lang in patterns:
+            if re.search(pattern, code, re.MULTILINE | re.IGNORECASE):
+                return lang
+
+        return ""
+
+    # ------------------------------------------------------------------
+    # Table extraction
+    # ------------------------------------------------------------------
+
+    def _extract_tables(self, table_elem: Tag) -> dict | None:
+        """Extract an HTML table and convert to a markdown-ready dict.
+
+        Handles <thead>/<tbody> structure as well as header-less tables.
+        If no explicit <thead> is present, the first row is used as headers.
+
+        Args:
+            table_elem: BeautifulSoup <table> Tag.
+
+        Returns:
+            Dict with 'headers' (list[str]) and 'rows' (list[list[str]]),
+            or None if the table has no meaningful content.
+        """
+        headers: list[str] = []
+        rows: list[list[str]] = []
+
+        # Try <thead> first for headers
+        thead = table_elem.find("thead")
+        if thead:
+            header_row = thead.find("tr")
+            if header_row:
+                headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]
+
+        # Body rows
+        tbody = table_elem.find("tbody") or table_elem
+        for row in tbody.find_all("tr"):
+            cells = [td.get_text(strip=True) for td in row.find_all(["td", "th"])]
+            # Skip the header row we already captured
+            if cells and cells != headers:
+                rows.append(cells)
+
+        # If no explicit thead, use first row as header
+        if not headers and rows:
+            headers = rows.pop(0)
+
+        if not headers and not rows:
+            return None
+
+        return {"headers": headers, "rows": rows}
+
+    # ------------------------------------------------------------------
+    # Image and link extraction
+    # ------------------------------------------------------------------
+
+    def _extract_image_info(self, img_elem: Tag, source_file: Path) -> dict | None:
+        """Extract image information from an <img> tag.
+
+        Captures src, alt text, title, and dimensions. Resolves relative
+        src paths against the source file location.
+
+        Args:
+            img_elem: BeautifulSoup <img> Tag.
+            source_file: Path to the containing HTML file.
+
+        Returns:
+            Image info dict or None if the img has no src.
+        """
+        src = img_elem.get("src", "")
+        if not src:
+            return None
+
+        # Resolve relative paths
+        resolved_src = self._resolve_relative_path(src, source_file)
+
+        return {
+            "src": resolved_src,
+            "alt": img_elem.get("alt", ""),
+            "title": img_elem.get("title", ""),
+            "width": int(img_elem.get("width", 0) or 0),
+            "height": int(img_elem.get("height", 0) or 0),
+            "data": b"",  # Placeholder; actual image data loaded separately
+        }
+
+    def _extract_link_info(self, a_elem: Tag, source_file: Path) -> dict | None:
+        """Extract link information from an <a> tag.
+
+        Captures href, link text, and title. Resolves relative hrefs.
+        Skips empty anchors and JavaScript links.
+
+        Args:
+            a_elem: BeautifulSoup <a> Tag with href attribute.
+            source_file: Path to the containing HTML file.
+
+        Returns:
+            Link info dict or None if the link is empty or a JS link.
+        """
+        href = a_elem.get("href", "")
+        if not href or href.startswith("javascript:") or href.startswith("#"):
+            return None
+
+        text = a_elem.get_text(strip=True)
+        if not text:
+            return None
+
+        resolved_href = self._resolve_relative_path(href, source_file)
+
+        return {
+            "href": resolved_href,
+            "text": text,
+            "title": a_elem.get("title", ""),
+        }
+
+    def _resolve_relative_path(self, path: str, source_file: Path) -> str:
+        """Resolve a relative path against the source file's directory.
+
+        Absolute URLs (http/https) and data URIs are returned as-is.
+        Relative paths are resolved against the source file's parent
+        directory and returned as POSIX-style strings.
+
+        Args:
+            path: The URL or relative path to resolve.
+            source_file: The HTML file containing this reference.
+
+        Returns:
+            Resolved path or URL string.
+        """
+        # Absolute URLs and data URIs — return as-is
+        if path.startswith(("http://", "https://", "data:", "//", "mailto:")):
+            return path
+
+        # Resolve relative to source file directory
+        try:
+            base_dir = source_file.parent
+            resolved = (base_dir / path).resolve()
+            return str(resolved)
+        except Exception:
+            return path
+
+    # ------------------------------------------------------------------
+    # HTML-to-text conversion
+    # ------------------------------------------------------------------
+
+    def _html_to_text(self, elem: Tag) -> str:
+        """Convert an HTML element to clean markdown-like text.
+
+        Processes the element's content recursively, converting:
+        - <p> to paragraphs with double newlines
+        - <br> to newlines
+        - <strong>/<b> to **bold**
+        - <em>/<i> to *italic*
+        - <a> to [text](href) markdown links
+        - <ul>/<ol> to markdown list items
+        - <blockquote> to > prefixed lines
+        - <code> (inline) to `backticks`
+        - Heading tags to markdown headings
+
+        Args:
+            elem: BeautifulSoup Tag to convert.
+
+        Returns:
+            Cleaned text string with markdown formatting.
+        """
+        if elem.name is None:
+            return str(elem).strip()
+
+        parts: list[str] = []
+
+        for child in elem.children:
+            if not hasattr(child, "name"):
+                # NavigableString (raw text)
+                text = str(child)
+                if text.strip():
+                    parts.append(text)
+                continue
+
+            if child.name is None:
+                continue
+
+            tag = child.name
+
+            if tag == "br":
+                parts.append("\n")
+            elif tag in ("p", "div"):
+                inner = self._html_to_text(child)
+                if inner.strip():
+                    parts.append(f"\n\n{inner.strip()}\n\n")
+            elif tag in ("strong", "b"):
+                inner = child.get_text(strip=True)
+                if inner:
+                    parts.append(f"**{inner}**")
+            elif tag in ("em", "i"):
+                inner = child.get_text(strip=True)
+                if inner:
+                    parts.append(f"*{inner}*")
+            elif tag == "a" and child.get("href"):
+                link_text = child.get_text(strip=True)
+                href = child.get("href", "")
+                if link_text and href and not href.startswith("javascript:"):
+                    parts.append(f"[{link_text}]({href})")
+                elif link_text:
+                    parts.append(link_text)
+            elif tag in ("ul", "ol"):
+                items = child.find_all("li", recursive=False)
+                for idx, li in enumerate(items):
+                    li_text = li.get_text(strip=True)
+                    if li_text:
+                        prefix = f"{idx + 1}." if tag == "ol" else "-"
+                        parts.append(f"\n{prefix} {li_text}")
+                parts.append("\n")
+            elif tag == "blockquote":
+                bq_text = child.get_text(strip=True)
+                if bq_text:
+                    lines = bq_text.split("\n")
+                    quoted = "\n".join(f"> {line}" for line in lines)
+                    parts.append(f"\n\n{quoted}\n\n")
+            elif tag == "code":
+                # Inline code (not inside <pre>)
+                if child.find_parent("pre") is None:
+                    code_text = child.get_text()
+                    if code_text.strip():
+                        parts.append(f"`{code_text.strip()}`")
+            elif tag in ("h3", "h4", "h5", "h6"):
+                level = int(tag[1])
+                inner = child.get_text(strip=True)
+                if inner:
+                    parts.append(f"\n\n{'#' * level} {inner}\n\n")
+            elif tag == "dl":
+                # Definition lists
+                for dt in child.find_all("dt"):
+                    term = dt.get_text(strip=True)
+                    dd = dt.find_next_sibling("dd")
+                    definition = dd.get_text(strip=True) if dd else ""
+                    parts.append(f"\n**{term}**: {definition}")
+                parts.append("\n")
+            elif tag == "hr":
+                parts.append("\n\n---\n\n")
+            else:
+                # Generic element — extract text
+                inner = self._html_to_text(child)
+                if inner.strip():
+                    parts.append(inner)
+
+        result = "".join(parts)
+        # Collapse excessive whitespace
+        result = re.sub(r"\n{3,}", "\n\n", result)
+        return result
+
+    # ------------------------------------------------------------------
+    # Load / Categorize / Build
+    # ------------------------------------------------------------------
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from JSON.
+
+        Args:
+            json_path: Path to the intermediate extracted JSON file.
+
+        Returns:
+            True on success.
+        """
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", [])))
+        print(f"✅ Loaded {total} sections")
+        return True
+
+    def categorize_content(self) -> dict:
+        """Categorize sections based on headings or keywords.
+
+        For single-source HTML (single file), groups all sections under one
+        category named after the source. For directories, creates categories
+        per file. Keyword-based categorization is used when ``self.categories``
+        is configured.
+
+        Returns:
+            Dict mapping category keys to dicts with 'title' and 'pages'.
+        """
+        print("\n📋 Categorizing content...")
+
+        categorized: dict[str, dict] = {}
+        sections = self.extracted_data.get("pages", [])
+
+        # For a single HTML file, use single category
+        total_files = self.extracted_data.get("total_files", 1)
+        if total_files == 1 and self.html_path:
+            path = Path(self.html_path)
+            if path.is_file():
+                basename = path.stem
+                category_key = self._sanitize_filename(basename)
+                categorized[category_key] = {
+                    "title": basename,
+                    "pages": sections,
+                }
+                print("✅ Created 1 category (single HTML file)")
+                print(f"   - {basename}: {len(sections)} sections")
+                return categorized
+
+        # For directory with multiple files, group by source file
+        if total_files > 1:
+            for section in sections:
+                source = section.get("source_file", "unknown")
+                source_stem = Path(source).stem
+                cat_key = self._sanitize_filename(source_stem)
+                if cat_key not in categorized:
+                    categorized[cat_key] = {
+                        "title": source_stem,
+                        "pages": [],
+                    }
+                categorized[cat_key]["pages"].append(section)
+
+            print(f"✅ Created {len(categorized)} categories (multi-file)")
+            for _key, cat_data in categorized.items():
+                print(f"   - {cat_data['title']}: {len(cat_data['pages'])} sections")
+            return categorized
+
+        # Keyword-based categorization
+        if self.categories:
+            first_value = next(iter(self.categories.values()), None)
+            if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
+                # Already categorized format
+                for cat_key, pages in self.categories.items():
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": pages,
+                    }
+            else:
+                # Keyword-based categorization
+                for cat_key in self.categories:
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": [],
+                    }
+
+                for section in sections:
+                    text = section.get("text", "").lower()
+                    heading_text = section.get("heading", "").lower()
+
+                    scores: dict[str, int] = {}
+                    for cat_key, keywords in self.categories.items():
+                        if isinstance(keywords, list):
+                            score = sum(
+                                1
+                                for kw in keywords
+                                if isinstance(kw, str)
+                                and (kw.lower() in text or kw.lower() in heading_text)
+                            )
+                        else:
+                            score = 0
+                        if score > 0:
+                            scores[cat_key] = score
+
+                    if scores:
+                        best_cat = max(scores, key=scores.get)
+                        categorized[best_cat]["pages"].append(section)
+                    else:
+                        if "other" not in categorized:
+                            categorized["other"] = {
+                                "title": "Other",
+                                "pages": [],
+                            }
+                        categorized["other"]["pages"].append(section)
+        else:
+            # No categorization — single category
+            categorized["content"] = {"title": "Content", "pages": sections}
+
+        print(f"✅ Created {len(categorized)} categories")
+        for _cat_key, cat_data in categorized.items():
+            print(f"   - {cat_data['title']}: {len(cat_data['pages'])} sections")
+
+        return categorized
+
+    def build_skill(self) -> None:
+        """Build complete skill structure from extracted data.
+
+        Creates the output directory tree, generates reference markdown files,
+        an index file, and the main SKILL.md file. Delegates to private
+        generator methods for each component.
+        """
+        print(f"\n🏗️  Building skill: {self.name}")
+
+        # Create directories
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        # Categorize content
+        categorized = self.categorize_content()
+
+        # Generate reference files
+        print("\n📝 Generating reference files...")
+        total_sections = len(categorized)
+        section_num = 1
+        for cat_key, cat_data in categorized.items():
+            self._generate_reference_file(cat_key, cat_data, section_num, total_sections)
+            section_num += 1
+
+        # Generate index
+        self._generate_index(categorized)
+
+        # Generate SKILL.md
+        self._generate_skill_md(categorized)
+
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ------------------------------------------------------------------
+    # Private generators
+    # ------------------------------------------------------------------
+
+    def _generate_reference_file(
+        self,
+        _cat_key: str,
+        cat_data: dict,
+        section_num: int,
+        total_sections: int,
+    ) -> None:
+        """Generate a reference markdown file for a content category.
+
+        Creates a markdown file containing all sections in the category,
+        with headings, text content, code examples, tables, and images.
+
+        Args:
+            _cat_key: Category key (unused but matches epub pattern).
+            cat_data: Category dict with 'title' and 'pages' keys.
+            section_num: Current section number for filename generation.
+            total_sections: Total number of categories for filename logic.
+        """
+        sections = cat_data["pages"]
+
+        # Determine filename
+        html_basename = ""
+        if self.html_path:
+            path = Path(self.html_path)
+            html_basename = path.stem if path.is_file() else self.name
+
+        if sections:
+            section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+
+            if total_sections == 1:
+                filename = (
+                    f"{self.skill_dir}/references/{html_basename}.md"
+                    if html_basename
+                    else f"{self.skill_dir}/references/main.md"
+                )
+            else:
+                sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
+                base_name = html_basename if html_basename else "section"
+                filename = f"{self.skill_dir}/references/{base_name}_{sec_range}.md"
+        else:
+            filename = f"{self.skill_dir}/references/section_{section_num:02d}.md"
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+
+            for section in sections:
+                sec_num = section.get("section_number", "?")
+                heading = section.get("heading", "")
+                heading_level = section.get("heading_level", "h1")
+                source = section.get("source_file", "")
+
+                f.write(f"---\n\n**📄 Source: Section {sec_num}**")
+                if source:
+                    f.write(f" *({source})*")
+                f.write("\n\n")
+
+                # Add heading
+                if heading:
+                    md_level = "#" * (int(heading_level[1]) + 1) if heading_level else "##"
+                    f.write(f"{md_level} {heading}\n\n")
+
+                # Add sub-headings (h3+) found within the section
+                for sub_heading in section.get("headings", []):
+                    sub_level = sub_heading.get("level", "h3")
+                    sub_text = sub_heading.get("text", "")
+                    if sub_text:
+                        sub_md = "#" * (int(sub_level[1]) + 1) if sub_level else "###"
+                        f.write(f"{sub_md} {sub_text}\n\n")
+
+                # Add text content
+                if section.get("text"):
+                    f.write(f"{section['text']}\n\n")
+
+                # Add code samples
+                code_list = section.get("code_samples", [])
+                if code_list:
+                    f.write("### Code Examples\n\n")
+                    for code in code_list:
+                        lang = code.get("language", "")
+                        f.write(f"```{lang}\n{code['code']}\n```\n\n")
+
+                # Add tables as markdown
+                table_list = section.get("tables", [])
+                if table_list:
+                    f.write("### Tables\n\n")
+                    for table in table_list:
+                        headers = table.get("headers", [])
+                        rows = table.get("rows", [])
+                        if headers:
+                            f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
+                            f.write("| " + " | ".join("---" for _ in headers) + " |\n")
+                        for row in rows:
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+
+                # Add images
+                images = section.get("images", [])
+                if images:
+                    f.write("### Images\n\n")
+                    for img in images:
+                        alt = img.get("alt", "")
+                        src = img.get("src", "")
+                        title = img.get("title", "")
+                        if alt or src:
+                            display = alt or title or f"Image {img.get('index', 0)}"
+                            f.write(f"![{display}]({src})\n\n")
+
+                # Add notable links
+                link_list = section.get("links", [])
+                if link_list:
+                    f.write("### Links\n\n")
+                    for link in link_list[:20]:  # Cap at 20 links per section
+                        f.write(f"- [{link['text']}]({link['href']})\n")
+                    f.write("\n")
+
+                f.write("---\n\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_index(self, categorized: dict) -> None:
+        """Generate reference index file.
+
+        Creates an index.md in the references directory listing all categories
+        with links, section counts, and overall statistics.
+
+        Args:
+            categorized: Dict of category_key -> category data.
+        """
+        filename = f"{self.skill_dir}/references/index.md"
+
+        html_basename = ""
+        if self.html_path:
+            path = Path(self.html_path)
+            html_basename = path.stem if path.is_file() else self.name
+
+        total_categories = len(categorized)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Documentation Reference\n\n")
+            f.write("## Categories\n\n")
+
+            section_num = 1
+            for _cat_key, cat_data in categorized.items():
+                sections = cat_data["pages"]
+                section_count = len(sections)
+
+                if sections:
+                    section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+                    sec_range_str = f"Sections {min(section_nums)}-{max(section_nums)}"
+
+                    if total_categories == 1:
+                        link_filename = f"{html_basename}.md" if html_basename else "main.md"
+                    else:
+                        sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
+                        base_name = html_basename if html_basename else "section"
+                        link_filename = f"{base_name}_{sec_range}.md"
+                else:
+                    link_filename = f"section_{section_num:02d}.md"
+                    sec_range_str = "N/A"
+
+                f.write(
+                    f"- [{cat_data['title']}]({link_filename}) "
+                    f"({section_count} sections, {sec_range_str})\n"
+                )
+                section_num += 1
+
+            f.write("\n## Statistics\n\n")
+            f.write(f"- Total sections: {self.extracted_data.get('total_sections', 0)}\n")
+            f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
+            f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
+            f.write(f"- HTML files processed: {self.extracted_data.get('total_files', 0)}\n")
+
+            # Metadata
+            metadata = self.extracted_data.get("metadata", {})
+            if metadata.get("author"):
+                f.write(f"- Author: {metadata['author']}\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict) -> None:
+        """Generate main SKILL.md file.
+
+        Creates the top-level SKILL.md with YAML frontmatter, document
+        information, usage guidance, section overview, key concepts,
+        code examples, table summary, statistics, and navigation links.
+
+        Args:
+            categorized: Dict of category_key -> category data.
+        """
+        filename = f"{self.skill_dir}/SKILL.md"
+
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        with open(filename, "w", encoding="utf-8") as f:
+            # YAML frontmatter
+            f.write("---\n")
+            f.write(f"name: {skill_name}\n")
+            f.write(f"description: {desc}\n")
+            f.write("---\n\n")
+
+            f.write(f"# {self.name.title()} Documentation Skill\n\n")
+            f.write(f"{self.description}\n\n")
+
+            # Document metadata
+            metadata = self.extracted_data.get("metadata", {})
+            if any(v for v in metadata.values() if v):
+                f.write("## 📋 Document Information\n\n")
+                if metadata.get("title"):
+                    f.write(f"**Title:** {metadata['title']}\n\n")
+                if metadata.get("author"):
+                    f.write(f"**Author:** {metadata['author']}\n\n")
+                if metadata.get("language"):
+                    f.write(f"**Language:** {metadata['language']}\n\n")
+                if metadata.get("description"):
+                    f.write(f"**Description:** {metadata['description']}\n\n")
+                if metadata.get("keywords"):
+                    f.write(f"**Keywords:** {metadata['keywords']}\n\n")
+                total_files = self.extracted_data.get("total_files", 1)
+                if total_files > 1:
+                    f.write(f"**Source files:** {total_files} HTML files\n\n")
+
+            # When to Use
+            f.write("## 💡 When to Use This Skill\n\n")
+            f.write("Use this skill when you need to:\n")
+            f.write(f"- Understand {self.name} concepts and fundamentals\n")
+            f.write("- Look up API references and technical specifications\n")
+            f.write("- Find code examples and implementation patterns\n")
+            f.write("- Review tutorials, guides, and best practices\n")
+            f.write("- Explore the complete documentation structure\n\n")
+
+            # Section Overview
+            total_sections = self.extracted_data.get("total_sections", 0)
+            f.write("## 📖 Section Overview\n\n")
+            f.write(f"**Total Sections:** {total_sections}\n\n")
+            f.write("**Content Breakdown:**\n\n")
+            for _cat_key, cat_data in categorized.items():
+                section_count = len(cat_data["pages"])
+                f.write(f"- **{cat_data['title']}**: {section_count} sections\n")
+            f.write("\n")
+
+            # Key Concepts from headings
+            f.write(self._format_key_concepts())
+
+            # Quick Reference patterns
+            f.write("## ⚡ Quick Reference\n\n")
+            f.write(self._format_patterns_from_content())
+
+            # Code examples (top 15, grouped by language)
+            all_code: list[dict] = []
+            for section in self.extracted_data.get("pages", []):
+                all_code.extend(section.get("code_samples", []))
+
+            all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
+            top_code = all_code[:15]
+
+            if top_code:
+                f.write("## 📝 Code Examples\n\n")
+                f.write("*High-quality examples extracted from documentation*\n\n")
+
+                by_lang: dict[str, list] = {}
+                for code in top_code:
+                    lang = code.get("language", "unknown")
+                    by_lang.setdefault(lang, []).append(code)
+
+                for lang in sorted(by_lang.keys()):
+                    examples = by_lang[lang]
+                    f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
+                    for i, code in enumerate(examples[:5], 1):
+                        quality = code.get("quality_score", 0)
+                        code_text = code.get("code", "")
+                        f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
+                        f.write(f"```{lang}\n")
+                        if len(code_text) <= 500:
+                            f.write(code_text)
+                        else:
+                            f.write(code_text[:500] + "\n...")
+                        f.write("\n```\n\n")
+
+            # Table Summary (first 5 tables)
+            all_tables: list[tuple[str, dict]] = []
+            for section in self.extracted_data.get("pages", []):
+                for table in section.get("tables", []):
+                    all_tables.append((section.get("heading", ""), table))
+
+            if all_tables:
+                f.write("## 📊 Table Summary\n\n")
+                f.write(f"*{len(all_tables)} table(s) found in document*\n\n")
+                for section_heading, table in all_tables[:5]:
+                    if section_heading:
+                        f.write(f"**From section: {section_heading}**\n\n")
+                    headers = table.get("headers", [])
+                    rows = table.get("rows", [])
+                    if headers:
+                        f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
+                        f.write("| " + " | ".join("---" for _ in headers) + " |\n")
+                        for row in rows[:5]:
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+
+            # Statistics
+            f.write("## 📊 Documentation Statistics\n\n")
+            f.write(f"- **Total Sections**: {total_sections}\n")
+            f.write(f"- **Code Blocks**: {self.extracted_data.get('total_code_blocks', 0)}\n")
+            f.write(f"- **Images/Diagrams**: {self.extracted_data.get('total_images', 0)}\n")
+            f.write(f"- **Tables**: {len(all_tables)}\n")
+            f.write(f"- **HTML Files**: {self.extracted_data.get('total_files', 0)}\n")
+
+            langs = self.extracted_data.get("languages_detected", {})
+            if langs:
+                f.write(f"- **Programming Languages**: {len(langs)}\n\n")
+                f.write("**Language Breakdown:**\n\n")
+                for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
+                    f.write(f"- {lang}: {count} examples\n")
+                f.write("\n")
+
+            # Navigation
+            f.write("## 🗺️ Navigation\n\n")
+            f.write("**Reference Files:**\n\n")
+            for _cat_key, cat_data in categorized.items():
+                cat_file = self._sanitize_filename(cat_data["title"])
+                f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
+            f.write("\n")
+            f.write("See `references/index.md` for complete documentation structure.\n\n")
+
+            # Footer
+            f.write("---\n\n")
+            f.write("**Generated by Skill Seeker** | HTML Scraper\n")
+
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"   Generated: {filename} ({line_count} lines)")
+
+    # ------------------------------------------------------------------
+    # Content analysis helpers
+    # ------------------------------------------------------------------
+
+    def _format_key_concepts(self) -> str:
+        """Extract key concepts from headings across all sections.
+
+        Collects h1 and h2 headings as major topics, and h3+ headings as
+        subtopics. Returns formatted markdown for inclusion in SKILL.md.
+
+        Returns:
+            Formatted markdown string with key concepts section.
+        """
+        all_headings: list[tuple[str, str]] = []
+        for section in self.extracted_data.get("pages", []):
+            # Main heading
+            heading = section.get("heading", "").strip()
+            level = section.get("heading_level", "h1")
+            if heading and len(heading) > 3:
+                all_headings.append((level, heading))
+            # Sub-headings
+            for sub in section.get("headings", []):
+                text = sub.get("text", "").strip()
+                sub_level = sub.get("level", "h3")
+                if text and len(text) > 3:
+                    all_headings.append((sub_level, text))
+
+        if not all_headings:
+            return ""
+
+        content = "## 🔑 Key Concepts\n\n"
+        content += "*Main topics covered in this documentation*\n\n"
+
+        h1_headings = [text for level, text in all_headings if level == "h1"]
+        h2_headings = [text for level, text in all_headings if level == "h2"]
+
+        if h1_headings:
+            content += "**Major Topics:**\n\n"
+            for heading in h1_headings[:10]:
+                content += f"- {heading}\n"
+            content += "\n"
+
+        if h2_headings:
+            content += "**Subtopics:**\n\n"
+            for heading in h2_headings[:15]:
+                content += f"- {heading}\n"
+            content += "\n"
+
+        return content
+
+    def _format_patterns_from_content(self) -> str:
+        """Extract common documentation patterns from section headings.
+
+        Searches for well-known heading keywords like 'getting started',
+        'installation', 'api', etc. and groups them by type.
+
+        Returns:
+            Formatted markdown string with pattern descriptions.
+        """
+        patterns: list[dict] = []
+        pattern_keywords = [
+            "getting started",
+            "installation",
+            "configuration",
+            "usage",
+            "api",
+            "examples",
+            "tutorial",
+            "guide",
+            "best practices",
+            "troubleshooting",
+            "faq",
+            "reference",
+            "changelog",
+        ]
+
+        for section in self.extracted_data.get("pages", []):
+            heading_text = section.get("heading", "").lower()
+            sec_num = section.get("section_number", 0)
+
+            for keyword in pattern_keywords:
+                if keyword in heading_text:
+                    patterns.append(
+                        {
+                            "type": keyword.title(),
+                            "heading": section.get("heading", ""),
+                            "section": sec_num,
+                        }
+                    )
+                    break
+
+        if not patterns:
+            return "*See reference files for detailed content*\n\n"
+
+        content = "*Common documentation patterns found:*\n\n"
+        by_type: dict[str, list] = {}
+        for pattern in patterns:
+            ptype = pattern["type"]
+            by_type.setdefault(ptype, []).append(pattern)
+
+        for ptype in sorted(by_type.keys()):
+            items = by_type[ptype]
+            content += f"**{ptype}** ({len(items)} sections):\n"
+            for item in items[:3]:
+                content += f"- {item['heading']} (section {item['section']})\n"
+            content += "\n"
+
+        return content
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert string to safe filename.
+
+        Removes special characters, converts spaces and hyphens to
+        underscores, and lowercases the result.
+
+        Args:
+            name: Raw string to sanitize.
+
+        Returns:
+            Filesystem-safe filename string.
+        """
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        safe = re.sub(r"[-\s]+", "_", safe)
+        return safe
+
+
+# ---------------------------------------------------------------------------
+# Module-level helpers
+# ---------------------------------------------------------------------------
+
+
+def _score_code_quality(code: str) -> float:
+    """Simple quality heuristic for code blocks (0-10 scale).
+
+    Scores based on line count, presence of definitions, imports,
+    indentation, and operator usage. Short snippets are penalized.
+
+    Args:
+        code: Source code string.
+
+    Returns:
+        Quality score between 0.0 and 10.0.
+    """
+    if not code:
+        return 0.0
+
+    score = 5.0
+    lines = code.strip().split("\n")
+    line_count = len(lines)
+
+    # More lines = more substantial
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+
+    # Has function/class definitions
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+
+    # Has imports/require
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+
+    # Has indentation (structured code)
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+
+    # Has assignment, operators, or common code syntax
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+
+    # Very short snippets get penalized
+    if len(code) < 30:
+        score -= 2.0
+
+    return min(10.0, max(0.0, score))
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """CLI entry point for the HTML scraper.
+
+    Parses command-line arguments and runs the extraction/build pipeline.
+    Supports two workflows:
+    1. Direct HTML extraction: ``--html-path page.html --name myskill``
+    2. Build from JSON: ``--from-json page_extracted.json``
+
+    Returns:
+        Exit code (0 for success, non-zero for failure).
+    """
+    parser = argparse.ArgumentParser(
+        description="Convert local HTML files to skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Examples:\n"
+            "  %(prog)s --html-path page.html --name myskill\n"
+            "  %(prog)s --html-path ./docs/ --name myskill\n"
+            "  %(prog)s --from-json page_extracted.json\n"
+        ),
+    )
+
+    # Shared universal args
+    from .arguments.common import add_all_standard_arguments
+
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for HTML
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for HTML), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # HTML-specific args
+    parser.add_argument(
+        "--html-path",
+        type=str,
+        help="Path to HTML file or directory of HTML files",
+        metavar="PATH",
+    )
+    parser.add_argument(
+        "--from-json",
+        type=str,
+        help="Build skill from previously extracted JSON",
+        metavar="FILE",
+    )
+
+    args = parser.parse_args()
+
+    # Set logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = getattr(args, "html_path", None) or getattr(args, "from_json", None) or "(none)"
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: HTML Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate inputs
+    if not (getattr(args, "html_path", None) or getattr(args, "from_json", None)):
+        parser.error("Must specify --html-path or --from-json")
+
+    # Build from JSON workflow
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} documentation",
+        }
+        try:
+            converter = HtmlToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Direct HTML mode
+    if not getattr(args, "name", None):
+        # Auto-detect name from path
+        path = Path(args.html_path)
+        args.name = path.stem if path.is_file() else path.name
+
+    config = {
+        "name": args.name,
+        "html_path": args.html_path,
+        # Pass None so extract_html() can infer from HTML metadata
+        "description": getattr(args, "description", None),
+    }
+
+    try:
+        converter = HtmlToSkillConverter(config)
+
+        # Extract
+        if not converter.extract_html():
+            print(
+                "\n❌ HTML extraction failed - see error above",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis,"
+                    " enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import (
+                        enhance_skill_md,
+                    )
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import (
+                        LocalSkillEnhancer,
+                    )
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import (
+                    LocalSkillEnhancer,
+                )
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except (FileNotFoundError, ValueError) as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(
+            f"\n❌ Unexpected error during HTML processing: {e}",
+            file=sys.stderr,
+        )
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/jupyter_scraper.py b/src/skill_seekers/cli/jupyter_scraper.py
new file mode 100644
index 0000000..ab7563e
--- /dev/null
+++ b/src/skill_seekers/cli/jupyter_scraper.py
@@ -0,0 +1,1209 @@
+#!/usr/bin/env python3
+"""
+Jupyter Notebook (.ipynb) to Skill Converter
+
+Converts Jupyter Notebooks into skills.
+Uses nbformat for notebook parsing, extracts markdown prose, code cells with
+outputs, kernel metadata, and cell-level tags.
+
+Supports both single .ipynb files and directories containing multiple notebooks.
+
+Usage:
+    skill-seekers jupyter --notebook notebook.ipynb --name myskill
+    skill-seekers jupyter --notebook ./notebooks/ --name myskill
+    skill-seekers jupyter --from-json notebook_extracted.json
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+
+# Optional dependency guard
+try:
+    import nbformat
+
+    JUPYTER_AVAILABLE = True
+except ImportError:
+    JUPYTER_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# Import pattern categories for code analysis
+_IMPORT_PATTERNS: dict[str, list[re.Pattern]] = {
+    "python": [
+        re.compile(r"^\s*import\s+([\w.]+)", re.MULTILINE),
+        re.compile(r"^\s*from\s+([\w.]+)\s+import", re.MULTILINE),
+    ],
+    "r": [
+        re.compile(r"^\s*library\(([\w.]+)\)", re.MULTILINE),
+        re.compile(r"^\s*require\(([\w.]+)\)", re.MULTILINE),
+    ],
+    "julia": [
+        re.compile(r"^\s*using\s+([\w.]+)", re.MULTILINE),
+        re.compile(r"^\s*import\s+([\w.]+)", re.MULTILINE),
+    ],
+    "javascript": [
+        re.compile(
+            r"""^\s*(?:const|let|var)\s+\w+\s*=\s*require\(['"]([\w./@-]+)['"]\)""", re.MULTILINE
+        ),
+        re.compile(r"""^\s*import\s+.*?\s+from\s+['"]([\w./@-]+)['"]""", re.MULTILINE),
+    ],
+    "scala": [re.compile(r"^\s*import\s+([\w.]+)", re.MULTILINE)],
+}
+
+# Topic keywords used for content categorization
+_TOPIC_KEYWORDS: dict[str, list[str]] = {
+    "data_loading": [
+        "read_csv",
+        "read_json",
+        "read_excel",
+        "read_sql",
+        "load_data",
+        "open(",
+        "pd.read",
+        "fetch",
+        "download",
+        "dataset",
+    ],
+    "data_cleaning": [
+        "dropna",
+        "fillna",
+        "replace",
+        "strip",
+        "clean",
+        "preprocess",
+        "missing",
+        "null",
+        "nan",
+        "duplicate",
+        "rename",
+    ],
+    "visualization": [
+        "plot",
+        "plt.",
+        "figure",
+        "ax.",
+        "chart",
+        "graph",
+        "histogram",
+        "scatter",
+        "seaborn",
+        "sns.",
+        "bokeh",
+        "plotly",
+        "matplotlib",
+    ],
+    "modeling": [
+        "fit",
+        "predict",
+        "train",
+        "model",
+        "classifier",
+        "regressor",
+        "sklearn",
+        "tensorflow",
+        "torch",
+        "keras",
+        "xgboost",
+    ],
+    "evaluation": [
+        "accuracy",
+        "precision",
+        "recall",
+        "f1",
+        "score",
+        "metric",
+        "confusion_matrix",
+        "roc",
+        "auc",
+        "loss",
+        "evaluate",
+    ],
+    "feature_engineering": [
+        "feature",
+        "transform",
+        "encode",
+        "scale",
+        "normalize",
+        "one_hot",
+        "label_encode",
+        "polynomial",
+        "pca",
+    ],
+    "setup": [
+        "install",
+        "pip",
+        "conda",
+        "import",
+        "config",
+        "setup",
+        "environment",
+        "version",
+        "requirements",
+    ],
+    "analysis": [
+        "describe",
+        "info",
+        "shape",
+        "head",
+        "tail",
+        "summary",
+        "statistics",
+        "correlation",
+        "groupby",
+        "aggregate",
+    ],
+}
+
+
+def _check_jupyter_deps():
+    """Raise RuntimeError if nbformat is not installed."""
+    if not JUPYTER_AVAILABLE:
+        raise RuntimeError(
+            "nbformat is required for Jupyter Notebook support.\n"
+            'Install with: pip install "skill-seekers[jupyter]"\n'
+            "Or: pip install nbformat"
+        )
+
+
+def infer_description_from_notebook(metadata: dict | None = None, name: str = "") -> str:
+    """Infer skill description from notebook metadata.
+
+    Args:
+        metadata: Notebook-level metadata dict (kernelspec, language_info, etc.)
+        name: Skill name for fallback
+
+    Returns:
+        Description string suitable for "Use when..." format
+    """
+    if metadata:
+        lang_info = metadata.get("language_info", {})
+        lang_name = lang_info.get("name", "") if isinstance(lang_info, dict) else ""
+        title = metadata.get("title", "")
+        if title and len(title) > 10:
+            return f"Use when working with {title.lower()}"
+        kernelspec = metadata.get("kernelspec", {})
+        display_name = kernelspec.get("display_name", "") if isinstance(kernelspec, dict) else ""
+        if display_name and len(display_name) > 3 and lang_name:
+            return f"Use when working with {lang_name} notebooks ({display_name} kernel)"
+        if lang_name:
+            return f"Use when working with {lang_name} notebook content"
+    return (
+        f"Use when referencing {name} notebook documentation"
+        if name
+        else "Use when referencing this notebook documentation"
+    )
+
+
+class JupyterToSkillConverter:
+    """Convert Jupyter Notebook (.ipynb) to skill."""
+
+    def __init__(self, config: dict):
+        self.config = config
+        self.name = config["name"]
+        self.notebook_path = config.get("notebook_path", "")
+        self.description = (
+            config.get("description") or f"Use when referencing {self.name} notebook documentation"
+        )
+        self.skill_dir = f"output/{self.name}"
+        self.data_file = f"output/{self.name}_extracted.json"
+        self.categories = config.get("categories", {})
+        self.extracted_data: dict | None = None
+
+    # ------------------------------------------------------------------
+    # Extraction
+    # ------------------------------------------------------------------
+
+    def extract_notebook(self) -> bool:
+        """Extract content from Jupyter Notebook file(s).
+
+        Reads .ipynb via nbformat v4, extracts markdown/code/raw cells,
+        detects language from kernel metadata, extracts imports, scores quality.
+        Saves intermediate JSON to {name}_extracted.json. Returns True on success.
+        """
+        _check_jupyter_deps()
+        print(f"\n🔍 Extracting from Jupyter Notebook: {self.notebook_path}")
+
+        path = Path(self.notebook_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Notebook path not found: {self.notebook_path}")
+
+        notebook_files = self._collect_notebook_files(path)
+        if not notebook_files:
+            raise ValueError(
+                f"No .ipynb files found at: {self.notebook_path}\n"
+                "Provide a path to a .ipynb file or directory containing notebooks."
+            )
+        print(f"   Found {len(notebook_files)} notebook(s)")
+
+        all_sections: list[dict] = []
+        combined_metadata: dict = {}
+        total_code_blocks = total_markdown_cells = total_raw_cells = 0
+        languages_detected: dict[str, int] = {}
+        all_imports: list[str] = []
+        section_number = 0
+
+        for nb_file in notebook_files:
+            try:
+                nb_data = self._parse_single_notebook(nb_file)
+            except Exception as e:
+                logger.warning("Failed to parse notebook %s: %s", nb_file, e)
+                print(f"   ⚠️  Skipping {nb_file.name}: {e}")
+                continue
+
+            if not combined_metadata:
+                combined_metadata = nb_data["metadata"]
+            nb_lang = nb_data["language"]
+            if nb_lang:
+                languages_detected[nb_lang] = (
+                    languages_detected.get(nb_lang, 0) + nb_data["code_cell_count"]
+                )
+            for section in nb_data["sections"]:
+                section_number += 1
+                section["section_number"] = section_number
+                section["source_notebook"] = nb_file.name
+            all_sections.extend(nb_data["sections"])
+            total_code_blocks += nb_data["code_cell_count"]
+            total_markdown_cells += nb_data["markdown_cell_count"]
+            total_raw_cells += nb_data["raw_cell_count"]
+            all_imports.extend(nb_data["imports"])
+            print(
+                f"   📓 {nb_file.name}: {nb_data['code_cell_count']} code, "
+                f"{nb_data['markdown_cell_count']} markdown, {nb_data['raw_cell_count']} raw cells"
+            )
+
+        if not self.config.get("description"):
+            self.description = infer_description_from_notebook(combined_metadata, self.name)
+
+        # Detect languages via LanguageDetector for unlabelled code cells
+        try:
+            from skill_seekers.cli.language_detector import LanguageDetector
+
+            detector = LanguageDetector(min_confidence=0.15)
+        except ImportError:
+            detector = None
+        if detector:
+            for section in all_sections:
+                for cs in section.get("code_samples", []):
+                    if not cs.get("language") and cs.get("code"):
+                        lang, conf = detector.detect_from_code(cs["code"])
+                        if lang and conf >= 0.3:
+                            cs["language"] = lang
+                            languages_detected[lang] = languages_detected.get(lang, 0) + 1
+
+        result_data = {
+            "source_file": str(self.notebook_path),
+            "metadata": combined_metadata,
+            "total_sections": len(all_sections),
+            "total_code_blocks": total_code_blocks,
+            "total_markdown_cells": total_markdown_cells,
+            "total_raw_cells": total_raw_cells,
+            "total_notebooks": len(notebook_files),
+            "languages_detected": languages_detected,
+            "imports": sorted(set(all_imports)),
+            "pages": all_sections,
+        }
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {len(all_sections)} sections, "
+            f"{total_code_blocks} code blocks, {total_markdown_cells} markdown cells"
+        )
+        return True
+
+    def _collect_notebook_files(self, path: Path) -> list[Path]:
+        """Collect .ipynb files from a path (single file or directory)."""
+        if path.is_file():
+            if not path.name.endswith(".ipynb"):
+                raise ValueError(f"Not a Jupyter Notebook (expected .ipynb): {path}")
+            return [path]
+        if path.is_dir():
+            notebooks = sorted(path.glob("**/*.ipynb"))
+            return [nb for nb in notebooks if ".ipynb_checkpoints" not in str(nb)]
+        raise ValueError(f"Path is not a file or directory: {path}")
+
+    def _parse_single_notebook(self, nb_path: Path) -> dict:
+        """Parse a single .ipynb file and return structured data."""
+        with open(nb_path, encoding="utf-8") as f:
+            nb = nbformat.read(f, as_version=4)
+        metadata = dict(nb.metadata) if nb.metadata else {}
+        language = self._detect_language(metadata)
+        sections: list[dict] = []
+        code_cell_count = markdown_cell_count = raw_cell_count = 0
+        imports: list[str] = []
+
+        for cell_index, cell in enumerate(nb.cells):
+            cell_type = cell.get("cell_type", "")
+            source = cell.get("source", "")
+            tags = dict(cell.get("metadata", {})).get("tags", [])
+
+            if cell_type == "markdown":
+                markdown_cell_count += 1
+                sections.extend(self._parse_markdown_cell(source, cell_index, tags, nb_path.name))
+            elif cell_type == "code":
+                code_cell_count += 1
+                sections.append(
+                    self._parse_code_cell(cell, cell_index, language, tags, nb_path.name)
+                )
+                imports.extend(self._extract_imports(source, language))
+            elif cell_type == "raw":
+                raw_cell_count += 1
+                sections.append(self._parse_raw_cell(source, cell_index, tags, nb_path.name))
+
+        return {
+            "metadata": metadata,
+            "language": language,
+            "sections": sections,
+            "code_cell_count": code_cell_count,
+            "markdown_cell_count": markdown_cell_count,
+            "raw_cell_count": raw_cell_count,
+            "imports": imports,
+        }
+
+    def _parse_markdown_cell(
+        self, source: str, cell_index: int, tags: list[str], notebook_name: str
+    ) -> list[dict]:
+        """Parse a markdown cell, splitting by heading boundaries."""
+        if not source.strip():
+            return []
+        lines = source.split("\n")
+        sections: list[dict] = []
+        current_heading = current_heading_level = ""
+        current_lines: list[str] = []
+
+        for line in lines:
+            heading_match = re.match(r"^(#{1,6})\s+(.+)", line)
+            if heading_match:
+                if current_heading or current_lines:
+                    sections.append(
+                        self._build_markdown_section(
+                            current_heading,
+                            current_heading_level,
+                            current_lines,
+                            cell_index,
+                            tags,
+                            notebook_name,
+                        )
+                    )
+                current_heading = heading_match.group(2).strip()
+                current_heading_level = f"h{len(heading_match.group(1))}"
+                current_lines = []
+            else:
+                current_lines.append(line)
+
+        if current_heading or current_lines:
+            sections.append(
+                self._build_markdown_section(
+                    current_heading,
+                    current_heading_level,
+                    current_lines,
+                    cell_index,
+                    tags,
+                    notebook_name,
+                )
+            )
+        return sections
+
+    def _build_markdown_section(
+        self,
+        heading: str,
+        heading_level: str,
+        lines: list[str],
+        cell_index: int,
+        tags: list[str],
+        notebook_name: str,
+    ) -> dict:
+        """Build a section dict from parsed markdown content."""
+        text = "\n".join(lines).strip()
+        code_samples = []
+        code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL)
+        for match in code_block_pattern.finditer(text):
+            lang, code = match.group(1) or "", match.group(2).strip()
+            if code:
+                code_samples.append(
+                    {
+                        "code": code,
+                        "language": lang,
+                        "quality_score": _score_code_quality(code),
+                    }
+                )
+        prose_text = code_block_pattern.sub("", text).strip()
+        sub_headings = []
+        for line in lines:
+            sub_match = re.match(r"^(#{3,6})\s+(.+)", line)
+            if sub_match:
+                sub_text = sub_match.group(2).strip()
+                if sub_text:
+                    sub_headings.append({"level": f"h{len(sub_match.group(1))}", "text": sub_text})
+        return {
+            "section_number": 0,
+            "heading": heading,
+            "heading_level": heading_level or "h1",
+            "text": prose_text,
+            "headings": sub_headings,
+            "code_samples": code_samples,
+            "tables": [],
+            "images": [],
+            "cell_type": "markdown",
+            "cell_index": cell_index,
+            "tags": tags,
+            "source_notebook": notebook_name,
+        }
+
+    def _parse_code_cell(
+        self, cell: dict, cell_index: int, language: str, tags: list[str], notebook_name: str
+    ) -> dict:
+        """Parse a code cell including source and outputs."""
+        source = cell.get("source", "")
+        execution_count = cell.get("execution_count")
+        code_samples = []
+        if source.strip():
+            code_samples.append(
+                {
+                    "code": source.strip(),
+                    "language": language,
+                    "quality_score": _score_code_quality(source),
+                    "execution_count": execution_count,
+                }
+            )
+        output_texts: list[str] = []
+        output_errors: list[str] = []
+        output_display: list[dict] = []
+        for output in cell.get("outputs", []):
+            output_type = output.get("output_type", "")
+            if output_type == "stream":
+                stream_text = output.get("text", "")
+                if isinstance(stream_text, list):
+                    stream_text = "".join(stream_text)
+                if output.get("name", "stdout") == "stderr":
+                    output_errors.append(stream_text)
+                else:
+                    output_texts.append(stream_text)
+            elif output_type in ("execute_result", "display_data"):
+                data = output.get("data", {})
+                text_plain = data.get("text/plain", "")
+                if isinstance(text_plain, list):
+                    text_plain = "".join(text_plain)
+                if text_plain:
+                    output_texts.append(text_plain)
+                for mime in ("text/html", "image/png", "image/svg+xml"):
+                    if mime in data:
+                        output_display.append({"mime_type": mime, "has_data": True})
+            elif output_type == "error":
+                ename, evalue = output.get("ename", "Error"), output.get("evalue", "")
+                error_msg = f"{ename}: {evalue}"
+                tb = output.get("traceback", [])
+                if tb:
+                    clean_tb = [re.sub(r"\x1b\[[0-9;]*m", "", line) for line in tb]
+                    error_msg += "\n" + "\n".join(clean_tb)
+                output_errors.append(error_msg)
+
+        return {
+            "section_number": 0,
+            "heading": self._infer_code_heading(source, execution_count),
+            "heading_level": "h3",
+            "text": "\n".join(output_texts).strip() if output_texts else "",
+            "headings": [],
+            "code_samples": code_samples,
+            "tables": [],
+            "images": [],
+            "cell_type": "code",
+            "cell_index": cell_index,
+            "tags": tags,
+            "source_notebook": notebook_name,
+            "execution_count": execution_count,
+            "output_text": "\n".join(output_texts).strip(),
+            "output_errors": output_errors,
+            "output_display": output_display,
+        }
+
+    def _parse_raw_cell(
+        self, source: str, cell_index: int, tags: list[str], notebook_name: str
+    ) -> dict:
+        """Parse a raw cell (unrendered text)."""
+        return {
+            "section_number": 0,
+            "heading": "",
+            "heading_level": "h3",
+            "text": source.strip(),
+            "headings": [],
+            "code_samples": [],
+            "tables": [],
+            "images": [],
+            "cell_type": "raw",
+            "cell_index": cell_index,
+            "tags": tags,
+            "source_notebook": notebook_name,
+        }
+
+    def _infer_code_heading(self, source: str, execution_count: int | None) -> str:
+        """Infer a descriptive heading for a code cell from first meaningful line."""
+        if not source.strip():
+            return f"Code Cell [{execution_count or '?'}]"
+        first_line = source.strip().split("\n")[0].strip()
+        comment_match = re.match(r"^#\s+(.+)", first_line)
+        if comment_match:
+            heading = comment_match.group(1).strip()
+            return heading[:77] + "..." if len(heading) > 80 else heading
+        def_match = re.match(r"^(?:def|class|async\s+def)\s+(\w+)", first_line)
+        if def_match:
+            return f"Define: {def_match.group(1)}"
+        assign_match = re.match(r"^(\w+)\s*=", first_line)
+        if assign_match and len(assign_match.group(1)) > 1:
+            return f"Assign: {assign_match.group(1)}"
+        magic_match = re.match(r"^(%+\w+)", first_line)
+        if magic_match:
+            return f"Magic: {magic_match.group(1)}"
+        if first_line.startswith("!"):
+            cmd = first_line[1:].strip().split()[0] if first_line[1:].strip() else "shell"
+            return f"Shell: {cmd}"
+        prefix = f"[{execution_count}]" if execution_count else ""
+        return f"Code Cell {prefix}".strip()
+
+    def _detect_language(self, metadata: dict) -> str:
+        """Detect programming language from notebook kernel metadata."""
+        kernelspec = metadata.get("kernelspec", {})
+        if isinstance(kernelspec, dict):
+            kernel_lang = kernelspec.get("language", "")
+            if kernel_lang:
+                return kernel_lang.lower()
+            kernel_name = kernelspec.get("name", "")
+            if kernel_name:
+                name_lower = kernel_name.lower()
+                for keyword, lang in [
+                    ("python", "python"),
+                    ("julia", "julia"),
+                    ("scala", "scala"),
+                    ("rust", "rust"),
+                ]:
+                    if keyword in name_lower:
+                        return lang
+                if name_lower in ("ir", "r"):
+                    return "r"
+                if "javascript" in name_lower or "node" in name_lower:
+                    return "javascript"
+                if "csharp" in name_lower or "dotnet" in name_lower:
+                    return "csharp"
+        lang_info = metadata.get("language_info", {})
+        if isinstance(lang_info, dict):
+            lang_name = lang_info.get("name", "")
+            if lang_name:
+                return lang_name.lower()
+        return ""
+
+    def _extract_imports(self, source: str, language: str) -> list[str]:
+        """Extract import/library statements from code source."""
+        if not source.strip():
+            return []
+        imports: list[str] = []
+        lang_key = language.lower() if language else "python"
+        patterns = _IMPORT_PATTERNS.get(lang_key, _IMPORT_PATTERNS.get("python", []))
+        for pattern in patterns:
+            for match in pattern.finditer(source):
+                module_name = match.group(1).strip()
+                if module_name and module_name not in imports:
+                    imports.append(module_name)
+        return imports
+
+    # ------------------------------------------------------------------
+    # Load / Categorize / Build
+    # ------------------------------------------------------------------
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from JSON."""
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", [])))
+        print(f"✅ Loaded {total} sections")
+        return True
+
+    def categorize_content(self) -> dict[str, dict]:
+        """Categorize sections based on cell type and topic keywords."""
+        print("\n📋 Categorizing content...")
+        categorized: dict[str, dict] = {}
+        sections = self.extracted_data.get("pages", [])
+
+        # Single notebook — use basename as category
+        if self.notebook_path and Path(self.notebook_path).is_file():
+            nb_basename = Path(self.notebook_path).stem
+            categorized[self._sanitize_filename(nb_basename)] = {
+                "title": nb_basename,
+                "pages": sections,
+            }
+            print(f"✅ Created 1 category (single notebook source)")
+            print(f"   - {nb_basename}: {len(sections)} sections")
+            return categorized
+
+        # Custom keyword-based categories
+        if self.categories:
+            first_value = next(iter(self.categories.values()), None)
+            if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
+                for cat_key, pages in self.categories.items():
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": pages,
+                    }
+            else:
+                for cat_key in self.categories:
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": [],
+                    }
+                for section in sections:
+                    combined = self._section_text(section)
+                    scores = {}
+                    for cat_key, keywords in self.categories.items():
+                        if isinstance(keywords, list):
+                            score = sum(
+                                1
+                                for kw in keywords
+                                if isinstance(kw, str) and kw.lower() in combined
+                            )
+                        else:
+                            score = 0
+                        if score > 0:
+                            scores[cat_key] = score
+                    if scores:
+                        categorized[max(scores, key=scores.get)]["pages"].append(section)
+                    else:
+                        categorized.setdefault("other", {"title": "Other", "pages": []})
+                        categorized["other"]["pages"].append(section)
+            self._print_categories(categorized)
+            return categorized
+
+        # Auto-categorize by topic keywords
+        topic_buckets: dict[str, list[dict]] = {}
+        uncategorized: list[dict] = []
+        for section in sections:
+            combined = self._section_text(section)
+            matched_topic, best_score = "", 0
+            for topic, keywords in _TOPIC_KEYWORDS.items():
+                score = sum(1 for kw in keywords if kw.lower() in combined)
+                if score > best_score:
+                    best_score, matched_topic = score, topic
+            if matched_topic and best_score >= 2:
+                topic_buckets.setdefault(matched_topic, []).append(section)
+            else:
+                uncategorized.append(section)
+        for topic, pages in sorted(topic_buckets.items()):
+            categorized[topic] = {"title": topic.replace("_", " ").title(), "pages": pages}
+        if uncategorized:
+            categorized["other"] = {"title": "Other", "pages": uncategorized}
+        if not categorized:
+            categorized["content"] = {"title": "Content", "pages": sections}
+        self._print_categories(categorized)
+        return categorized
+
+    def _section_text(self, section: dict) -> str:
+        """Combine section text, heading, and code into a single lowercase string."""
+        text = section.get("text", "").lower()
+        heading = section.get("heading", "").lower()
+        code = " ".join(cs.get("code", "").lower() for cs in section.get("code_samples", []))
+        return f"{text} {heading} {code}"
+
+    def _print_categories(self, categorized: dict[str, dict]) -> None:
+        print(f"✅ Created {len(categorized)} categories")
+        for cat_data in categorized.values():
+            print(f"   - {cat_data['title']}: {len(cat_data['pages'])} sections")
+
+    def build_skill(self) -> None:
+        """Build complete skill directory structure."""
+        print(f"\n🏗️  Building skill: {self.name}")
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        categorized = self.categorize_content()
+        print("\n📝 Generating reference files...")
+        total_categories = len(categorized)
+        for section_num, (cat_key, cat_data) in enumerate(categorized.items(), 1):
+            self._generate_reference_file(cat_key, cat_data, section_num, total_categories)
+        self._generate_index(categorized)
+        self._generate_skill_md(categorized)
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ------------------------------------------------------------------
+    # Private generation methods
+    # ------------------------------------------------------------------
+
+    def _nb_basename(self) -> str:
+        """Return the notebook stem if notebook_path points to a single file."""
+        if self.notebook_path and Path(self.notebook_path).is_file():
+            return Path(self.notebook_path).stem
+        return ""
+
+    def _ref_filename(self, sections: list[dict], section_num: int, total_sections: int) -> str:
+        """Determine the reference file path for a category."""
+        nb_base = self._nb_basename()
+        if sections:
+            sec_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+            if total_sections == 1:
+                name = nb_base if nb_base else "main"
+                return f"{self.skill_dir}/references/{name}.md"
+            sec_range = f"s{min(sec_nums)}-s{max(sec_nums)}"
+            base = nb_base or "section"
+            return f"{self.skill_dir}/references/{base}_{sec_range}.md"
+        return f"{self.skill_dir}/references/section_{section_num:02d}.md"
+
+    def _generate_reference_file(
+        self, _cat_key: str, cat_data: dict, section_num: int, total_sections: int
+    ) -> None:
+        """Generate a reference markdown file for a category."""
+        sections = cat_data["pages"]
+        filename = self._ref_filename(sections, section_num, total_sections)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+            for section in sections:
+                sec_num = section.get("section_number", "?")
+                heading = section.get("heading", "")
+                heading_level = section.get("heading_level", "h1")
+                cell_type = section.get("cell_type", "markdown")
+
+                f.write(f"---\n\n**📄 Source: Section {sec_num}**")
+                if cell_type == "code":
+                    ec = section.get("execution_count")
+                    f.write(f" (Code Cell{f' [In {ec}]' if ec else ''})")
+                elif cell_type == "raw":
+                    f.write(" (Raw Cell)")
+                f.write("\n\n")
+
+                if heading:
+                    md_lvl = (
+                        "#" * (int(heading_level[1]) + 1) if heading_level.startswith("h") else "##"
+                    )
+                    f.write(f"{md_lvl} {heading}\n\n")
+                for sub in section.get("headings", []):
+                    sl, st = sub.get("level", "h3"), sub.get("text", "")
+                    if st:
+                        smd = "#" * (int(sl[1]) + 1) if sl.startswith("h") else "###"
+                        f.write(f"{smd} {st}\n\n")
+                if section.get("text"):
+                    f.write(f"{section['text']}\n\n")
+                for code in section.get("code_samples", []):
+                    ec = code.get("execution_count")
+                    if ec:
+                        f.write(f"**In [{ec}]:**\n\n")
+                    f.write(f"```{code.get('language', '')}\n{code['code']}\n```\n\n")
+                if section.get("output_text"):
+                    f.write(f"**Output:**\n\n```\n{section['output_text']}\n```\n\n")
+                for err in section.get("output_errors", []):
+                    f.write(f"**Errors:**\n\n```\n{err}\n```\n\n")
+                disp = section.get("output_display", [])
+                if disp:
+                    mimes = [d.get("mime_type", "") for d in disp]
+                    f.write(f"*Rich output: {', '.join(mimes)}*\n\n")
+                for table in section.get("tables", []):
+                    headers, rows = table.get("headers", []), table.get("rows", [])
+                    if headers:
+                        f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
+                        f.write("| " + " | ".join("---" for _ in headers) + " |\n")
+                    for row in rows:
+                        f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                    f.write("\n")
+                tags = section.get("tags", [])
+                if tags:
+                    f.write(f"*Tags: {', '.join(str(t) for t in tags)}*\n\n")
+                f.write("---\n\n")
+        print(f"   Generated: {filename}")
+
+    def _generate_index(self, categorized: dict[str, dict]) -> None:
+        """Generate reference index file."""
+        filename = f"{self.skill_dir}/references/index.md"
+        nb_base = self._nb_basename()
+        total_cats = len(categorized)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Notebook Reference\n\n## Categories\n\n")
+            for section_num, (_ck, cd) in enumerate(categorized.items(), 1):
+                pages = cd["pages"]
+                count = len(pages)
+                if pages:
+                    snums = [s.get("section_number", i + 1) for i, s in enumerate(pages)]
+                    rng = f"Sections {min(snums)}-{max(snums)}"
+                    if total_cats == 1:
+                        link = f"{nb_base}.md" if nb_base else "main.md"
+                    else:
+                        base = nb_base or "section"
+                        link = f"{base}_s{min(snums)}-s{max(snums)}.md"
+                else:
+                    link, rng = f"section_{section_num:02d}.md", "N/A"
+                f.write(f"- [{cd['title']}]({link}) ({count} sections, {rng})\n")
+
+            f.write("\n## Statistics\n\n")
+            ed = self.extracted_data
+            f.write(f"- Total sections: {ed.get('total_sections', 0)}\n")
+            f.write(f"- Code cells: {ed.get('total_code_blocks', 0)}\n")
+            f.write(f"- Markdown cells: {ed.get('total_markdown_cells', 0)}\n")
+            f.write(f"- Raw cells: {ed.get('total_raw_cells', 0)}\n")
+            f.write(f"- Notebooks: {ed.get('total_notebooks', 1)}\n")
+
+            meta = ed.get("metadata", {})
+            ks = meta.get("kernelspec", {})
+            if isinstance(ks, dict) and ks.get("display_name"):
+                f.write(f"- Kernel: {ks['display_name']}\n")
+            li = meta.get("language_info", {})
+            if isinstance(li, dict) and li.get("version"):
+                f.write(f"- Language version: {li.get('name', '')} {li['version']}\n")
+
+            imports = ed.get("imports", [])
+            if imports:
+                f.write(f"\n## Imported Packages ({len(imports)})\n\n")
+                for imp in imports[:30]:
+                    f.write(f"- `{imp}`\n")
+                if len(imports) > 30:
+                    f.write(f"- ... and {len(imports) - 30} more\n")
+        print(f"   Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict[str, dict]) -> None:
+        """Generate main SKILL.md file."""
+        filename = f"{self.skill_dir}/SKILL.md"
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024]
+        ed = self.extracted_data
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"---\nname: {skill_name}\ndescription: {desc}\n---\n\n")
+            f.write(f"# {self.name.title()} Notebook Skill\n\n{self.description}\n\n")
+
+            # Notebook metadata
+            meta = ed.get("metadata", {})
+            ks = meta.get("kernelspec", {})
+            li = meta.get("language_info", {})
+            has_ks = isinstance(ks, dict) and ks.get("display_name")
+            has_li = isinstance(li, dict) and li.get("name")
+            if has_ks or has_li:
+                f.write("## 📋 Notebook Information\n\n")
+                if has_ks:
+                    f.write(f"**Kernel:** {ks['display_name']}\n\n")
+                if has_li:
+                    ver = li.get("version", "")
+                    f.write(f"**Language:** {li['name']}{' ' + ver if ver else ''}\n\n")
+
+            f.write("## 💡 When to Use This Skill\n\nUse this skill when you need to:\n")
+            f.write(f"- Understand {self.name} concepts and analysis workflow\n")
+            f.write("- Reference code examples and their outputs\n")
+            f.write("- Reproduce data analysis or computation steps\n")
+            f.write("- Review methodology, visualizations, and results\n")
+            f.write("- Find library usage patterns and best practices\n\n")
+
+            total_sections = ed.get("total_sections", 0)
+            f.write(f"## 📖 Section Overview\n\n**Total Sections:** {total_sections}\n\n")
+            f.write("**Content Breakdown:**\n\n")
+            for cd in categorized.values():
+                f.write(f"- **{cd['title']}**: {len(cd['pages'])} sections\n")
+            f.write("\n")
+
+            f.write(self._format_key_concepts())
+
+            imports = ed.get("imports", [])
+            if imports:
+                f.write(f"## 📦 Dependencies\n\n*{len(imports)} package(s) imported*\n\n")
+                for imp in imports[:20]:
+                    f.write(f"- `{imp}`\n")
+                if len(imports) > 20:
+                    f.write(f"- ... and {len(imports) - 20} more\n")
+                f.write("\n")
+
+            f.write("## ⚡ Quick Reference\n\n")
+            f.write(self._format_patterns_from_content())
+
+            # Top code examples
+            all_code: list[dict] = []
+            for section in ed.get("pages", []):
+                all_code.extend(section.get("code_samples", []))
+            all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
+            top_code = all_code[:15]
+            if top_code:
+                f.write("## 📝 Code Examples\n\n*High-quality code cells from notebook*\n\n")
+                by_lang: dict[str, list] = {}
+                for c in top_code:
+                    by_lang.setdefault(c.get("language", "unknown"), []).append(c)
+                for lang in sorted(by_lang):
+                    examples = by_lang[lang]
+                    f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
+                    for i, c in enumerate(examples[:5], 1):
+                        quality = c.get("quality_score", 0)
+                        ec = c.get("execution_count")
+                        label = f"In [{ec}]" if ec else f"Example {i}"
+                        code_text = c.get("code", "")
+                        f.write(f"**{label}** (Quality: {quality:.1f}/10):\n\n```{lang}\n")
+                        f.write(code_text[:500] + ("\n..." if len(code_text) > 500 else ""))
+                        f.write("\n```\n\n")
+
+            f.write("## 📊 Notebook Statistics\n\n")
+            f.write(f"- **Total Sections**: {total_sections}\n")
+            f.write(f"- **Code Cells**: {ed.get('total_code_blocks', 0)}\n")
+            f.write(f"- **Markdown Cells**: {ed.get('total_markdown_cells', 0)}\n")
+            f.write(f"- **Raw Cells**: {ed.get('total_raw_cells', 0)}\n")
+            f.write(f"- **Notebooks**: {ed.get('total_notebooks', 1)}\n")
+            langs = ed.get("languages_detected", {})
+            if langs:
+                f.write(f"- **Programming Languages**: {len(langs)}\n\n")
+                f.write("**Language Breakdown:**\n\n")
+                for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
+                    f.write(f"- {lang}: {count} code cells\n")
+                f.write("\n")
+
+            f.write("## 🗺️ Navigation\n\n**Reference Files:**\n\n")
+            for cd in categorized.values():
+                cat_file = self._sanitize_filename(cd["title"])
+                f.write(f"- `references/{cat_file}.md` - {cd['title']}\n")
+            f.write("\nSee `references/index.md` for complete notebook structure.\n\n")
+            f.write("---\n\n**Generated by Skill Seeker** | Jupyter Notebook Scraper\n")
+
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"   Generated: {filename} ({line_count} lines)")
+
+    # ------------------------------------------------------------------
+    # Formatting helpers
+    # ------------------------------------------------------------------
+
+    def _format_key_concepts(self) -> str:
+        """Extract key concepts from markdown headings across all sections."""
+        all_headings: list[tuple[str, str]] = []
+        for section in self.extracted_data.get("pages", []):
+            heading = section.get("heading", "").strip()
+            level = section.get("heading_level", "h1")
+            if heading and len(heading) > 3 and section.get("cell_type") == "markdown":
+                all_headings.append((level, heading))
+            for sub in section.get("headings", []):
+                st = sub.get("text", "").strip()
+                if st and len(st) > 3:
+                    all_headings.append((sub.get("level", "h3"), st))
+        if not all_headings:
+            return ""
+        content = "## 🔑 Key Concepts\n\n*Main topics covered in this notebook*\n\n"
+        h1s = [text for lvl, text in all_headings if lvl == "h1"]
+        h2s = [text for lvl, text in all_headings if lvl == "h2"]
+        if h1s:
+            content += "**Major Topics:**\n\n" + "".join(f"- {h}\n" for h in h1s[:10]) + "\n"
+        if h2s:
+            content += "**Subtopics:**\n\n" + "".join(f"- {h}\n" for h in h2s[:15]) + "\n"
+        return content
+
+    def _format_patterns_from_content(self) -> str:
+        """Extract common patterns from text content headings."""
+        pattern_keywords = [
+            "getting started",
+            "installation",
+            "configuration",
+            "usage",
+            "api",
+            "examples",
+            "tutorial",
+            "guide",
+            "best practices",
+            "troubleshooting",
+            "faq",
+            "data loading",
+            "preprocessing",
+            "modeling",
+            "evaluation",
+            "results",
+            "conclusion",
+            "summary",
+        ]
+        patterns: list[dict] = []
+        for section in self.extracted_data.get("pages", []):
+            heading_text = section.get("heading", "").lower()
+            sec_num = section.get("section_number", 0)
+            for kw in pattern_keywords:
+                if kw in heading_text:
+                    patterns.append(
+                        {
+                            "type": kw.title(),
+                            "heading": section.get("heading", ""),
+                            "section": sec_num,
+                        }
+                    )
+                    break
+        if not patterns:
+            return "*See reference files for detailed content*\n\n"
+        content = "*Common documentation patterns found:*\n\n"
+        by_type: dict[str, list] = {}
+        for p in patterns:
+            by_type.setdefault(p["type"], []).append(p)
+        for ptype in sorted(by_type):
+            items = by_type[ptype]
+            content += f"**{ptype}** ({len(items)} sections):\n"
+            for item in items[:3]:
+                content += f"- {item['heading']} (section {item['section']})\n"
+            content += "\n"
+        return content
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert string to safe filename."""
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        return re.sub(r"[-\s]+", "_", safe)
+
+
+# ---------------------------------------------------------------------------
+# Module-level helpers
+# ---------------------------------------------------------------------------
+
+
+def _score_code_quality(code: str) -> float:
+    """Simple quality heuristic for code blocks (0–10 scale)."""
+    if not code:
+        return 0.0
+    score = 5.0
+    lines = code.strip().split("\n")
+    line_count = len(lines)
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+    if re.search(r'""".*?"""|\'\'\'.*?\'\'\'', code, re.DOTALL):
+        score += 0.3
+    if re.search(r"^%", code, re.MULTILINE):
+        score += 0.2
+    if len(code) < 30:
+        score -= 2.0
+    non_magic = [ln for ln in lines if ln.strip() and not ln.strip().startswith(("%", "!"))]
+    if line_count > 0 and not non_magic:
+        score -= 1.0
+    return min(10.0, max(0.0, score))
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """Standalone CLI entry point for the Jupyter Notebook scraper."""
+    from .arguments.jupyter import add_jupyter_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert Jupyter Notebook (.ipynb) to skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    add_jupyter_arguments(parser)
+    args = parser.parse_args()
+
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    if getattr(args, "dry_run", False):
+        source = getattr(args, "notebook", None) or getattr(args, "from_json", None) or "(none)"
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: Jupyter Notebook Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    if not (getattr(args, "notebook", None) or getattr(args, "from_json", None)):
+        parser.error("Must specify --notebook or --from-json")
+
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} notebook documentation",
+        }
+        try:
+            converter = JupyterToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Direct notebook mode
+    if not getattr(args, "name", None):
+        nb_path = Path(args.notebook)
+        args.name = nb_path.stem if nb_path.is_file() else (nb_path.name or "notebooks")
+
+    config = {
+        "name": args.name,
+        "notebook_path": args.notebook,
+        "description": getattr(args, "description", None),
+    }
+
+    try:
+        converter = JupyterToSkillConverter(config)
+        if not converter.extract_notebook():
+            print("\n❌ Notebook extraction failed - see error above", file=sys.stderr)
+            sys.exit(1)
+        converter.build_skill()
+
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis, "
+                    "enhancement provides general improvements)"
+                )
+            print("")
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Unexpected error during Jupyter processing: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py
index f509b8b..f33c38e 100644
--- a/src/skill_seekers/cli/main.py
+++ b/src/skill_seekers/cli/main.py
@@ -15,7 +15,17 @@ Commands:
     word                 Extract from Word (.docx) file
     epub                 Extract from EPUB e-book (.epub)
     video                Extract from video (YouTube or local)
-    unified              Multi-source scraping (docs + GitHub + PDF)
+    jupyter              Extract from Jupyter Notebook (.ipynb)
+    html                 Extract from local HTML files
+    openapi              Extract from OpenAPI/Swagger spec
+    asciidoc             Extract from AsciiDoc documents (.adoc)
+    pptx                 Extract from PowerPoint (.pptx)
+    rss                  Extract from RSS/Atom feeds
+    manpage              Extract from man pages
+    confluence           Extract from Confluence wiki
+    notion               Extract from Notion pages
+    chat                 Extract from Slack/Discord chat exports
+    unified              Multi-source scraping (docs + GitHub + PDF + more)
     analyze              Analyze local codebase and extract code knowledge
     enhance              AI-powered enhancement (auto: API or LOCAL mode)
     enhance-status       Check enhancement status (for background/daemon modes)
@@ -70,6 +80,17 @@ COMMAND_MODULES = {
     "quality": "skill_seekers.cli.quality_metrics",
     "workflows": "skill_seekers.cli.workflows_command",
     "sync-config": "skill_seekers.cli.sync_config",
+    # New source types (v3.2.0+)
+    "jupyter": "skill_seekers.cli.jupyter_scraper",
+    "html": "skill_seekers.cli.html_scraper",
+    "openapi": "skill_seekers.cli.openapi_scraper",
+    "asciidoc": "skill_seekers.cli.asciidoc_scraper",
+    "pptx": "skill_seekers.cli.pptx_scraper",
+    "rss": "skill_seekers.cli.rss_scraper",
+    "manpage": "skill_seekers.cli.man_scraper",
+    "confluence": "skill_seekers.cli.confluence_scraper",
+    "notion": "skill_seekers.cli.notion_scraper",
+    "chat": "skill_seekers.cli.chat_scraper",
 }
 
 
diff --git a/src/skill_seekers/cli/man_scraper.py b/src/skill_seekers/cli/man_scraper.py
new file mode 100644
index 0000000..c48492d
--- /dev/null
+++ b/src/skill_seekers/cli/man_scraper.py
@@ -0,0 +1,1513 @@
+#!/usr/bin/env python3
+"""
+Man Page to Skill Converter
+
+Converts Unix/Linux man pages into AI-ready skills.  No external dependencies
+are required beyond the Python standard library -- extraction relies on
+``subprocess`` (to invoke ``man``) and ``re`` (to strip troff/groff formatting).
+
+Three extraction strategies are supported:
+
+1. **Live man command** -- run ``man <name>`` and capture stdout.
+2. **Directory scan** -- read ``.1`` -- ``.8`` / ``.man`` files directly from
+   a directory (useful when man pages are not installed system-wide).
+3. **Pre-extracted JSON** -- reload a previously saved intermediate JSON file
+   and jump straight to the skill-building phase.
+
+Usage:
+    skill-seekers man --man-names git,curl --name unix-tools
+    skill-seekers man --man-path /usr/share/man/man1 --name coreutils
+    skill-seekers man --from-json unix-tools_extracted.json
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Standard man page section names (used for parsing)
+# ---------------------------------------------------------------------------
+STANDARD_SECTIONS = [
+    "NAME",
+    "SYNOPSIS",
+    "DESCRIPTION",
+    "OPTIONS",
+    "ARGUMENTS",
+    "COMMANDS",
+    "SUBCOMMANDS",
+    "ENVIRONMENT",
+    "ENVIRONMENT VARIABLES",
+    "EXIT STATUS",
+    "EXIT CODES",
+    "RETURN VALUE",
+    "RETURN VALUES",
+    "ERRORS",
+    "FILES",
+    "EXAMPLES",
+    "EXAMPLE",
+    "DIAGNOSTICS",
+    "COMPATIBILITY",
+    "STANDARDS",
+    "CONFORMING TO",
+    "NOTES",
+    "CAVEATS",
+    "BUGS",
+    "HISTORY",
+    "AUTHORS",
+    "AUTHOR",
+    "COPYRIGHT",
+    "LICENSE",
+    "SEE ALSO",
+    "REPORTING BUGS",
+    "SECURITY CONSIDERATIONS",
+    "CONFIGURATION",
+    "DEFAULTS",
+    "GIT",
+]
+
+# Man page manual section numbers
+MAN_SECTION_NUMBERS = list(range(1, 9))  # 1-8
+
+# File extensions recognised as man pages
+MAN_FILE_EXTENSIONS = {f".{n}" for n in MAN_SECTION_NUMBERS} | {".man", ".1p", ".3p"}
+
+
+def infer_description_from_manpages(
+    names: list[str] | None = None,
+    name_lines: list[str] | None = None,
+    skill_name: str = "",
+) -> str:
+    """Infer skill description from man page NAME lines or page names.
+
+    Args:
+        names: List of man page names (e.g. ["git", "curl"]).
+        name_lines: NAME section lines extracted from man pages.
+        skill_name: Skill name for fallback.
+
+    Returns:
+        Description string suitable for "Use when..." format.
+    """
+    if name_lines:
+        # NAME lines typically have the form: "command - short description"
+        for line in name_lines:
+            if " - " in line:
+                desc = line.split(" - ", 1)[1].strip()
+                if len(desc) > 20:
+                    if len(desc) > 150:
+                        desc = desc[:147] + "..."
+                    return f"Use when {desc.lower()}"
+
+    if names:
+        joined = ", ".join(names[:5])
+        suffix = f" (and {len(names) - 5} more)" if len(names) > 5 else ""
+        return f"Use when referencing {joined}{suffix} command documentation"
+
+    return (
+        f"Use when referencing {skill_name} documentation"
+        if skill_name
+        else "Use when referencing this documentation"
+    )
+
+
+class ManPageToSkillConverter:
+    """Convert Unix man pages into a skill directory structure.
+
+    Supports extraction via the ``man`` command or by reading raw man-page
+    files from a directory.  Parsed content is saved as an intermediate JSON
+    file so that the (potentially slow) extraction step can be decoupled
+    from skill generation.
+    """
+
+    def __init__(self, config: dict) -> None:
+        """Initialise the converter from a configuration dictionary.
+
+        Args:
+            config: Dictionary with keys:
+                - ``name``       -- skill name (required)
+                - ``man_names``  -- list of man page names, e.g. ``["git", "curl"]``
+                - ``man_path``   -- directory containing raw man page files
+                - ``sections``   -- man section numbers to query (default all)
+                - ``description``-- explicit description (optional)
+                - ``categories`` -- keyword-based categorisation map (optional)
+        """
+        self.config = config
+        self.name: str = config["name"]
+        self.man_names: list[str] = config.get("man_names", [])
+        self.man_path: str = config.get("man_path", "")
+        self.sections: list[int] = config.get("sections", [])
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} documentation"
+        )
+
+        # Paths
+        self.skill_dir = f"output/{self.name}"
+        self.data_file = f"output/{self.name}_extracted.json"
+
+        # Categories config
+        self.categories: dict = config.get("categories", {})
+
+        # Extracted data placeholder
+        self.extracted_data: dict | None = None
+
+    # ------------------------------------------------------------------
+    # Extraction
+    # ------------------------------------------------------------------
+
+    def extract_manpages(self) -> bool:
+        """Extract man pages via ``man`` command or by reading files from a directory.
+
+        Workflow:
+        1. If ``man_path`` is set, read ``.1``-``.8`` / ``.man`` files from
+           that directory.
+        2. Otherwise, run ``man <name>`` for each entry in ``man_names``.
+        3. Strip troff/groff formatting from every captured page.
+        4. Parse each page into structured sections (NAME, SYNOPSIS, ...).
+        5. Persist the intermediate JSON to ``self.data_file``.
+
+        Returns:
+            ``True`` on success.
+
+        Raises:
+            FileNotFoundError: If ``man_path`` does not exist.
+            RuntimeError: If no man pages could be extracted.
+        """
+        print(f"\n🔍 Extracting man pages for skill: {self.name}")
+
+        pages: list[dict] = []
+
+        if self.man_path:
+            pages = self._extract_from_directory(self.man_path)
+        elif self.man_names:
+            pages = self._extract_from_names(self.man_names)
+        else:
+            raise RuntimeError("No man page source specified.  Provide --man-names or --man-path.")
+
+        if not pages:
+            raise RuntimeError("No man pages could be extracted.  Check names or path.")
+
+        # Collect NAME lines for description inference
+        name_lines: list[str] = []
+        for page in pages:
+            name_section = page.get("sections", {}).get("NAME", "")
+            if name_section:
+                name_lines.append(name_section.strip())
+
+        # Update description from man page content if not set explicitly
+        if not self.config.get("description"):
+            self.description = infer_description_from_manpages(
+                names=self.man_names or None,
+                name_lines=name_lines or None,
+                skill_name=self.name,
+            )
+
+        # Build result data
+        total_options = sum(len(p.get("options", [])) for p in pages)
+        total_examples = sum(len(p.get("examples", [])) for p in pages)
+        see_also_all: list[str] = []
+        for page in pages:
+            see_also_all.extend(page.get("see_also", []))
+
+        result_data = {
+            "source": self.man_path or "man command",
+            "total_pages": len(pages),
+            "total_options": total_options,
+            "total_examples": total_examples,
+            "see_also": sorted(set(see_also_all)),
+            "pages": pages,
+        }
+
+        # Save extracted data
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False)
+
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {len(pages)} man page(s), "
+            f"{total_options} options, "
+            f"{total_examples} examples"
+        )
+        return True
+
+    def _extract_from_names(self, names: list[str]) -> list[dict]:
+        """Run ``man <name>`` for each name and parse output.
+
+        When ``self.sections`` is set, the specific section number is passed to
+        ``man`` (e.g. ``man 3 printf``).  Otherwise, the default section is used.
+
+        Args:
+            names: Man page names to look up.
+
+        Returns:
+            List of parsed page dicts.
+        """
+        pages: list[dict] = []
+        section_targets: list[int] = self.sections or [0]  # 0 = default
+
+        for man_name in names:
+            for section_num in section_targets:
+                raw = self._run_man_command(man_name, section_num or None)
+                if raw is None:
+                    continue
+                clean = self._strip_troff_formatting(raw)
+                parsed = self._parse_man_output(clean, man_name, section_num or None)
+                pages.append(parsed)
+                section_label = f"({section_num})" if section_num else ""
+                print(f"   Extracted: {man_name}{section_label}")
+        return pages
+
+    def _extract_from_directory(self, dir_path: str) -> list[dict]:
+        """Read man page files from a directory and parse them.
+
+        Recognised extensions: ``.1`` -- ``.8``, ``.1p``, ``.3p``, ``.man``.
+        Compressed files (``.gz``, ``.bz2``, ``.xz``) are also handled.
+
+        Args:
+            dir_path: Path to the directory containing man page files.
+
+        Returns:
+            List of parsed page dicts.
+
+        Raises:
+            FileNotFoundError: If ``dir_path`` does not exist.
+        """
+        path = Path(dir_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Man page directory not found: {dir_path}")
+        if not path.is_dir():
+            raise ValueError(f"Path is not a directory: {dir_path}")
+
+        print(f"   Scanning directory: {dir_path}")
+
+        pages: list[dict] = []
+        man_files = sorted(path.iterdir())
+
+        for fp in man_files:
+            if fp.is_dir():
+                # Recurse into subdirectories (man1/, man2/, ...)
+                sub_pages = self._extract_from_directory(str(fp))
+                pages.extend(sub_pages)
+                continue
+
+            # Check for compressed man pages
+            real_suffix = fp.suffix
+            actual_path = fp
+            if real_suffix in (".gz", ".bz2", ".xz"):
+                real_suffix = fp.with_suffix("").suffix
+                actual_path = fp
+
+            if real_suffix not in MAN_FILE_EXTENSIONS:
+                continue
+
+            # Filter by requested sections
+            section_num = self._section_from_suffix(real_suffix)
+            if self.sections and section_num not in self.sections:
+                continue
+
+            raw = self._read_man_file(str(actual_path))
+            if raw is None:
+                continue
+
+            clean = self._strip_troff_formatting(raw)
+            man_name = fp.stem
+            # Remove double-suffix for compressed files (e.g. git.1.gz -> git)
+            if fp.suffix in (".gz", ".bz2", ".xz"):
+                man_name = Path(man_name).stem
+
+            parsed = self._parse_man_output(clean, man_name, section_num)
+            pages.append(parsed)
+            print(f"   Read file: {fp.name}")
+
+        return pages
+
+    @staticmethod
+    def _section_from_suffix(suffix: str) -> int | None:
+        """Derive the man section number from a file suffix.
+
+        Args:
+            suffix: File extension, e.g. ``.1``, ``.3p``, ``.man``.
+
+        Returns:
+            Integer section number or ``None`` if not determinable.
+        """
+        suffix = suffix.lstrip(".")
+        # Handle POSIX extensions like 1p, 3p
+        numeric = re.match(r"^(\d)", suffix)
+        if numeric:
+            return int(numeric.group(1))
+        return None
+
+    # ------------------------------------------------------------------
+    # Man command execution
+    # ------------------------------------------------------------------
+
+    def _run_man_command(self, name: str, section: int | None = None) -> str | None:
+        """Execute ``man`` and capture its output.
+
+        Uses ``MANWIDTH=999`` to avoid unwanted line wrapping and ``col -bx``
+        to strip backspace-based formatting on platforms that still use it.
+
+        Args:
+            name: Man page name (e.g. ``"git"``).
+            section: Optional manual section number.
+
+        Returns:
+            Raw text output, or ``None`` on failure.
+        """
+        cmd: list[str] = ["man"]
+        if section:
+            cmd.append(str(section))
+        cmd.append(name)
+
+        env = os.environ.copy()
+        # Wide output avoids mid-word breaks
+        env["MANWIDTH"] = "999"
+        # Force plain-text rendering (no colour escapes on some systems)
+        env["MAN_KEEP_FORMATTING"] = "0"
+        env["COLUMNS"] = "999"
+
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=30,
+                env=env,
+            )
+            if result.returncode != 0:
+                section_label = f"({section}) " if section else ""
+                logger.debug(
+                    "man %s%s returned exit code %d: %s",
+                    section_label,
+                    name,
+                    result.returncode,
+                    result.stderr.strip(),
+                )
+                return None
+
+            output = result.stdout
+            if not output.strip():
+                return None
+
+            # Pipe through ``col -bx`` to strip backspace overstriking
+            try:
+                col_result = subprocess.run(
+                    ["col", "-bx"],
+                    input=output,
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                )
+                if col_result.returncode == 0 and col_result.stdout.strip():
+                    output = col_result.stdout
+            except FileNotFoundError:
+                # ``col`` not available -- fall back to manual backspace removal
+                output = re.sub(r".\x08", "", output)
+
+            return output
+
+        except FileNotFoundError:
+            logger.warning("'man' command not found -- is it installed?")
+            return None
+        except subprocess.TimeoutExpired:
+            logger.warning("man %s timed out after 30 s", name)
+            return None
+        except OSError as exc:
+            logger.warning("Error running man %s: %s", name, exc)
+            return None
+
+    # ------------------------------------------------------------------
+    # File reading
+    # ------------------------------------------------------------------
+
+    def _read_man_file(self, filepath: str) -> str | None:
+        """Read a man page file, handling optional compression.
+
+        Supports ``.gz``, ``.bz2``, and ``.xz`` compressed files as well as
+        plain text.
+
+        Args:
+            filepath: Absolute or relative path to the file.
+
+        Returns:
+            Raw file content as a string, or ``None`` on failure.
+        """
+        path = Path(filepath)
+
+        try:
+            if path.suffix == ".gz":
+                import gzip
+
+                with gzip.open(path, "rt", encoding="utf-8", errors="replace") as f:
+                    return f.read()
+            elif path.suffix == ".bz2":
+                import bz2
+
+                with bz2.open(path, "rt", encoding="utf-8", errors="replace") as f:
+                    return f.read()
+            elif path.suffix == ".xz":
+                import lzma
+
+                with lzma.open(path, "rt", encoding="utf-8", errors="replace") as f:
+                    return f.read()
+            else:
+                with open(path, encoding="utf-8", errors="replace") as f:
+                    return f.read()
+        except OSError as exc:
+            logger.warning("Could not read %s: %s", filepath, exc)
+            return None
+
+    # ------------------------------------------------------------------
+    # Troff/groff stripping
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _strip_troff_formatting(text: str) -> str:
+        """Remove troff/groff formatting codes from raw man page text.
+
+        This handles:
+        - Backspace-based bold/underline overstriking (e.g. ``X\\bX``).
+        - ANSI escape sequences.
+        - Common roff macros (``.TH``, ``.SH``, ``.TP``, ``.PP``, etc.).
+        - Inline font switching (``\\fB``, ``\\fI``, ``\\fR``, ``\\fP``).
+        - Roff special characters (``\\-``, ``\\(aq``, ``\\(lq``, etc.).
+        - Comment lines starting with ``.\\"`` or ``'\\"``.
+
+        The goal is to produce clean, readable plain text suitable for
+        further section parsing.
+
+        Args:
+            text: Raw text potentially containing troff formatting.
+
+        Returns:
+            Cleaned plain-text string.
+        """
+        # Remove ANSI escape sequences
+        text = re.sub(r"\x1b\[[0-9;]*[a-zA-Z]", "", text)
+        text = re.sub(r"\x1b\([AB012]", "", text)
+
+        # Remove backspace overstriking (bold: X\bX, underline: _\bX)
+        text = re.sub(r".\x08", "", text)
+
+        # Remove troff comment lines
+        text = re.sub(r'^[.\']\\?".*$', "", text, flags=re.MULTILINE)
+        text = re.sub(r"^\.\\\s*.*$", "", text, flags=re.MULTILINE)
+
+        # Remove common roff macros at line start
+        # We keep .SH content as it becomes section headers
+        macro_pattern = re.compile(
+            r"^\.\s*(?:TH|PP|LP|IP|TP|HP|RS|RE|br|sp|ne|nf|fi|na|ad|in|ti|nh|hy|PD|IX"
+            r"|de|ft|nr|ds|rm|rn|if|ie|el|so|mso|am|ig)\b.*$",
+            re.MULTILINE,
+        )
+        text = macro_pattern.sub("", text)
+
+        # Convert .SH "SECTION" or .SH SECTION to plain section header
+        text = re.sub(
+            r'^\.\s*SH\s+"?([^"]*?)"?\s*$',
+            r"\1",
+            text,
+            flags=re.MULTILINE,
+        )
+        # Convert .SS subsection headers similarly
+        text = re.sub(
+            r'^\.\s*SS\s+"?([^"]*?)"?\s*$',
+            r"  \1",
+            text,
+            flags=re.MULTILINE,
+        )
+
+        # Remove .B / .I / .BI / .BR / .IR / .RB / .RI inline macros
+        # Keep their text arguments
+        text = re.sub(
+            r"^\.\s*(?:B|I|BI|BR|IR|RB|RI|SB|SM)\s+(.*)$",
+            r"\1",
+            text,
+            flags=re.MULTILINE,
+        )
+
+        # Remove inline font escapes (\fB, \fI, \fR, \fP, \f[...])
+        text = re.sub(r"\\f[BIRP1234]", "", text)
+        text = re.sub(r"\\f\[[^\]]*\]", "", text)
+
+        # Remove other inline troff escapes
+        text = re.sub(r"\\[*$]([({][^)}]+[)}]|\S)", "", text)
+
+        # Convert troff special characters to plain equivalents
+        replacements = {
+            r"\-": "-",
+            r"\(aq": "'",
+            r"\(lq": '"',
+            r"\(rq": '"',
+            r"\(dq": '"',
+            r"\(bu": "*",
+            r"\(em": "--",
+            r"\(en": "-",
+            r"\(co": "(c)",
+            r"\(rg": "(R)",
+            r"\(tm": "(TM)",
+            r"\&": "",
+            r"\e": "\\",
+            r"\|": "",
+            r"\^": "",
+            r"\~": " ",
+            r"\ ": " ",
+            r"\0": " ",
+        }
+        for troff_seq, replacement in replacements.items():
+            text = text.replace(troff_seq, replacement)
+
+        # Remove remaining backslash escapes
+        text = re.sub(r"\\[(\[][a-zA-Z]{2,4}[\])]", "", text)
+
+        # Strip stray roff size/motion escapes  \s[+-]N, \v'...', \h'...'
+        text = re.sub(r"\\s[+-]?\d+", "", text)
+        text = re.sub(r"\\[vh]'[^']*'", "", text)
+
+        # Collapse multiple blank lines into at most two
+        text = re.sub(r"\n{3,}", "\n\n", text)
+
+        return text.strip()
+
+    # ------------------------------------------------------------------
+    # Parsing
+    # ------------------------------------------------------------------
+
+    def _parse_man_output(
+        self,
+        text: str,
+        man_name: str,
+        section_num: int | None = None,
+    ) -> dict:
+        """Parse cleaned man page text into structured sections.
+
+        Identifies standard man page sections (NAME, SYNOPSIS, DESCRIPTION,
+        OPTIONS, EXAMPLES, SEE ALSO, etc.) by looking for lines that match
+        known section headers at the start of a line with no leading
+        whitespace.
+
+        Args:
+            text: Cleaned man page text (troff already stripped).
+            man_name: Name of the man page.
+            section_num: Manual section number (1-8) if known.
+
+        Returns:
+            Structured dict with ``name``, ``section``, ``sections``,
+            ``options``, ``examples``, ``see_also``, and ``raw_text`` keys.
+        """
+        # Build a pattern that matches known section headings at line start
+        known_uppers = [s.upper() for s in STANDARD_SECTIONS]
+
+        sections: dict[str, str] = {}
+        current_section: str | None = None
+        current_lines: list[str] = []
+
+        for line in text.splitlines():
+            stripped = line.strip()
+            # Check if this line is a section header
+            upper_stripped = stripped.upper()
+            if upper_stripped in known_uppers and not line.startswith(" "):
+                # Flush previous section
+                if current_section is not None:
+                    sections[current_section] = "\n".join(current_lines).strip()
+                current_section = stripped.upper()
+                current_lines = []
+            else:
+                current_lines.append(line)
+
+        # Flush last section
+        if current_section is not None:
+            sections[current_section] = "\n".join(current_lines).strip()
+
+        # Extract structured parts
+        options = self._extract_options(sections.get("OPTIONS", ""))
+        examples = self._extract_examples(sections.get("EXAMPLES", sections.get("EXAMPLE", "")))
+        see_also = self._extract_see_also(sections.get("SEE ALSO", ""))
+
+        # Build synopsis
+        synopsis = sections.get("SYNOPSIS", "").strip()
+        description_text = sections.get("DESCRIPTION", "").strip()
+
+        return {
+            "name": man_name,
+            "section": section_num,
+            "title": sections.get("NAME", man_name).strip(),
+            "synopsis": synopsis,
+            "description": description_text[:2000]
+            if len(description_text) > 2000
+            else description_text,
+            "sections": sections,
+            "options": options,
+            "examples": examples,
+            "see_also": see_also,
+            "raw_text": text,
+        }
+
+    def _extract_options(self, options_text: str) -> list[dict]:
+        """Parse the OPTIONS section into a list of flag/description dicts.
+
+        Handles common option formats:
+        - ``-f, --flag``
+        - ``-f value``
+        - ``--long-option=VALUE``
+
+        Args:
+            options_text: Raw text of the OPTIONS section.
+
+        Returns:
+            List of dicts with ``flag`` and ``description`` keys.
+        """
+        if not options_text.strip():
+            return []
+
+        options: list[dict] = []
+        # Pattern for option lines: starts with optional whitespace then a dash
+        option_re = re.compile(
+            r"^\s{0,7}(-[\w](?:[\w-]*)?(?:\s*,\s*--[\w][\w-]*(?:=\S+)?)?|"
+            r"--[\w][\w-]*(?:=\S+)?)"
+            r"(?:\s+(.*))?$"
+        )
+
+        current_flag: str | None = None
+        current_desc_lines: list[str] = []
+
+        for line in options_text.splitlines():
+            match = option_re.match(line)
+            if match:
+                # Flush previous option
+                if current_flag is not None:
+                    options.append(
+                        {
+                            "flag": current_flag.strip(),
+                            "description": " ".join(current_desc_lines).strip(),
+                        }
+                    )
+                current_flag = match.group(1)
+                desc_part = match.group(2) or ""
+                current_desc_lines = [desc_part] if desc_part else []
+            elif current_flag is not None:
+                # Continuation line for current option description
+                stripped = line.strip()
+                if stripped:
+                    current_desc_lines.append(stripped)
+
+        # Flush last option
+        if current_flag is not None:
+            options.append(
+                {
+                    "flag": current_flag.strip(),
+                    "description": " ".join(current_desc_lines).strip(),
+                }
+            )
+
+        return options
+
+    def _extract_examples(self, examples_text: str) -> list[dict]:
+        """Parse the EXAMPLES section into structured example blocks.
+
+        Looks for lines that appear to be commands (starting with ``$``,
+        ``#``, ``%``, or common command prefixes) versus descriptive prose.
+
+        Args:
+            examples_text: Raw text of the EXAMPLES (or EXAMPLE) section.
+
+        Returns:
+            List of dicts with ``description`` and ``command`` keys.
+        """
+        if not examples_text.strip():
+            return []
+
+        examples: list[dict] = []
+        current_desc_lines: list[str] = []
+        current_cmd_lines: list[str] = []
+
+        # Patterns that indicate a command line
+        cmd_prefixes = re.compile(r"^\s{2,}[\$#%>]?\s*\S")
+        # A line that is indented and looks like code
+        code_indent = re.compile(r"^\s{4,}\S")
+
+        for line in examples_text.splitlines():
+            stripped = line.strip()
+            if not stripped:
+                # Blank line: flush if we have a command accumulated
+                if current_cmd_lines:
+                    examples.append(
+                        {
+                            "description": " ".join(current_desc_lines).strip(),
+                            "command": "\n".join(current_cmd_lines).strip(),
+                        }
+                    )
+                    current_desc_lines = []
+                    current_cmd_lines = []
+                continue
+
+            if cmd_prefixes.match(line) or code_indent.match(line):
+                current_cmd_lines.append(stripped)
+            else:
+                if current_cmd_lines:
+                    # New prose after a command block -> flush
+                    examples.append(
+                        {
+                            "description": " ".join(current_desc_lines).strip(),
+                            "command": "\n".join(current_cmd_lines).strip(),
+                        }
+                    )
+                    current_desc_lines = []
+                    current_cmd_lines = []
+                current_desc_lines.append(stripped)
+
+        # Flush remaining
+        if current_cmd_lines:
+            examples.append(
+                {
+                    "description": " ".join(current_desc_lines).strip(),
+                    "command": "\n".join(current_cmd_lines).strip(),
+                }
+            )
+        elif current_desc_lines:
+            # Trailing prose with no command -- still record it
+            examples.append(
+                {
+                    "description": " ".join(current_desc_lines).strip(),
+                    "command": "",
+                }
+            )
+
+        return examples
+
+    def _extract_see_also(self, see_also_text: str) -> list[str]:
+        """Parse the SEE ALSO section into a list of referenced page names.
+
+        Typical format: ``git-log(1), git-diff(1), gitk(1)``
+
+        Args:
+            see_also_text: Raw text of the SEE ALSO section.
+
+        Returns:
+            Sorted de-duplicated list of referenced page names.
+        """
+        if not see_also_text.strip():
+            return []
+
+        # Match patterns like "name(N)" where N is a digit
+        refs = re.findall(r"([\w.+-]+)\s*\(\d+\)", see_also_text)
+        # Also capture plain references (just names separated by commas)
+        if not refs:
+            refs = [r.strip() for r in re.split(r"[,\n]", see_also_text) if r.strip()]
+
+        return sorted(set(refs))
+
+    # ------------------------------------------------------------------
+    # Loading
+    # ------------------------------------------------------------------
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from JSON.
+
+        Args:
+            json_path: Path to the intermediate JSON file.
+
+        Returns:
+            ``True`` on success.
+        """
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_pages", len(self.extracted_data.get("pages", [])))
+        print(f"✅ Loaded {total} man page(s)")
+        return True
+
+    # ------------------------------------------------------------------
+    # Categorisation
+    # ------------------------------------------------------------------
+
+    def categorize_content(self) -> dict[str, dict]:
+        """Categorise man pages based on name prefixes, sections, or keywords.
+
+        Man pages are grouped by a common prefix (e.g. ``git-*`` pages all go
+        under a ``git`` category) or by their manual section number.  When
+        explicit ``self.categories`` are provided, keyword matching is used
+        instead.
+
+        Returns:
+            Dict mapping category keys to ``{"title": ..., "pages": [...]}``
+            dicts.
+        """
+        print("\n📋 Categorizing content...")
+
+        categorized: dict[str, dict] = {}
+        pages = self.extracted_data.get("pages", [])
+
+        # If explicit categories are provided, use keyword matching
+        if self.categories:
+            first_value = next(iter(self.categories.values()), None)
+            if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
+                for cat_key, cat_pages in self.categories.items():
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": cat_pages,
+                    }
+            else:
+                for cat_key in self.categories:
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": [],
+                    }
+                for page in pages:
+                    text = page.get("description", "").lower()
+                    title = page.get("title", "").lower()
+                    scores: dict[str, int] = {}
+                    for cat_key, keywords in self.categories.items():
+                        if isinstance(keywords, list):
+                            score = sum(
+                                1
+                                for kw in keywords
+                                if isinstance(kw, str)
+                                and (kw.lower() in text or kw.lower() in title)
+                            )
+                        else:
+                            score = 0
+                        if score > 0:
+                            scores[cat_key] = score
+                    if scores:
+                        best_cat = max(scores, key=scores.get)
+                        categorized[best_cat]["pages"].append(page)
+                    else:
+                        if "other" not in categorized:
+                            categorized["other"] = {"title": "Other", "pages": []}
+                        categorized["other"]["pages"].append(page)
+
+            print(f"✅ Created {len(categorized)} categories")
+            for _cat_key, cat_data in categorized.items():
+                print(f"   - {cat_data['title']}: {len(cat_data['pages'])} pages")
+            return categorized
+
+        # Auto-categorise by name prefix (e.g. git-log -> git)
+        if len(pages) > 1:
+            prefix_groups: dict[str, list[dict]] = {}
+            for page in pages:
+                name = page.get("name", "unknown")
+                prefix = name.split("-", 1)[0] if "-" in name else name
+                prefix_groups.setdefault(prefix, []).append(page)
+
+            # Only use prefix grouping if it actually reduces categories
+            if len(prefix_groups) < len(pages):
+                for prefix, group_pages in prefix_groups.items():
+                    cat_key = self._sanitize_filename(prefix)
+                    categorized[cat_key] = {
+                        "title": prefix.title(),
+                        "pages": group_pages,
+                    }
+            else:
+                categorized["commands"] = {
+                    "title": "Commands",
+                    "pages": pages,
+                }
+        else:
+            # Single man page
+            page_name = pages[0].get("name", "content") if pages else "content"
+            categorized[self._sanitize_filename(page_name)] = {
+                "title": page_name,
+                "pages": pages,
+            }
+
+        print(f"✅ Created {len(categorized)} categories")
+        for _cat_key, cat_data in categorized.items():
+            print(f"   - {cat_data['title']}: {len(cat_data['pages'])} pages")
+        return categorized
+
+    # ------------------------------------------------------------------
+    # Build
+    # ------------------------------------------------------------------
+
+    def build_skill(self) -> None:
+        """Build the complete skill directory structure.
+
+        Creates the output directory, generates reference files, an index,
+        and the main SKILL.md.
+        """
+        print(f"\n🏗️  Building skill: {self.name}")
+
+        # Create directories
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        # Categorise content
+        categorized = self.categorize_content()
+
+        # Generate reference files
+        print("\n📝 Generating reference files...")
+        total_cats = len(categorized)
+        cat_num = 1
+        for cat_key, cat_data in categorized.items():
+            self._generate_reference_file(cat_key, cat_data, cat_num, total_cats)
+            cat_num += 1
+
+        # Generate index
+        self._generate_index(categorized)
+
+        # Generate SKILL.md
+        self._generate_skill_md(categorized)
+
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ------------------------------------------------------------------
+    # Generation (private)
+    # ------------------------------------------------------------------
+
+    def _generate_reference_file(
+        self,
+        cat_key: str,
+        cat_data: dict,
+        cat_num: int,
+        total_cats: int,
+    ) -> None:
+        """Generate a reference markdown file for a category of man pages.
+
+        Args:
+            cat_key: Category key (sanitised).
+            cat_data: Dict with ``title`` and ``pages``.
+            cat_num: 1-based index of this category.
+            total_cats: Total number of categories.
+        """
+        pages = cat_data["pages"]
+
+        if total_cats == 1:
+            filename = f"{self.skill_dir}/references/{cat_key}.md"
+        else:
+            filename = f"{self.skill_dir}/references/{cat_key}_{cat_num:02d}.md"
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+
+            for page in pages:
+                man_name = page.get("name", "unknown")
+                section = page.get("section")
+                section_label = f"({section})" if section else ""
+
+                f.write(f"---\n\n## {man_name}{section_label}\n\n")
+
+                # Title / NAME line
+                title = page.get("title", "")
+                if title and title != man_name:
+                    f.write(f"**{title}**\n\n")
+
+                # Synopsis
+                synopsis = page.get("synopsis", "")
+                if synopsis:
+                    f.write("### Synopsis\n\n")
+                    f.write(f"```\n{synopsis}\n```\n\n")
+
+                # Description (truncated for reference file)
+                description = page.get("description", "")
+                if description:
+                    f.write("### Description\n\n")
+                    # Keep a reasonable amount for the reference file
+                    if len(description) > 3000:
+                        f.write(f"{description[:3000]}\n\n*... (truncated)*\n\n")
+                    else:
+                        f.write(f"{description}\n\n")
+
+                # Options
+                options = page.get("options", [])
+                if options:
+                    f.write("### Options\n\n")
+                    for opt in options:
+                        flag = opt.get("flag", "")
+                        desc = opt.get("description", "")
+                        f.write(f"- `{flag}`")
+                        if desc:
+                            # Truncate very long option descriptions
+                            short_desc = desc[:200] + "..." if len(desc) > 200 else desc
+                            f.write(f" -- {short_desc}")
+                        f.write("\n")
+                    f.write("\n")
+
+                # Examples
+                examples = page.get("examples", [])
+                if examples:
+                    f.write("### Examples\n\n")
+                    for i, ex in enumerate(examples, 1):
+                        ex_desc = ex.get("description", "")
+                        ex_cmd = ex.get("command", "")
+                        if ex_desc:
+                            f.write(f"**Example {i}:** {ex_desc}\n\n")
+                        if ex_cmd:
+                            f.write(f"```bash\n{ex_cmd}\n```\n\n")
+
+                # SEE ALSO
+                see_also = page.get("see_also", [])
+                if see_also:
+                    f.write("### See Also\n\n")
+                    f.write(", ".join(f"`{ref}`" for ref in see_also) + "\n\n")
+
+                # Extra sections (non-standard ones we haven't explicitly handled)
+                handled = {
+                    "NAME",
+                    "SYNOPSIS",
+                    "DESCRIPTION",
+                    "OPTIONS",
+                    "EXAMPLES",
+                    "EXAMPLE",
+                    "SEE ALSO",
+                }
+                extra_sections = page.get("sections", {})
+                for sec_name, sec_text in extra_sections.items():
+                    if sec_name in handled or not sec_text.strip():
+                        continue
+                    f.write(f"### {sec_name.title()}\n\n")
+                    if len(sec_text) > 1500:
+                        f.write(f"{sec_text[:1500]}\n\n*... (truncated)*\n\n")
+                    else:
+                        f.write(f"{sec_text}\n\n")
+
+                f.write("---\n\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_index(self, categorized: dict[str, dict]) -> None:
+        """Generate references/index.md with links to all reference files.
+
+        Args:
+            categorized: Category mapping produced by ``categorize_content()``.
+        """
+        filename = f"{self.skill_dir}/references/index.md"
+        total_cats = len(categorized)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Man Pages Reference\n\n")
+            f.write("## Categories\n\n")
+
+            cat_num = 1
+            for cat_key, cat_data in categorized.items():
+                page_count = len(cat_data["pages"])
+                if total_cats == 1:
+                    link_filename = f"{cat_key}.md"
+                else:
+                    link_filename = f"{cat_key}_{cat_num:02d}.md"
+                f.write(f"- [{cat_data['title']}]({link_filename}) ({page_count} man page(s))\n")
+                cat_num += 1
+
+            f.write("\n## All Man Pages\n\n")
+            pages = self.extracted_data.get("pages", [])
+            for page in sorted(pages, key=lambda p: p.get("name", "")):
+                man_name = page.get("name", "unknown")
+                section = page.get("section")
+                section_label = f"({section})" if section else ""
+                title = page.get("title", "")
+                f.write(f"- **{man_name}{section_label}** -- {title}\n")
+
+            f.write("\n## Statistics\n\n")
+            f.write(f"- Total man pages: {self.extracted_data.get('total_pages', 0)}\n")
+            f.write(f"- Total options: {self.extracted_data.get('total_options', 0)}\n")
+            f.write(f"- Total examples: {self.extracted_data.get('total_examples', 0)}\n")
+
+            see_also = self.extracted_data.get("see_also", [])
+            if see_also:
+                f.write(f"- Cross-references: {len(see_also)}\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict[str, dict]) -> None:
+        """Generate the main SKILL.md file.
+
+        Args:
+            categorized: Category mapping produced by ``categorize_content()``.
+        """
+        filename = f"{self.skill_dir}/SKILL.md"
+
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        with open(filename, "w", encoding="utf-8") as f:
+            # YAML frontmatter
+            f.write("---\n")
+            f.write(f"name: {skill_name}\n")
+            f.write(f"description: {desc}\n")
+            f.write("---\n\n")
+
+            f.write(f"# {self.name.title()} Man Pages Skill\n\n")
+            f.write(f"{self.description}\n\n")
+
+            # When to Use
+            f.write("## When to Use This Skill\n\n")
+            f.write("Use this skill when you need to:\n")
+            f.write(f"- Understand {self.name} command-line tools and their options\n")
+            f.write("- Look up command syntax and usage patterns\n")
+            f.write("- Find examples of common command invocations\n")
+            f.write("- Review available flags and environment variables\n")
+            f.write("- Explore related commands via SEE ALSO references\n\n")
+
+            # Quick command reference (synopses)
+            pages = self.extracted_data.get("pages", [])
+            synopses = [
+                (p.get("name", ""), p.get("synopsis", ""))
+                for p in pages
+                if p.get("synopsis", "").strip()
+            ]
+
+            if synopses:
+                f.write("## Quick Command Reference\n\n")
+                for cmd_name, synopsis in synopses[:20]:
+                    f.write(f"### {cmd_name}\n\n")
+                    f.write(f"```\n{synopsis.strip()}\n```\n\n")
+
+            # Page overview
+            f.write("## Man Page Overview\n\n")
+            total_pages = self.extracted_data.get("total_pages", 0)
+            f.write(f"**Total Man Pages:** {total_pages}\n\n")
+            f.write("**Content Breakdown:**\n\n")
+            for _cat_key, cat_data in categorized.items():
+                page_count = len(cat_data["pages"])
+                f.write(f"- **{cat_data['title']}**: {page_count} man page(s)\n")
+            f.write("\n")
+
+            # Key options (top options across all pages)
+            all_options: list[dict] = []
+            for page in pages:
+                for opt in page.get("options", []):
+                    all_options.append(
+                        {
+                            "command": page.get("name", ""),
+                            **opt,
+                        }
+                    )
+
+            if all_options:
+                f.write("## Common Options\n\n")
+                f.write(f"*{len(all_options)} options extracted across all man pages*\n\n")
+                # Show options for first few commands
+                shown_commands: set[str] = set()
+                for opt in all_options:
+                    cmd = opt.get("command", "")
+                    if cmd in shown_commands:
+                        continue
+                    if len(shown_commands) >= 5:
+                        break
+                    shown_commands.add(cmd)
+                    # Show first 5 options per command
+                    cmd_opts = [o for o in all_options if o.get("command") == cmd][:5]
+                    f.write(f"### {cmd}\n\n")
+                    for co in cmd_opts:
+                        flag = co.get("flag", "")
+                        flag_desc = co.get("description", "")
+                        short_desc = flag_desc[:120] + "..." if len(flag_desc) > 120 else flag_desc
+                        f.write(f"- `{flag}` -- {short_desc}\n")
+                    f.write("\n")
+
+            # Examples (top examples)
+            all_examples: list[dict] = []
+            for page in pages:
+                for ex in page.get("examples", []):
+                    all_examples.append(
+                        {
+                            "command_name": page.get("name", ""),
+                            **ex,
+                        }
+                    )
+
+            if all_examples:
+                f.write("## Examples\n\n")
+                f.write(f"*{len(all_examples)} example(s) extracted from man pages*\n\n")
+                for ex in all_examples[:15]:
+                    cmd_name = ex.get("command_name", "")
+                    ex_desc = ex.get("description", "")
+                    ex_cmd = ex.get("command", "")
+                    if cmd_name:
+                        f.write(f"### {cmd_name}\n\n")
+                    if ex_desc:
+                        f.write(f"{ex_desc}\n\n")
+                    if ex_cmd:
+                        f.write(f"```bash\n{ex_cmd}\n```\n\n")
+
+            # Cross-references
+            see_also = self.extracted_data.get("see_also", [])
+            if see_also:
+                f.write("## Related Commands (SEE ALSO)\n\n")
+                for ref in see_also[:30]:
+                    f.write(f"- `{ref}`\n")
+                if len(see_also) > 30:
+                    f.write(f"\n*... and {len(see_also) - 30} more*\n")
+                f.write("\n")
+
+            # Statistics
+            f.write("## Documentation Statistics\n\n")
+            f.write(f"- **Total Man Pages**: {total_pages}\n")
+            f.write(f"- **Total Options**: {self.extracted_data.get('total_options', 0)}\n")
+            f.write(f"- **Total Examples**: {self.extracted_data.get('total_examples', 0)}\n")
+            f.write(f"- **Cross-references**: {len(see_also)}\n\n")
+
+            # Navigation
+            f.write("## Navigation\n\n")
+            f.write("**Reference Files:**\n\n")
+            total_cats = len(categorized)
+            cat_num = 1
+            for _cat_key, cat_data in categorized.items():
+                ref_name = f"{_cat_key}.md" if total_cats == 1 else f"{_cat_key}_{cat_num:02d}.md"
+                f.write(f"- `references/{ref_name}` -- {cat_data['title']}\n")
+                cat_num += 1
+            f.write("\n")
+            f.write("See `references/index.md` for complete reference structure.\n\n")
+
+            # Footer
+            f.write("---\n\n")
+            f.write("**Generated by Skill Seekers** | Man Page Scraper\n")
+
+        # Report line count
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().splitlines())
+        print(f"   Generated: {filename} ({line_count} lines)")
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _sanitize_filename(name: str) -> str:
+        """Convert a string to a safe filename.
+
+        Args:
+            name: Arbitrary string.
+
+        Returns:
+            Lowercase snake_case filename-safe string.
+        """
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        safe = re.sub(r"[-\s]+", "_", safe)
+        return safe
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """CLI entry point for the man page scraper.
+
+    Supports three workflows:
+
+    1. ``--man-names git,curl`` -- extract named man pages via the ``man``
+       command.
+    2. ``--man-path /usr/share/man/man1`` -- read man page files from a
+       directory.
+    3. ``--from-json data.json`` -- reload previously extracted data and
+       rebuild the skill.
+
+    Returns:
+        Exit code (0 on success, non-zero on error).
+    """
+    parser = argparse.ArgumentParser(
+        description="Convert Unix man pages to a skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Examples:\n"
+            "  %(prog)s --man-names git,curl --name unix-tools\n"
+            "  %(prog)s --man-path /usr/share/man/man1 --name coreutils\n"
+            "  %(prog)s --from-json unix-tools_extracted.json\n"
+        ),
+    )
+
+    # Standard arguments (name, description, output, enhance-level, etc.)
+    from .arguments.common import add_all_standard_arguments
+
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for man pages
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for man), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # Man-specific arguments
+    parser.add_argument(
+        "--man-names",
+        type=str,
+        help="Comma-separated list of man page names (e.g. git,curl,grep)",
+        metavar="NAMES",
+    )
+    parser.add_argument(
+        "--man-path",
+        type=str,
+        help="Directory containing man page files (.1-.8, .man, .gz)",
+        metavar="DIR",
+    )
+    parser.add_argument(
+        "--sections",
+        type=str,
+        help="Comma-separated list of man section numbers to extract (e.g. 1,3,8)",
+        metavar="NUMS",
+    )
+    parser.add_argument(
+        "--from-json",
+        type=str,
+        help="Build skill from previously extracted JSON",
+        metavar="FILE",
+    )
+
+    args = parser.parse_args()
+
+    # Logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Dry run
+    if getattr(args, "dry_run", False):
+        source = (
+            getattr(args, "man_names", None)
+            or getattr(args, "man_path", None)
+            or getattr(args, "from_json", None)
+            or "(none)"
+        )
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: Man Page Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Sections:       {getattr(args, 'sections', None) or 'all'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate: must have at least one source
+    if not (
+        getattr(args, "man_names", None)
+        or getattr(args, "man_path", None)
+        or getattr(args, "from_json", None)
+    ):
+        parser.error("Must specify --man-names, --man-path, or --from-json")
+
+    # Parse section numbers
+    section_list: list[int] = []
+    if getattr(args, "sections", None):
+        try:
+            section_list = [int(s.strip()) for s in args.sections.split(",") if s.strip()]
+        except ValueError:
+            parser.error("--sections must be comma-separated integers (e.g. 1,3,8)")
+
+    # Parse man names
+    man_name_list: list[str] = []
+    if getattr(args, "man_names", None):
+        man_name_list = [n.strip() for n in args.man_names.split(",") if n.strip()]
+
+    # Build from JSON workflow
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} documentation",
+        }
+        try:
+            converter = ManPageToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Auto-detect name from man names or path
+    if not getattr(args, "name", None):
+        if man_name_list:
+            args.name = man_name_list[0] if len(man_name_list) == 1 else "man-pages"
+        elif getattr(args, "man_path", None):
+            args.name = Path(args.man_path).name
+        else:
+            args.name = "man-pages"
+
+    config = {
+        "name": args.name,
+        "man_names": man_name_list,
+        "man_path": getattr(args, "man_path", ""),
+        "sections": section_list,
+        "description": getattr(args, "description", None),
+    }
+
+    try:
+        converter = ManPageToSkillConverter(config)
+
+        # Extract
+        if not converter.extract_manpages():
+            print("\n❌ Man page extraction failed -- see error above", file=sys.stderr)
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis,"
+                    " enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Unexpected error during man page processing: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/notion_scraper.py b/src/skill_seekers/cli/notion_scraper.py
new file mode 100644
index 0000000..fa4ecd8
--- /dev/null
+++ b/src/skill_seekers/cli/notion_scraper.py
@@ -0,0 +1,1023 @@
+#!/usr/bin/env python3
+"""
+Notion Workspace to Skill Converter
+
+Converts Notion databases and pages into AI-ready skills. Two modes:
+
+1. **API mode** — Uses the Notion API via ``notion-client`` to fetch databases,
+   pages, and blocks in real time.  Requires an integration token.
+2. **Export mode** — Parses a Notion Markdown/CSV export directory downloaded
+   from Settings > Export.  No token required.
+
+Usage:
+    skill-seekers notion --database-id ID --token $NOTION_TOKEN --name myskill
+    skill-seekers notion --page-id ID --token $NOTION_TOKEN --name myskill
+    skill-seekers notion --export-path ./notion-export/ --name myskill
+    skill-seekers notion --from-json output/myskill_notion_data.json --name myskill
+"""
+
+import argparse
+import csv
+import json
+import logging
+import os
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+# Optional dependency guard — notion-client is not a core dependency
+try:
+    from notion_client import Client as NotionClient
+    from notion_client import APIResponseError
+
+    NOTION_AVAILABLE = True
+except ImportError:
+    NOTION_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# Constants
+DEFAULT_MAX_PAGES = 500
+RATE_LIMIT_DELAY = 0.35  # seconds between API requests
+MAX_BLOCK_DEPTH = 5
+
+
+def _check_notion_deps() -> None:
+    """Raise RuntimeError if notion-client is not installed."""
+    if not NOTION_AVAILABLE:
+        raise RuntimeError(
+            "notion-client is required for Notion API mode.\n"
+            'Install with: pip install "skill-seekers[notion]"\n'
+            "Or: pip install notion-client"
+        )
+
+
+def infer_description_from_notion(metadata: dict | None = None, name: str = "") -> str:
+    """Infer a skill description from Notion workspace metadata."""
+    if metadata:
+        desc_text = metadata.get("description", "")
+        if desc_text and len(desc_text) > 20:
+            desc = desc_text.strip()[:150]
+            return f"Use when {desc.lower()}"
+        title_text = metadata.get("title", "")
+        if title_text and len(title_text) > 10:
+            return f"Use when working with {title_text.lower()}"
+    return (
+        f"Use when referencing {name} documentation"
+        if name
+        else "Use when referencing this Notion workspace"
+    )
+
+
+class NotionToSkillConverter:
+    """Convert Notion workspace content (database or page tree) to a skill.
+
+    Args:
+        config: Dict with keys name, database_id, page_id, export_path,
+                token, description, max_pages.
+    """
+
+    def __init__(self, config: dict) -> None:
+        self.config = config
+        self.name: str = config["name"]
+        self.database_id: str | None = config.get("database_id")
+        self.page_id: str | None = config.get("page_id")
+        self.export_path: str | None = config.get("export_path")
+        self.token: str | None = config.get("token") or os.getenv("NOTION_TOKEN")
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} documentation"
+        )
+        self.max_pages: int = config.get("max_pages", DEFAULT_MAX_PAGES)
+        self.skill_dir: str = f"output/{self.name}"
+        self.data_file: str = f"output/{self.name}_notion_data.json"
+        self._client: Any = None
+        self.extracted_data: dict[str, Any] | None = None
+        self._pages_fetched: int = 0
+        self._blocks_fetched: int = 0
+
+    # -- Notion client ---------------------------------------------------
+
+    def _get_client(self) -> Any:
+        """Return a cached Notion API client, creating one if needed."""
+        _check_notion_deps()
+        if self._client is None:
+            if not self.token:
+                raise ValueError("Notion integration token required. Set NOTION_TOKEN or --token.")
+            self._client = NotionClient(auth=self.token)
+            logger.info("Notion API client initialised")
+        return self._client
+
+    # -- Public extraction -----------------------------------------------
+
+    def extract_notion(self) -> bool:
+        """Extract content from Notion (API or export mode). Saves JSON."""
+        print(f"\n--- Extracting Notion content for: {self.name}")
+
+        if self.export_path:
+            pages, source_mode = self._extract_from_export(), "export"
+        elif self.database_id or self.page_id:
+            pages, source_mode = self._extract_via_api(), "api"
+        else:
+            raise ValueError("Must specify --database-id, --page-id, or --export-path.")
+
+        metadata: dict[str, Any] = {
+            "title": self.name,
+            "source_mode": source_mode,
+            "database_id": self.database_id,
+            "page_id": self.page_id,
+            "export_path": self.export_path,
+        }
+        if not self.config.get("description"):
+            self.description = infer_description_from_notion(metadata, self.name)
+
+        result_data: dict[str, Any] = {
+            "metadata": metadata,
+            "total_pages": len(pages),
+            "pages_fetched": self._pages_fetched,
+            "blocks_fetched": self._blocks_fetched,
+            "pages": pages,
+        }
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+        self.extracted_data = result_data
+        print(f"   Saved extracted data to: {self.data_file}")
+        print(f"   Extracted {len(pages)} pages, {self._blocks_fetched} blocks")
+        return True
+
+    # -- Load extracted data ---------------------------------------------
+
+    def load_extracted_data(self, json_path: str | None = None) -> bool:
+        """Load previously extracted Notion data from JSON."""
+        path = json_path or self.data_file
+        print(f"\n   Loading extracted data from: {path}")
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Data file not found: {path}")
+        with open(path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_pages", len(self.extracted_data.get("pages", [])))
+        print(f"   Loaded {total} pages")
+        return True
+
+    # -- Categorisation --------------------------------------------------
+
+    def categorize_content(self) -> dict[str, dict[str, Any]]:
+        """Categorize pages by database properties or page hierarchy."""
+        if not self.extracted_data:
+            raise RuntimeError("No extracted data available.")
+        print("\n   Categorizing content...")
+        pages = self.extracted_data.get("pages", [])
+        categorized: dict[str, dict[str, Any]] = {}
+        for page in pages:
+            props = page.get("properties", {})
+            cat_key = self._resolve_category_key(props, page.get("parent_path", ""))
+            cat_title = cat_key.replace("_", " ").title()
+            categorized.setdefault(cat_key, {"title": cat_title, "pages": []})
+            categorized[cat_key]["pages"].append(page)
+        if list(categorized.keys()) == ["other"]:
+            categorized = {"content": {"title": "Content", "pages": pages}}
+        print(f"   Created {len(categorized)} categories")
+        for cat_data in categorized.values():
+            print(f"     - {cat_data['title']}: {len(cat_data['pages'])} pages")
+        return categorized
+
+    def _resolve_category_key(self, properties: dict[str, Any], parent_path: str) -> str:
+        """Determine category from properties (tags/category/type/status) or parent path."""
+        for name in ("category", "Category", "tags", "Tags", "type", "Type", "status", "Status"):
+            val = properties.get(name)
+            if val:
+                val = val[0] if isinstance(val, list) and val else val
+                if isinstance(val, str) and val.strip():
+                    return self._sanitize_key(val)
+        if parent_path:
+            first = parent_path.strip("/").split("/")[0]
+            if first:
+                return self._sanitize_key(first)
+        return "other"
+
+    @staticmethod
+    def _sanitize_key(text: str) -> str:
+        """Convert text to safe lowercase underscore key."""
+        safe = re.sub(r"[^\w\s-]", "", text.lower())
+        return re.sub(r"[-\s]+", "_", safe).strip("_") or "other"
+
+    # -- Skill building --------------------------------------------------
+
+    def build_skill(self) -> None:
+        """Build complete skill directory (SKILL.md, references, index)."""
+        if not self.extracted_data:
+            raise RuntimeError("No extracted data available.")
+        print(f"\n   Building skill: {self.name}")
+        for subdir in ("references", "scripts", "assets"):
+            os.makedirs(f"{self.skill_dir}/{subdir}", exist_ok=True)
+        categorized = self.categorize_content()
+        print("\n   Generating reference files...")
+        total_cat = len(categorized)
+        for i, (cat_key, cat_data) in enumerate(categorized.items(), 1):
+            self._generate_reference_file(cat_key, cat_data, i, total_cat)
+        self._generate_index(categorized)
+        self._generate_skill_md(categorized)
+        print(f"\n   Skill built successfully: {self.skill_dir}/")
+        print(f"\n   Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    def _generate_reference_file(
+        self, cat_key: str, cat_data: dict[str, Any], section_num: int, total_sections: int
+    ) -> None:
+        """Generate a reference markdown file for one category."""
+        pages = cat_data["pages"]
+        filename = f"{self.skill_dir}/references/{cat_key}.md"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+            for page in pages:
+                title = page.get("title", "Untitled")
+                f.write(f"---\n\n## {title}\n\n")
+                if page.get("url"):
+                    f.write(f"*Source: [{page['url']}]({page['url']})*\n\n")
+                props = page.get("properties", {})
+                if props:
+                    f.write("**Properties:**\n\n")
+                    for pn, pv in props.items():
+                        pv = ", ".join(str(v) for v in pv) if isinstance(pv, list) else pv
+                        f.write(f"- **{pn}:** {pv}\n")
+                    f.write("\n")
+                if page.get("content"):
+                    f.write(f"{page['content']}\n\n")
+                for blk in page.get("code_blocks", []):
+                    if blk.get("caption"):
+                        f.write(f"*{blk['caption']}*\n\n")
+                    f.write(f"```{blk.get('language', '')}\n{blk.get('code', '')}\n```\n\n")
+        print(f"     Generated: {filename} ({len(pages)} pages)")
+
+    def _generate_index(self, categorized: dict[str, dict[str, Any]]) -> None:
+        """Generate references/index.md."""
+        filename = f"{self.skill_dir}/references/index.md"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Reference Index\n\n## Categories\n\n")
+            for cat_key, cat_data in categorized.items():
+                f.write(f"- [{cat_data['title']}]({cat_key}.md) ({len(cat_data['pages'])} pages)\n")
+            f.write("\n## Statistics\n\n")
+            f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
+            f.write(f"- Blocks fetched: {self.extracted_data.get('blocks_fetched', 0)}\n")
+            f.write(
+                f"- Source mode: {self.extracted_data.get('metadata', {}).get('source_mode', 'unknown')}\n"
+            )
+        print(f"     Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict[str, dict[str, Any]]) -> None:
+        """Generate main SKILL.md with YAML frontmatter."""
+        filename = f"{self.skill_dir}/SKILL.md"
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024]
+        meta = self.extracted_data.get("metadata", {})
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"---\nname: {skill_name}\ndescription: {desc}\n---\n\n")
+            f.write(f"# {self.name.title()} Documentation Skill\n\n{self.description}\n\n")
+            # Source info
+            f.write(
+                f"## Source Information\n\n**Source mode:** {meta.get('source_mode', 'unknown')}\n"
+            )
+            for key in ("database_id", "page_id", "export_path"):
+                if meta.get(key):
+                    f.write(f"**{key.replace('_', ' ').title()}:** `{meta[key]}`\n")
+            f.write("\n## When to Use This Skill\n\nUse this skill when you need to:\n")
+            f.write(f"- Understand {self.name} concepts and processes\n")
+            f.write("- Look up structured database entries and their properties\n")
+            f.write("- Find code examples and implementation notes\n")
+            f.write("- Review documentation and knowledge base articles\n")
+            f.write("- Explore the workspace hierarchy and relationships\n\n")
+            # Content overview
+            f.write(
+                f"## Content Overview\n\n**Total Pages:** {self.extracted_data.get('total_pages', 0)}\n\n"
+            )
+            for cd in categorized.values():
+                f.write(f"- **{cd['title']}**: {len(cd['pages'])} pages\n")
+            f.write("\n")
+            # Key topics
+            topics = self._collect_key_topics()
+            if topics:
+                f.write("## Key Topics\n\n")
+                for t in topics[:20]:
+                    f.write(f"- {t}\n")
+                f.write("\n")
+            # Code highlights
+            all_code = self._collect_code_blocks()
+            if all_code:
+                f.write("## Code Examples\n\n")
+                by_lang: dict[str, list[dict[str, str]]] = {}
+                for blk in all_code[:30]:
+                    by_lang.setdefault(blk.get("language", "plain text"), []).append(blk)
+                for lang in sorted(by_lang):
+                    exs = by_lang[lang]
+                    f.write(f"### {lang.title()} ({len(exs)} examples)\n\n")
+                    for blk in exs[:3]:
+                        code = blk.get("code", "")[:500]
+                        f.write(f"```{lang}\n{code}\n```\n\n")
+            # Property summary
+            psummary = self._collect_property_summary()
+            if psummary:
+                f.write("## Database Properties\n\n")
+                for pn, vals in psummary.items():
+                    sample = ", ".join(sorted(vals)[:5])
+                    f.write(f"- **{pn}** ({len(vals)} unique): {sample}\n")
+                f.write("\n")
+            # Navigation
+            f.write("## Navigation\n\n")
+            for ck, cd in categorized.items():
+                f.write(f"- `references/{ck}.md` - {cd['title']}\n")
+            f.write("\nSee `references/index.md` for complete reference structure.\n\n")
+            f.write("---\n\n**Generated by Skill Seeker** | Notion Scraper\n")
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"     Generated: {filename} ({line_count} lines)")
+
+    # -- SKILL.md helpers ------------------------------------------------
+
+    def _collect_key_topics(self) -> list[str]:
+        """Extract unique heading texts from all pages."""
+        topics, seen = [], set()
+        for page in self.extracted_data.get("pages", []):
+            for text in [page.get("title", "")] + [
+                h.get("text", "") for h in page.get("headings", [])
+            ]:
+                text = text.strip()
+                if text and text.lower() not in seen and len(text) > 3:
+                    seen.add(text.lower())
+                    topics.append(text)
+        return topics
+
+    def _collect_code_blocks(self) -> list[dict[str, str]]:
+        """Collect all code blocks from extracted pages."""
+        return [
+            blk for p in self.extracted_data.get("pages", []) for blk in p.get("code_blocks", [])
+        ]
+
+    def _collect_property_summary(self) -> dict[str, set[str]]:
+        """Collect unique property values across all pages."""
+        summary: dict[str, set[str]] = {}
+        for page in self.extracted_data.get("pages", []):
+            for pn, pv in page.get("properties", {}).items():
+                summary.setdefault(pn, set())
+                if isinstance(pv, list):
+                    summary[pn].update(str(v) for v in pv)
+                elif pv is not None:
+                    summary[pn].add(str(pv))
+        return {k: v for k, v in summary.items() if v}
+
+    # ====================================================================
+    # API MODE
+    # ====================================================================
+
+    def _extract_via_api(self) -> list[dict[str, Any]]:
+        """Fetch pages from Notion via API (database query or page tree walk)."""
+        client = self._get_client()
+        if self.database_id:
+            print(f"   Fetching database: {self.database_id}")
+            return self._extract_database_entries(client)
+        print(f"   Fetching page tree: {self.page_id}")
+        return self._extract_page_tree(client, self.page_id, parent_path="")
+
+    def _extract_database_entries(self, client: Any) -> list[dict[str, Any]]:
+        """Extract entries from a Notion database with properties."""
+        pages: list[dict[str, Any]] = []
+        has_more, cursor = True, None
+        # Fetch DB metadata
+        try:
+            db_meta = client.databases.retrieve(database_id=self.database_id)
+            logger.info(
+                "Database: %s",
+                self._extract_rich_text(db_meta.get("title", [])) or self.database_id,
+            )
+        except Exception as e:
+            logger.warning("Could not fetch database metadata: %s", e)
+        # Paginate entries
+        while has_more and self._pages_fetched < self.max_pages:
+            try:
+                params: dict[str, Any] = {"database_id": self.database_id}
+                if cursor:
+                    params["start_cursor"] = cursor
+                resp = client.databases.query(**params)
+                has_more, cursor = resp.get("has_more", False), resp.get("next_cursor")
+                for entry in resp.get("results", []):
+                    if self._pages_fetched >= self.max_pages:
+                        break
+                    pd = self._process_database_entry(client, entry)
+                    if pd:
+                        pages.append(pd)
+                        self._pages_fetched += 1
+                    time.sleep(RATE_LIMIT_DELAY)
+                logger.info("   Fetched %d entries...", self._pages_fetched)
+            except APIResponseError as e:
+                if e.status == 429:
+                    time.sleep(10)
+                    continue  # noqa: E702
+                logger.error("Notion API error: %s", e)
+                break  # noqa: E702
+            except Exception as e:
+                logger.error("Error querying database: %s", e)
+                break  # noqa: E702
+        return pages
+
+    def _process_database_entry(self, client: Any, entry: dict[str, Any]) -> dict[str, Any] | None:
+        """Process one database entry into a page dict."""
+        try:
+            page_id, url = entry["id"], entry.get("url", "")
+            props = self._extract_properties(entry.get("properties", {}))
+            title = props.get("Name", "") or props.get("Title", "") or "Untitled"
+            if isinstance(title, list):
+                title = ", ".join(str(t) for t in title) or "Untitled"
+            content, headings, code_blocks = self._fetch_page_blocks(client, page_id)
+            return {
+                "id": page_id,
+                "title": title,
+                "url": url,
+                "properties": props,
+                "content": content,
+                "headings": headings,
+                "code_blocks": code_blocks,
+                "parent_path": "",
+            }
+        except Exception as e:
+            logger.warning("Failed to process entry %s: %s", entry.get("id", "?"), e)
+            return None
+
+    def _extract_properties(self, raw: dict[str, Any]) -> dict[str, Any]:
+        """Flatten Notion's raw property format into simple {name: value} pairs."""
+        result: dict[str, Any] = {}
+        for name, data in raw.items():
+            try:
+                val = self._extract_property_value(data.get("type", ""), data)
+                if val is not None:
+                    result[name] = val
+            except Exception as e:
+                logger.debug("Could not extract property '%s': %s", name, e)
+        return result
+
+    def _extract_property_value(self, ptype: str, data: dict[str, Any]) -> Any:
+        """Extract a single property value by its Notion type."""
+        if ptype == "title":
+            return self._extract_rich_text(data.get("title", []))
+        if ptype == "rich_text":
+            return self._extract_rich_text(data.get("rich_text", []))
+        if ptype == "number":
+            return data.get("number")
+        if ptype == "select":
+            s = data.get("select")
+            return s.get("name", "") if s else None
+        if ptype == "multi_select":
+            return [o.get("name", "") for o in data.get("multi_select", [])]
+        if ptype == "date":
+            d = data.get("date")
+            return (
+                (f"{d['start']} - {d['end']}" if d and d.get("end") else d.get("start"))
+                if d
+                else None
+            )
+        if ptype == "checkbox":
+            return data.get("checkbox", False)
+        if ptype in ("url", "email", "phone_number", "created_time", "last_edited_time"):
+            return data.get(ptype)
+        if ptype == "status":
+            s = data.get("status")
+            return s.get("name", "") if s else None
+        if ptype == "relation":
+            rels = data.get("relation", [])
+            return [r.get("id", "") for r in rels] if rels else None
+        if ptype == "people":
+            return [p.get("name", "") for p in data.get("people", [])] or None
+        if ptype == "files":
+            return [fi.get("name", "") for fi in data.get("files", [])] or None
+        if ptype in ("formula", "rollup"):
+            inner = data.get(ptype, {})
+            return inner.get(inner.get("type", ""))
+        logger.debug("Unsupported property type: %s", ptype)
+        return None
+
+    # -- Page tree (recursive) -------------------------------------------
+
+    def _extract_page_tree(
+        self, client: Any, page_id: str, parent_path: str, depth: int = 0
+    ) -> list[dict[str, Any]]:
+        """Recursively extract a page and its child pages."""
+        if self._pages_fetched >= self.max_pages:
+            return []
+        pages: list[dict[str, Any]] = []
+        try:
+            meta = client.pages.retrieve(page_id=page_id)
+            props = self._extract_properties(meta.get("properties", {}))
+            title = (
+                props.get("title", "")
+                or props.get("Name", "")
+                or props.get("Title", "")
+                or "Untitled"
+            )
+            if isinstance(title, list):
+                title = ", ".join(str(t) for t in title) or "Untitled"
+            current_path = f"{parent_path}/{title}" if parent_path else title
+            content, headings, code_blocks = self._fetch_page_blocks(client, page_id)
+            self._pages_fetched += 1
+            pages.append(
+                {
+                    "id": page_id,
+                    "title": title,
+                    "url": meta.get("url", ""),
+                    "properties": props,
+                    "content": content,
+                    "headings": headings,
+                    "code_blocks": code_blocks,
+                    "parent_path": parent_path,
+                    "depth": depth,
+                }
+            )
+            logger.info("   [%d] %s", self._pages_fetched, current_path)
+            time.sleep(RATE_LIMIT_DELAY)
+            if depth < MAX_BLOCK_DEPTH:
+                for child_id in self._get_child_pages(client, page_id):
+                    if self._pages_fetched >= self.max_pages:
+                        break
+                    pages.extend(self._extract_page_tree(client, child_id, current_path, depth + 1))
+        except APIResponseError as e:
+            if e.status == 429:
+                time.sleep(10)
+                return self._extract_page_tree(client, page_id, parent_path, depth)
+            logger.warning("API error on page %s: %s", page_id, e)
+        except Exception as e:
+            logger.warning("Error extracting page %s: %s", page_id, e)
+        return pages
+
+    def _get_child_pages(self, client: Any, page_id: str) -> list[str]:
+        """Get IDs of child_page / child_database blocks within a page."""
+        ids: list[str] = []
+        has_more, cursor = True, None
+        while has_more:
+            try:
+                params: dict[str, Any] = {"block_id": page_id}
+                if cursor:
+                    params["start_cursor"] = cursor
+                resp = client.blocks.children.list(**params)
+                has_more, cursor = resp.get("has_more", False), resp.get("next_cursor")
+                for b in resp.get("results", []):
+                    if b.get("type") in ("child_page", "child_database"):
+                        ids.append(b["id"])
+                time.sleep(RATE_LIMIT_DELAY)
+            except Exception as e:
+                logger.debug("Error listing children of %s: %s", page_id, e)
+                break  # noqa: E702
+        return ids
+
+    # -- Block parsing ---------------------------------------------------
+
+    def _fetch_page_blocks(
+        self, client: Any, page_id: str, depth: int = 0
+    ) -> tuple[str, list[dict[str, str]], list[dict[str, str]]]:
+        """Fetch all blocks for a page and convert to markdown."""
+        parts, headings, code_blocks = [], [], []
+        has_more, cursor = True, None
+        while has_more:
+            try:
+                params: dict[str, Any] = {"block_id": page_id}
+                if cursor:
+                    params["start_cursor"] = cursor
+                resp = client.blocks.children.list(**params)
+                has_more, cursor = resp.get("has_more", False), resp.get("next_cursor")
+                for block in resp.get("results", []):
+                    self._blocks_fetched += 1
+                    md, bh, bc = self._parse_notion_blocks(client, block, depth)
+                    if md:
+                        parts.append(md)
+                    headings.extend(bh)
+                    code_blocks.extend(bc)
+                time.sleep(RATE_LIMIT_DELAY)
+            except APIResponseError as e:
+                if e.status == 429:
+                    time.sleep(10)
+                    continue  # noqa: E702
+                logger.debug("API error fetching blocks for %s: %s", page_id, e)
+                break  # noqa: E702
+            except Exception as e:
+                logger.debug("Error fetching blocks for %s: %s", page_id, e)
+                break  # noqa: E702
+        return "\n\n".join(p for p in parts if p.strip()), headings, code_blocks
+
+    def _parse_notion_blocks(
+        self, client: Any, block: dict[str, Any], depth: int = 0
+    ) -> tuple[str, list[dict[str, str]], list[dict[str, str]]]:
+        """Convert a Notion block to markdown, recursing into children."""
+        btype = block.get("type", "")
+        md, headings, code_blocks = self._handle_block_type(btype, block)
+        if block.get("has_children") and depth < MAX_BLOCK_DEPTH:
+            child_md, ch, cc = self._fetch_page_blocks(client, block["id"], depth + 1)
+            if child_md:
+                if btype in ("toggle", "callout"):
+                    indented = "\n".join(f"  {l}" for l in child_md.split("\n"))  # noqa: E741
+                    md = f"{md}\n{indented}" if md else indented
+                else:
+                    md = f"{md}\n\n{child_md}" if md else child_md
+            headings.extend(ch)
+            code_blocks.extend(cc)
+        return md, headings, code_blocks
+
+    def _handle_block_type(
+        self, btype: str, block: dict[str, Any]
+    ) -> tuple[str, list[dict[str, str]], list[dict[str, str]]]:
+        """Handle a Notion block type: paragraph, heading, code, callout, toggle, table, etc."""
+        headings: list[dict[str, str]] = []
+        code_blocks: list[dict[str, str]] = []
+        data = block.get(btype, {})
+        md = ""
+
+        if btype == "paragraph":
+            md = self._extract_rich_text(data.get("rich_text", []))
+        elif btype in ("heading_1", "heading_2", "heading_3"):
+            level = int(btype[-1])
+            text = self._extract_rich_text(data.get("rich_text", []))
+            md = f"{'#' * level} {text}"
+            if text:
+                headings.append({"level": f"h{level}", "text": text})
+        elif btype == "code":
+            lang = data.get("language", "plain text") or "plain text"
+            code_text = self._extract_rich_text(data.get("rich_text", []))
+            caption = self._extract_rich_text(data.get("caption", []))
+            md = f"```{lang}\n{code_text}\n```"
+            if code_text.strip():
+                code_blocks.append({"language": lang, "code": code_text, "caption": caption})
+        elif btype == "callout":
+            icon = data.get("icon", {})
+            emoji = icon.get("emoji", "") if icon else ""
+            text = self._extract_rich_text(data.get("rich_text", []))
+            md = f"> {emoji} **Callout:** {text}" if emoji else f"> **Callout:** {text}"
+        elif btype == "toggle":
+            md = f"<details>\n<summary>{self._extract_rich_text(data.get('rich_text', []))}</summary>"
+        elif btype == "quote":
+            md = f"> {self._extract_rich_text(data.get('rich_text', []))}"
+        elif btype == "bulleted_list_item":
+            md = f"- {self._extract_rich_text(data.get('rich_text', []))}"
+        elif btype == "numbered_list_item":
+            md = f"1. {self._extract_rich_text(data.get('rich_text', []))}"
+        elif btype == "to_do":
+            text = self._extract_rich_text(data.get("rich_text", []))
+            md = f"- [{'x' if data.get('checked') else ' '}] {text}"
+        elif btype == "divider":
+            md = "---"
+        elif btype == "table":
+            md = self._handle_table_block(block)
+        elif btype == "image":
+            itype = data.get("type", "")
+            url = data.get(itype, {}).get("url", "") if itype in ("external", "file") else ""
+            cap = self._extract_rich_text(data.get("caption", []))
+            md = f"![{cap or 'Image'}]({url})" if url else ""
+        elif btype in ("bookmark", "embed", "link_preview"):
+            url = data.get("url", "")
+            cap = (
+                self._extract_rich_text(data.get("caption", [])) if btype != "link_preview" else ""
+            )
+            md = f"[{cap or url}]({url})" if url else ""
+        elif btype == "equation":
+            expr = data.get("expression", "")
+            md = f"$$\n{expr}\n$$" if expr else ""
+        elif btype in ("child_page", "child_database"):
+            md = f"**Sub-{btype.split('_')[1]}: {data.get('title', '')}**"
+        elif btype in ("pdf", "video", "audio", "file"):
+            ftype = data.get("type", "")
+            url = data.get(ftype, {}).get("url", "") if ftype in ("external", "file") else ""
+            md = f"[{btype.title()}]({url})" if url else ""
+        elif btype == "link_to_page":
+            lt = data.get("type", "")
+            md = f"*[Link to page: {data.get(lt, '')}]*" if data.get(lt) else ""
+        elif btype in (
+            "column_list",
+            "column",
+            "synced_block",
+            "template",
+            "table_of_contents",
+            "breadcrumb",
+        ):
+            md = "*[Table of Contents]*" if btype == "table_of_contents" else ""
+        else:
+            logger.debug("Unhandled block type: %s", btype)
+
+        return md, headings, code_blocks
+
+    def _handle_table_block(self, block: dict[str, Any]) -> str:
+        """Convert a Notion table block into a markdown table."""
+        tdata = block.get("table", {})
+        has_header = tdata.get("has_column_header", False)
+        rows = block.get("_table_rows", [])
+        if not rows:
+            return f"*[Table: {tdata.get('table_width', 0)} columns]*"
+        lines = []
+        for i, row in enumerate(rows):
+            cells = [self._extract_rich_text(c) for c in row.get("cells", [])]
+            lines.append("| " + " | ".join(cells) + " |")
+            if i == 0 and has_header:
+                lines.append("| " + " | ".join("---" for _ in cells) + " |")
+        return "\n".join(lines)
+
+    # -- Rich text -------------------------------------------------------
+
+    def _extract_rich_text(self, rich_text_list: list[dict[str, Any]]) -> str:
+        """Extract text with annotations (bold, italic, code, links) from Notion rich text."""
+        if not rich_text_list:
+            return ""
+        parts = []
+        for obj in rich_text_list:
+            text = obj.get("plain_text", "")
+            if not text:
+                continue
+            ann = obj.get("annotations", {})
+            if ann.get("code"):
+                text = f"`{text}`"
+            if ann.get("bold"):
+                text = f"**{text}**"
+            if ann.get("italic"):
+                text = f"*{text}*"
+            if ann.get("strikethrough"):
+                text = f"~~{text}~~"
+            if ann.get("underline"):
+                text = f"<u>{text}</u>"
+            if obj.get("href"):
+                text = f"[{text}]({obj['href']})"
+            parts.append(text)
+        return "".join(parts)
+
+    # ====================================================================
+    # EXPORT MODE
+    # ====================================================================
+
+    def _extract_from_export(self) -> list[dict[str, Any]]:
+        """Parse a Notion Markdown/CSV export directory."""
+        if not self.export_path:
+            raise ValueError("export_path is required for export mode.")
+        export_dir = Path(self.export_path)
+        if not export_dir.exists():
+            raise FileNotFoundError(f"Export directory not found: {self.export_path}")
+        if not export_dir.is_dir():
+            raise ValueError(f"Export path is not a directory: {self.export_path}")
+        print(f"   Parsing Notion export: {self.export_path}")
+        pages: list[dict[str, Any]] = []
+        for root, _dirs, files in os.walk(export_dir):
+            rel = str(Path(root).relative_to(export_dir))
+            parent = "" if rel == "." else rel
+            for fn in sorted(files):
+                if self._pages_fetched >= self.max_pages:
+                    break
+                fp = Path(root) / fn
+                if fp.suffix.lower() == ".md":
+                    pd = self._parse_export_markdown(fp, parent)
+                    if pd:
+                        pages.append(pd)
+                        self._pages_fetched += 1  # noqa: E702
+                elif fp.suffix.lower() == ".csv":
+                    for pd in self._parse_export_csv(fp, parent):
+                        if self._pages_fetched >= self.max_pages:
+                            break
+                        pages.append(pd)
+                        self._pages_fetched += 1  # noqa: E702
+            if self._pages_fetched >= self.max_pages:
+                break
+        print(f"   Parsed {len(pages)} files from export directory")
+        return pages
+
+    def _parse_export_markdown(self, filepath: Path, parent_path: str) -> dict[str, Any] | None:
+        """Parse a single .md file from a Notion export."""
+        try:
+            content = filepath.read_text(encoding="utf-8", errors="ignore")
+        except Exception as e:
+            logger.warning("Could not read %s: %s", filepath, e)
+            return None  # noqa: E702
+        if not content.strip():
+            return None
+        lines = content.split("\n")
+        title = self._clean_notion_export_title(filepath.stem)
+        for line in lines:
+            if line.startswith("# "):
+                title = line[2:].strip()
+                break  # noqa: E702
+        headings = [
+            {"level": f"h{len(m.group(1))}", "text": m.group(2).strip()}
+            for line in lines
+            if (m := re.match(r"^(#{2,6})\s+(.+)$", line))
+        ]
+        code_blocks = [
+            {"language": lang or "plain text", "code": code.strip(), "caption": ""}
+            for lang, code in re.findall(r"```(\w*)\n(.*?)```", content, re.DOTALL)
+            if code.strip()
+        ]
+        self._blocks_fetched += len(lines) + len(code_blocks)
+        body = re.sub(r"```\w*\n.*?```", "", content, flags=re.DOTALL)
+        body = re.sub(r"^#\s+.+$", "", body, count=1, flags=re.MULTILINE).strip()
+        return {
+            "id": str(filepath),
+            "title": title,
+            "url": "",
+            "properties": {},
+            "content": body,
+            "headings": headings,
+            "code_blocks": code_blocks,
+            "parent_path": parent_path,
+        }
+
+    def _parse_export_csv(self, filepath: Path, parent_path: str) -> list[dict[str, Any]]:
+        """Parse a CSV file from a Notion database export (one page per row)."""
+        pages: list[dict[str, Any]] = []
+        try:
+            with open(filepath, encoding="utf-8", errors="ignore", newline="") as f:
+                reader = csv.DictReader(f)
+                if not reader.fieldnames:
+                    return pages
+                title_col = reader.fieldnames[0]
+                for i, row in enumerate(reader):
+                    title = row.get(title_col, f"Row {i + 1}") or f"Row {i + 1}"
+                    props = {k: v for k, v in row.items() if k and v}
+                    body = "\n\n".join(
+                        f"**{k}:** {v}"
+                        for k, v in row.items()
+                        if k and v and k != title_col and len(str(v)) > 10
+                    )
+                    pages.append(
+                        {
+                            "id": f"{filepath}:row:{i}",
+                            "title": title,
+                            "url": "",
+                            "properties": props,
+                            "content": body,
+                            "headings": [],
+                            "code_blocks": [],
+                            "parent_path": parent_path,
+                        }
+                    )
+                    self._blocks_fetched += 1
+        except Exception as e:
+            logger.warning("Could not parse CSV %s: %s", filepath, e)
+        return pages
+
+    @staticmethod
+    def _clean_notion_export_title(stem: str) -> str:
+        """Strip trailing Notion hex IDs from export filenames."""
+        cleaned = re.sub(r"\s+[0-9a-f]{16,}$", "", stem)
+        return cleaned.strip() or stem
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """CLI entry point for the Notion scraper."""
+    from .arguments.common import add_all_standard_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert Notion workspace content to AI-ready skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Examples:\n"
+            "  skill-seekers notion --database-id ID --token $NOTION_TOKEN --name myskill\n"
+            "  skill-seekers notion --page-id ID --token $NOTION_TOKEN --name myskill\n"
+            "  skill-seekers notion --export-path ./export/ --name myskill\n"
+            "  skill-seekers notion --from-json output/myskill_notion_data.json --name myskill"
+        ),
+    )
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for Notion
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+
+    # Notion-specific arguments
+    parser.add_argument(
+        "--database-id", type=str, help="Notion database ID (API mode)", metavar="ID"
+    )
+    parser.add_argument(
+        "--page-id", type=str, help="Notion page ID (API mode, recursive)", metavar="ID"
+    )
+    parser.add_argument(
+        "--export-path", type=str, help="Notion export directory (export mode)", metavar="PATH"
+    )
+    parser.add_argument(
+        "--token", type=str, help="Notion integration token (or NOTION_TOKEN env)", metavar="TOKEN"
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=DEFAULT_MAX_PAGES,
+        help=f"Maximum pages to extract (default: {DEFAULT_MAX_PAGES})",
+        metavar="N",
+    )
+    parser.add_argument(
+        "--from-json", type=str, help="Build from previously extracted JSON", metavar="FILE"
+    )
+
+    args = parser.parse_args()
+
+    # Logging
+    level = (
+        logging.WARNING
+        if getattr(args, "quiet", False)
+        else (logging.DEBUG if getattr(args, "verbose", False) else logging.INFO)
+    )
+    logging.basicConfig(level=level, format="%(message)s", force=True)
+
+    # Dry run
+    if getattr(args, "dry_run", False):
+        source = (
+            getattr(args, "database_id", None)
+            or getattr(args, "page_id", None)
+            or getattr(args, "export_path", None)
+            or getattr(args, "from_json", None)
+            or "(none)"
+        )
+        print(f"\n{'=' * 60}\nDRY RUN: Notion Extraction\n{'=' * 60}")
+        print(
+            f"Source: {source}\nName: {getattr(args, 'name', None) or '(auto)'}\nMax pages: {args.max_pages}"
+        )
+        return 0
+
+    # Validate
+    has_source = any(
+        getattr(args, a, None) for a in ("database_id", "page_id", "export_path", "from_json")
+    )
+    if not has_source:
+        parser.error("Must specify --database-id, --page-id, --export-path, or --from-json")
+    if not getattr(args, "name", None):
+        if getattr(args, "from_json", None):
+            args.name = Path(args.from_json).stem.replace("_notion_data", "")
+        elif getattr(args, "export_path", None):
+            args.name = Path(args.export_path).stem
+        else:
+            parser.error("--name is required when using --database-id or --page-id")
+
+    # --from-json: build only
+    if getattr(args, "from_json", None):
+        config = {
+            "name": args.name,
+            "description": getattr(args, "description", None),
+            "max_pages": args.max_pages,
+        }
+        try:
+            conv = NotionToSkillConverter(config)
+            conv.load_extracted_data(args.from_json)
+            conv.build_skill()
+        except Exception as e:
+            print(f"\n   Error: {e}", file=sys.stderr)
+            sys.exit(1)  # noqa: E702
+        return 0
+
+    # Full extract + build
+    config: dict[str, Any] = {
+        "name": args.name,
+        "database_id": getattr(args, "database_id", None),
+        "page_id": getattr(args, "page_id", None),
+        "export_path": getattr(args, "export_path", None),
+        "token": getattr(args, "token", None),
+        "description": getattr(args, "description", None),
+        "max_pages": args.max_pages,
+    }
+    try:
+        conv = NotionToSkillConverter(config)
+        if not conv.extract_notion():
+            print("\n   Notion extraction failed", file=sys.stderr)
+            sys.exit(1)  # noqa: E702
+        conv.build_skill()
+
+        # Run enhancement workflows if specified
+        try:
+            from skill_seekers.cli.workflow_runner import run_workflows
+
+            run_workflows(args)
+        except (ImportError, AttributeError):
+            pass
+
+        # Traditional AI enhancement
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            skill_dir = conv.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                except ImportError:
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    LocalSkillEnhancer(Path(skill_dir)).run(headless=True)
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                LocalSkillEnhancer(Path(skill_dir)).run(headless=True)
+    except RuntimeError as e:
+        print(f"\n   Error: {e}", file=sys.stderr)
+        sys.exit(1)  # noqa: E702
+    except Exception as e:
+        print(f"\n   Unexpected error: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)  # noqa: E702
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/openapi_scraper.py b/src/skill_seekers/cli/openapi_scraper.py
new file mode 100644
index 0000000..caf0ba3
--- /dev/null
+++ b/src/skill_seekers/cli/openapi_scraper.py
@@ -0,0 +1,1959 @@
+#!/usr/bin/env python3
+"""
+OpenAPI/Swagger Specification to Skill Converter
+
+Converts OpenAPI 2.0 (Swagger) and OpenAPI 3.0/3.1 specifications into AI-ready skills.
+Supports both YAML and JSON spec formats, and can load specs from local files or remote URLs.
+
+Extracts:
+- API info (title, description, version, contact, license)
+- Servers / host / basePath
+- All paths with their operations (GET, POST, PUT, DELETE, PATCH, etc.)
+- Parameters (path, query, header, cookie, body)
+- Request bodies and response schemas
+- Component schemas / definitions with properties, types, enums
+- Security schemes (apiKey, http, oauth2, openIdConnect)
+- Tags for endpoint grouping
+
+Usage:
+    skill-seekers openapi --spec petstore.yaml --name petstore-api
+    skill-seekers openapi --spec-url https://petstore3.swagger.io/api/v3/openapi.json --name petstore
+    skill-seekers openapi --from-json petstore_extracted.json
+    python3 -m skill_seekers.cli.openapi_scraper --spec spec.yaml --name my-api
+"""
+
+import argparse
+import copy
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Any
+
+# Optional dependency guard
+try:
+    import yaml
+
+    YAML_AVAILABLE = True
+except ImportError:
+    YAML_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# HTTP methods recognized in OpenAPI path items
+HTTP_METHODS = {"get", "post", "put", "delete", "patch", "head", "options", "trace"}
+
+# OpenAPI version detection patterns
+_OPENAPI_3_RE = re.compile(r"^3\.\d+\.\d+$")
+_SWAGGER_2_RE = re.compile(r"^2\.\d+$")
+
+
+def _check_yaml_deps():
+    """Raise RuntimeError if pyyaml is not installed."""
+    if not YAML_AVAILABLE:
+        raise RuntimeError(
+            "pyyaml is required for OpenAPI/Swagger YAML spec support.\n"
+            'Install with: pip install "skill-seekers[openapi]"\n'
+            "Or: pip install pyyaml"
+        )
+
+
+def infer_description_from_spec(info: dict | None = None, name: str = "") -> str:
+    """Infer skill description from OpenAPI info object.
+
+    Tries to build a meaningful "Use when..." description from the spec metadata.
+
+    Args:
+        info: OpenAPI info object with title, description, etc.
+        name: Skill name for fallback
+
+    Returns:
+        Description string suitable for "Use when..." format
+    """
+    if info:
+        # Try the spec description first
+        desc = info.get("description", "")
+        if desc and len(desc) > 20:
+            # Take first sentence or first 150 chars
+            first_sentence = desc.split(". ")[0]
+            if len(first_sentence) > 150:
+                first_sentence = first_sentence[:147] + "..."
+            return f"Use when working with {first_sentence.lower()}"
+
+        # Fall back to title
+        title = info.get("title", "")
+        if title and len(title) > 5:
+            return f"Use when working with the {title} API"
+
+    return f"Use when working with the {name} API" if name else "Use when working with this API"
+
+
+class OpenAPIToSkillConverter:
+    """Convert OpenAPI/Swagger specifications to AI-ready skills.
+
+    Supports OpenAPI 2.0 (Swagger), 3.0, and 3.1 specifications in both
+    YAML and JSON formats. Can load specs from local files or remote URLs.
+
+    The converter extracts endpoints, schemas, security schemes, and metadata,
+    then generates structured markdown reference files suitable for LLM consumption.
+
+    Attributes:
+        config: Configuration dictionary with name, spec_path, spec_url, description.
+        name: Skill name used for output directory and filenames.
+        spec_path: Local file path to the OpenAPI spec (mutually exclusive with spec_url).
+        spec_url: Remote URL to fetch the OpenAPI spec from.
+        description: Skill description for SKILL.md frontmatter.
+        skill_dir: Output directory for the generated skill.
+        data_file: Path to the extracted JSON data file.
+        spec_data: Raw parsed spec dictionary.
+        extracted_data: Structured extraction result with endpoints, schemas, etc.
+    """
+
+    def __init__(self, config: dict) -> None:
+        """Initialize the converter with configuration.
+
+        Args:
+            config: Dictionary with keys:
+                - name (str): Skill name (required)
+                - spec_path (str): Local file path to spec (optional)
+                - spec_url (str): Remote URL to fetch spec (optional)
+                - description (str): Skill description (optional)
+
+        Raises:
+            ValueError: If neither spec_path nor spec_url is provided and
+                        no from_json workflow is intended.
+        """
+        self.config = config
+        self.name = config["name"]
+        self.spec_path: str = config.get("spec_path", "")
+        self.spec_url: str = config.get("spec_url", "")
+        self.description: str = config.get(
+            "description", f"Use when working with the {self.name} API"
+        )
+
+        # Output paths
+        self.skill_dir = f"output/{self.name}"
+        self.data_file = f"output/{self.name}_extracted.json"
+
+        # Internal state
+        self.spec_data: dict[str, Any] = {}
+        self.extracted_data: dict[str, Any] = {}
+        self.openapi_version: str = ""
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Spec loading
+    # ──────────────────────────────────────────────────────────────────────
+
+    def extract_spec(self) -> bool:
+        """Read and parse the OpenAPI specification from file or URL.
+
+        Determines the source (local file or remote URL), loads the raw content,
+        parses it as YAML or JSON, detects the OpenAPI version, and delegates
+        to the appropriate version-specific parser.
+
+        Returns:
+            True if extraction succeeded, False otherwise.
+
+        Raises:
+            RuntimeError: If the spec cannot be loaded or parsed.
+            ValueError: If the spec version is unsupported.
+        """
+        _check_yaml_deps()
+        logger.info("\n  Extracting OpenAPI specification...")
+
+        # Load raw spec data
+        if self.spec_path:
+            self.spec_data = self._load_from_file(self.spec_path)
+        elif self.spec_url:
+            self.spec_data = self._load_from_url(self.spec_url)
+        else:
+            raise RuntimeError(
+                "No spec source provided. Use spec_path (local file) or spec_url (remote URL)."
+            )
+
+        # Detect version
+        self.openapi_version = self._detect_version(self.spec_data)
+        logger.info("  Detected OpenAPI version: %s", self.openapi_version)
+
+        # Parse according to version
+        if _SWAGGER_2_RE.match(self.openapi_version):
+            self.extracted_data = self._parse_swagger_2(self.spec_data)
+        elif _OPENAPI_3_RE.match(self.openapi_version):
+            self.extracted_data = self._parse_openapi_3(self.spec_data)
+        else:
+            raise ValueError(
+                f"Unsupported OpenAPI version: {self.openapi_version}. "
+                "Supported versions: 2.0 (Swagger), 3.0.x, 3.1.x"
+            )
+
+        # Update description from spec info if not explicitly set in config
+        if "description" not in self.config:
+            info = self.extracted_data.get("info", {})
+            self.description = infer_description_from_spec(info, self.name)
+
+        # Persist extracted data
+        os.makedirs("output", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)
+        logger.info("  Saved extracted data to: %s", self.data_file)
+
+        # Log summary
+        endpoints = self.extracted_data.get("endpoints", [])
+        schemas = self.extracted_data.get("schemas", {})
+        security = self.extracted_data.get("security_schemes", {})
+        logger.info(
+            "  Extracted %d endpoints, %d schemas, %d security schemes",
+            len(endpoints),
+            len(schemas),
+            len(security),
+        )
+
+        return True
+
+    def _load_from_file(self, path: str) -> dict[str, Any]:
+        """Load and parse a spec from a local file.
+
+        Supports both YAML (.yaml, .yml) and JSON (.json) files.
+
+        Args:
+            path: Path to the local spec file.
+
+        Returns:
+            Parsed spec as a dictionary.
+
+        Raises:
+            RuntimeError: If the file cannot be read or parsed.
+        """
+        logger.info("  Loading spec from file: %s", path)
+
+        if not os.path.exists(path):
+            raise RuntimeError(f"Spec file not found: {path}")
+
+        try:
+            with open(path, encoding="utf-8") as f:
+                content = f.read()
+        except OSError as e:
+            raise RuntimeError(f"Failed to read spec file {path}: {e}") from e
+
+        return self._parse_content(content, path)
+
+    def _load_from_url(self, url: str) -> dict[str, Any]:
+        """Fetch and parse a spec from a remote URL.
+
+        Args:
+            url: URL to fetch the spec from.
+
+        Returns:
+            Parsed spec as a dictionary.
+
+        Raises:
+            RuntimeError: If the URL cannot be fetched or the content parsed.
+        """
+        logger.info("  Fetching spec from URL: %s", url)
+
+        try:
+            import requests
+        except ImportError as exc:
+            raise RuntimeError(
+                "requests library is required for fetching remote specs.\n"
+                "Install with: pip install requests"
+            ) from exc
+
+        try:
+            response = requests.get(
+                url,
+                timeout=30,
+                headers={
+                    "User-Agent": "SkillSeekers/OpenAPI-Scraper",
+                    "Accept": "application/json, application/yaml, text/yaml, */*",
+                },
+            )
+            response.raise_for_status()
+        except Exception as e:
+            raise RuntimeError(f"Failed to fetch spec from {url}: {e}") from e
+
+        return self._parse_content(response.text, url)
+
+    def _parse_content(self, content: str, source: str) -> dict[str, Any]:
+        """Parse raw content as YAML or JSON.
+
+        Tries JSON first (faster), falls back to YAML. YAML is a superset
+        of JSON, so YAML parsing handles both formats.
+
+        Args:
+            content: Raw text content.
+            source: Source path or URL (for error messages and format detection).
+
+        Returns:
+            Parsed dictionary.
+
+        Raises:
+            RuntimeError: If content cannot be parsed.
+        """
+        # Try JSON first if source looks like JSON
+        if source.endswith(".json") or content.lstrip().startswith("{"):
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError:
+                pass  # Fall through to YAML
+
+        # Try YAML (handles both YAML and JSON)
+        try:
+            data = yaml.safe_load(content)
+            if isinstance(data, dict):
+                return data
+            raise RuntimeError(
+                f"Spec from {source} parsed but is not a mapping (got {type(data).__name__})"
+            )
+        except yaml.YAMLError as e:
+            raise RuntimeError(f"Failed to parse spec from {source}: {e}") from e
+
+    def _detect_version(self, spec: dict[str, Any]) -> str:
+        """Detect the OpenAPI/Swagger version from the spec.
+
+        Args:
+            spec: Parsed spec dictionary.
+
+        Returns:
+            Version string (e.g. "2.0", "3.0.3", "3.1.0").
+
+        Raises:
+            ValueError: If no version field is found.
+        """
+        # OpenAPI 3.x uses "openapi" field
+        if "openapi" in spec:
+            return str(spec["openapi"])
+
+        # Swagger 2.0 uses "swagger" field
+        if "swagger" in spec:
+            return str(spec["swagger"])
+
+        raise ValueError(
+            "Cannot determine spec version. Expected 'openapi' or 'swagger' field "
+            "at the root of the specification."
+        )
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Data loading (from previously extracted JSON)
+    # ──────────────────────────────────────────────────────────────────────
+
+    def load_extracted_data(self, json_path: str | None = None) -> bool:
+        """Load previously extracted data from a JSON file.
+
+        Args:
+            json_path: Path to the JSON file. Defaults to self.data_file.
+
+        Returns:
+            True if loading succeeded.
+
+        Raises:
+            FileNotFoundError: If the JSON file does not exist.
+        """
+        path = json_path or self.data_file
+        logger.info("  Loading extracted data from: %s", path)
+
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Extracted data file not found: {path}")
+
+        with open(path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+
+        endpoints = self.extracted_data.get("endpoints", [])
+        schemas = self.extracted_data.get("schemas", {})
+        logger.info("  Loaded %d endpoints, %d schemas", len(endpoints), len(schemas))
+        return True
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Version-specific parsers
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _parse_openapi_3(self, spec: dict[str, Any]) -> dict[str, Any]:
+        """Parse an OpenAPI 3.0/3.1 specification.
+
+        Extracts info, servers, endpoints, component schemas, and security schemes
+        following the OpenAPI 3.x structure.
+
+        Args:
+            spec: Parsed OpenAPI 3.x spec dictionary.
+
+        Returns:
+            Structured extraction dictionary.
+        """
+        logger.info("  Parsing OpenAPI 3.x specification...")
+
+        result: dict[str, Any] = {
+            "openapi_version": str(spec.get("openapi", "3.0.0")),
+            "info": self._extract_info(spec),
+            "servers": [],
+            "endpoints": [],
+            "schemas": {},
+            "security_schemes": {},
+            "tags": [],
+            "external_docs": spec.get("externalDocs", {}),
+        }
+
+        # Extract servers
+        for server in spec.get("servers", []):
+            result["servers"].append(
+                {
+                    "url": server.get("url", ""),
+                    "description": server.get("description", ""),
+                    "variables": server.get("variables", {}),
+                }
+            )
+
+        # Extract tags
+        for tag in spec.get("tags", []):
+            result["tags"].append(
+                {
+                    "name": tag.get("name", ""),
+                    "description": tag.get("description", ""),
+                    "external_docs": tag.get("externalDocs", {}),
+                }
+            )
+
+        # Extract endpoints from paths
+        result["endpoints"] = self._extract_endpoints(spec, version=3)
+
+        # Extract component schemas
+        components = spec.get("components", {})
+        result["schemas"] = self._extract_schemas(components.get("schemas", {}), spec)
+
+        # Extract security schemes
+        result["security_schemes"] = self._extract_security(
+            components.get("securitySchemes", {}), version=3
+        )
+
+        # Global security requirements
+        result["global_security"] = spec.get("security", [])
+
+        return result
+
+    def _parse_swagger_2(self, spec: dict[str, Any]) -> dict[str, Any]:
+        """Parse a Swagger 2.0 specification.
+
+        Extracts info, host/basePath, endpoints, definitions, and security
+        following the Swagger 2.0 structure.
+
+        Args:
+            spec: Parsed Swagger 2.0 spec dictionary.
+
+        Returns:
+            Structured extraction dictionary.
+        """
+        logger.info("  Parsing Swagger 2.0 specification...")
+
+        result: dict[str, Any] = {
+            "openapi_version": str(spec.get("swagger", "2.0")),
+            "info": self._extract_info(spec),
+            "servers": [],
+            "endpoints": [],
+            "schemas": {},
+            "security_schemes": {},
+            "tags": [],
+            "external_docs": spec.get("externalDocs", {}),
+        }
+
+        # Convert host/basePath/schemes to pseudo-servers for consistency
+        host = spec.get("host", "")
+        base_path = spec.get("basePath", "/")
+        schemes = spec.get("schemes", ["https"])
+        if host:
+            for scheme in schemes:
+                result["servers"].append(
+                    {
+                        "url": f"{scheme}://{host}{base_path}",
+                        "description": f"Swagger 2.0 server ({scheme})",
+                        "variables": {},
+                    }
+                )
+
+        # Extract tags
+        for tag in spec.get("tags", []):
+            result["tags"].append(
+                {
+                    "name": tag.get("name", ""),
+                    "description": tag.get("description", ""),
+                    "external_docs": tag.get("externalDocs", {}),
+                }
+            )
+
+        # Extract endpoints from paths
+        result["endpoints"] = self._extract_endpoints(spec, version=2)
+
+        # Extract definitions (Swagger 2.0 equivalent of component schemas)
+        result["schemas"] = self._extract_schemas(spec.get("definitions", {}), spec)
+
+        # Extract security definitions
+        result["security_schemes"] = self._extract_security(
+            spec.get("securityDefinitions", {}), version=2
+        )
+
+        # Global security requirements
+        result["global_security"] = spec.get("security", [])
+
+        # Swagger 2.0 global consumes/produces
+        result["consumes"] = spec.get("consumes", [])
+        result["produces"] = spec.get("produces", [])
+
+        return result
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Shared extraction helpers
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _extract_info(self, spec: dict[str, Any]) -> dict[str, Any]:
+        """Extract the info object from a spec.
+
+        Args:
+            spec: The full spec dictionary.
+
+        Returns:
+            Normalized info dictionary.
+        """
+        info = spec.get("info", {})
+        contact = info.get("contact", {})
+        license_info = info.get("license", {})
+
+        return {
+            "title": info.get("title", "Untitled API"),
+            "description": info.get("description", ""),
+            "version": info.get("version", ""),
+            "terms_of_service": info.get("termsOfService", ""),
+            "contact": {
+                "name": contact.get("name", ""),
+                "url": contact.get("url", ""),
+                "email": contact.get("email", ""),
+            },
+            "license": {
+                "name": license_info.get("name", ""),
+                "url": license_info.get("url", ""),
+            },
+        }
+
+    def _extract_endpoints(self, spec: dict[str, Any], version: int) -> list[dict[str, Any]]:
+        """Extract all API endpoints from the spec paths.
+
+        Iterates over every path and HTTP method, extracting operation metadata,
+        parameters, request body, responses, tags, and security requirements.
+
+        Args:
+            spec: The full spec dictionary.
+            version: OpenAPI major version (2 or 3).
+
+        Returns:
+            List of endpoint dictionaries.
+        """
+        endpoints: list[dict[str, Any]] = []
+        paths = spec.get("paths", {})
+
+        for path, path_item in paths.items():
+            if not isinstance(path_item, dict):
+                continue
+
+            # Path-level parameters apply to all operations
+            path_level_params = path_item.get("parameters", [])
+
+            for method in HTTP_METHODS:
+                operation = path_item.get(method)
+                if not operation or not isinstance(operation, dict):
+                    continue
+
+                endpoint: dict[str, Any] = {
+                    "path": path,
+                    "method": method.upper(),
+                    "operation_id": operation.get("operationId", ""),
+                    "summary": operation.get("summary", ""),
+                    "description": operation.get("description", ""),
+                    "tags": operation.get("tags", []),
+                    "deprecated": operation.get("deprecated", False),
+                    "security": operation.get("security", []),
+                    "parameters": [],
+                    "request_body": {},
+                    "responses": {},
+                }
+
+                # Merge path-level and operation-level parameters
+                all_params = list(path_level_params) + operation.get("parameters", [])
+                for param in all_params:
+                    resolved = self._resolve_ref(param, spec)
+                    endpoint["parameters"].append(
+                        self._normalize_parameter(resolved, version, spec)
+                    )
+
+                # Request body (OpenAPI 3.x) or body parameter (Swagger 2.0)
+                if version >= 3:
+                    req_body = operation.get("requestBody", {})
+                    if req_body:
+                        resolved_body = self._resolve_ref(req_body, spec)
+                        endpoint["request_body"] = self._normalize_request_body_v3(
+                            resolved_body, spec
+                        )
+                else:
+                    # Swagger 2.0: body parameter is extracted alongside other params
+                    body_params = [p for p in endpoint["parameters"] if p.get("location") == "body"]
+                    if body_params:
+                        endpoint["request_body"] = {
+                            "description": body_params[0].get("description", ""),
+                            "required": body_params[0].get("required", False),
+                            "content": {
+                                "application/json": {"schema": body_params[0].get("schema", {})}
+                            },
+                        }
+
+                # Responses
+                for status_code, response_obj in operation.get("responses", {}).items():
+                    resolved_resp = self._resolve_ref(response_obj, spec)
+                    endpoint["responses"][str(status_code)] = self._normalize_response(
+                        resolved_resp, version, spec
+                    )
+
+                endpoints.append(endpoint)
+
+        return endpoints
+
+    def _normalize_parameter(
+        self, param: dict[str, Any], version: int, spec: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Normalize a parameter object across OpenAPI versions.
+
+        Args:
+            param: Raw parameter object (already resolved).
+            version: OpenAPI major version (2 or 3).
+            spec: Full spec for nested $ref resolution.
+
+        Returns:
+            Normalized parameter dictionary.
+        """
+        location = param.get("in", "query")
+        schema = param.get("schema", {})
+
+        # Swagger 2.0 has type/format directly on the parameter
+        if version == 2 and not schema and location != "body":
+            schema = {
+                "type": param.get("type", "string"),
+                "format": param.get("format", ""),
+                "enum": param.get("enum", []),
+                "default": param.get("default"),
+                "items": param.get("items", {}),
+            }
+            # Remove empty values
+            schema = {k: v for k, v in schema.items() if v is not None and v != "" and v != []}
+
+        # Swagger 2.0 body parameter
+        if version == 2 and location == "body":
+            body_schema = param.get("schema", {})
+            body_schema = self._resolve_ref(body_schema, spec)
+            schema = self._flatten_schema(body_schema, spec)
+
+        # OpenAPI 3.x parameter schema
+        if version >= 3 and schema:
+            schema = self._resolve_ref(schema, spec)
+            schema = self._flatten_schema(schema, spec)
+
+        return {
+            "name": param.get("name", ""),
+            "location": location,
+            "description": param.get("description", ""),
+            "required": param.get("required", location == "path"),
+            "deprecated": param.get("deprecated", False),
+            "schema": schema,
+            "example": param.get("example", param.get("x-example")),
+        }
+
+    def _normalize_request_body_v3(
+        self, body: dict[str, Any], spec: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Normalize an OpenAPI 3.x request body object.
+
+        Args:
+            body: Raw requestBody object (already resolved).
+            spec: Full spec for nested $ref resolution.
+
+        Returns:
+            Normalized request body dictionary.
+        """
+        content_map: dict[str, Any] = {}
+        for media_type, media_obj in body.get("content", {}).items():
+            schema = media_obj.get("schema", {})
+            schema = self._resolve_ref(schema, spec)
+            schema = self._flatten_schema(schema, spec)
+            content_map[media_type] = {
+                "schema": schema,
+                "example": media_obj.get("example"),
+                "examples": media_obj.get("examples", {}),
+            }
+
+        return {
+            "description": body.get("description", ""),
+            "required": body.get("required", False),
+            "content": content_map,
+        }
+
+    def _normalize_response(
+        self,
+        response: dict[str, Any],
+        version: int,
+        spec: dict[str, Any],
+    ) -> dict[str, Any]:
+        """Normalize a response object across OpenAPI versions.
+
+        Args:
+            response: Raw response object (already resolved).
+            version: OpenAPI major version (2 or 3).
+            spec: Full spec for nested $ref resolution.
+
+        Returns:
+            Normalized response dictionary.
+        """
+        result: dict[str, Any] = {
+            "description": response.get("description", ""),
+            "content": {},
+            "headers": {},
+        }
+
+        if version >= 3:
+            # OpenAPI 3.x: content with media types
+            for media_type, media_obj in response.get("content", {}).items():
+                schema = media_obj.get("schema", {})
+                schema = self._resolve_ref(schema, spec)
+                schema = self._flatten_schema(schema, spec)
+                result["content"][media_type] = {"schema": schema}
+        else:
+            # Swagger 2.0: schema directly on the response
+            schema = response.get("schema", {})
+            if schema:
+                schema = self._resolve_ref(schema, spec)
+                schema = self._flatten_schema(schema, spec)
+                result["content"]["application/json"] = {"schema": schema}
+
+        # Headers
+        for header_name, header_obj in response.get("headers", {}).items():
+            resolved_header = self._resolve_ref(header_obj, spec)
+            result["headers"][header_name] = {
+                "description": resolved_header.get("description", ""),
+                "schema": resolved_header.get(
+                    "schema",
+                    {
+                        "type": resolved_header.get("type", "string"),
+                    },
+                ),
+            }
+
+        return result
+
+    def _extract_schemas(
+        self, schemas_dict: dict[str, Any], spec: dict[str, Any]
+    ) -> dict[str, Any]:
+        """Extract and normalize component schemas or definitions.
+
+        Args:
+            schemas_dict: The schemas/definitions mapping from the spec.
+            spec: Full spec for $ref resolution.
+
+        Returns:
+            Dictionary of schema name to flattened schema object.
+        """
+        result: dict[str, Any] = {}
+
+        for schema_name, schema_obj in schemas_dict.items():
+            resolved = self._resolve_ref(schema_obj, spec)
+            flattened = self._flatten_schema(resolved, spec, depth=0)
+            result[schema_name] = flattened
+
+        logger.info("  Extracted %d schemas", len(result))
+        return result
+
+    def _flatten_schema(
+        self,
+        schema: dict[str, Any],
+        spec: dict[str, Any],
+        depth: int = 0,
+    ) -> dict[str, Any]:
+        """Flatten a schema by resolving references and simplifying structure.
+
+        Handles $ref, allOf, oneOf, anyOf composition. Limits recursion depth
+        to prevent infinite loops in circular references.
+
+        Args:
+            schema: Schema object to flatten.
+            spec: Full spec for $ref resolution.
+            depth: Current recursion depth (max 10).
+
+        Returns:
+            Flattened schema dictionary.
+        """
+        if not schema or not isinstance(schema, dict) or depth > 10:
+            return schema if isinstance(schema, dict) else {}
+
+        # Resolve top-level $ref
+        if "$ref" in schema:
+            ref_name = schema["$ref"].split("/")[-1]
+            resolved = self._resolve_ref(schema, spec)
+            if resolved is schema:
+                # Could not resolve — return stub
+                return {"type": "object", "$ref": schema["$ref"], "_ref_name": ref_name}
+            result = self._flatten_schema(resolved, spec, depth + 1)
+            result["_ref_name"] = ref_name
+            return result
+
+        result = dict(schema)
+
+        # Handle allOf composition
+        if "allOf" in result:
+            merged: dict[str, Any] = {}
+            merged_properties: dict[str, Any] = {}
+            merged_required: list[str] = []
+            for sub_schema in result["allOf"]:
+                flat = self._flatten_schema(sub_schema, spec, depth + 1)
+                merged_properties.update(flat.get("properties", {}))
+                merged_required.extend(flat.get("required", []))
+                # Merge other fields (description, type, etc.)
+                for k, v in flat.items():
+                    if k not in ("properties", "required"):
+                        merged[k] = v
+            merged["properties"] = merged_properties
+            if merged_required:
+                merged["required"] = list(dict.fromkeys(merged_required))
+            if "type" not in merged and merged_properties:
+                merged["type"] = "object"
+            del result["allOf"]
+            result.update(merged)
+
+        # Handle oneOf / anyOf — keep as list of flattened schemas
+        for combinator in ("oneOf", "anyOf"):
+            if combinator in result:
+                result[combinator] = [
+                    self._flatten_schema(s, spec, depth + 1) for s in result[combinator]
+                ]
+
+        # Flatten nested properties
+        if "properties" in result:
+            flat_props: dict[str, Any] = {}
+            for prop_name, prop_schema in result["properties"].items():
+                flat_props[prop_name] = self._flatten_schema(prop_schema, spec, depth + 1)
+            result["properties"] = flat_props
+
+        # Flatten items (for array types)
+        if "items" in result and isinstance(result["items"], dict):
+            result["items"] = self._flatten_schema(result["items"], spec, depth + 1)
+
+        # Flatten additionalProperties
+        if "additionalProperties" in result and isinstance(result["additionalProperties"], dict):
+            result["additionalProperties"] = self._flatten_schema(
+                result["additionalProperties"], spec, depth + 1
+            )
+
+        return result
+
+    def _extract_security(self, security_dict: dict[str, Any], version: int) -> dict[str, Any]:
+        """Extract and normalize security scheme definitions.
+
+        Args:
+            security_dict: securitySchemes (v3) or securityDefinitions (v2) mapping.
+            version: OpenAPI major version (2 or 3).
+
+        Returns:
+            Dictionary of scheme name to normalized security scheme.
+        """
+        result: dict[str, Any] = {}
+
+        for scheme_name, scheme_obj in security_dict.items():
+            scheme_type = scheme_obj.get("type", "")
+
+            normalized: dict[str, Any] = {
+                "type": scheme_type,
+                "description": scheme_obj.get("description", ""),
+            }
+
+            if scheme_type == "apiKey":
+                normalized["name"] = scheme_obj.get("name", "")
+                normalized["location"] = scheme_obj.get("in", "header")
+
+            elif scheme_type in ("http", "basic"):
+                normalized["scheme"] = scheme_obj.get("scheme", "basic")
+                normalized["bearer_format"] = scheme_obj.get("bearerFormat", "")
+
+            elif scheme_type == "oauth2":
+                if version >= 3:
+                    normalized["flows"] = scheme_obj.get("flows", {})
+                else:
+                    # Swagger 2.0 OAuth2
+                    normalized["flow"] = scheme_obj.get("flow", "")
+                    normalized["authorization_url"] = scheme_obj.get("authorizationUrl", "")
+                    normalized["token_url"] = scheme_obj.get("tokenUrl", "")
+                    normalized["scopes"] = scheme_obj.get("scopes", {})
+
+            elif scheme_type == "openIdConnect":
+                normalized["openid_connect_url"] = scheme_obj.get("openIdConnectUrl", "")
+
+            result[scheme_name] = normalized
+
+        return result
+
+    def _resolve_ref(self, obj: dict[str, Any], spec: dict[str, Any]) -> dict[str, Any]:
+        """Resolve a $ref reference within the specification.
+
+        Follows JSON Pointer syntax (e.g. "#/components/schemas/Pet") to find
+        the referenced object. Returns the original object unchanged if it
+        contains no $ref.
+
+        Args:
+            obj: Object that may contain a "$ref" key.
+            spec: The full spec to resolve against.
+
+        Returns:
+            The resolved object, or the original if no $ref is present.
+        """
+        if not isinstance(obj, dict) or "$ref" not in obj:
+            return obj
+
+        ref_path = obj["$ref"]
+        if not ref_path.startswith("#/"):
+            # External references are not supported — return as-is
+            logger.debug("  External $ref not supported: %s", ref_path)
+            return obj
+
+        parts = ref_path[2:].split("/")
+        current: Any = spec
+        for part in parts:
+            # Handle JSON Pointer escaping
+            part = part.replace("~1", "/").replace("~0", "~")
+            if isinstance(current, dict):
+                current = current.get(part)
+            else:
+                logger.warning("  Could not resolve $ref: %s", ref_path)
+                return obj
+
+            if current is None:
+                logger.warning("  $ref target not found: %s", ref_path)
+                return obj
+
+        if isinstance(current, dict):
+            # Return a copy to avoid mutation
+            return copy.copy(current)
+        return obj
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Categorization
+    # ──────────────────────────────────────────────────────────────────────
+
+    def categorize_content(self) -> dict[str, list[dict[str, Any]]]:
+        """Categorize endpoints by tags and path groups.
+
+        Groups endpoints primarily by their tags. Endpoints without tags are
+        grouped by the first significant path segment. A special "untagged"
+        group is used for endpoints that cannot be categorized.
+
+        Returns:
+            Dictionary mapping category name to list of endpoint dicts.
+        """
+        logger.info("  Categorizing endpoints...")
+
+        endpoints = self.extracted_data.get("endpoints", [])
+        categories: dict[str, list[dict[str, Any]]] = {}
+
+        for endpoint in endpoints:
+            tags = endpoint.get("tags", [])
+
+            if tags:
+                # Use the first tag as primary category
+                tag = tags[0]
+                if tag not in categories:
+                    categories[tag] = []
+                categories[tag].append(endpoint)
+            else:
+                # Group by first path segment
+                path = endpoint.get("path", "/")
+                segments = [s for s in path.split("/") if s and not s.startswith("{")]
+                group = segments[0] if segments else "root"
+                if group not in categories:
+                    categories[group] = []
+                categories[group].append(endpoint)
+
+        # Log summary
+        for cat_name, cat_endpoints in categories.items():
+            logger.info("    %s: %d endpoints", cat_name, len(cat_endpoints))
+
+        return categories
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Skill building
+    # ──────────────────────────────────────────────────────────────────────
+
+    def build_skill(self) -> None:
+        """Build the complete skill structure from extracted data.
+
+        Creates output directories, generates reference files for each endpoint
+        category, an index file, and the main SKILL.md.
+        """
+        logger.info("\n  Building skill: %s", self.name)
+
+        # Create directories
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        # Categorize endpoints
+        categories = self.categorize_content()
+
+        # Generate reference files
+        logger.info("  Generating reference files...")
+        for cat_name, cat_endpoints in categories.items():
+            self._generate_reference_file(cat_name, cat_endpoints)
+
+        # Generate schemas reference
+        schemas = self.extracted_data.get("schemas", {})
+        if schemas:
+            self._generate_schemas_reference(schemas)
+
+        # Generate security reference
+        security = self.extracted_data.get("security_schemes", {})
+        if security:
+            self._generate_security_reference(security)
+
+        # Generate index
+        self._generate_index(categories)
+
+        # Generate SKILL.md
+        self._generate_skill_md(categories)
+
+        logger.info("\n  Skill built successfully: %s/", self.skill_dir)
+        logger.info("  Next step: Package with: skill-seekers package %s/", self.skill_dir)
+
+    def _generate_reference_file(self, cat_name: str, endpoints: list[dict[str, Any]]) -> None:
+        """Generate a reference markdown file for a category of endpoints.
+
+        Args:
+            cat_name: Category name (tag or path group).
+            endpoints: List of endpoint dicts belonging to this category.
+        """
+        safe_name = self._sanitize_filename(cat_name)
+        filepath = f"{self.skill_dir}/references/{safe_name}.md"
+
+        lines: list[str] = []
+        lines.append(f"# {cat_name} Endpoints\n")
+
+        # Tag description from spec tags
+        tag_desc = self._get_tag_description(cat_name)
+        if tag_desc:
+            lines.append(f"{tag_desc}\n")
+
+        lines.append(f"**Endpoints:** {len(endpoints)}\n")
+        lines.append("---\n")
+
+        for endpoint in endpoints:
+            lines.append(self._format_endpoint_md(endpoint))
+            lines.append("\n---\n")
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+
+        logger.info("    Generated: %s", filepath)
+
+    def _generate_schemas_reference(self, schemas: dict[str, Any]) -> None:
+        """Generate a reference markdown file for all component schemas.
+
+        Args:
+            schemas: Dictionary mapping schema name to schema object.
+        """
+        filepath = f"{self.skill_dir}/references/schemas.md"
+
+        lines: list[str] = []
+        lines.append("# Data Models / Schemas\n")
+        lines.append("Component schemas (data models) defined in the API specification.\n")
+        lines.append(f"**Total schemas:** {len(schemas)}\n")
+        lines.append("---\n")
+
+        for schema_name in sorted(schemas.keys()):
+            schema = schemas[schema_name]
+            lines.append(self._format_schema_md(schema_name, schema))
+            lines.append("\n---\n")
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+
+        logger.info("    Generated: %s", filepath)
+
+    def _generate_security_reference(self, security_schemes: dict[str, Any]) -> None:
+        """Generate a reference markdown file for security schemes.
+
+        Args:
+            security_schemes: Dictionary mapping scheme name to scheme object.
+        """
+        filepath = f"{self.skill_dir}/references/security.md"
+
+        lines: list[str] = []
+        lines.append("# Security Schemes\n")
+        lines.append("Authentication and authorization schemes defined in the API specification.\n")
+        lines.append(f"**Total schemes:** {len(security_schemes)}\n")
+        lines.append("---\n")
+
+        for scheme_name, scheme in security_schemes.items():
+            lines.append(f"## {scheme_name}\n")
+            lines.append(f"**Type:** `{scheme.get('type', 'unknown')}`\n")
+
+            if scheme.get("description"):
+                lines.append(f"{scheme['description']}\n")
+
+            scheme_type = scheme.get("type", "")
+
+            if scheme_type == "apiKey":
+                lines.append(f"- **Parameter name:** `{scheme.get('name', '')}`")
+                lines.append(f"- **Location:** `{scheme.get('location', 'header')}`\n")
+
+            elif scheme_type in ("http", "basic"):
+                lines.append(f"- **Scheme:** `{scheme.get('scheme', 'basic')}`")
+                if scheme.get("bearer_format"):
+                    lines.append(f"- **Bearer format:** `{scheme['bearer_format']}`")
+                lines.append("")
+
+            elif scheme_type == "oauth2":
+                if "flows" in scheme:
+                    # OpenAPI 3.x flows
+                    for flow_name, flow_obj in scheme["flows"].items():
+                        lines.append(f"### Flow: {flow_name}\n")
+                        if flow_obj.get("authorizationUrl"):
+                            lines.append(
+                                f"- **Authorization URL:** `{flow_obj['authorizationUrl']}`"
+                            )
+                        if flow_obj.get("tokenUrl"):
+                            lines.append(f"- **Token URL:** `{flow_obj['tokenUrl']}`")
+                        if flow_obj.get("refreshUrl"):
+                            lines.append(f"- **Refresh URL:** `{flow_obj['refreshUrl']}`")
+                        scopes = flow_obj.get("scopes", {})
+                        if scopes:
+                            lines.append("\n**Scopes:**\n")
+                            for scope_name, scope_desc in scopes.items():
+                                lines.append(f"- `{scope_name}`: {scope_desc}")
+                        lines.append("")
+                else:
+                    # Swagger 2.0 OAuth2
+                    if scheme.get("authorization_url"):
+                        lines.append(f"- **Authorization URL:** `{scheme['authorization_url']}`")
+                    if scheme.get("token_url"):
+                        lines.append(f"- **Token URL:** `{scheme['token_url']}`")
+                    if scheme.get("flow"):
+                        lines.append(f"- **Flow:** `{scheme['flow']}`")
+                    scopes = scheme.get("scopes", {})
+                    if scopes:
+                        lines.append("\n**Scopes:**\n")
+                        for scope_name, scope_desc in scopes.items():
+                            lines.append(f"- `{scope_name}`: {scope_desc}")
+                    lines.append("")
+
+            elif scheme_type == "openIdConnect":
+                lines.append(
+                    f"- **OpenID Connect URL:** `{scheme.get('openid_connect_url', '')}`\n"
+                )
+
+            lines.append("")
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+
+        logger.info("    Generated: %s", filepath)
+
+    def _generate_index(self, categories: dict[str, list[dict[str, Any]]]) -> None:
+        """Generate the reference index file.
+
+        Args:
+            categories: Categorized endpoints mapping.
+        """
+        filepath = f"{self.skill_dir}/references/index.md"
+
+        lines: list[str] = []
+        lines.append(f"# {self.name.title()} API Reference Index\n")
+
+        info = self.extracted_data.get("info", {})
+        if info.get("version"):
+            lines.append(f"**API Version:** {info['version']}\n")
+
+        lines.append("## Endpoint Categories\n")
+        total_endpoints = 0
+        for cat_name, cat_endpoints in sorted(categories.items()):
+            safe_name = self._sanitize_filename(cat_name)
+            count = len(cat_endpoints)
+            total_endpoints += count
+            lines.append(f"- [{cat_name}]({safe_name}.md) ({count} endpoints)")
+
+        lines.append(f"\n**Total endpoints:** {total_endpoints}\n")
+
+        # Schemas and security links
+        schemas = self.extracted_data.get("schemas", {})
+        security = self.extracted_data.get("security_schemes", {})
+
+        lines.append("## Additional References\n")
+        if schemas:
+            lines.append(f"- [Data Models / Schemas](schemas.md) ({len(schemas)} schemas)")
+        if security:
+            lines.append(f"- [Security Schemes](security.md) ({len(security)} schemes)")
+
+        # Servers
+        servers = self.extracted_data.get("servers", [])
+        if servers:
+            lines.append("\n## Servers\n")
+            for server in servers:
+                desc = server.get("description", "")
+                url = server.get("url", "")
+                if desc:
+                    lines.append(f"- `{url}` - {desc}")
+                else:
+                    lines.append(f"- `{url}`")
+
+        lines.append("")
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+
+        logger.info("    Generated: %s", filepath)
+
+    def _generate_skill_md(self, categories: dict[str, list[dict[str, Any]]]) -> None:
+        """Generate the main SKILL.md file.
+
+        Creates a comprehensive skill manifest with API overview, endpoint summary,
+        authentication info, quick reference, and navigation links.
+
+        Args:
+            categories: Categorized endpoints mapping.
+        """
+        filepath = f"{self.skill_dir}/SKILL.md"
+
+        info = self.extracted_data.get("info", {})
+        api_title = info.get("title", self.name.title())
+        api_version = info.get("version", "")
+        api_description = info.get("description", "")
+
+        # Skill name for frontmatter (lowercase, hyphens, max 64 chars)
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+
+        # Truncate description
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        lines: list[str] = []
+
+        # YAML frontmatter
+        lines.append("---")
+        lines.append(f"name: {skill_name}")
+        lines.append(f"description: {desc}")
+        lines.append("---\n")
+
+        # Header
+        lines.append(f"# {api_title}\n")
+        lines.append(f"{self.description}\n")
+
+        if api_version:
+            lines.append(f"**API Version:** {api_version}\n")
+
+        if api_description:
+            # Truncate long descriptions for SKILL.md summary
+            summary_desc = api_description
+            if len(summary_desc) > 500:
+                summary_desc = summary_desc[:497] + "..."
+            lines.append(f"{summary_desc}\n")
+
+        # When to use
+        lines.append("## When to Use This Skill\n")
+        lines.append("Use this skill when you need to:\n")
+        lines.append(f"- Understand the {api_title} endpoints and operations")
+        lines.append(f"- Look up request/response schemas for {api_title}")
+        lines.append("- Find authentication and authorization requirements")
+        lines.append("- Construct API requests with correct parameters")
+        lines.append("- Review available data models and their properties")
+        lines.append("- Check endpoint paths, methods, and status codes\n")
+
+        # Servers
+        servers = self.extracted_data.get("servers", [])
+        if servers:
+            lines.append("## Servers\n")
+            for server in servers:
+                url = server.get("url", "")
+                server_desc = server.get("description", "")
+                if server_desc:
+                    lines.append(f"- `{url}` - {server_desc}")
+                else:
+                    lines.append(f"- `{url}`")
+            lines.append("")
+
+        # Authentication summary
+        security_schemes = self.extracted_data.get("security_schemes", {})
+        if security_schemes:
+            lines.append("## Authentication\n")
+            for scheme_name, scheme in security_schemes.items():
+                scheme_type = scheme.get("type", "")
+                if scheme_type == "apiKey":
+                    location = scheme.get("location", "header")
+                    param_name = scheme.get("name", "")
+                    lines.append(
+                        f"- **{scheme_name}**: API Key in `{location}` (parameter: `{param_name}`)"
+                    )
+                elif scheme_type in ("http", "basic"):
+                    auth_scheme = scheme.get("scheme", "basic")
+                    lines.append(f"- **{scheme_name}**: HTTP `{auth_scheme}`")
+                elif scheme_type == "oauth2":
+                    lines.append(f"- **{scheme_name}**: OAuth 2.0")
+                elif scheme_type == "openIdConnect":
+                    lines.append(f"- **{scheme_name}**: OpenID Connect")
+                else:
+                    lines.append(f"- **{scheme_name}**: `{scheme_type}`")
+            lines.append("")
+
+        # Endpoint overview by category
+        lines.append("## API Endpoints Overview\n")
+        total_endpoints = sum(len(eps) for eps in categories.values())
+        lines.append(f"**Total endpoints:** {total_endpoints}\n")
+
+        for cat_name in sorted(categories.keys()):
+            cat_endpoints = categories[cat_name]
+            tag_desc = self._get_tag_description(cat_name)
+            header = f"### {cat_name}"
+            if tag_desc:
+                header += f" - {tag_desc}"
+            lines.append(header + "\n")
+
+            for ep in cat_endpoints:
+                method = ep.get("method", "GET")
+                path = ep.get("path", "/")
+                summary = ep.get("summary", "")
+                deprecated = " *(deprecated)*" if ep.get("deprecated") else ""
+                line = f"- `{method} {path}`"
+                if summary:
+                    line += f" - {summary}"
+                line += deprecated
+                lines.append(line)
+            lines.append("")
+
+        # Data models summary
+        schemas = self.extracted_data.get("schemas", {})
+        if schemas:
+            lines.append("## Data Models\n")
+            lines.append(f"**Total schemas:** {len(schemas)}\n")
+            for schema_name in sorted(schemas.keys()):
+                schema = schemas[schema_name]
+                schema_desc = schema.get("description", "")
+                schema_type = schema.get("type", "object")
+                line = f"- **{schema_name}** (`{schema_type}`)"
+                if schema_desc:
+                    short_desc = schema_desc
+                    if len(short_desc) > 80:
+                        short_desc = short_desc[:77] + "..."
+                    line += f" - {short_desc}"
+                lines.append(line)
+            lines.append("")
+
+        # Quick reference: most common endpoints
+        lines.append("## Quick Reference\n")
+        lines.append("### Common Operations\n")
+        # Show first 15 endpoints grouped by method
+        all_endpoints = self.extracted_data.get("endpoints", [])
+        by_method: dict[str, list[dict[str, Any]]] = {}
+        for ep in all_endpoints:
+            method = ep.get("method", "GET")
+            if method not in by_method:
+                by_method[method] = []
+            by_method[method].append(ep)
+
+        method_order = ["GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS"]
+        for method in method_order:
+            eps = by_method.get(method, [])
+            if not eps:
+                continue
+            lines.append(f"**{method}:**\n")
+            for ep in eps[:5]:
+                path = ep.get("path", "/")
+                summary = ep.get("summary", "")
+                if summary:
+                    lines.append(f"- `{path}` - {summary}")
+                else:
+                    lines.append(f"- `{path}`")
+            if len(eps) > 5:
+                lines.append(f"- *...and {len(eps) - 5} more*")
+            lines.append("")
+
+        # Reference file navigation
+        lines.append("## Reference Files\n")
+        lines.append("Detailed API documentation is organized in `references/`:\n")
+        lines.append("- `references/index.md` - Complete reference index")
+        for cat_name in sorted(categories.keys()):
+            safe_name = self._sanitize_filename(cat_name)
+            count = len(categories[cat_name])
+            lines.append(f"- `references/{safe_name}.md` - {cat_name} ({count} endpoints)")
+        if schemas:
+            lines.append(f"- `references/schemas.md` - Data models ({len(schemas)} schemas)")
+        if security_schemes:
+            lines.append(
+                f"- `references/security.md` - Security schemes ({len(security_schemes)} schemes)"
+            )
+        lines.append("")
+
+        # Contact info
+        contact = info.get("contact", {})
+        license_info = info.get("license", {})
+        if contact.get("url") or contact.get("email") or license_info.get("name"):
+            lines.append("## API Info\n")
+            if contact.get("name"):
+                lines.append(f"- **Contact:** {contact['name']}")
+            if contact.get("email"):
+                lines.append(f"- **Email:** {contact['email']}")
+            if contact.get("url"):
+                lines.append(f"- **URL:** {contact['url']}")
+            if license_info.get("name"):
+                license_line = f"- **License:** {license_info['name']}"
+                if license_info.get("url"):
+                    license_line += f" ([link]({license_info['url']}))"
+                lines.append(license_line)
+            if info.get("terms_of_service"):
+                lines.append(f"- **Terms of Service:** {info['terms_of_service']}")
+            lines.append("")
+
+        # Footer
+        lines.append("---\n")
+        lines.append("**Generated by Skill Seekers** | OpenAPI/Swagger Specification Scraper\n")
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write("\n".join(lines))
+
+        line_count = len(lines)
+        logger.info("    Generated: %s (%d lines)", filepath, line_count)
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Markdown formatting helpers
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _format_endpoint_md(self, endpoint: dict[str, Any]) -> str:
+        """Format a single endpoint as a markdown section.
+
+        Generates a comprehensive markdown block including method, path, summary,
+        description, parameters table, request body schema, and response schemas.
+
+        Args:
+            endpoint: Normalized endpoint dictionary.
+
+        Returns:
+            Markdown string for the endpoint.
+        """
+        lines: list[str] = []
+
+        method = endpoint.get("method", "GET")
+        path = endpoint.get("path", "/")
+        summary = endpoint.get("summary", "")
+        description = endpoint.get("description", "")
+        operation_id = endpoint.get("operation_id", "")
+        deprecated = endpoint.get("deprecated", False)
+
+        # Header
+        header = f"## `{method} {path}`"
+        if deprecated:
+            header += " *(DEPRECATED)*"
+        lines.append(header + "\n")
+
+        if summary:
+            lines.append(f"**{summary}**\n")
+
+        if description:
+            lines.append(f"{description}\n")
+
+        if operation_id:
+            lines.append(f"**Operation ID:** `{operation_id}`\n")
+
+        # Tags
+        tags = endpoint.get("tags", [])
+        if tags:
+            lines.append(f"**Tags:** {', '.join(f'`{t}`' for t in tags)}\n")
+
+        # Security requirements
+        security = endpoint.get("security", [])
+        if security:
+            scheme_names = []
+            for req in security:
+                scheme_names.extend(req.keys())
+            if scheme_names:
+                lines.append(f"**Security:** {', '.join(f'`{s}`' for s in scheme_names)}\n")
+
+        # Parameters
+        params = endpoint.get("parameters", [])
+        # Exclude body params (handled in request body section)
+        non_body_params = [p for p in params if p.get("location") != "body"]
+
+        if non_body_params:
+            lines.append("### Parameters\n")
+            lines.append("| Name | Location | Type | Required | Description |")
+            lines.append("|------|----------|------|----------|-------------|")
+
+            for param in non_body_params:
+                name = param.get("name", "")
+                location = param.get("location", "query")
+                schema = param.get("schema", {})
+                param_type = self._schema_type_string(schema)
+                required = "Yes" if param.get("required") else "No"
+                desc = param.get("description", "").replace("\n", " ")
+                if len(desc) > 100:
+                    desc = desc[:97] + "..."
+
+                deprecated_mark = " *(deprecated)*" if param.get("deprecated") else ""
+                lines.append(
+                    f"| `{name}`{deprecated_mark} | {location} "
+                    f"| `{param_type}` | {required} | {desc} |"
+                )
+            lines.append("")
+
+        # Request body
+        request_body = endpoint.get("request_body", {})
+        if request_body and request_body.get("content"):
+            lines.append("### Request Body\n")
+            if request_body.get("description"):
+                lines.append(f"{request_body['description']}\n")
+            required = "Required" if request_body.get("required") else "Optional"
+            lines.append(f"**{required}**\n")
+
+            for media_type, media_obj in request_body["content"].items():
+                lines.append(f"**Content-Type:** `{media_type}`\n")
+                schema = media_obj.get("schema", {})
+                if schema:
+                    lines.append(self._render_schema_block(schema, indent=0))
+                    lines.append("")
+
+        # Responses
+        responses = endpoint.get("responses", {})
+        if responses:
+            lines.append("### Responses\n")
+
+            for status_code in sorted(responses.keys()):
+                resp = responses[status_code]
+                resp_desc = resp.get("description", "")
+                lines.append(f"**`{status_code}`** - {resp_desc}\n")
+
+                for media_type, media_obj in resp.get("content", {}).items():
+                    lines.append(f"Content-Type: `{media_type}`\n")
+                    schema = media_obj.get("schema", {})
+                    if schema:
+                        lines.append(self._render_schema_block(schema, indent=0))
+                        lines.append("")
+
+                # Response headers
+                headers = resp.get("headers", {})
+                if headers:
+                    lines.append("**Headers:**\n")
+                    for hdr_name, hdr_obj in headers.items():
+                        hdr_desc = hdr_obj.get("description", "")
+                        hdr_schema = hdr_obj.get("schema", {})
+                        hdr_type = self._schema_type_string(hdr_schema)
+                        lines.append(f"- `{hdr_name}` (`{hdr_type}`): {hdr_desc}")
+                    lines.append("")
+
+        return "\n".join(lines)
+
+    def _format_schema_md(self, schema_name: str, schema: dict[str, Any]) -> str:
+        """Format a component schema as a markdown section.
+
+        Renders the schema name, type, description, properties table, enum values,
+        and composition (allOf/oneOf/anyOf).
+
+        Args:
+            schema_name: Name of the schema.
+            schema: Flattened schema dictionary.
+
+        Returns:
+            Markdown string for the schema.
+        """
+        lines: list[str] = []
+
+        schema_type = schema.get("type", "object")
+        lines.append(f"## {schema_name}\n")
+        lines.append(f"**Type:** `{schema_type}`\n")
+
+        if schema.get("description"):
+            lines.append(f"{schema['description']}\n")
+
+        # Enum values
+        enum_values = schema.get("enum", [])
+        if enum_values:
+            lines.append("**Enum values:**\n")
+            for val in enum_values:
+                lines.append(f"- `{val}`")
+            lines.append("")
+
+        # Properties (for object types)
+        properties = schema.get("properties", {})
+        required_fields = schema.get("required", [])
+
+        if properties:
+            lines.append("### Properties\n")
+            lines.append("| Property | Type | Required | Description |")
+            lines.append("|----------|------|----------|-------------|")
+
+            for prop_name in sorted(properties.keys()):
+                prop = properties[prop_name]
+                prop_type = self._schema_type_string(prop)
+                is_required = "Yes" if prop_name in required_fields else "No"
+                prop_desc = prop.get("description", "").replace("\n", " ")
+                if len(prop_desc) > 100:
+                    prop_desc = prop_desc[:97] + "..."
+
+                # Add enum info inline
+                prop_enum = prop.get("enum", [])
+                if prop_enum:
+                    enum_str = ", ".join(f"`{v}`" for v in prop_enum[:5])
+                    if len(prop_enum) > 5:
+                        enum_str += f", +{len(prop_enum) - 5} more"
+                    prop_desc += f" Enum: [{enum_str}]"
+
+                lines.append(f"| `{prop_name}` | `{prop_type}` | {is_required} | {prop_desc} |")
+            lines.append("")
+
+        # Array items
+        if schema_type == "array" and "items" in schema:
+            items = schema["items"]
+            items_type = self._schema_type_string(items)
+            lines.append(f"**Items type:** `{items_type}`\n")
+            if items.get("properties"):
+                lines.append(self._render_schema_block(items, indent=0))
+                lines.append("")
+
+        # Composition types
+        for combinator in ("oneOf", "anyOf"):
+            variants = schema.get(combinator, [])
+            if variants:
+                lines.append(f"### {combinator}\n")
+                for i, variant in enumerate(variants, 1):
+                    variant_type = self._schema_type_string(variant)
+                    ref_name = variant.get("_ref_name", "")
+                    if ref_name:
+                        lines.append(f"{i}. `{ref_name}` ({variant_type})")
+                    else:
+                        lines.append(f"{i}. `{variant_type}`")
+                lines.append("")
+
+        # Additional properties
+        addl = schema.get("additionalProperties")
+        if isinstance(addl, dict) and addl:
+            addl_type = self._schema_type_string(addl)
+            lines.append(f"**Additional properties:** `{addl_type}`\n")
+
+        return "\n".join(lines)
+
+    def _render_schema_block(self, schema: dict[str, Any], indent: int = 0) -> str:
+        """Render a schema as an indented property listing.
+
+        Used for inline schema rendering in endpoint request/response sections.
+
+        Args:
+            schema: Schema dictionary.
+            indent: Indentation level.
+
+        Returns:
+            Formatted schema string.
+        """
+        lines: list[str] = []
+        prefix = "  " * indent
+
+        schema_type = schema.get("type", "object")
+        ref_name = schema.get("_ref_name", "")
+
+        if ref_name:
+            lines.append(f"{prefix}Schema: `{ref_name}` ({schema_type})")
+        else:
+            lines.append(f"{prefix}Schema: `{schema_type}`")
+
+        # Show properties for objects
+        properties = schema.get("properties", {})
+        required_fields = schema.get("required", [])
+
+        if properties:
+            for prop_name in sorted(properties.keys()):
+                prop = properties[prop_name]
+                prop_type = self._schema_type_string(prop)
+                req_marker = " *(required)*" if prop_name in required_fields else ""
+                prop_desc = prop.get("description", "")
+                if prop_desc:
+                    if len(prop_desc) > 60:
+                        prop_desc = prop_desc[:57] + "..."
+                    lines.append(
+                        f"{prefix}- `{prop_name}`: `{prop_type}`{req_marker} - {prop_desc}"
+                    )
+                else:
+                    lines.append(f"{prefix}- `{prop_name}`: `{prop_type}`{req_marker}")
+
+        # Show enum values
+        enum_values = schema.get("enum", [])
+        if enum_values:
+            enum_str = ", ".join(f"`{v}`" for v in enum_values[:8])
+            if len(enum_values) > 8:
+                enum_str += f", +{len(enum_values) - 8} more"
+            lines.append(f"{prefix}Enum: [{enum_str}]")
+
+        # Show array items type
+        if schema_type == "array" and "items" in schema:
+            items_type = self._schema_type_string(schema["items"])
+            lines.append(f"{prefix}Items: `{items_type}`")
+
+        return "\n".join(lines)
+
+    def _schema_type_string(self, schema: dict[str, Any]) -> str:
+        """Generate a human-readable type string for a schema.
+
+        Handles primitive types, arrays, objects, refs, enums, and formats.
+
+        Args:
+            schema: Schema dictionary.
+
+        Returns:
+            Type string like "string", "integer(int64)", "array[Pet]", etc.
+        """
+        if not schema or not isinstance(schema, dict):
+            return "any"
+
+        ref_name = schema.get("_ref_name", "")
+        schema_type = schema.get("type", "")
+        schema_format = schema.get("format", "")
+
+        # Referenced type
+        if ref_name and not schema_type:
+            return ref_name
+
+        # Array type
+        if schema_type == "array":
+            items = schema.get("items", {})
+            items_type = self._schema_type_string(items)
+            return f"array[{items_type}]"
+
+        # Object with ref name
+        if ref_name:
+            return ref_name
+
+        # Primitive with format
+        if schema_format:
+            return f"{schema_type}({schema_format})"
+
+        # Enum
+        if schema.get("enum") and not schema_type:
+            return "enum"
+
+        # Composition types
+        for combinator in ("oneOf", "anyOf"):
+            variants = schema.get(combinator, [])
+            if variants:
+                type_strs = [self._schema_type_string(v) for v in variants[:3]]
+                result = " | ".join(type_strs)
+                if len(variants) > 3:
+                    result += " | ..."
+                return result
+
+        return schema_type or "object"
+
+    def _get_tag_description(self, tag_name: str) -> str:
+        """Look up a tag description from the spec tags list.
+
+        Args:
+            tag_name: Tag name to search for.
+
+        Returns:
+            Tag description string, or empty string if not found.
+        """
+        for tag in self.extracted_data.get("tags", []):
+            if tag.get("name") == tag_name:
+                return tag.get("description", "")
+        return ""
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert a string to a safe filename.
+
+        Removes special characters, replaces spaces and hyphens with underscores,
+        and lowercases the result.
+
+        Args:
+            name: Input string.
+
+        Returns:
+            Sanitized filename string.
+        """
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        safe = re.sub(r"[-\s]+", "_", safe)
+        return safe
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI entry point
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def main() -> int:
+    """CLI entry point for the OpenAPI scraper.
+
+    Supports three input modes:
+    1. Local spec file: --spec path/to/spec.yaml
+    2. Remote spec URL: --spec-url https://example.com/openapi.json
+    3. Pre-extracted JSON: --from-json extracted.json
+
+    Standard arguments (--name, --description, --verbose, --quiet, --dry-run)
+    are provided by the shared argument system.
+    """
+    _check_yaml_deps()
+
+    parser = argparse.ArgumentParser(
+        description="Convert OpenAPI/Swagger specifications to AI-ready skills",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  %(prog)s --spec petstore.yaml --name petstore-api
+  %(prog)s --spec-url https://petstore3.swagger.io/api/v3/openapi.json --name petstore
+  %(prog)s --from-json petstore_extracted.json
+        """,
+    )
+
+    # Standard shared arguments
+    from .arguments.common import add_all_standard_arguments
+
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for OpenAPI
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for OpenAPI), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # OpenAPI-specific arguments
+    parser.add_argument(
+        "--spec",
+        type=str,
+        help="Local path to OpenAPI/Swagger spec file (YAML or JSON)",
+        metavar="PATH",
+    )
+    parser.add_argument(
+        "--spec-url",
+        type=str,
+        help="Remote URL to fetch OpenAPI/Swagger spec from",
+        metavar="URL",
+    )
+    parser.add_argument(
+        "--from-json",
+        type=str,
+        help="Build skill from previously extracted JSON data",
+        metavar="FILE",
+    )
+
+    args = parser.parse_args()
+
+    # Setup logging
+    if getattr(args, "quiet", False):
+        logging.basicConfig(level=logging.WARNING, format="%(message)s")
+    elif getattr(args, "verbose", False):
+        logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s")
+    else:
+        logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = args.spec or args.spec_url or args.from_json or "(none)"
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: OpenAPI Specification Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n  Dry run complete")
+        return 0
+
+    # Validate inputs
+    if not (args.spec or args.spec_url or args.from_json):
+        parser.error("Must specify --spec (file path), --spec-url (URL), or --from-json")
+
+    # Build from pre-extracted JSON
+    if args.from_json:
+        name = args.name or Path(args.from_json).stem.replace("_extracted", "")
+        config: dict[str, Any] = {
+            "name": name,
+            "description": (args.description or f"Use when working with the {name} API"),
+        }
+        converter = OpenAPIToSkillConverter(config)
+        converter.load_extracted_data(args.from_json)
+        converter.build_skill()
+        return 0
+
+    # Determine name
+    if not args.name:
+        if args.spec:
+            name = Path(args.spec).stem
+        elif args.spec_url:
+            # Derive name from URL
+            from urllib.parse import urlparse
+
+            url_path = urlparse(args.spec_url).path
+            name = Path(url_path).stem if url_path else "api"
+        else:
+            name = "api"
+    else:
+        name = args.name
+
+    # Build config
+    config = {
+        "name": name,
+        "spec_path": args.spec or "",
+        "spec_url": args.spec_url or "",
+    }
+    if args.description:
+        config["description"] = args.description
+
+    # Create converter and run
+    try:
+        converter = OpenAPIToSkillConverter(config)
+
+        if not converter.extract_spec():
+            print("\n  OpenAPI extraction failed", file=sys.stderr)
+            sys.exit(1)
+
+        converter.build_skill()
+
+        # Enhancement workflow integration
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print(f"\n{'=' * 80}")
+            print(f"  AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("  API enhancement complete!")
+                except ImportError:
+                    print("  API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("  Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("  Local enhancement complete!")
+
+    except (ValueError, RuntimeError) as e:
+        print(f"\n  Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n  Unexpected error during OpenAPI processing: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/parsers/__init__.py b/src/skill_seekers/cli/parsers/__init__.py
index 06bf6b4..ddf04ee 100644
--- a/src/skill_seekers/cli/parsers/__init__.py
+++ b/src/skill_seekers/cli/parsers/__init__.py
@@ -33,6 +33,18 @@ from .quality_parser import QualityParser
 from .workflows_parser import WorkflowsParser
 from .sync_config_parser import SyncConfigParser
 
+# New source type parsers (v3.2.0+)
+from .jupyter_parser import JupyterParser
+from .html_parser import HtmlParser
+from .openapi_parser import OpenAPIParser
+from .asciidoc_parser import AsciiDocParser
+from .pptx_parser import PptxParser
+from .rss_parser import RssParser
+from .manpage_parser import ManPageParser
+from .confluence_parser import ConfluenceParser
+from .notion_parser import NotionParser
+from .chat_parser import ChatParser
+
 # Registry of all parsers (in order of usage frequency)
 PARSERS = [
     CreateParser(),  # NEW: Unified create command (placed first for prominence)
@@ -60,6 +72,17 @@ PARSERS = [
     QualityParser(),
     WorkflowsParser(),
     SyncConfigParser(),
+    # New source types (v3.2.0+)
+    JupyterParser(),
+    HtmlParser(),
+    OpenAPIParser(),
+    AsciiDocParser(),
+    PptxParser(),
+    RssParser(),
+    ManPageParser(),
+    ConfluenceParser(),
+    NotionParser(),
+    ChatParser(),
 ]
 
 
diff --git a/src/skill_seekers/cli/parsers/asciidoc_parser.py b/src/skill_seekers/cli/parsers/asciidoc_parser.py
new file mode 100644
index 0000000..6a42372
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/asciidoc_parser.py
@@ -0,0 +1,32 @@
+"""AsciiDoc subcommand parser.
+
+Uses shared argument definitions from arguments.asciidoc to ensure
+consistency with the standalone asciidoc_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.asciidoc import add_asciidoc_arguments
+
+
+class AsciiDocParser(SubcommandParser):
+    """Parser for asciidoc subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "asciidoc"
+
+    @property
+    def help(self) -> str:
+        return "Extract from AsciiDoc documents (.adoc)"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from AsciiDoc documents (.adoc) and generate skill"
+
+    def add_arguments(self, parser):
+        """Add asciidoc-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with asciidoc_scraper.py (standalone scraper).
+        """
+        add_asciidoc_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/chat_parser.py b/src/skill_seekers/cli/parsers/chat_parser.py
new file mode 100644
index 0000000..dd8441b
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/chat_parser.py
@@ -0,0 +1,32 @@
+"""Chat subcommand parser.
+
+Uses shared argument definitions from arguments.chat to ensure
+consistency with the standalone chat_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.chat import add_chat_arguments
+
+
+class ChatParser(SubcommandParser):
+    """Parser for chat subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "chat"
+
+    @property
+    def help(self) -> str:
+        return "Extract from Slack/Discord chat exports"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from Slack/Discord chat exports and generate skill"
+
+    def add_arguments(self, parser):
+        """Add chat-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with chat_scraper.py (standalone scraper).
+        """
+        add_chat_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/confluence_parser.py b/src/skill_seekers/cli/parsers/confluence_parser.py
new file mode 100644
index 0000000..b01f456
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/confluence_parser.py
@@ -0,0 +1,32 @@
+"""Confluence subcommand parser.
+
+Uses shared argument definitions from arguments.confluence to ensure
+consistency with the standalone confluence_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.confluence import add_confluence_arguments
+
+
+class ConfluenceParser(SubcommandParser):
+    """Parser for confluence subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "confluence"
+
+    @property
+    def help(self) -> str:
+        return "Extract from Confluence wiki"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from Confluence wiki and generate skill"
+
+    def add_arguments(self, parser):
+        """Add confluence-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with confluence_scraper.py (standalone scraper).
+        """
+        add_confluence_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/html_parser.py b/src/skill_seekers/cli/parsers/html_parser.py
new file mode 100644
index 0000000..d6f01e9
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/html_parser.py
@@ -0,0 +1,32 @@
+"""HTML subcommand parser.
+
+Uses shared argument definitions from arguments.html to ensure
+consistency with the standalone html_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.html import add_html_arguments
+
+
+class HtmlParser(SubcommandParser):
+    """Parser for html subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "html"
+
+    @property
+    def help(self) -> str:
+        return "Extract from local HTML files (.html/.htm)"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from local HTML files (.html/.htm) and generate skill"
+
+    def add_arguments(self, parser):
+        """Add html-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with html_scraper.py (standalone scraper).
+        """
+        add_html_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/jupyter_parser.py b/src/skill_seekers/cli/parsers/jupyter_parser.py
new file mode 100644
index 0000000..2cb7841
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/jupyter_parser.py
@@ -0,0 +1,32 @@
+"""Jupyter Notebook subcommand parser.
+
+Uses shared argument definitions from arguments.jupyter to ensure
+consistency with the standalone jupyter_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.jupyter import add_jupyter_arguments
+
+
+class JupyterParser(SubcommandParser):
+    """Parser for jupyter subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "jupyter"
+
+    @property
+    def help(self) -> str:
+        return "Extract from Jupyter Notebook (.ipynb)"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from Jupyter Notebook (.ipynb) and generate skill"
+
+    def add_arguments(self, parser):
+        """Add jupyter-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with jupyter_scraper.py (standalone scraper).
+        """
+        add_jupyter_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/manpage_parser.py b/src/skill_seekers/cli/parsers/manpage_parser.py
new file mode 100644
index 0000000..b5bd75c
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/manpage_parser.py
@@ -0,0 +1,32 @@
+"""Man page subcommand parser.
+
+Uses shared argument definitions from arguments.manpage to ensure
+consistency with the standalone man_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.manpage import add_manpage_arguments
+
+
+class ManPageParser(SubcommandParser):
+    """Parser for manpage subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "manpage"
+
+    @property
+    def help(self) -> str:
+        return "Extract from man pages"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from man pages and generate skill"
+
+    def add_arguments(self, parser):
+        """Add manpage-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with man_scraper.py (standalone scraper).
+        """
+        add_manpage_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/notion_parser.py b/src/skill_seekers/cli/parsers/notion_parser.py
new file mode 100644
index 0000000..f495d5d
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/notion_parser.py
@@ -0,0 +1,32 @@
+"""Notion subcommand parser.
+
+Uses shared argument definitions from arguments.notion to ensure
+consistency with the standalone notion_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.notion import add_notion_arguments
+
+
+class NotionParser(SubcommandParser):
+    """Parser for notion subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "notion"
+
+    @property
+    def help(self) -> str:
+        return "Extract from Notion pages"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from Notion pages and generate skill"
+
+    def add_arguments(self, parser):
+        """Add notion-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with notion_scraper.py (standalone scraper).
+        """
+        add_notion_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/openapi_parser.py b/src/skill_seekers/cli/parsers/openapi_parser.py
new file mode 100644
index 0000000..3c0e27b
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/openapi_parser.py
@@ -0,0 +1,32 @@
+"""OpenAPI subcommand parser.
+
+Uses shared argument definitions from arguments.openapi to ensure
+consistency with the standalone openapi_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.openapi import add_openapi_arguments
+
+
+class OpenAPIParser(SubcommandParser):
+    """Parser for openapi subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "openapi"
+
+    @property
+    def help(self) -> str:
+        return "Extract from OpenAPI/Swagger spec"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from OpenAPI/Swagger spec and generate skill"
+
+    def add_arguments(self, parser):
+        """Add openapi-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with openapi_scraper.py (standalone scraper).
+        """
+        add_openapi_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/pptx_parser.py b/src/skill_seekers/cli/parsers/pptx_parser.py
new file mode 100644
index 0000000..fbd1bdc
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/pptx_parser.py
@@ -0,0 +1,32 @@
+"""PPTX subcommand parser.
+
+Uses shared argument definitions from arguments.pptx to ensure
+consistency with the standalone pptx_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.pptx import add_pptx_arguments
+
+
+class PptxParser(SubcommandParser):
+    """Parser for pptx subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "pptx"
+
+    @property
+    def help(self) -> str:
+        return "Extract from PowerPoint presentations (.pptx)"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from PowerPoint presentations (.pptx) and generate skill"
+
+    def add_arguments(self, parser):
+        """Add pptx-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with pptx_scraper.py (standalone scraper).
+        """
+        add_pptx_arguments(parser)
diff --git a/src/skill_seekers/cli/parsers/rss_parser.py b/src/skill_seekers/cli/parsers/rss_parser.py
new file mode 100644
index 0000000..2d3e384
--- /dev/null
+++ b/src/skill_seekers/cli/parsers/rss_parser.py
@@ -0,0 +1,32 @@
+"""RSS subcommand parser.
+
+Uses shared argument definitions from arguments.rss to ensure
+consistency with the standalone rss_scraper module.
+"""
+
+from .base import SubcommandParser
+from skill_seekers.cli.arguments.rss import add_rss_arguments
+
+
+class RssParser(SubcommandParser):
+    """Parser for rss subcommand."""
+
+    @property
+    def name(self) -> str:
+        return "rss"
+
+    @property
+    def help(self) -> str:
+        return "Extract from RSS/Atom feeds"
+
+    @property
+    def description(self) -> str:
+        return "Extract content from RSS/Atom feeds and generate skill"
+
+    def add_arguments(self, parser):
+        """Add rss-specific arguments.
+
+        Uses shared argument definitions to ensure consistency
+        with rss_scraper.py (standalone scraper).
+        """
+        add_rss_arguments(parser)
diff --git a/src/skill_seekers/cli/pptx_scraper.py b/src/skill_seekers/cli/pptx_scraper.py
new file mode 100644
index 0000000..725299e
--- /dev/null
+++ b/src/skill_seekers/cli/pptx_scraper.py
@@ -0,0 +1,1821 @@
+#!/usr/bin/env python3
+"""
+PowerPoint (.pptx) Presentation to Skill Converter
+
+Converts PowerPoint presentations into AI-ready skills.
+Uses python-pptx to extract slide content including text, tables, speaker notes,
+images, and code blocks. Supports single files and directories of .pptx files.
+
+Slides are grouped into sections based on layout type (section/title layouts act
+as section breaks). Each section becomes a reference file in the output skill.
+
+Usage:
+    skill-seekers pptx --pptx presentation.pptx --name myskill
+    skill-seekers pptx --pptx ./slides_dir/ --name myskill
+    skill-seekers pptx --from-json presentation_extracted.json
+"""
+
+import argparse
+import json
+import logging
+import os
+import re
+import sys
+from pathlib import Path
+
+# Optional dependency guard
+try:
+    from pptx import Presentation
+    from pptx.enum.text import PP_ALIGN  # noqa: F401
+    from pptx.util import Emu  # noqa: F401
+
+    PPTX_AVAILABLE = True
+except ImportError:
+    PPTX_AVAILABLE = False
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Monospace / code font families used for code-block detection
+# ---------------------------------------------------------------------------
+MONOSPACE_FONTS = frozenset(
+    {
+        "courier",
+        "courier new",
+        "consolas",
+        "menlo",
+        "monaco",
+        "lucida console",
+        "lucida sans typewriter",
+        "dejavu sans mono",
+        "liberation mono",
+        "source code pro",
+        "fira code",
+        "fira mono",
+        "jetbrains mono",
+        "roboto mono",
+        "ubuntu mono",
+        "inconsolata",
+        "hack",
+        "cascadia code",
+        "cascadia mono",
+        "sf mono",
+        "andale mono",
+        "ibm plex mono",
+        "droid sans mono",
+        "noto mono",
+        "pt mono",
+        "overpass mono",
+    }
+)
+
+# Layout names that typically signal a section/title divider slide
+SECTION_LAYOUT_NAMES = frozenset(
+    {
+        "section header",
+        "section",
+        "title slide",
+        "title only",
+        "title and content",
+        "blank",
+    }
+)
+
+# Layout names that are strong section-break indicators (title-only slides)
+TITLE_ONLY_LAYOUTS = frozenset(
+    {
+        "section header",
+        "section",
+        "title slide",
+        "title only",
+    }
+)
+
+
+def _check_pptx_deps() -> None:
+    """Raise RuntimeError if python-pptx is not installed."""
+    if not PPTX_AVAILABLE:
+        raise RuntimeError(
+            "python-pptx is required for PowerPoint support.\n"
+            'Install with: pip install "skill-seekers[pptx]"\n'
+            "Or: pip install python-pptx"
+        )
+
+
+def infer_description_from_pptx(
+    metadata: dict | None = None,
+    name: str = "",
+) -> str:
+    """Infer skill description from PowerPoint metadata or name.
+
+    Tries to extract a meaningful description from:
+    1. Presentation subject field
+    2. Presentation title field
+    3. Falls back to a template using the skill name
+
+    Args:
+        metadata: Presentation metadata dict with title, subject, author, etc.
+        name: Skill name for fallback
+
+    Returns:
+        Description string suitable for "Use when..." format
+    """
+    if metadata:
+        # Try subject field first (often contains a description)
+        if metadata.get("subject"):
+            desc = str(metadata["subject"]).strip()
+            if len(desc) > 20:
+                if len(desc) > 150:
+                    desc = desc[:147] + "..."
+                return f"Use when {desc.lower()}"
+
+        # Try title if meaningful
+        if metadata.get("title"):
+            title = str(metadata["title"]).strip()
+            if len(title) > 10 and not title.lower().endswith(".pptx"):
+                return f"Use when working with {title.lower()}"
+
+    return (
+        f"Use when referencing {name} presentation"
+        if name
+        else "Use when referencing this presentation"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Main converter class
+# ---------------------------------------------------------------------------
+
+
+class PptxToSkillConverter:
+    """Convert PowerPoint presentation (.pptx) to an AI-ready skill.
+
+    Follows the same pipeline pattern as the Word, EPUB, and PDF scrapers:
+    extract -> categorize -> build_skill (reference files + index + SKILL.md).
+
+    The extraction phase uses python-pptx to read slides, extracting:
+    - Slide titles, body text, and speaker notes
+    - Tables (converted to markdown)
+    - Image counts and descriptions
+    - Code blocks (detected via monospace font usage)
+    - Presentation-level metadata (title, author, subject, etc.)
+    - Slide layout information for section grouping
+
+    Supports both single .pptx files and directories containing multiple
+    .pptx files (merged into a single skill).
+    """
+
+    def __init__(self, config: dict) -> None:
+        """Initialize the converter with a configuration dictionary.
+
+        Args:
+            config: Configuration dict with keys:
+                - name (str): Skill name (required)
+                - pptx_path (str): Path to .pptx file or directory (optional)
+                - description (str): Skill description (optional, inferred if absent)
+                - categories (dict): Manual category assignments (optional)
+        """
+        self.config = config
+        self.name: str = config["name"]
+        self.pptx_path: str = config.get("pptx_path", "")
+        self.description: str = (
+            config.get("description") or f"Use when referencing {self.name} presentation"
+        )
+
+        # Paths
+        self.skill_dir: str = f"output/{self.name}"
+        self.data_file: str = f"output/{self.name}_extracted.json"
+
+        # Categories config
+        self.categories: dict = config.get("categories", {})
+
+        # Extracted data (populated by extract_pptx or load_extracted_data)
+        self.extracted_data: dict | None = None
+
+    # ------------------------------------------------------------------
+    # Extraction
+    # ------------------------------------------------------------------
+
+    def extract_pptx(self) -> bool:
+        """Extract content from PowerPoint file(s) using python-pptx.
+
+        Handles both single .pptx files and directories containing multiple
+        .pptx files. For directories, files are processed in sorted order and
+        their slides are concatenated sequentially.
+
+        Workflow:
+        1. Check dependencies (python-pptx)
+        2. Resolve input path (single file vs. directory)
+        3. For each .pptx file:
+           a. Open with python-pptx Presentation class
+           b. Extract presentation-level metadata
+           c. Iterate slides, extracting text, notes, tables, images, code
+        4. Detect section breaks from slide layouts
+        5. Group slides into sections
+        6. Detect code languages via LanguageDetector
+        7. Save intermediate JSON to {name}_extracted.json
+
+        Returns:
+            True on successful extraction.
+
+        Raises:
+            FileNotFoundError: If the pptx_path does not exist.
+            ValueError: If no .pptx files are found in a directory.
+            RuntimeError: If extraction fails for other reasons.
+        """
+        _check_pptx_deps()
+
+        print(f"\n🔍 Extracting from PowerPoint: {self.pptx_path}")
+
+        pptx_path = Path(self.pptx_path)
+        if not pptx_path.exists():
+            raise FileNotFoundError(f"PowerPoint path not found: {self.pptx_path}")
+
+        # Collect .pptx file(s) to process
+        pptx_files: list[Path] = []
+        if pptx_path.is_dir():
+            pptx_files = sorted(pptx_path.glob("*.pptx"))
+            if not pptx_files:
+                raise ValueError(f"No .pptx files found in directory: {self.pptx_path}")
+            print(f"   Found {len(pptx_files)} .pptx file(s) in directory")
+        else:
+            if not str(pptx_path).lower().endswith(".pptx"):
+                raise ValueError(f"Not a PowerPoint file (expected .pptx): {self.pptx_path}")
+            pptx_files = [pptx_path]
+
+        # Accumulate slides across all files
+        all_slides: list[dict] = []
+        merged_metadata: dict = {}
+        total_image_count = 0
+        slide_offset = 0
+
+        for file_path in pptx_files:
+            print(f"   Processing: {file_path.name}")
+            try:
+                prs = Presentation(str(file_path))
+            except Exception as e:
+                raise RuntimeError(f"Failed to open PowerPoint file: {file_path}\n{e}") from e
+
+            # Extract metadata from first (or only) file
+            if not merged_metadata:
+                merged_metadata = self._extract_presentation_metadata(prs)
+                if merged_metadata.get("title"):
+                    print(f"   Title: {merged_metadata['title']}")
+                if merged_metadata.get("author"):
+                    print(f"   Author: {merged_metadata['author']}")
+
+            # Extract each slide
+            for slide_idx, slide in enumerate(prs.slides):
+                slide_number = slide_offset + slide_idx + 1
+                slide_data = self._extract_slide(slide, slide_number)
+
+                # Track source file for multi-file scenarios
+                if len(pptx_files) > 1:
+                    slide_data["source_file"] = file_path.name
+
+                all_slides.append(slide_data)
+                total_image_count += slide_data.get("image_count", 0)
+
+            slide_offset += len(prs.slides)
+
+        print(f"   Total slides extracted: {len(all_slides)}")
+
+        # Update description from metadata if not explicitly set
+        if not self.config.get("description"):
+            self.description = infer_description_from_pptx(merged_metadata, self.name)
+
+        # Group slides into sections based on layout and section breaks
+        sections = self._group_slides_into_sections(all_slides)
+
+        # Detect code languages using LanguageDetector
+        languages_detected, total_code_blocks = self._detect_languages(sections)
+
+        # Count total tables
+        total_tables = sum(
+            len(slide.get("tables", []))
+            for section in sections
+            for slide in section.get("slides", [])
+        )
+
+        result_data = {
+            "source_file": self.pptx_path,
+            "metadata": merged_metadata,
+            "total_slides": len(all_slides),
+            "total_sections": len(sections),
+            "total_code_blocks": total_code_blocks,
+            "total_images": total_image_count,
+            "total_tables": total_tables,
+            "languages_detected": languages_detected,
+            "pages": sections,  # "pages" key for pipeline compatibility
+        }
+
+        # Save extracted data
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {len(sections)} sections ({len(all_slides)} slides), "
+            f"{total_code_blocks} code blocks, "
+            f"{total_image_count} images, "
+            f"{total_tables} tables"
+        )
+        return True
+
+    def _extract_presentation_metadata(self, prs) -> dict:
+        """Extract presentation-level metadata from core properties.
+
+        Reads the Office Open XML core properties: title, author, subject,
+        category, comments, keywords, created/modified dates, revision,
+        and last_modified_by.
+
+        Args:
+            prs: python-pptx Presentation object
+
+        Returns:
+            Dictionary of metadata fields (string values, None for missing).
+        """
+        props = prs.core_properties
+        return {
+            "title": props.title or "",
+            "author": props.author or "",
+            "subject": props.subject or "",
+            "category": props.category or "",
+            "comments": props.comments or "",
+            "keywords": props.keywords or "",
+            "created": str(props.created) if props.created else "",
+            "modified": str(props.modified) if props.modified else "",
+            "last_modified_by": props.last_modified_by or "",
+            "revision": props.revision if props.revision else None,
+            "slide_count": len(prs.slides),
+            "slide_width": prs.slide_width,
+            "slide_height": prs.slide_height,
+        }
+
+    def _extract_slide(self, slide, slide_number: int) -> dict:
+        """Extract all content from a single slide.
+
+        Processes the slide's shapes to extract:
+        - Title text (from the title placeholder)
+        - Body text (from all text frames, excluding title)
+        - Speaker notes
+        - Tables (as structured data)
+        - Image count and descriptions
+        - Code blocks (detected by monospace font usage)
+        - Layout name and type information
+
+        Args:
+            slide: python-pptx Slide object
+            slide_number: 1-based slide number in the presentation
+
+        Returns:
+            Dictionary with all extracted slide data.
+        """
+        layout_name = ""
+        if slide.slide_layout:
+            layout_name = slide.slide_layout.name or ""
+
+        # Determine if this is a section/title slide
+        is_section_slide = layout_name.lower() in TITLE_ONLY_LAYOUTS
+
+        # Extract title
+        title = ""
+        if slide.shapes.title:
+            title = slide.shapes.title.text.strip()
+
+        # Extract body text from all text frames (excluding title placeholder)
+        body_parts: list[str] = []
+        code_blocks: list[dict] = []
+        image_count = 0
+        tables: list[dict] = []
+
+        for shape in slide.shapes:
+            # Skip the title placeholder (already extracted)
+            if shape.has_text_frame and shape == slide.shapes.title:
+                continue
+
+            # Process grouped shapes recursively
+            if shape.shape_type is not None and hasattr(shape, "shapes"):
+                group_text, group_codes, group_images = self._extract_group_shapes(shape)
+                body_parts.extend(group_text)
+                code_blocks.extend(group_codes)
+                image_count += group_images
+                continue
+
+            # Tables
+            if shape.has_table:
+                table_data = self._extract_tables(shape.table)
+                if table_data:
+                    tables.append(table_data)
+                continue
+
+            # Images
+            if self._is_image_shape(shape):
+                image_count += 1
+                continue
+
+            # Text frames
+            if shape.has_text_frame:
+                frame_text, frame_codes = self._process_text_frame(shape.text_frame)
+                if frame_codes:
+                    code_blocks.extend(frame_codes)
+                elif frame_text:
+                    body_parts.append(frame_text)
+
+        # Extract speaker notes
+        speaker_notes = self._extract_speaker_notes(slide)
+
+        # Extract image info summary
+        image_info = self._extract_images_info(slide)
+
+        return {
+            "slide_number": slide_number,
+            "layout_name": layout_name,
+            "is_section_slide": is_section_slide,
+            "title": title,
+            "body_text": "\n\n".join(body_parts),
+            "speaker_notes": speaker_notes,
+            "tables": tables,
+            "code_blocks": code_blocks,
+            "image_count": image_count,
+            "image_info": image_info,
+        }
+
+    def _extract_group_shapes(self, group_shape) -> tuple[list[str], list[dict], int]:
+        """Recursively extract content from grouped shapes.
+
+        PowerPoint allows shapes to be grouped together. This method walks
+        the group hierarchy and extracts text, code blocks, and image counts
+        from all nested shapes.
+
+        Args:
+            group_shape: python-pptx GroupShape object
+
+        Returns:
+            Tuple of (text_parts, code_blocks, image_count)
+        """
+        text_parts: list[str] = []
+        code_blocks: list[dict] = []
+        image_count = 0
+
+        for shape in group_shape.shapes:
+            # Nested groups
+            if hasattr(shape, "shapes"):
+                sub_text, sub_codes, sub_images = self._extract_group_shapes(shape)
+                text_parts.extend(sub_text)
+                code_blocks.extend(sub_codes)
+                image_count += sub_images
+                continue
+
+            # Tables in groups
+            if shape.has_table:
+                # Tables in groups are rare but possible; skip for text extraction
+                continue
+
+            # Images in groups
+            if self._is_image_shape(shape):
+                image_count += 1
+                continue
+
+            # Text frames in groups
+            if shape.has_text_frame:
+                frame_text, frame_codes = self._process_text_frame(shape.text_frame)
+                if frame_codes:
+                    code_blocks.extend(frame_codes)
+                elif frame_text:
+                    text_parts.append(frame_text)
+
+        return text_parts, code_blocks, image_count
+
+    def _process_text_frame(self, text_frame) -> tuple[str, list[dict]]:
+        """Process a text frame, separating regular text from code blocks.
+
+        Examines the font properties of each paragraph's runs to determine
+        whether the content is code (monospace font) or regular text.
+
+        Args:
+            text_frame: python-pptx TextFrame object
+
+        Returns:
+            Tuple of (plain_text, code_blocks) where code_blocks is a list
+            of dicts with 'code', 'language', and 'quality_score' keys.
+        """
+        text_parts: list[str] = []
+        code_parts: list[str] = []
+        code_blocks: list[dict] = []
+        in_code_block = False
+
+        for paragraph in text_frame.paragraphs:
+            para_text = paragraph.text.strip()
+            if not para_text:
+                # Empty paragraph may separate code blocks
+                if in_code_block and code_parts:
+                    code_blocks.append(self._finalize_code_block(code_parts))
+                    code_parts = []
+                    in_code_block = False
+                continue
+
+            is_code = self._detect_code_blocks(paragraph)
+
+            if is_code:
+                in_code_block = True
+                code_parts.append(paragraph.text)
+            else:
+                # Flush any accumulated code
+                if in_code_block and code_parts:
+                    code_blocks.append(self._finalize_code_block(code_parts))
+                    code_parts = []
+                    in_code_block = False
+                text_parts.append(para_text)
+
+        # Flush trailing code block
+        if code_parts:
+            code_blocks.append(self._finalize_code_block(code_parts))
+
+        return "\n".join(text_parts), code_blocks
+
+    def _finalize_code_block(self, code_parts: list[str]) -> dict:
+        """Create a code block dict from accumulated code lines.
+
+        Args:
+            code_parts: List of code line strings
+
+        Returns:
+            Dict with 'code', 'language', and 'quality_score' keys.
+        """
+        code_text = "\n".join(code_parts)
+        quality = _score_code_quality(code_text)
+        return {
+            "code": code_text,
+            "language": "",
+            "quality_score": quality,
+        }
+
+    def _extract_tables(self, table) -> dict | None:
+        """Extract table data from a python-pptx Table object.
+
+        Converts the table into a structured dict with headers and rows.
+        The first row is treated as the header row.
+
+        Args:
+            table: python-pptx Table object
+
+        Returns:
+            Dict with 'headers' (list[str]) and 'rows' (list[list[str]]) keys,
+            or None if the table is empty.
+        """
+        if not table.rows:
+            return None
+
+        rows_data: list[list[str]] = []
+        for row in table.rows:
+            cells = []
+            for cell in row.cells:
+                # Extract text from all paragraphs in the cell
+                cell_text = "\n".join(p.text.strip() for p in cell.text_frame.paragraphs).strip()
+                cells.append(cell_text)
+            rows_data.append(cells)
+
+        if not rows_data:
+            return None
+
+        # First row is headers
+        headers = rows_data[0]
+        data_rows = rows_data[1:]
+
+        return {"headers": headers, "rows": data_rows}
+
+    def _extract_images_info(self, slide) -> list[dict]:
+        """Extract descriptive information about images on a slide.
+
+        Does not extract image binary data (to keep JSON output manageable).
+        Instead, records image position, size, and any alt text or name.
+
+        Args:
+            slide: python-pptx Slide object
+
+        Returns:
+            List of dicts with image metadata (name, width, height, alt_text).
+        """
+        images: list[dict] = []
+
+        for shape in slide.shapes:
+            if not self._is_image_shape(shape):
+                continue
+
+            info: dict = {
+                "index": len(images),
+                "name": shape.name or "",
+                "width": shape.width if hasattr(shape, "width") else 0,
+                "height": shape.height if hasattr(shape, "height") else 0,
+            }
+
+            # Try to get alt text (accessibility description)
+            try:
+                # python-pptx stores alt text in shape._element
+                desc_elem = shape._element.find(
+                    ".//{http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing}cNvPr"
+                )
+                if desc_elem is not None:
+                    info["alt_text"] = desc_elem.get("descr", "")
+                else:
+                    # Try the main namespace
+                    for child in shape._element.iter():
+                        descr = child.get("descr")
+                        if descr:
+                            info["alt_text"] = descr
+                            break
+            except Exception:
+                pass
+
+            images.append(info)
+
+        return images
+
+    def _detect_code_blocks(self, paragraph) -> bool:
+        """Detect whether a paragraph contains code based on font properties.
+
+        Code blocks in presentations are typically identified by:
+        1. Monospace font family (Courier, Consolas, etc.)
+        2. Small font size relative to body text
+        3. Specific formatting patterns (e.g., syntax-highlighted runs)
+
+        This method checks the font properties of the paragraph's runs
+        and uses heuristics to determine if the content is code.
+
+        Args:
+            paragraph: python-pptx Paragraph object
+
+        Returns:
+            True if the paragraph appears to contain code.
+        """
+        if not paragraph.runs:
+            return False
+
+        # Count runs with monospace fonts
+        mono_runs = 0
+        total_runs = 0
+        total_chars = 0
+        mono_chars = 0
+
+        for run in paragraph.runs:
+            run_text = run.text
+            if not run_text.strip():
+                continue
+
+            total_runs += 1
+            char_count = len(run_text)
+            total_chars += char_count
+
+            font_name = ""
+            if run.font and run.font.name:
+                font_name = run.font.name.lower()
+
+            if font_name in MONOSPACE_FONTS:
+                mono_runs += 1
+                mono_chars += char_count
+
+        if total_runs == 0 or total_chars == 0:
+            return False
+
+        # If majority of characters are in monospace font, it's code
+        mono_ratio = mono_chars / total_chars
+        if mono_ratio >= 0.6:
+            return True
+
+        # Also check the paragraph text for code-like patterns
+        text = paragraph.text.strip()
+        return mono_ratio >= 0.3 and self._text_looks_like_code(text)
+
+    def _text_looks_like_code(self, text: str) -> bool:
+        """Heuristic check whether text content looks like source code.
+
+        Uses pattern matching to detect common code constructs like
+        function definitions, imports, variable assignments, etc.
+
+        Args:
+            text: The text content to check
+
+        Returns:
+            True if the text appears to be source code.
+        """
+        if not text:
+            return False
+
+        # Strong code indicators
+        code_patterns = [
+            r"^\s*(def |class |function |func |fn |pub fn )",
+            r"^\s*(import |from .+ import|require\(|#include|using )",
+            r"^\s*(if |else:|elif |for |while |switch |case )",
+            r"^\s*(return |yield |raise |throw )",
+            r"^\s*(const |let |var |int |float |str |bool )",
+            r"[{}\[\]();]",
+            r"^\s*#\s*\w+",  # preprocessor or comment
+            r"=>|->|\|\||&&",  # operators
+            r"^\s*@\w+",  # decorators
+            r'^\s*\w+\s*=\s*["\'\d\[\{]',  # assignment
+            r"^\s*\$\w+",  # shell/PHP variables
+            r"^\s*(SELECT|INSERT|UPDATE|DELETE|CREATE|FROM|WHERE)\s",  # SQL
+        ]
+
+        for pattern in code_patterns:
+            if re.search(pattern, text, re.MULTILINE | re.IGNORECASE):
+                return True
+
+        # Check ratio of special characters (code tends to have more)
+        if len(text) > 10:
+            special_count = sum(1 for c in text if c in "{}[]();=<>|&!@#$%^*~`")
+            if special_count / len(text) > 0.08:
+                return True
+
+        return False
+
+    def _extract_speaker_notes(self, slide) -> str:
+        """Extract speaker notes from a slide.
+
+        Speaker notes are stored in the slide's notes_slide object.
+        Returns the full text of the notes, or empty string if none exist.
+
+        Args:
+            slide: python-pptx Slide object
+
+        Returns:
+            Speaker notes text string.
+        """
+        try:
+            if not slide.has_notes_slide:
+                return ""
+
+            notes_slide = slide.notes_slide
+            if not notes_slide or not notes_slide.notes_text_frame:
+                return ""
+
+            notes_text = notes_slide.notes_text_frame.text.strip()
+            return notes_text
+        except Exception:
+            logger.debug(f"Could not extract speaker notes from slide {slide.slide_id}")
+            return ""
+
+    def _is_image_shape(self, shape) -> bool:
+        """Check if a shape is an image (picture).
+
+        Args:
+            shape: python-pptx Shape object
+
+        Returns:
+            True if the shape contains an image.
+        """
+        try:
+            # python-pptx shape_type 13 = PICTURE
+            if (
+                hasattr(shape, "shape_type")
+                and shape.shape_type is not None
+                and shape.shape_type == 13  # MSO_SHAPE_TYPE.PICTURE
+            ):
+                return True
+            # Also check for image in the shape's element
+            if hasattr(shape, "image"):
+                return True
+        except Exception:
+            pass
+        return False
+
+    # ------------------------------------------------------------------
+    # Section grouping
+    # ------------------------------------------------------------------
+
+    def _group_slides_into_sections(self, slides: list[dict]) -> list[dict]:
+        """Group slides into sections based on layout type and section breaks.
+
+        Section breaks are detected from:
+        1. Slides with section/title-only layouts (is_section_slide=True)
+        2. Slides whose title matches common section patterns
+
+        Each section contains:
+        - section_number: 1-based index
+        - heading: Section title (from the section break slide)
+        - heading_level: 'h1' for sections, 'h2' for subsections
+        - text: Combined body text from all slides in the section
+        - slides: List of raw slide dicts
+        - code_samples: Aggregated code blocks
+        - tables: Aggregated tables
+        - speaker_notes: Combined speaker notes
+        - image_count: Total images in the section
+
+        Args:
+            slides: List of slide dicts from _extract_slide()
+
+        Returns:
+            List of section dicts compatible with the pipeline format.
+        """
+        if not slides:
+            return []
+
+        # Identify section break points
+        section_breaks: list[int] = []
+        for i, slide in enumerate(slides):
+            if slide.get("is_section_slide") and slide.get("title"):
+                section_breaks.append(i)
+
+        # If no explicit section breaks, treat the entire presentation as one section
+        if not section_breaks:
+            section = self._build_section_from_slides(
+                section_number=1,
+                heading=slides[0].get("title", self.name),
+                heading_level="h1",
+                slide_list=slides,
+            )
+            return [section]
+
+        # Build sections from break points
+        sections: list[dict] = []
+        section_number = 0
+
+        # Handle slides before the first section break
+        if section_breaks[0] > 0:
+            pre_section_slides = slides[: section_breaks[0]]
+            section_number += 1
+            section = self._build_section_from_slides(
+                section_number=section_number,
+                heading=pre_section_slides[0].get("title", "Introduction"),
+                heading_level="h1",
+                slide_list=pre_section_slides,
+            )
+            sections.append(section)
+
+        # Process each section
+        for idx, break_idx in enumerate(section_breaks):
+            section_number += 1
+            section_slide = slides[break_idx]
+            heading = section_slide.get("title", f"Section {section_number}")
+
+            # Determine end of this section
+            end_idx = section_breaks[idx + 1] if idx + 1 < len(section_breaks) else len(slides)
+
+            section_slides = slides[break_idx:end_idx]
+
+            section = self._build_section_from_slides(
+                section_number=section_number,
+                heading=heading,
+                heading_level="h1",
+                slide_list=section_slides,
+            )
+            sections.append(section)
+
+        return sections
+
+    def _build_section_from_slides(
+        self,
+        section_number: int,
+        heading: str,
+        heading_level: str,
+        slide_list: list[dict],
+    ) -> dict:
+        """Aggregate multiple slides into a single section dict.
+
+        Combines text, code blocks, tables, and notes from all slides
+        in the section into a single section dict compatible with the
+        pipeline's intermediate JSON format.
+
+        Args:
+            section_number: 1-based section index
+            heading: Section heading text
+            heading_level: 'h1' or 'h2'
+            slide_list: List of slide dicts to include
+
+        Returns:
+            Section dict with aggregated content.
+        """
+        text_parts: list[str] = []
+        code_samples: list[dict] = []
+        all_tables: list[dict] = []
+        notes_parts: list[str] = []
+        image_count = 0
+        sub_headings: list[dict] = []
+
+        for slide in slide_list:
+            slide_num = slide.get("slide_number", "?")
+            slide_title = slide.get("title", "")
+
+            # Add slide title as sub-heading (unless it's the section heading)
+            if slide_title and slide_title != heading:
+                sub_headings.append(
+                    {
+                        "level": "h3",
+                        "text": f"Slide {slide_num}: {slide_title}",
+                    }
+                )
+
+            # Collect body text
+            body = slide.get("body_text", "").strip()
+            if body:
+                text_parts.append(body)
+
+            # Collect code blocks
+            code_blocks = slide.get("code_blocks", [])
+            code_samples.extend(code_blocks)
+
+            # Collect tables
+            tables = slide.get("tables", [])
+            all_tables.extend(tables)
+
+            # Collect speaker notes
+            notes = slide.get("speaker_notes", "").strip()
+            if notes:
+                notes_parts.append(f"[Slide {slide_num}] {notes}")
+
+            # Count images
+            image_count += slide.get("image_count", 0)
+
+        # Combine text with speaker notes appended
+        combined_text = "\n\n".join(text_parts)
+        if notes_parts:
+            combined_text += "\n\n### Speaker Notes\n\n" + "\n\n".join(notes_parts)
+
+        return {
+            "section_number": section_number,
+            "heading": heading,
+            "heading_level": heading_level,
+            "text": combined_text,
+            "headings": sub_headings,
+            "code_samples": code_samples,
+            "tables": all_tables,
+            "slides": slide_list,
+            "image_count": image_count,
+            "slide_range": (
+                f"{slide_list[0]['slide_number']}-{slide_list[-1]['slide_number']}"
+                if slide_list
+                else ""
+            ),
+        }
+
+    # ------------------------------------------------------------------
+    # Language detection
+    # ------------------------------------------------------------------
+
+    def _detect_languages(
+        self,
+        sections: list[dict],
+    ) -> tuple[dict[str, int], int]:
+        """Detect programming languages in code blocks across all sections.
+
+        Uses the project's LanguageDetector for automatic language detection
+        when the language is not already set.
+
+        Args:
+            sections: List of section dicts with code_samples
+
+        Returns:
+            Tuple of (languages_detected dict, total_code_blocks count)
+        """
+        try:
+            from skill_seekers.cli.language_detector import LanguageDetector
+
+            detector = LanguageDetector(min_confidence=0.15)
+        except ImportError:
+            detector = None
+            logger.debug("LanguageDetector not available, skipping language detection")
+
+        languages_detected: dict[str, int] = {}
+        total_code_blocks = 0
+
+        for section in sections:
+            for code_sample in section.get("code_samples", []):
+                total_code_blocks += 1
+                lang = code_sample.get("language", "")
+
+                if lang:
+                    languages_detected[lang] = languages_detected.get(lang, 0) + 1
+                elif detector:
+                    code = code_sample.get("code", "")
+                    if code:
+                        detected_lang, confidence = detector.detect_from_code(code)
+                        if detected_lang and confidence >= 0.3:
+                            code_sample["language"] = detected_lang
+                            languages_detected[detected_lang] = (
+                                languages_detected.get(detected_lang, 0) + 1
+                            )
+
+        return languages_detected, total_code_blocks
+
+    # ------------------------------------------------------------------
+    # Load / Categorize / Build
+    # ------------------------------------------------------------------
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from JSON file.
+
+        Args:
+            json_path: Path to the extracted JSON file
+
+        Returns:
+            True on success.
+        """
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+        total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", [])))
+        print(f"✅ Loaded {total} sections")
+        return True
+
+    def categorize_content(self) -> dict[str, dict]:
+        """Categorize sections based on headings, keywords, or config.
+
+        For a single PowerPoint source, creates one category containing all
+        sections. For keyword-based categorization (multi-source), scores
+        each section against category keywords.
+
+        Returns:
+            Dict mapping category keys to category dicts with 'title' and
+            'pages' (list of sections).
+        """
+        print("\n📋 Categorizing content...")
+
+        categorized: dict[str, dict] = {}
+        sections = self.extracted_data.get("pages", [])
+
+        # For single PPTX source, use single category with all sections
+        if self.pptx_path:
+            pptx_basename = Path(self.pptx_path).stem
+            category_key = self._sanitize_filename(pptx_basename)
+            categorized[category_key] = {
+                "title": pptx_basename,
+                "pages": sections,
+            }
+            print("✅ Created 1 category (single PowerPoint source)")
+            print(f"   - {pptx_basename}: {len(sections)} sections")
+            return categorized
+
+        # Keyword-based categorization (multi-source scenario)
+        if self.categories:
+            first_value = next(iter(self.categories.values()), None)
+            if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
+                # Already categorized format
+                for cat_key, pages in self.categories.items():
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": pages,
+                    }
+            else:
+                # Keyword-based categorization
+                for cat_key in self.categories:
+                    categorized[cat_key] = {
+                        "title": cat_key.replace("_", " ").title(),
+                        "pages": [],
+                    }
+
+                for section in sections:
+                    text = section.get("text", "").lower()
+                    heading_text = section.get("heading", "").lower()
+
+                    scores: dict[str, int] = {}
+                    for cat_key, keywords in self.categories.items():
+                        if isinstance(keywords, list):
+                            score = sum(
+                                1
+                                for kw in keywords
+                                if isinstance(kw, str)
+                                and (kw.lower() in text or kw.lower() in heading_text)
+                            )
+                        else:
+                            score = 0
+                        if score > 0:
+                            scores[cat_key] = score
+
+                    if scores:
+                        best_cat = max(scores, key=scores.get)
+                        categorized[best_cat]["pages"].append(section)
+                    else:
+                        if "other" not in categorized:
+                            categorized["other"] = {"title": "Other", "pages": []}
+                        categorized["other"]["pages"].append(section)
+        else:
+            # No categorization - single category
+            categorized["content"] = {"title": "Content", "pages": sections}
+
+        print(f"✅ Created {len(categorized)} categories")
+        for _cat_key, cat_data in categorized.items():
+            print(f"   - {cat_data['title']}: {len(cat_data['pages'])} sections")
+
+        return categorized
+
+    def build_skill(self) -> None:
+        """Build complete skill structure from extracted data.
+
+        Creates the output directory structure with:
+        - references/ — one markdown file per category
+        - references/index.md — category index with statistics
+        - SKILL.md — main skill file with frontmatter and overview
+        - scripts/ — empty (reserved for future use)
+        - assets/ — empty (reserved for image export)
+        """
+        print(f"\n🏗️  Building skill: {self.name}")
+
+        # Create directories
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        # Categorize content
+        categorized = self.categorize_content()
+
+        # Generate reference files
+        print("\n📝 Generating reference files...")
+        total_sections = len(categorized)
+        section_num = 1
+        for cat_key, cat_data in categorized.items():
+            self._generate_reference_file(cat_key, cat_data, section_num, total_sections)
+            section_num += 1
+
+        # Generate index
+        self._generate_index(categorized)
+
+        # Generate SKILL.md
+        self._generate_skill_md(categorized)
+
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ------------------------------------------------------------------
+    # Output generation (private)
+    # ------------------------------------------------------------------
+
+    def _generate_reference_file(
+        self,
+        _cat_key: str,
+        cat_data: dict,
+        section_num: int,
+        total_sections: int,
+    ) -> None:
+        """Generate a reference markdown file for a category of sections.
+
+        Each section's slides are rendered as markdown with slide numbers,
+        body text, code examples, tables, speaker notes, and image counts.
+
+        Args:
+            _cat_key: Category key (unused, for interface consistency)
+            cat_data: Category dict with 'title' and 'pages' keys
+            section_num: 1-based index among all categories
+            total_sections: Total number of categories being generated
+        """
+        sections = cat_data["pages"]
+
+        # Use pptx basename for filename
+        pptx_basename = ""
+        if self.pptx_path:
+            pptx_basename = Path(self.pptx_path).stem
+
+        if sections:
+            section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+
+            if total_sections == 1:
+                filename = (
+                    f"{self.skill_dir}/references/{pptx_basename}.md"
+                    if pptx_basename
+                    else f"{self.skill_dir}/references/main.md"
+                )
+            else:
+                sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
+                base_name = pptx_basename if pptx_basename else "section"
+                filename = f"{self.skill_dir}/references/{base_name}_{sec_range}.md"
+        else:
+            filename = f"{self.skill_dir}/references/section_{section_num:02d}.md"
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+
+            for section in sections:
+                sec_num = section.get("section_number", "?")
+                heading = section.get("heading", "")
+                heading_level = section.get("heading_level", "h1")
+                slide_range = section.get("slide_range", "")
+
+                f.write(f"---\n\n**📄 Source: Section {sec_num}**")
+                if slide_range:
+                    f.write(f" (Slides {slide_range})")
+                f.write("\n\n")
+
+                # Section heading
+                if heading:
+                    md_level = "#" * (int(heading_level[1]) + 1) if heading_level else "##"
+                    f.write(f"{md_level} {heading}\n\n")
+
+                # Sub-headings (individual slide titles)
+                for sub_heading in section.get("headings", []):
+                    sub_level = sub_heading.get("level", "h3")
+                    sub_text = sub_heading.get("text", "")
+                    if sub_text:
+                        sub_md = "#" * (int(sub_level[1]) + 1) if sub_level else "###"
+                        f.write(f"{sub_md} {sub_text}\n\n")
+
+                # Body text
+                text = section.get("text", "").strip()
+                if text:
+                    f.write(f"{text}\n\n")
+
+                # Code samples
+                code_list = section.get("code_samples", [])
+                if code_list:
+                    f.write("### Code Examples\n\n")
+                    for code in code_list:
+                        lang = code.get("language", "")
+                        f.write(f"```{lang}\n{code['code']}\n```\n\n")
+
+                # Tables as markdown
+                tables = section.get("tables", [])
+                if tables:
+                    f.write("### Tables\n\n")
+                    for table in tables:
+                        headers = table.get("headers", [])
+                        rows = table.get("rows", [])
+                        if headers:
+                            f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
+                            f.write("| " + " | ".join("---" for _ in headers) + " |\n")
+                        for row in rows:
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+
+                # Image count summary
+                img_count = section.get("image_count", 0)
+                if img_count > 0:
+                    f.write(f"### Images\n\n*{img_count} image(s) in this section*\n\n")
+
+                f.write("---\n\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_index(self, categorized: dict[str, dict]) -> None:
+        """Generate reference index file listing all categories and statistics.
+
+        Args:
+            categorized: Dict of category key -> category data
+        """
+        filename = f"{self.skill_dir}/references/index.md"
+
+        pptx_basename = ""
+        if self.pptx_path:
+            pptx_basename = Path(self.pptx_path).stem
+
+        total_sections = len(categorized)
+
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Presentation Reference\n\n")
+            f.write("## Categories\n\n")
+
+            section_num = 1
+            for _cat_key, cat_data in categorized.items():
+                sections = cat_data["pages"]
+                section_count = len(sections)
+
+                if sections:
+                    section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)]
+                    sec_range_str = f"Sections {min(section_nums)}-{max(section_nums)}"
+
+                    if total_sections == 1:
+                        link_filename = f"{pptx_basename}.md" if pptx_basename else "main.md"
+                    else:
+                        sec_range = f"s{min(section_nums)}-s{max(section_nums)}"
+                        base_name = pptx_basename if pptx_basename else "section"
+                        link_filename = f"{base_name}_{sec_range}.md"
+                else:
+                    link_filename = f"section_{section_num:02d}.md"
+                    sec_range_str = "N/A"
+
+                f.write(
+                    f"- [{cat_data['title']}]({link_filename}) "
+                    f"({section_count} sections, {sec_range_str})\n"
+                )
+                section_num += 1
+
+            f.write("\n## Statistics\n\n")
+            f.write(f"- Total slides: {self.extracted_data.get('total_slides', 0)}\n")
+            f.write(f"- Total sections: {self.extracted_data.get('total_sections', 0)}\n")
+            f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
+            f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
+            f.write(f"- Tables: {self.extracted_data.get('total_tables', 0)}\n")
+
+            # Metadata
+            metadata = self.extracted_data.get("metadata", {})
+            if metadata.get("author"):
+                f.write(f"- Author: {metadata['author']}\n")
+            if metadata.get("created"):
+                f.write(f"- Created: {metadata['created']}\n")
+
+        print(f"   Generated: {filename}")
+
+    def _generate_skill_md(self, categorized: dict[str, dict]) -> None:
+        """Generate main SKILL.md file with YAML frontmatter and overview.
+
+        Creates a comprehensive skill file with:
+        - YAML frontmatter (name, description)
+        - Document information (from metadata)
+        - "When to Use" section
+        - Section overview with slide counts
+        - Key concepts from headings
+        - Quick reference patterns
+        - Top code examples grouped by language
+        - Table summary
+        - Documentation statistics
+        - Navigation links
+
+        Args:
+            categorized: Dict of category key -> category data
+        """
+        filename = f"{self.skill_dir}/SKILL.md"
+
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        with open(filename, "w", encoding="utf-8") as f:
+            # YAML frontmatter
+            f.write("---\n")
+            f.write(f"name: {skill_name}\n")
+            f.write(f"description: {desc}\n")
+            f.write("---\n\n")
+
+            f.write(f"# {self.name.title()} Presentation Skill\n\n")
+            f.write(f"{self.description}\n\n")
+
+            # Document metadata
+            metadata = self.extracted_data.get("metadata", {})
+            if any(v for v in metadata.values() if v):
+                f.write("## 📋 Presentation Information\n\n")
+                if metadata.get("title"):
+                    f.write(f"**Title:** {metadata['title']}\n\n")
+                if metadata.get("author"):
+                    f.write(f"**Author:** {metadata['author']}\n\n")
+                if metadata.get("subject"):
+                    f.write(f"**Subject:** {metadata['subject']}\n\n")
+                if metadata.get("category"):
+                    f.write(f"**Category:** {metadata['category']}\n\n")
+                if metadata.get("created"):
+                    f.write(f"**Created:** {metadata['created']}\n\n")
+                if metadata.get("modified"):
+                    f.write(f"**Modified:** {metadata['modified']}\n\n")
+                if metadata.get("slide_count"):
+                    f.write(f"**Slides:** {metadata['slide_count']}\n\n")
+
+            # When to Use
+            f.write("## 💡 When to Use This Skill\n\n")
+            f.write("Use this skill when you need to:\n")
+            f.write(f"- Understand {self.name} concepts and fundamentals\n")
+            f.write("- Review presentation content and key points\n")
+            f.write("- Find code examples and implementation patterns\n")
+            f.write("- Access speaker notes and additional context\n")
+            f.write("- Reference tables and data from the presentation\n\n")
+
+            # Section Overview
+            total_slides = self.extracted_data.get("total_slides", 0)
+            total_sections = self.extracted_data.get("total_sections", 0)
+            f.write("## 📖 Section Overview\n\n")
+            f.write(f"**Total Slides:** {total_slides}\n\n")
+            f.write(f"**Total Sections:** {total_sections}\n\n")
+            f.write("**Content Breakdown:**\n\n")
+            for _cat_key, cat_data in categorized.items():
+                section_count = len(cat_data["pages"])
+                f.write(f"- **{cat_data['title']}**: {section_count} sections\n")
+            f.write("\n")
+
+            # Key Concepts from headings
+            f.write(self._format_key_concepts())
+
+            # Quick Reference patterns
+            f.write("## ⚡ Quick Reference\n\n")
+            f.write(self._format_patterns_from_content())
+
+            # Code examples (top 15, grouped by language)
+            all_code: list[dict] = []
+            for section in self.extracted_data.get("pages", []):
+                all_code.extend(section.get("code_samples", []))
+
+            all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
+            top_code = all_code[:15]
+
+            if top_code:
+                f.write("## 📝 Code Examples\n\n")
+                f.write("*High-quality examples extracted from presentation*\n\n")
+
+                by_lang: dict[str, list] = {}
+                for code in top_code:
+                    lang = code.get("language", "unknown")
+                    by_lang.setdefault(lang, []).append(code)
+
+                for lang in sorted(by_lang.keys()):
+                    examples = by_lang[lang]
+                    f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
+                    for i, code in enumerate(examples[:5], 1):
+                        quality = code.get("quality_score", 0)
+                        code_text = code.get("code", "")
+                        f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
+                        f.write(f"```{lang}\n")
+                        if len(code_text) <= 500:
+                            f.write(code_text)
+                        else:
+                            f.write(code_text[:500] + "\n...")
+                        f.write("\n```\n\n")
+
+            # Table Summary (first 5 tables)
+            all_tables: list[tuple[str, dict]] = []
+            for section in self.extracted_data.get("pages", []):
+                for table in section.get("tables", []):
+                    all_tables.append((section.get("heading", ""), table))
+
+            if all_tables:
+                f.write("## 📊 Table Summary\n\n")
+                f.write(f"*{len(all_tables)} table(s) found in presentation*\n\n")
+                for section_heading, table in all_tables[:5]:
+                    if section_heading:
+                        f.write(f"**From section: {section_heading}**\n\n")
+                    headers = table.get("headers", [])
+                    rows = table.get("rows", [])
+                    if headers:
+                        f.write("| " + " | ".join(str(h) for h in headers) + " |\n")
+                        f.write("| " + " | ".join("---" for _ in headers) + " |\n")
+                        for row in rows[:5]:
+                            f.write("| " + " | ".join(str(c) for c in row) + " |\n")
+                        f.write("\n")
+
+            # Statistics
+            f.write("## 📊 Presentation Statistics\n\n")
+            f.write(f"- **Total Slides**: {total_slides}\n")
+            f.write(f"- **Total Sections**: {total_sections}\n")
+            f.write(f"- **Code Blocks**: {self.extracted_data.get('total_code_blocks', 0)}\n")
+            f.write(f"- **Images/Diagrams**: {self.extracted_data.get('total_images', 0)}\n")
+            f.write(f"- **Tables**: {self.extracted_data.get('total_tables', 0)}\n")
+
+            langs = self.extracted_data.get("languages_detected", {})
+            if langs:
+                f.write(f"- **Programming Languages**: {len(langs)}\n\n")
+                f.write("**Language Breakdown:**\n\n")
+                for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
+                    f.write(f"- {lang}: {count} examples\n")
+                f.write("\n")
+
+            # Navigation
+            f.write("## 🗺️ Navigation\n\n")
+            f.write("**Reference Files:**\n\n")
+            for _cat_key, cat_data in categorized.items():
+                cat_file = self._sanitize_filename(cat_data["title"])
+                f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
+            f.write("\n")
+            f.write("See `references/index.md` for complete presentation structure.\n\n")
+
+            # Footer
+            f.write("---\n\n")
+            f.write("**Generated by Skill Seeker** | PowerPoint Presentation Scraper\n")
+
+        with open(filename, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"   Generated: {filename} ({line_count} lines)")
+
+    # ------------------------------------------------------------------
+    # Content analysis helpers
+    # ------------------------------------------------------------------
+
+    def _format_key_concepts(self) -> str:
+        """Extract key concepts from section and slide headings.
+
+        Returns:
+            Markdown string with key concepts section, or empty string
+            if no headings are found.
+        """
+        all_headings: list[tuple[str, str]] = []
+
+        for section in self.extracted_data.get("pages", []):
+            # Main section heading
+            heading = section.get("heading", "").strip()
+            level = section.get("heading_level", "h1")
+            if heading and len(heading) > 3:
+                all_headings.append((level, heading))
+            # Sub-headings (individual slide titles)
+            for sub in section.get("headings", []):
+                text = sub.get("text", "").strip()
+                sub_level = sub.get("level", "h3")
+                if text and len(text) > 3:
+                    all_headings.append((sub_level, text))
+
+        if not all_headings:
+            return ""
+
+        content = "## 🔑 Key Concepts\n\n"
+        content += "*Main topics covered in this presentation*\n\n"
+
+        h1_headings = [text for level, text in all_headings if level == "h1"]
+        h2_headings = [text for level, text in all_headings if level == "h2"]
+        h3_headings = [text for level, text in all_headings if level == "h3"]
+
+        if h1_headings:
+            content += "**Major Sections:**\n\n"
+            for heading in h1_headings[:10]:
+                content += f"- {heading}\n"
+            content += "\n"
+
+        if h2_headings:
+            content += "**Subsections:**\n\n"
+            for heading in h2_headings[:15]:
+                content += f"- {heading}\n"
+            content += "\n"
+
+        if h3_headings and not h2_headings:
+            content += "**Slide Topics:**\n\n"
+            for heading in h3_headings[:15]:
+                content += f"- {heading}\n"
+            content += "\n"
+
+        return content
+
+    def _format_patterns_from_content(self) -> str:
+        """Extract common documentation patterns from section headings.
+
+        Searches for keywords like "introduction", "overview", "demo",
+        "agenda", etc. that are common in presentations.
+
+        Returns:
+            Markdown string describing found patterns.
+        """
+        patterns: list[dict] = []
+        pattern_keywords = [
+            "introduction",
+            "overview",
+            "agenda",
+            "objectives",
+            "getting started",
+            "demo",
+            "demonstration",
+            "examples",
+            "architecture",
+            "design",
+            "implementation",
+            "best practices",
+            "summary",
+            "conclusion",
+            "q&a",
+            "questions",
+            "next steps",
+            "resources",
+            "references",
+            "appendix",
+        ]
+
+        for section in self.extracted_data.get("pages", []):
+            heading_text = section.get("heading", "").lower()
+            sec_num = section.get("section_number", 0)
+
+            for keyword in pattern_keywords:
+                if keyword in heading_text:
+                    patterns.append(
+                        {
+                            "type": keyword.title(),
+                            "heading": section.get("heading", ""),
+                            "section": sec_num,
+                        }
+                    )
+                    break
+
+        if not patterns:
+            return "*See reference files for detailed content*\n\n"
+
+        content = "*Common presentation patterns found:*\n\n"
+        by_type: dict[str, list] = {}
+        for pattern in patterns:
+            ptype = pattern["type"]
+            by_type.setdefault(ptype, []).append(pattern)
+
+        for ptype in sorted(by_type.keys()):
+            items = by_type[ptype]
+            content += f"**{ptype}** ({len(items)} sections):\n"
+            for item in items[:3]:
+                content += f"- {item['heading']} (section {item['section']})\n"
+            content += "\n"
+
+        return content
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert a string to a filesystem-safe filename.
+
+        Removes special characters, replaces spaces and hyphens with
+        underscores, and lowercases the result.
+
+        Args:
+            name: Input string to sanitize
+
+        Returns:
+            Safe filename string
+        """
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        safe = re.sub(r"[-\s]+", "_", safe)
+        return safe
+
+
+# ---------------------------------------------------------------------------
+# Module-level helpers
+# ---------------------------------------------------------------------------
+
+
+def _score_code_quality(code: str) -> float:
+    """Score code quality on a 0-10 scale using heuristics.
+
+    Higher scores indicate more substantial, well-structured code.
+    Factors include line count, presence of definitions, imports,
+    indentation, and code syntax characters.
+
+    Args:
+        code: Source code text to score
+
+    Returns:
+        Float quality score between 0.0 and 10.0
+    """
+    if not code:
+        return 0.0
+
+    score = 5.0
+    lines = code.strip().split("\n")
+    line_count = len(lines)
+
+    # More lines = more substantial
+    if line_count >= 10:
+        score += 2.0
+    elif line_count >= 5:
+        score += 1.0
+
+    # Has function/class definitions
+    if re.search(r"\b(def |class |function |func |fn )", code):
+        score += 1.5
+
+    # Has imports/require
+    if re.search(r"\b(import |from .+ import|require\(|#include|using )", code):
+        score += 0.5
+
+    # Has indentation (common in Python, JS, etc.)
+    if re.search(r"^    ", code, re.MULTILINE):
+        score += 0.5
+
+    # Has assignment, operators, or common code syntax
+    if re.search(r"[=:{}()\[\]]", code):
+        score += 0.3
+
+    # Very short snippets get penalized
+    if len(code) < 30:
+        score -= 2.0
+
+    return min(10.0, max(0.0, score))
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """CLI entry point for the PowerPoint scraper.
+
+    Parses command-line arguments and runs the extraction and skill-building
+    pipeline. Supports direct .pptx input, directory input, and loading from
+    previously extracted JSON.
+
+    Returns:
+        Exit code (0 for success, non-zero for errors).
+    """
+    from skill_seekers.cli.arguments.pptx import add_pptx_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert PowerPoint presentation (.pptx) to skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    add_pptx_arguments(parser)
+
+    args = parser.parse_args()
+
+    # Set logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = getattr(args, "pptx", None) or getattr(args, "from_json", None) or "(none)"
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: PowerPoint Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:         {source}")
+        print(f"Name:           {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Enhance level:  {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate inputs
+    if not (getattr(args, "pptx", None) or getattr(args, "from_json", None)):
+        parser.error("Must specify --pptx or --from-json")
+
+    # Build from JSON workflow
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} presentation",
+        }
+        try:
+            converter = PptxToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Direct PPTX mode
+    if not getattr(args, "name", None):
+        # Auto-detect name from filename or directory name
+        pptx_path = Path(args.pptx)
+        args.name = pptx_path.stem if pptx_path.is_file() else pptx_path.name
+
+    config = {
+        "name": args.name,
+        "pptx_path": args.pptx,
+        # Pass None so extract_pptx() can infer from presentation metadata
+        "description": getattr(args, "description", None),
+    }
+
+    try:
+        converter = PptxToSkillConverter(config)
+
+        # Extract
+        if not converter.extract_pptx():
+            print(
+                "\n❌ PowerPoint extraction failed - see error above",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis,"
+                    " enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except (FileNotFoundError, ValueError) as e:
+        print(f"\n❌ Input error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(
+            f"\n❌ Unexpected error during PowerPoint processing: {e}",
+            file=sys.stderr,
+        )
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/rss_scraper.py b/src/skill_seekers/cli/rss_scraper.py
new file mode 100644
index 0000000..ce6837b
--- /dev/null
+++ b/src/skill_seekers/cli/rss_scraper.py
@@ -0,0 +1,1087 @@
+#!/usr/bin/env python3
+"""
+RSS/Atom Feed to Skill Converter
+
+Converts RSS 2.0, RSS 1.0 (RDF), and Atom feeds into AI-ready skills.
+Uses feedparser for feed parsing, optionally follows article links to scrape
+full content using requests + BeautifulSoup.
+
+Supports both remote feed URLs and local feed XML files. Extracts article
+metadata (title, author, published date, categories), feed-level metadata
+(title, description, link, language), and optionally the full article text
+from linked pages.
+
+Usage:
+    skill-seekers rss --feed-url https://example.com/feed.xml --name myblog
+    skill-seekers rss --feed-path ./feed.xml --name myblog
+    skill-seekers rss --feed-url https://example.com/rss --no-follow-links --name myblog
+    skill-seekers rss --from-json myblog_extracted.json
+    python3 -m skill_seekers.cli.rss_scraper --feed-url https://example.com/atom.xml --name myblog
+"""
+
+import argparse
+import hashlib
+import json
+import logging
+import os
+import re
+import sys
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+# Optional dependency guard — feedparser is not in core deps
+try:
+    import feedparser  # noqa: F401
+
+    FEEDPARSER_AVAILABLE = True
+except ImportError:
+    FEEDPARSER_AVAILABLE = False
+
+# BeautifulSoup is a core dependency (always available)
+from bs4 import BeautifulSoup, Comment, Tag
+
+logger = logging.getLogger(__name__)
+
+# Feed type constants
+FEED_TYPE_RSS_20 = "RSS 2.0"
+FEED_TYPE_RSS_10 = "RSS 1.0 (RDF)"
+FEED_TYPE_ATOM = "Atom"
+FEED_TYPE_UNKNOWN = "Unknown"
+
+# Default request headers for scraping article pages
+_DEFAULT_HEADERS = {
+    "User-Agent": "SkillSeekers/RSS-Scraper (https://github.com/skill-seekers)",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+}
+
+# Tags to strip from scraped article HTML
+_STRIP_TAGS = {"script", "style", "nav", "footer", "header", "aside", "noscript", "iframe"}
+
+# Maximum length for a single article's scraped text (characters)
+_MAX_ARTICLE_TEXT_LENGTH = 50_000
+
+# Delay between HTTP requests when following links (seconds)
+_REQUEST_DELAY = 1.0
+
+
+def _check_feedparser_deps() -> None:
+    """Raise RuntimeError if feedparser is not installed."""
+    if not FEEDPARSER_AVAILABLE:
+        raise RuntimeError(
+            "feedparser is required for RSS/Atom feed support.\n"
+            'Install with: pip install "skill-seekers[rss]"\n'
+            "Or: pip install feedparser"
+        )
+
+
+def infer_description_from_feed(
+    feed_meta: dict[str, Any] | None = None,
+    name: str = "",
+) -> str:
+    """Infer skill description from feed-level metadata.
+
+    Tries to build a meaningful "Use when..." description from the feed
+    title and subtitle/description fields.
+
+    Args:
+        feed_meta: Feed metadata dict with title, description, link, etc.
+        name: Skill name for fallback.
+
+    Returns:
+        Description string suitable for "Use when..." format.
+    """
+    if feed_meta:
+        desc = feed_meta.get("description", "")
+        if desc and len(desc) > 20:
+            if len(desc) > 150:
+                desc = desc[:147] + "..."
+            return f"Use when referencing {desc.lower()}"
+        title = feed_meta.get("title", "")
+        if title and len(title) > 5:
+            return f"Use when referencing articles from {title}"
+    return (
+        f"Use when referencing {name} feed content"
+        if name
+        else "Use when referencing this feed content"
+    )
+
+
+class RssToSkillConverter:
+    """Convert RSS/Atom feeds to AI-ready skills.
+
+    Parses RSS 2.0, RSS 1.0 (RDF), and Atom feeds using feedparser.
+    Optionally follows article links to scrape full page content via
+    requests + BeautifulSoup.
+    """
+
+    def __init__(self, config: dict[str, Any]) -> None:
+        """Initialize the converter with configuration.
+
+        Args:
+            config: Dictionary with name (required), feed_url, feed_path,
+                follow_links (default True), max_articles (default 50),
+                and description (optional).
+        """
+        self.config = config
+        self.name: str = config["name"]
+        self.feed_url: str = config.get("feed_url", "")
+        self.feed_path: str = config.get("feed_path", "")
+        self.follow_links: bool = config.get("follow_links", True)
+        self.max_articles: int = config.get("max_articles", 50)
+        self.description: str = config.get(
+            "description", f"Use when referencing {self.name} feed content"
+        )
+
+        # Output paths
+        self.skill_dir: str = f"output/{self.name}"
+        self.data_file: str = f"output/{self.name}_extracted.json"
+
+        # Internal state
+        self.extracted_data: dict[str, Any] | None = None
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Public API
+    # ──────────────────────────────────────────────────────────────────────
+
+    def extract_feed(self) -> bool:
+        """Parse the RSS/Atom feed and extract article data.
+
+        Parses feed, extracts metadata and articles, optionally follows links
+        to scrape full content, saves intermediate JSON.
+
+        Returns:
+            True on success.
+        """
+        _check_feedparser_deps()
+
+        source = self.feed_url or self.feed_path
+        print(f"\n🔍 Extracting RSS/Atom feed: {source}")
+
+        # Parse the feed
+        parsed = self._parse_feed()
+
+        # Detect feed type
+        feed_type = self._detect_feed_type(parsed)
+        print(f"   Feed type: {feed_type}")
+
+        # Extract feed-level metadata
+        feed_meta = self._extract_feed_metadata(parsed)
+        print(f"   Title: {feed_meta.get('title', 'Unknown')}")
+        print(f"   Link: {feed_meta.get('link', 'N/A')}")
+        print(f"   Language: {feed_meta.get('language', 'N/A')}")
+
+        # Update description from feed metadata if not explicitly set
+        if "description" not in self.config:
+            self.description = infer_description_from_feed(feed_meta, self.name)
+
+        # Extract articles
+        articles = self._extract_articles(parsed)
+        print(f"   Articles found: {len(articles)}")
+
+        # Optionally scrape full article content
+        if self.follow_links:
+            print(f"\n🌐 Following article links (max {len(articles)})...")
+            scraped_count = 0
+            for i, article in enumerate(articles):
+                link = article.get("link", "")
+                if not link:
+                    continue
+                print(f"   [{i + 1}/{len(articles)}] {link[:80]}...")
+                content = self._scrape_article_content(link)
+                if content:
+                    article["full_text"] = content
+                    scraped_count += 1
+                # Be polite — delay between requests
+                if i < len(articles) - 1:
+                    time.sleep(_REQUEST_DELAY)
+            print(f"   Scraped full content for {scraped_count}/{len(articles)} articles")
+        else:
+            print("   Skipping link following (--no-follow-links)")
+
+        # Categorize articles by feed categories/tags
+        all_categories = self._collect_all_categories(articles)
+
+        # Build result data
+        result_data: dict[str, Any] = {
+            "source": source,
+            "feed_type": feed_type,
+            "feed_metadata": feed_meta,
+            "total_articles": len(articles),
+            "followed_links": self.follow_links,
+            "all_categories": sorted(all_categories),
+            "articles": articles,
+        }
+
+        # Persist extracted data
+        os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True)
+        with open(self.data_file, "w", encoding="utf-8") as f:
+            json.dump(result_data, f, indent=2, ensure_ascii=False, default=str)
+
+        print(f"\n💾 Saved extracted data to: {self.data_file}")
+        self.extracted_data = result_data
+        print(
+            f"✅ Extracted {len(articles)} articles ({len(all_categories)} unique categories/tags)"
+        )
+        return True
+
+    def load_extracted_data(self, json_path: str) -> bool:
+        """Load previously extracted data from a JSON file."""
+        print(f"\n📂 Loading extracted data from: {json_path}")
+        if not os.path.exists(json_path):
+            raise FileNotFoundError(f"Extracted data file not found: {json_path}")
+
+        with open(json_path, encoding="utf-8") as f:
+            self.extracted_data = json.load(f)
+
+        total = self.extracted_data.get(
+            "total_articles", len(self.extracted_data.get("articles", []))
+        )
+        print(f"✅ Loaded {total} articles")
+        return True
+
+    def categorize_content(self) -> dict[str, dict[str, Any]]:
+        """Categorize articles by their feed categories/tags."""
+        print("\n📋 Categorizing content by feed tags...")
+
+        if not self.extracted_data:
+            raise RuntimeError("No extracted data available. Call extract_feed() first.")
+
+        articles = self.extracted_data.get("articles", [])
+        categorized: dict[str, dict[str, Any]] = {}
+
+        for article in articles:
+            cats = article.get("categories", [])
+            if not cats:
+                cats = ["uncategorized"]
+
+            for cat in cats:
+                cat_key = self._sanitize_filename(cat)
+                if cat_key not in categorized:
+                    categorized[cat_key] = {
+                        "title": cat,
+                        "articles": [],
+                    }
+                # Avoid duplicates if an article has overlapping normalized keys
+                article_id = article.get("id", article.get("link", ""))
+                existing_ids = {
+                    a.get("id", a.get("link", "")) for a in categorized[cat_key]["articles"]
+                }
+                if article_id not in existing_ids:
+                    categorized[cat_key]["articles"].append(article)
+
+        # If no categories at all, put everything in one group
+        if not categorized:
+            categorized["all_articles"] = {
+                "title": "All Articles",
+                "articles": articles,
+            }
+
+        print(f"✅ Created {len(categorized)} categories")
+        for cat_key, cat_data in categorized.items():
+            print(f"   - {cat_data['title']}: {len(cat_data['articles'])} articles")
+
+        return categorized
+
+    def build_skill(self) -> None:
+        """Build complete skill structure from extracted data."""
+        print(f"\n🏗️  Building skill: {self.name}")
+
+        if not self.extracted_data:
+            raise RuntimeError("No extracted data available. Call extract_feed() first.")
+
+        # Create directories
+        os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
+        os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
+
+        # Categorize content
+        categorized = self.categorize_content()
+
+        # Generate reference files
+        print("\n📝 Generating reference files...")
+        for cat_key, cat_data in categorized.items():
+            self._generate_reference_file(cat_key, cat_data)
+
+        # Generate index
+        self._generate_index(categorized)
+
+        # Generate SKILL.md
+        self._generate_skill_md(categorized)
+
+        print(f"\n✅ Skill built successfully: {self.skill_dir}/")
+        print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Feed parsing internals
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _parse_feed(self) -> "feedparser.FeedParserDict":
+        """Parse feed from URL or local file using feedparser."""
+        import feedparser as fp
+
+        if self.feed_path:
+            if not os.path.exists(self.feed_path):
+                raise FileNotFoundError(f"Feed file not found: {self.feed_path}")
+            logger.info("Parsing feed from local file: %s", self.feed_path)
+            parsed = fp.parse(self.feed_path)
+        elif self.feed_url:
+            logger.info("Fetching feed from URL: %s", self.feed_url)
+            parsed = fp.parse(
+                self.feed_url,
+                agent="SkillSeekers/RSS-Scraper",
+            )
+        else:
+            raise RuntimeError(
+                "No feed source provided. Use feed_url (remote URL) or feed_path (local file)."
+            )
+
+        # Check for parsing errors
+        if parsed.bozo and not parsed.entries:
+            exc = parsed.get("bozo_exception", "Unknown parse error")
+            raise RuntimeError(f"Failed to parse feed: {exc}")
+
+        return parsed
+
+    def _detect_feed_type(self, parsed: "feedparser.FeedParserDict") -> str:
+        """Detect RSS 2.0, RSS 1.0, or Atom from feedparser's version field."""
+        version = getattr(parsed, "version", "") or ""
+        version_lower = version.lower()
+
+        if "atom" in version_lower:
+            return FEED_TYPE_ATOM
+        if "rss20" in version_lower or version_lower == "rss20":
+            return FEED_TYPE_RSS_20
+        if "rss10" in version_lower or "rdf" in version_lower:
+            return FEED_TYPE_RSS_10
+        if version_lower.startswith("rss"):
+            return FEED_TYPE_RSS_20
+
+        # Fallback heuristic: check feed dict for version clues
+        feed = parsed.get("feed", {})
+        if feed.get("xmlns", "").startswith("http://www.w3.org/2005/Atom"):
+            return FEED_TYPE_ATOM
+        if feed.get("rss_version"):
+            return FEED_TYPE_RSS_20
+
+        return FEED_TYPE_UNKNOWN
+
+    def _extract_feed_metadata(self, parsed: "feedparser.FeedParserDict") -> dict[str, Any]:
+        """Extract feed-level metadata (title, description, link, language, etc.)."""
+        feed = parsed.get("feed", {})
+
+        # feedparser normalizes subtitle (Atom) and description (RSS)
+        description = feed.get("subtitle", "") or feed.get("description", "")
+
+        # Published / updated dates
+        published = feed.get("published", "") or feed.get("updated", "")
+
+        # Feed image (RSS <image>, Atom <icon>/<logo>)
+        image_url = ""
+        image_data = feed.get("image", {})
+        if isinstance(image_data, dict):
+            image_url = image_data.get("href", "") or image_data.get("url", "")
+        elif isinstance(image_data, str):
+            image_url = image_data
+
+        return {
+            "title": feed.get("title", "Untitled Feed"),
+            "description": description,
+            "link": feed.get("link", ""),
+            "language": feed.get("language", ""),
+            "author": feed.get("author", ""),
+            "published": published,
+            "generator": feed.get("generator", ""),
+            "image_url": image_url,
+            "rights": feed.get("rights", ""),
+        }
+
+    def _extract_articles(self, parsed: "feedparser.FeedParserDict") -> list[dict[str, Any]]:
+        """Extract article entries (title, link, summary, date, author, categories)."""
+        articles: list[dict[str, Any]] = []
+
+        for entry in parsed.entries[: self.max_articles]:
+            # Unique identifier (Atom id, RSS guid, or link hash)
+            entry_id = entry.get("id", "") or entry.get("link", "")
+            if not entry_id:
+                entry_id = hashlib.sha256(entry.get("title", "").encode("utf-8")).hexdigest()[:16]
+
+            # Published date normalization
+            published = entry.get("published", "") or entry.get("updated", "")
+            published_parsed = entry.get("published_parsed") or entry.get("updated_parsed")
+            published_iso = ""
+            if published_parsed:
+                try:
+                    dt = datetime(*published_parsed[:6])
+                    published_iso = dt.isoformat()
+                except (TypeError, ValueError):
+                    published_iso = published
+
+            # Categories / tags
+            categories: list[str] = []
+            for tag_data in entry.get("tags", []):
+                term = tag_data.get("term", "")
+                if term:
+                    categories.append(term)
+
+            # Summary — feedparser may provide HTML; clean it
+            summary_raw = entry.get("summary", "") or entry.get("description", "")
+            summary_text = self._html_to_text(summary_raw) if summary_raw else ""
+
+            # Content — some feeds include full content inline
+            content_text = ""
+            content_list = entry.get("content", [])
+            if content_list and isinstance(content_list, list):
+                for content_block in content_list:
+                    value = content_block.get("value", "")
+                    if value:
+                        content_text += self._html_to_text(value) + "\n\n"
+                content_text = content_text.strip()
+
+            # Author(s)
+            author = entry.get("author", "")
+            if not author:
+                authors_detail = entry.get("authors", [])
+                if authors_detail:
+                    author = ", ".join(a.get("name", "") for a in authors_detail if a.get("name"))
+
+            article: dict[str, Any] = {
+                "id": entry_id,
+                "title": entry.get("title", "Untitled"),
+                "link": entry.get("link", ""),
+                "summary": summary_text,
+                "content": content_text,
+                "published": published,
+                "published_iso": published_iso,
+                "author": author,
+                "categories": categories,
+            }
+
+            articles.append(article)
+
+        return articles
+
+    def _scrape_article_content(self, url: str) -> str:
+        """Follow article URL, extract full page content using requests + BeautifulSoup."""
+        try:
+            import requests
+        except ImportError:
+            logger.warning(
+                "requests library not available — cannot follow article links. "
+                "Install with: pip install requests"
+            )
+            return ""
+
+        try:
+            response = requests.get(
+                url,
+                headers=_DEFAULT_HEADERS,
+                timeout=15,
+                allow_redirects=True,
+            )
+            response.raise_for_status()
+        except Exception as e:
+            logger.debug("Failed to fetch %s: %s", url, e)
+            return ""
+
+        content_type = response.headers.get("Content-Type", "")
+        if "html" not in content_type.lower() and "xml" not in content_type.lower():
+            logger.debug("Skipping non-HTML content at %s (type: %s)", url, content_type)
+            return ""
+
+        return self._extract_article_text(response.text)
+
+    def _extract_article_text(self, html: str) -> str:
+        """Clean article HTML to text/markdown. Finds <article>/<main>, strips nav/ads."""
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Remove unwanted elements
+        for tag_name in _STRIP_TAGS:
+            for element in soup.find_all(tag_name):
+                element.decompose()
+        for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
+            comment.extract()
+
+        # Try to find the main article container
+        main_content = (
+            soup.find("article")
+            or soup.find("main")
+            or soup.find(attrs={"role": "main"})
+            or soup.find(attrs={"id": re.compile(r"(content|article|post|entry)", re.I)})
+            or soup.find(attrs={"class": re.compile(r"(content|article|post|entry)", re.I)})
+        )
+
+        if not main_content:
+            main_content = soup.find("body") or soup
+
+        # Convert to text with basic structure preservation
+        text_parts: list[str] = []
+        for element in main_content.descendants:
+            if isinstance(element, Tag):
+                if element.name in ("h1", "h2", "h3", "h4", "h5", "h6"):
+                    level = int(element.name[1])
+                    heading_text = element.get_text(strip=True)
+                    if heading_text:
+                        text_parts.append(f"\n{'#' * level} {heading_text}\n")
+                elif element.name == "p":
+                    para_text = element.get_text(separator=" ", strip=True)
+                    if para_text:
+                        text_parts.append(f"\n{para_text}\n")
+                elif element.name in ("pre", "code"):
+                    code_text = element.get_text()
+                    if code_text and code_text.strip():
+                        # Detect language from class if available
+                        classes = element.get("class", [])
+                        lang = ""
+                        for cls in classes:
+                            if isinstance(cls, str) and (
+                                cls.startswith("language-") or cls.startswith("lang-")
+                            ):
+                                lang = cls.split("-", 1)[1]
+                                break
+                        text_parts.append(f"\n```{lang}\n{code_text.strip()}\n```\n")
+                elif element.name == "li":
+                    li_text = element.get_text(separator=" ", strip=True)
+                    if li_text:
+                        text_parts.append(f"- {li_text}")
+                elif element.name == "blockquote":
+                    bq_text = element.get_text(separator=" ", strip=True)
+                    if bq_text:
+                        text_parts.append(f"\n> {bq_text}\n")
+
+        text = "\n".join(text_parts).strip()
+
+        # Collapse excessive whitespace
+        text = re.sub(r"\n{4,}", "\n\n\n", text)
+
+        # Truncate if too long
+        if len(text) > _MAX_ARTICLE_TEXT_LENGTH:
+            text = text[:_MAX_ARTICLE_TEXT_LENGTH] + "\n\n[Content truncated]"
+
+        return text
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Categorization helpers
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _collect_all_categories(self, articles: list[dict[str, Any]]) -> set[str]:
+        """Collect all unique category/tag strings across articles."""
+        categories: set[str] = set()
+        for article in articles:
+            for cat in article.get("categories", []):
+                if cat:
+                    categories.add(cat)
+        return categories
+
+    def _html_to_text(self, html_fragment: str) -> str:
+        """Convert an HTML fragment to plain text, stripping all tags."""
+        if not html_fragment:
+            return ""
+        soup = BeautifulSoup(html_fragment, "html.parser")
+        text = soup.get_text(separator=" ", strip=True)
+        # Collapse multiple spaces
+        text = re.sub(r"\s+", " ", text).strip()
+        return text
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Skill generation — reference files
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _generate_reference_file(self, cat_key: str, cat_data: dict[str, Any]) -> None:
+        """Generate a reference markdown file for a category of articles."""
+        safe_name = self._sanitize_filename(cat_data["title"])
+        filepath = f"{self.skill_dir}/references/{safe_name}.md"
+
+        articles = cat_data["articles"]
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write(f"# {cat_data['title']}\n\n")
+            f.write(f"**Articles:** {len(articles)}\n\n")
+            f.write("---\n\n")
+
+            for article in articles:
+                f.write(f"## {article.get('title', 'Untitled')}\n\n")
+
+                # Metadata block
+                if article.get("author"):
+                    f.write(f"**Author:** {article['author']}\n\n")
+                if article.get("published"):
+                    f.write(f"**Published:** {article['published']}\n\n")
+                if article.get("link"):
+                    f.write(f"**Link:** {article['link']}\n\n")
+                if article.get("categories"):
+                    tags = ", ".join(article["categories"])
+                    f.write(f"**Tags:** {tags}\n\n")
+
+                # Summary
+                summary = article.get("summary", "")
+                if summary:
+                    f.write("### Summary\n\n")
+                    f.write(f"{summary}\n\n")
+
+                # Inline content from feed (if present)
+                inline_content = article.get("content", "")
+                if inline_content and inline_content != summary:
+                    f.write("### Content\n\n")
+                    f.write(f"{inline_content}\n\n")
+
+                # Full scraped text
+                full_text = article.get("full_text", "")
+                if full_text:
+                    f.write("### Full Article\n\n")
+                    f.write(f"{full_text}\n\n")
+
+                f.write("---\n\n")
+
+        print(f"   Generated: {filepath}")
+
+    def _generate_index(self, categorized: dict[str, dict[str, Any]]) -> None:
+        """Generate the reference index file with category links and statistics."""
+        filepath = f"{self.skill_dir}/references/index.md"
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            f.write(f"# {self.name.title()} Feed Reference Index\n\n")
+
+            feed_meta = self.extracted_data.get("feed_metadata", {})
+            if feed_meta.get("title"):
+                f.write(f"**Feed:** {feed_meta['title']}\n\n")
+            if feed_meta.get("link"):
+                f.write(f"**Source:** {feed_meta['link']}\n\n")
+
+            f.write("## Categories\n\n")
+
+            total_articles = 0
+            for cat_key, cat_data in sorted(categorized.items()):
+                safe_name = self._sanitize_filename(cat_data["title"])
+                count = len(cat_data["articles"])
+                total_articles += count
+                f.write(f"- [{cat_data['title']}]({safe_name}.md) ({count} articles)\n")
+
+            f.write(f"\n**Total articles:** {total_articles}\n\n")
+
+            # Statistics
+            f.write("## Statistics\n\n")
+            f.write(f"- Total articles: {self.extracted_data.get('total_articles', 0)}\n")
+            f.write(f"- Feed type: {self.extracted_data.get('feed_type', FEED_TYPE_UNKNOWN)}\n")
+            f.write(
+                f"- Links followed: "
+                f"{'Yes' if self.extracted_data.get('followed_links') else 'No'}\n"
+            )
+
+            all_cats = self.extracted_data.get("all_categories", [])
+            if all_cats:
+                f.write(f"- Unique tags: {len(all_cats)}\n")
+
+            # Author summary
+            author_counts = self._count_authors()
+            if author_counts:
+                f.write(f"\n## Authors ({len(author_counts)})\n\n")
+                for author, count in sorted(
+                    author_counts.items(), key=lambda x: x[1], reverse=True
+                )[:20]:
+                    f.write(f"- {author}: {count} articles\n")
+
+        print(f"   Generated: {filepath}")
+
+    def _generate_skill_md(self, categorized: dict[str, dict[str, Any]]) -> None:
+        """Generate the main SKILL.md file with feed overview and navigation."""
+        filepath = f"{self.skill_dir}/SKILL.md"
+
+        feed_meta = self.extracted_data.get("feed_metadata", {})
+        feed_title = feed_meta.get("title", self.name.title())
+        feed_type = self.extracted_data.get("feed_type", FEED_TYPE_UNKNOWN)
+
+        # Skill name for frontmatter (lowercase, hyphens, max 64 chars)
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+
+        # Truncate description
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        with open(filepath, "w", encoding="utf-8") as f:
+            # YAML frontmatter
+            f.write("---\n")
+            f.write(f"name: {skill_name}\n")
+            f.write(f"description: {desc}\n")
+            f.write("---\n\n")
+
+            # Header
+            f.write(f"# {feed_title} Feed Skill\n\n")
+            f.write(f"{self.description}\n\n")
+
+            # Feed Information
+            f.write("## 📡 Feed Information\n\n")
+            f.write(f"**Feed Title:** {feed_title}\n\n")
+            f.write(f"**Feed Type:** {feed_type}\n\n")
+            if feed_meta.get("link"):
+                f.write(f"**Website:** {feed_meta['link']}\n\n")
+            if feed_meta.get("language"):
+                f.write(f"**Language:** {feed_meta['language']}\n\n")
+            if feed_meta.get("description"):
+                feed_desc = feed_meta["description"]
+                if len(feed_desc) > 300:
+                    feed_desc = feed_desc[:297] + "..."
+                f.write(f"**Description:** {feed_desc}\n\n")
+            if feed_meta.get("generator"):
+                f.write(f"**Generator:** {feed_meta['generator']}\n\n")
+            if feed_meta.get("rights"):
+                f.write(f"**Rights:** {feed_meta['rights']}\n\n")
+
+            # When to Use
+            f.write("## 💡 When to Use This Skill\n\n")
+            f.write("Use this skill when you need to:\n")
+            f.write(f"- Reference articles and content from {feed_title}\n")
+            f.write("- Look up specific topics covered in the feed\n")
+            f.write("- Find author perspectives and expert analysis\n")
+            f.write("- Review recent posts and updates on the subject\n")
+            f.write("- Explore categorized content by tags or topics\n\n")
+
+            # Article Overview
+            total_articles = self.extracted_data.get("total_articles", 0)
+            f.write("## 📖 Article Overview\n\n")
+            f.write(f"**Total Articles:** {total_articles}\n\n")
+
+            # Category breakdown
+            f.write("**Content by Category:**\n\n")
+            for cat_key, cat_data in sorted(categorized.items()):
+                count = len(cat_data["articles"])
+                f.write(f"- **{cat_data['title']}**: {count} articles\n")
+            f.write("\n")
+
+            # Recent articles (top 10 by date or order)
+            articles = self.extracted_data.get("articles", [])
+            recent = articles[:10]
+            if recent:
+                f.write("## 📰 Recent Articles\n\n")
+                for article in recent:
+                    title = article.get("title", "Untitled")
+                    published = article.get("published", "")
+                    author = article.get("author", "")
+                    link = article.get("link", "")
+
+                    f.write(f"### {title}\n\n")
+                    meta_parts: list[str] = []
+                    if published:
+                        meta_parts.append(f"**Published:** {published}")
+                    if author:
+                        meta_parts.append(f"**Author:** {author}")
+                    if meta_parts:
+                        f.write(" | ".join(meta_parts) + "\n\n")
+
+                    summary = article.get("summary", "")
+                    if summary:
+                        # Show first 200 chars of summary
+                        short = summary[:200] + "..." if len(summary) > 200 else summary
+                        f.write(f"{short}\n\n")
+
+                    if link:
+                        f.write(f"[Read more]({link})\n\n")
+
+            # Authors
+            author_counts = self._count_authors()
+            if author_counts:
+                f.write(f"## ✍️ Authors ({len(author_counts)})\n\n")
+                for author, count in sorted(
+                    author_counts.items(), key=lambda x: x[1], reverse=True
+                )[:15]:
+                    f.write(f"- **{author}**: {count} articles\n")
+                f.write("\n")
+
+            # All categories/tags
+            all_cats = self.extracted_data.get("all_categories", [])
+            if all_cats:
+                f.write(f"## 🏷️ Tags ({len(all_cats)})\n\n")
+                f.write(", ".join(f"`{cat}`" for cat in all_cats[:50]))
+                if len(all_cats) > 50:
+                    f.write(f" ... and {len(all_cats) - 50} more")
+                f.write("\n\n")
+
+            # Statistics
+            f.write("## 📊 Feed Statistics\n\n")
+            f.write(f"- **Total Articles**: {total_articles}\n")
+            f.write(f"- **Feed Type**: {feed_type}\n")
+            f.write(f"- **Categories/Tags**: {len(all_cats)}\n")
+            f.write(f"- **Authors**: {len(author_counts)}\n")
+            followed = self.extracted_data.get("followed_links", False)
+            f.write(f"- **Full Content Scraped**: {'Yes' if followed else 'No'}\n\n")
+
+            # Date range
+            date_range = self._get_date_range()
+            if date_range:
+                f.write(f"- **Date Range**: {date_range[0]} to {date_range[1]}\n\n")
+
+            # Navigation
+            f.write("## 🗺️ Navigation\n\n")
+            f.write("**Reference Files:**\n\n")
+            for cat_key, cat_data in sorted(categorized.items()):
+                safe_name = self._sanitize_filename(cat_data["title"])
+                f.write(
+                    f"- `references/{safe_name}.md` - {cat_data['title']}"
+                    f" ({len(cat_data['articles'])} articles)\n"
+                )
+            f.write("\n")
+            f.write("See `references/index.md` for complete feed structure.\n\n")
+
+            # Footer
+            f.write("---\n\n")
+            f.write("**Generated by Skill Seeker** | RSS/Atom Feed Scraper\n")
+
+        with open(filepath, encoding="utf-8") as f:
+            line_count = len(f.read().split("\n"))
+        print(f"   Generated: {filepath} ({line_count} lines)")
+
+    # ──────────────────────────────────────────────────────────────────────
+    # Utility helpers
+    # ──────────────────────────────────────────────────────────────────────
+
+    def _count_authors(self) -> dict[str, int]:
+        """Count articles per author."""
+        if not self.extracted_data:
+            return {}
+        counts: dict[str, int] = {}
+        for article in self.extracted_data.get("articles", []):
+            author = article.get("author", "").strip()
+            if author:
+                counts[author] = counts.get(author, 0) + 1
+        return counts
+
+    def _get_date_range(self) -> tuple[str, str] | None:
+        """Get the date range (earliest, latest) of articles, or None."""
+        if not self.extracted_data:
+            return None
+        dates: list[str] = []
+        for article in self.extracted_data.get("articles", []):
+            iso = article.get("published_iso", "")
+            if iso:
+                dates.append(iso)
+        if not dates:
+            return None
+        dates.sort()
+        return (dates[0][:10], dates[-1][:10])
+
+    def _sanitize_filename(self, name: str) -> str:
+        """Convert a string to a safe filename."""
+        safe = re.sub(r"[^\w\s-]", "", name.lower())
+        safe = re.sub(r"[-\s]+", "_", safe)
+        return safe or "unnamed"
+
+
+# ──────────────────────────────────────────────────────────────────────────
+# CLI entry point
+# ──────────────────────────────────────────────────────────────────────────
+
+
+def main() -> int:
+    """CLI entry point for the RSS/Atom feed scraper."""
+    from .arguments.common import add_all_standard_arguments
+
+    parser = argparse.ArgumentParser(
+        description="Convert RSS/Atom feed to AI-ready skill",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Examples:\n"
+            "  %(prog)s --feed-url https://example.com/feed.xml --name myblog\n"
+            "  %(prog)s --feed-path ./feed.xml --name myblog\n"
+            "  %(prog)s --feed-url https://example.com/rss --no-follow-links --name myblog\n"
+            "  %(prog)s --from-json myblog_extracted.json\n"
+        ),
+    )
+
+    # Standard arguments (name, description, output, enhance-level, etc.)
+    add_all_standard_arguments(parser)
+
+    # Override enhance-level default to 0 for RSS
+    for action in parser._actions:
+        if hasattr(action, "dest") and action.dest == "enhance_level":
+            action.default = 0
+            action.help = (
+                "AI enhancement level (auto-detects API vs LOCAL mode): "
+                "0=disabled (default for RSS), 1=SKILL.md only, "
+                "2=+architecture/config, 3=full enhancement. "
+                "Mode selection: uses API if ANTHROPIC_API_KEY is set, "
+                "otherwise LOCAL (Claude Code)"
+            )
+
+    # RSS-specific arguments
+    parser.add_argument(
+        "--feed-url",
+        type=str,
+        help="URL of the RSS/Atom feed to scrape",
+        metavar="URL",
+    )
+    parser.add_argument(
+        "--feed-path",
+        type=str,
+        help="Local file path to an RSS/Atom XML file",
+        metavar="PATH",
+    )
+    parser.add_argument(
+        "--follow-links",
+        action="store_true",
+        default=True,
+        dest="follow_links",
+        help="Follow article links to scrape full content (default: enabled)",
+    )
+    parser.add_argument(
+        "--no-follow-links",
+        action="store_false",
+        dest="follow_links",
+        help="Do not follow article links — use feed content only",
+    )
+    parser.add_argument(
+        "--max-articles",
+        type=int,
+        default=50,
+        metavar="N",
+        help="Maximum number of articles to process (default: 50)",
+    )
+    parser.add_argument(
+        "--from-json",
+        type=str,
+        help="Build skill from previously extracted JSON file",
+        metavar="FILE",
+    )
+
+    args = parser.parse_args()
+
+    # Set logging level
+    if getattr(args, "quiet", False):
+        logging.getLogger().setLevel(logging.WARNING)
+    elif getattr(args, "verbose", False):
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    # Handle --dry-run
+    if getattr(args, "dry_run", False):
+        source = (
+            getattr(args, "feed_url", None)
+            or getattr(args, "feed_path", None)
+            or getattr(args, "from_json", None)
+            or "(none)"
+        )
+        print(f"\n{'=' * 60}")
+        print("DRY RUN: RSS/Atom Feed Extraction")
+        print(f"{'=' * 60}")
+        print(f"Source:          {source}")
+        print(f"Name:            {getattr(args, 'name', None) or '(auto-detect)'}")
+        print(f"Follow links:    {getattr(args, 'follow_links', True)}")
+        print(f"Max articles:    {getattr(args, 'max_articles', 50)}")
+        print(f"Enhance level:   {getattr(args, 'enhance_level', 0)}")
+        print(f"\n✅ Dry run complete")
+        return 0
+
+    # Validate inputs
+    has_source = (
+        getattr(args, "feed_url", None)
+        or getattr(args, "feed_path", None)
+        or getattr(args, "from_json", None)
+    )
+    if not has_source:
+        parser.error("Must specify --feed-url, --feed-path, or --from-json")
+
+    # Build from JSON workflow
+    if getattr(args, "from_json", None):
+        name = Path(args.from_json).stem.replace("_extracted", "")
+        config: dict[str, Any] = {
+            "name": getattr(args, "name", None) or name,
+            "description": getattr(args, "description", None)
+            or f"Use when referencing {name} feed content",
+        }
+        try:
+            converter = RssToSkillConverter(config)
+            converter.load_extracted_data(args.from_json)
+            converter.build_skill()
+        except Exception as e:
+            print(f"\n❌ Error: {e}", file=sys.stderr)
+            sys.exit(1)
+        return 0
+
+    # Feed extraction workflow
+    if not getattr(args, "name", None):
+        # Auto-detect name from URL or file path
+        if getattr(args, "feed_url", None):
+            from urllib.parse import urlparse
+
+            parsed_url = urlparse(args.feed_url)
+            args.name = parsed_url.hostname.replace(".", "-") if parsed_url.hostname else "feed"
+        elif getattr(args, "feed_path", None):
+            args.name = Path(args.feed_path).stem
+
+    config = {
+        "name": args.name,
+        "feed_url": getattr(args, "feed_url", "") or "",
+        "feed_path": getattr(args, "feed_path", "") or "",
+        "follow_links": getattr(args, "follow_links", True),
+        "max_articles": getattr(args, "max_articles", 50),
+        "description": getattr(args, "description", None),
+    }
+
+    try:
+        converter = RssToSkillConverter(config)
+
+        # Extract feed
+        if not converter.extract_feed():
+            print("\n❌ Feed extraction failed — see error above", file=sys.stderr)
+            sys.exit(1)
+
+        # Build skill
+        converter.build_skill()
+
+        # Enhancement Workflow Integration
+        from skill_seekers.cli.workflow_runner import run_workflows
+
+        workflow_executed, workflow_names = run_workflows(args)
+        workflow_name = ", ".join(workflow_names) if workflow_names else None
+
+        # Traditional enhancement (complements workflow system)
+        if getattr(args, "enhance_level", 0) > 0:
+            api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY")
+            mode = "API" if api_key else "LOCAL"
+
+            print("\n" + "=" * 80)
+            print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})")
+            print("=" * 80)
+            if workflow_executed:
+                print(f"   Running after workflow: {workflow_name}")
+                print(
+                    "   (Workflow provides specialized analysis, "
+                    "enhancement provides general improvements)"
+                )
+            print("")
+
+            skill_dir = converter.skill_dir
+            if api_key:
+                try:
+                    from skill_seekers.cli.enhance_skill import enhance_skill_md
+
+                    enhance_skill_md(skill_dir, api_key)
+                    print("✅ API enhancement complete!")
+                except ImportError:
+                    print("❌ API enhancement not available. Falling back to LOCAL mode...")
+                    from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                    enhancer = LocalSkillEnhancer(Path(skill_dir))
+                    enhancer.run(headless=True)
+                    print("✅ Local enhancement complete!")
+            else:
+                from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
+
+                enhancer = LocalSkillEnhancer(Path(skill_dir))
+                enhancer.run(headless=True)
+                print("✅ Local enhancement complete!")
+
+    except RuntimeError as e:
+        print(f"\n❌ Error: {e}", file=sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Unexpected error during feed processing: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/skill_seekers/cli/source_detector.py b/src/skill_seekers/cli/source_detector.py
index 572d753..1858fcc 100644
--- a/src/skill_seekers/cli/source_detector.py
+++ b/src/skill_seekers/cli/source_detector.py
@@ -1,7 +1,12 @@
 """Source type detection for unified create command.
 
-Auto-detects whether a source is a web URL, GitHub repository,
-local directory, PDF file, or config file based on patterns.
+Auto-detects source type from user input — supports web URLs, GitHub repos,
+local directories, and 14+ file types (PDF, DOCX, EPUB, IPYNB, HTML, YAML/OpenAPI,
+AsciiDoc, PPTX, RSS/Atom, man pages, video files, and config JSON).
+
+Note: Confluence, Notion, and Slack/Discord chat sources are API/export-based
+and cannot be auto-detected from a single argument. Use their dedicated
+subcommands (``skill-seekers confluence``, ``notion``, ``chat``) instead.
 """
 
 import os
@@ -66,11 +71,49 @@ class SourceDetector:
         if source.endswith(".epub"):
             return cls._detect_epub(source)
 
+        if source.endswith(".ipynb"):
+            return cls._detect_jupyter(source)
+
+        if source.lower().endswith((".html", ".htm")):
+            return cls._detect_html(source)
+
+        if source.endswith(".pptx"):
+            return cls._detect_pptx(source)
+
+        if source.lower().endswith((".adoc", ".asciidoc")):
+            return cls._detect_asciidoc(source)
+
+        # Man page file extensions (.1 through .8, .man)
+        # Only match if the basename looks like a man page (e.g., "git.1", not "log.1")
+        # Require basename without the extension to be a plausible command name
+        if source.lower().endswith(".man"):
+            return cls._detect_manpage(source)
+        MAN_SECTION_EXTENSIONS = (".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8")
+        if source.lower().endswith(MAN_SECTION_EXTENSIONS):
+            # Heuristic: man pages have a simple basename (no dots before extension)
+            # e.g., "git.1" is a man page, "access.log.1" is not
+            basename_no_ext = os.path.splitext(os.path.basename(source))[0]
+            if "." not in basename_no_ext:
+                return cls._detect_manpage(source)
+
         # Video file extensions
         VIDEO_EXTENSIONS = (".mp4", ".mkv", ".avi", ".mov", ".webm", ".flv", ".wmv")
         if source.lower().endswith(VIDEO_EXTENSIONS):
             return cls._detect_video_file(source)
 
+        # RSS/Atom feed file extensions (only .rss and .atom — .xml is too generic)
+        if source.lower().endswith((".rss", ".atom")):
+            return cls._detect_rss(source)
+
+        # OpenAPI/Swagger spec detection (YAML files with OpenAPI content)
+        # Sniff file content for 'openapi:' or 'swagger:' keys before committing
+        if (
+            source.lower().endswith((".yaml", ".yml"))
+            and os.path.isfile(source)
+            and cls._looks_like_openapi(source)
+        ):
+            return cls._detect_openapi(source)
+
         # 2. Video URL detection (before directory check)
         video_url_info = cls._detect_video_url(source)
         if video_url_info:
@@ -97,15 +140,22 @@ class SourceDetector:
         raise ValueError(
             f"Cannot determine source type for: {source}\n\n"
             "Examples:\n"
-            "  Web:    skill-seekers create https://docs.react.dev/\n"
-            "  GitHub: skill-seekers create facebook/react\n"
-            "  Local:  skill-seekers create ./my-project\n"
-            "  PDF:    skill-seekers create tutorial.pdf\n"
-            "  DOCX:   skill-seekers create document.docx\n"
-            "  EPUB:   skill-seekers create ebook.epub\n"
-            "  Video:  skill-seekers create https://youtube.com/watch?v=...\n"
-            "  Video:  skill-seekers create recording.mp4\n"
-            "  Config: skill-seekers create configs/react.json"
+            "  Web:        skill-seekers create https://docs.react.dev/\n"
+            "  GitHub:     skill-seekers create facebook/react\n"
+            "  Local:      skill-seekers create ./my-project\n"
+            "  PDF:        skill-seekers create tutorial.pdf\n"
+            "  DOCX:       skill-seekers create document.docx\n"
+            "  EPUB:       skill-seekers create ebook.epub\n"
+            "  Jupyter:    skill-seekers create notebook.ipynb\n"
+            "  HTML:       skill-seekers create page.html\n"
+            "  OpenAPI:    skill-seekers create openapi.yaml\n"
+            "  AsciiDoc:   skill-seekers create document.adoc\n"
+            "  PowerPoint: skill-seekers create presentation.pptx\n"
+            "  RSS:        skill-seekers create feed.rss\n"
+            "  Man page:   skill-seekers create command.1\n"
+            "  Video:      skill-seekers create https://youtube.com/watch?v=...\n"
+            "  Video:      skill-seekers create recording.mp4\n"
+            "  Config:     skill-seekers create configs/react.json"
         )
 
     @classmethod
@@ -140,6 +190,90 @@ class SourceDetector:
             type="epub", parsed={"file_path": source}, suggested_name=name, raw_input=source
         )
 
+    @classmethod
+    def _detect_jupyter(cls, source: str) -> SourceInfo:
+        """Detect Jupyter Notebook file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="jupyter", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
+    @classmethod
+    def _detect_html(cls, source: str) -> SourceInfo:
+        """Detect local HTML file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="html", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
+    @classmethod
+    def _detect_pptx(cls, source: str) -> SourceInfo:
+        """Detect PowerPoint file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="pptx", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
+    @classmethod
+    def _detect_asciidoc(cls, source: str) -> SourceInfo:
+        """Detect AsciiDoc file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="asciidoc", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
+    @classmethod
+    def _detect_manpage(cls, source: str) -> SourceInfo:
+        """Detect man page file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="manpage", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
+    @classmethod
+    def _detect_rss(cls, source: str) -> SourceInfo:
+        """Detect RSS/Atom feed file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="rss", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
+    @classmethod
+    def _looks_like_openapi(cls, source: str) -> bool:
+        """Check if a YAML/JSON file looks like an OpenAPI or Swagger spec.
+
+        Reads the first few lines to look for 'openapi:' or 'swagger:' keys.
+
+        Args:
+            source: Path to the file
+
+        Returns:
+            True if the file appears to be an OpenAPI/Swagger spec
+        """
+        try:
+            with open(source, encoding="utf-8", errors="replace") as f:
+                # Read first 20 lines — the openapi/swagger key is always near the top
+                for _ in range(20):
+                    line = f.readline()
+                    if not line:
+                        break
+                    stripped = line.strip().lower()
+                    if stripped.startswith("openapi:") or stripped.startswith("swagger:"):
+                        return True
+                    if stripped.startswith('"openapi"') or stripped.startswith('"swagger"'):
+                        return True
+        except OSError:
+            pass
+        return False
+
+    @classmethod
+    def _detect_openapi(cls, source: str) -> SourceInfo:
+        """Detect OpenAPI/Swagger spec file source."""
+        name = os.path.splitext(os.path.basename(source))[0]
+        return SourceInfo(
+            type="openapi", parsed={"file_path": source}, suggested_name=name, raw_input=source
+        )
+
     @classmethod
     def _detect_video_file(cls, source: str) -> SourceInfo:
         """Detect local video file source."""
@@ -312,5 +446,19 @@ class SourceDetector:
             if not os.path.isfile(config_path):
                 raise ValueError(f"Path is not a file: {config_path}")
 
-        # For web and github, validation happens during scraping
-        # (URL accessibility, repo existence)
+        elif source_info.type in ("jupyter", "html", "pptx", "asciidoc", "manpage", "openapi"):
+            file_path = source_info.parsed.get("file_path", "")
+            if file_path:
+                type_label = source_info.type.upper()
+                if not os.path.exists(file_path):
+                    raise ValueError(f"{type_label} file does not exist: {file_path}")
+                if not os.path.isfile(file_path) and not os.path.isdir(file_path):
+                    raise ValueError(f"Path is not a file or directory: {file_path}")
+
+        elif source_info.type == "rss":
+            file_path = source_info.parsed.get("file_path", "")
+            if file_path and not os.path.exists(file_path):
+                raise ValueError(f"RSS/Atom file does not exist: {file_path}")
+
+        # For web, github, confluence, notion, chat, rss (URL), validation happens
+        # during scraping (URL accessibility, API auth, etc.)
diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py
index 81f7ed3..c413176 100644
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -76,6 +76,17 @@ class UnifiedScraper:
             "word": [],  # List of word sources
             "video": [],  # List of video sources
             "local": [],  # List of local sources (docs or code)
+            "epub": [],  # List of epub sources
+            "jupyter": [],  # List of Jupyter notebook sources
+            "html": [],  # List of local HTML sources
+            "openapi": [],  # List of OpenAPI/Swagger spec sources
+            "asciidoc": [],  # List of AsciiDoc sources
+            "pptx": [],  # List of PowerPoint sources
+            "confluence": [],  # List of Confluence wiki sources
+            "notion": [],  # List of Notion page sources
+            "rss": [],  # List of RSS/Atom feed sources
+            "manpage": [],  # List of man page sources
+            "chat": [],  # List of Slack/Discord chat sources
         }
 
         # Track source index for unique naming (multi-source support)
@@ -86,6 +97,17 @@ class UnifiedScraper:
             "word": 0,
             "video": 0,
             "local": 0,
+            "epub": 0,
+            "jupyter": 0,
+            "html": 0,
+            "openapi": 0,
+            "asciidoc": 0,
+            "pptx": 0,
+            "confluence": 0,
+            "notion": 0,
+            "rss": 0,
+            "manpage": 0,
+            "chat": 0,
         }
 
         # Output paths - cleaner organization
@@ -166,6 +188,28 @@ class UnifiedScraper:
                     self._scrape_video(source)
                 elif source_type == "local":
                     self._scrape_local(source)
+                elif source_type == "epub":
+                    self._scrape_epub(source)
+                elif source_type == "jupyter":
+                    self._scrape_jupyter(source)
+                elif source_type == "html":
+                    self._scrape_html(source)
+                elif source_type == "openapi":
+                    self._scrape_openapi(source)
+                elif source_type == "asciidoc":
+                    self._scrape_asciidoc(source)
+                elif source_type == "pptx":
+                    self._scrape_pptx(source)
+                elif source_type == "confluence":
+                    self._scrape_confluence(source)
+                elif source_type == "notion":
+                    self._scrape_notion(source)
+                elif source_type == "rss":
+                    self._scrape_rss(source)
+                elif source_type == "manpage":
+                    self._scrape_manpage(source)
+                elif source_type == "chat":
+                    self._scrape_chat(source)
                 else:
                     logger.warning(f"Unknown source type: {source_type}")
             except Exception as e:
@@ -571,6 +615,7 @@ class UnifiedScraper:
             {
                 "docx_path": docx_path,
                 "docx_id": docx_id,
+                "word_id": docx_id,  # Alias for generic reference generation
                 "idx": idx,
                 "data": word_data,
                 "data_file": cache_word_data,
@@ -788,6 +833,595 @@ class UnifiedScraper:
             logger.debug(f"Traceback: {traceback.format_exc()}")
             raise
 
+    # ------------------------------------------------------------------
+    # New source type handlers (v3.2.0+)
+    # ------------------------------------------------------------------
+
+    def _scrape_epub(self, source: dict[str, Any]):
+        """Scrape EPUB e-book (.epub)."""
+        try:
+            from skill_seekers.cli.epub_scraper import EpubToSkillConverter
+        except ImportError:
+            logger.error(
+                "EPUB scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[epub]"
+            )
+            return
+
+        idx = self._source_counters["epub"]
+        self._source_counters["epub"] += 1
+
+        epub_path = source["path"]
+        epub_id = os.path.splitext(os.path.basename(epub_path))[0]
+
+        epub_config = {
+            "name": f"{self.name}_epub_{idx}_{epub_id}",
+            "epub_path": source["path"],
+            "description": source.get("description", f"{epub_id} e-book"),
+        }
+
+        logger.info(f"Scraping EPUB: {source['path']}")
+        converter = EpubToSkillConverter(epub_config)
+        converter.extract_epub()
+
+        epub_data_file = converter.data_file
+        with open(epub_data_file, encoding="utf-8") as f:
+            epub_data = json.load(f)
+
+        cache_epub_data = os.path.join(self.data_dir, f"epub_data_{idx}_{epub_id}.json")
+        shutil.copy(epub_data_file, cache_epub_data)
+
+        self.scraped_data["epub"].append(
+            {
+                "epub_path": epub_path,
+                "epub_id": epub_id,
+                "idx": idx,
+                "data": epub_data,
+                "data_file": cache_epub_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ EPUB: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone EPUB SKILL.md: {e}")
+
+        logger.info(f"✅ EPUB: {len(epub_data.get('chapters', []))} chapters extracted")
+
+    def _scrape_jupyter(self, source: dict[str, Any]):
+        """Scrape Jupyter Notebook (.ipynb)."""
+        try:
+            from skill_seekers.cli.jupyter_scraper import JupyterToSkillConverter
+        except ImportError:
+            logger.error(
+                "Jupyter scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[jupyter]"
+            )
+            return
+
+        idx = self._source_counters["jupyter"]
+        self._source_counters["jupyter"] += 1
+
+        nb_path = source["path"]
+        nb_id = os.path.splitext(os.path.basename(nb_path))[0]
+
+        nb_config = {
+            "name": f"{self.name}_jupyter_{idx}_{nb_id}",
+            "notebook_path": source["path"],
+            "description": source.get("description", f"{nb_id} notebook"),
+        }
+
+        logger.info(f"Scraping Jupyter Notebook: {source['path']}")
+        converter = JupyterToSkillConverter(nb_config)
+        converter.extract_notebook()
+
+        nb_data_file = converter.data_file
+        with open(nb_data_file, encoding="utf-8") as f:
+            nb_data = json.load(f)
+
+        cache_nb_data = os.path.join(self.data_dir, f"jupyter_data_{idx}_{nb_id}.json")
+        shutil.copy(nb_data_file, cache_nb_data)
+
+        self.scraped_data["jupyter"].append(
+            {
+                "notebook_path": nb_path,
+                "notebook_id": nb_id,
+                "idx": idx,
+                "data": nb_data,
+                "data_file": cache_nb_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ Jupyter: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone Jupyter SKILL.md: {e}")
+
+        logger.info(f"✅ Jupyter: {len(nb_data.get('cells', []))} cells extracted")
+
+    def _scrape_html(self, source: dict[str, Any]):
+        """Scrape local HTML file(s)."""
+        try:
+            from skill_seekers.cli.html_scraper import HtmlToSkillConverter
+        except ImportError:
+            logger.error("html_scraper.py not found")
+            return
+
+        idx = self._source_counters["html"]
+        self._source_counters["html"] += 1
+
+        html_path = source["path"]
+        html_id = os.path.splitext(os.path.basename(html_path.rstrip("/")))[0]
+
+        html_config = {
+            "name": f"{self.name}_html_{idx}_{html_id}",
+            "html_path": source["path"],
+            "description": source.get("description", f"{html_id} HTML content"),
+        }
+
+        logger.info(f"Scraping local HTML: {source['path']}")
+        converter = HtmlToSkillConverter(html_config)
+        converter.extract_html()
+
+        html_data_file = converter.data_file
+        with open(html_data_file, encoding="utf-8") as f:
+            html_data = json.load(f)
+
+        cache_html_data = os.path.join(self.data_dir, f"html_data_{idx}_{html_id}.json")
+        shutil.copy(html_data_file, cache_html_data)
+
+        self.scraped_data["html"].append(
+            {
+                "html_path": html_path,
+                "html_id": html_id,
+                "idx": idx,
+                "data": html_data,
+                "data_file": cache_html_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ HTML: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone HTML SKILL.md: {e}")
+
+        logger.info(f"✅ HTML: {len(html_data.get('pages', []))} pages extracted")
+
+    def _scrape_openapi(self, source: dict[str, Any]):
+        """Scrape OpenAPI/Swagger specification."""
+        try:
+            from skill_seekers.cli.openapi_scraper import OpenAPIToSkillConverter
+        except ImportError:
+            logger.error("openapi_scraper.py not found")
+            return
+
+        idx = self._source_counters["openapi"]
+        self._source_counters["openapi"] += 1
+
+        spec_path = source.get("path", source.get("url", ""))
+        spec_id = os.path.splitext(os.path.basename(spec_path))[0] if spec_path else f"spec_{idx}"
+
+        openapi_config = {
+            "name": f"{self.name}_openapi_{idx}_{spec_id}",
+            "spec_path": source.get("path"),
+            "spec_url": source.get("url"),
+            "description": source.get("description", f"{spec_id} API spec"),
+        }
+
+        logger.info(f"Scraping OpenAPI spec: {spec_path}")
+        converter = OpenAPIToSkillConverter(openapi_config)
+        converter.extract_spec()
+
+        api_data_file = converter.data_file
+        with open(api_data_file, encoding="utf-8") as f:
+            api_data = json.load(f)
+
+        cache_api_data = os.path.join(self.data_dir, f"openapi_data_{idx}_{spec_id}.json")
+        shutil.copy(api_data_file, cache_api_data)
+
+        self.scraped_data["openapi"].append(
+            {
+                "spec_path": spec_path,
+                "spec_id": spec_id,
+                "idx": idx,
+                "data": api_data,
+                "data_file": cache_api_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ OpenAPI: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone OpenAPI SKILL.md: {e}")
+
+        logger.info(f"✅ OpenAPI: {len(api_data.get('endpoints', []))} endpoints extracted")
+
+    def _scrape_asciidoc(self, source: dict[str, Any]):
+        """Scrape AsciiDoc document(s)."""
+        try:
+            from skill_seekers.cli.asciidoc_scraper import AsciiDocToSkillConverter
+        except ImportError:
+            logger.error(
+                "AsciiDoc scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[asciidoc]"
+            )
+            return
+
+        idx = self._source_counters["asciidoc"]
+        self._source_counters["asciidoc"] += 1
+
+        adoc_path = source["path"]
+        adoc_id = os.path.splitext(os.path.basename(adoc_path.rstrip("/")))[0]
+
+        adoc_config = {
+            "name": f"{self.name}_asciidoc_{idx}_{adoc_id}",
+            "asciidoc_path": source["path"],
+            "description": source.get("description", f"{adoc_id} AsciiDoc content"),
+        }
+
+        logger.info(f"Scraping AsciiDoc: {source['path']}")
+        converter = AsciiDocToSkillConverter(adoc_config)
+        converter.extract_asciidoc()
+
+        adoc_data_file = converter.data_file
+        with open(adoc_data_file, encoding="utf-8") as f:
+            adoc_data = json.load(f)
+
+        cache_adoc_data = os.path.join(self.data_dir, f"asciidoc_data_{idx}_{adoc_id}.json")
+        shutil.copy(adoc_data_file, cache_adoc_data)
+
+        self.scraped_data["asciidoc"].append(
+            {
+                "asciidoc_path": adoc_path,
+                "asciidoc_id": adoc_id,
+                "idx": idx,
+                "data": adoc_data,
+                "data_file": cache_adoc_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ AsciiDoc: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone AsciiDoc SKILL.md: {e}")
+
+        logger.info(f"✅ AsciiDoc: {len(adoc_data.get('sections', []))} sections extracted")
+
+    def _scrape_pptx(self, source: dict[str, Any]):
+        """Scrape PowerPoint presentation (.pptx)."""
+        try:
+            from skill_seekers.cli.pptx_scraper import PptxToSkillConverter
+        except ImportError:
+            logger.error(
+                "PowerPoint scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[pptx]"
+            )
+            return
+
+        idx = self._source_counters["pptx"]
+        self._source_counters["pptx"] += 1
+
+        pptx_path = source["path"]
+        pptx_id = os.path.splitext(os.path.basename(pptx_path))[0]
+
+        pptx_config = {
+            "name": f"{self.name}_pptx_{idx}_{pptx_id}",
+            "pptx_path": source["path"],
+            "description": source.get("description", f"{pptx_id} presentation"),
+        }
+
+        logger.info(f"Scraping PowerPoint: {source['path']}")
+        converter = PptxToSkillConverter(pptx_config)
+        converter.extract_pptx()
+
+        pptx_data_file = converter.data_file
+        with open(pptx_data_file, encoding="utf-8") as f:
+            pptx_data = json.load(f)
+
+        cache_pptx_data = os.path.join(self.data_dir, f"pptx_data_{idx}_{pptx_id}.json")
+        shutil.copy(pptx_data_file, cache_pptx_data)
+
+        self.scraped_data["pptx"].append(
+            {
+                "pptx_path": pptx_path,
+                "pptx_id": pptx_id,
+                "idx": idx,
+                "data": pptx_data,
+                "data_file": cache_pptx_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ PowerPoint: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone PowerPoint SKILL.md: {e}")
+
+        logger.info(f"✅ PowerPoint: {len(pptx_data.get('slides', []))} slides extracted")
+
+    def _scrape_confluence(self, source: dict[str, Any]):
+        """Scrape Confluence wiki (API or exported HTML/XML)."""
+        try:
+            from skill_seekers.cli.confluence_scraper import ConfluenceToSkillConverter
+        except ImportError:
+            logger.error(
+                "Confluence scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[confluence]"
+            )
+            return
+
+        idx = self._source_counters["confluence"]
+        self._source_counters["confluence"] += 1
+
+        source_id = source.get("space_key", source.get("path", f"confluence_{idx}"))
+        if isinstance(source_id, str) and "/" in source_id:
+            source_id = os.path.basename(source_id.rstrip("/"))
+
+        conf_config = {
+            "name": f"{self.name}_confluence_{idx}_{source_id}",
+            "base_url": source.get("base_url", source.get("url")),
+            "space_key": source.get("space_key"),
+            "export_path": source.get("path"),
+            "username": source.get("username"),
+            "token": source.get("token"),
+            "description": source.get("description", f"{source_id} Confluence content"),
+            "max_pages": source.get("max_pages", 500),
+        }
+
+        logger.info(f"Scraping Confluence: {source_id}")
+        converter = ConfluenceToSkillConverter(conf_config)
+        converter.extract_confluence()
+
+        conf_data_file = converter.data_file
+        with open(conf_data_file, encoding="utf-8") as f:
+            conf_data = json.load(f)
+
+        cache_conf_data = os.path.join(self.data_dir, f"confluence_data_{idx}_{source_id}.json")
+        shutil.copy(conf_data_file, cache_conf_data)
+
+        self.scraped_data["confluence"].append(
+            {
+                "source_id": source_id,
+                "idx": idx,
+                "data": conf_data,
+                "data_file": cache_conf_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ Confluence: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone Confluence SKILL.md: {e}")
+
+        logger.info(f"✅ Confluence: {len(conf_data.get('pages', []))} pages extracted")
+
+    def _scrape_notion(self, source: dict[str, Any]):
+        """Scrape Notion pages (API or exported Markdown)."""
+        try:
+            from skill_seekers.cli.notion_scraper import NotionToSkillConverter
+        except ImportError:
+            logger.error(
+                "Notion scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[notion]"
+            )
+            return
+
+        idx = self._source_counters["notion"]
+        self._source_counters["notion"] += 1
+
+        source_id = source.get(
+            "database_id", source.get("page_id", source.get("path", f"notion_{idx}"))
+        )
+        if isinstance(source_id, str) and "/" in source_id:
+            source_id = os.path.basename(source_id.rstrip("/"))
+
+        notion_config = {
+            "name": f"{self.name}_notion_{idx}_{source_id}",
+            "database_id": source.get("database_id"),
+            "page_id": source.get("page_id"),
+            "export_path": source.get("path"),
+            "token": source.get("token"),
+            "description": source.get("description", f"{source_id} Notion content"),
+            "max_pages": source.get("max_pages", 500),
+        }
+
+        logger.info(f"Scraping Notion: {source_id}")
+        converter = NotionToSkillConverter(notion_config)
+        converter.extract_notion()
+
+        notion_data_file = converter.data_file
+        with open(notion_data_file, encoding="utf-8") as f:
+            notion_data = json.load(f)
+
+        cache_notion_data = os.path.join(self.data_dir, f"notion_data_{idx}_{source_id}.json")
+        shutil.copy(notion_data_file, cache_notion_data)
+
+        self.scraped_data["notion"].append(
+            {
+                "source_id": source_id,
+                "idx": idx,
+                "data": notion_data,
+                "data_file": cache_notion_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ Notion: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone Notion SKILL.md: {e}")
+
+        logger.info(f"✅ Notion: {len(notion_data.get('pages', []))} pages extracted")
+
+    def _scrape_rss(self, source: dict[str, Any]):
+        """Scrape RSS/Atom feed (with optional full article scraping)."""
+        try:
+            from skill_seekers.cli.rss_scraper import RssToSkillConverter
+        except ImportError:
+            logger.error(
+                "RSS scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[rss]"
+            )
+            return
+
+        idx = self._source_counters["rss"]
+        self._source_counters["rss"] += 1
+
+        feed_url = source.get("url", source.get("path", ""))
+        feed_id = feed_url.split("/")[-1].split(".")[0] if feed_url else f"feed_{idx}"
+
+        rss_config = {
+            "name": f"{self.name}_rss_{idx}_{feed_id}",
+            "feed_url": source.get("url"),
+            "feed_path": source.get("path"),
+            "follow_links": source.get("follow_links", True),
+            "max_articles": source.get("max_articles", 50),
+            "description": source.get("description", f"{feed_id} RSS/Atom feed"),
+        }
+
+        logger.info(f"Scraping RSS/Atom feed: {feed_url}")
+        converter = RssToSkillConverter(rss_config)
+        converter.extract_feed()
+
+        rss_data_file = converter.data_file
+        with open(rss_data_file, encoding="utf-8") as f:
+            rss_data = json.load(f)
+
+        cache_rss_data = os.path.join(self.data_dir, f"rss_data_{idx}_{feed_id}.json")
+        shutil.copy(rss_data_file, cache_rss_data)
+
+        self.scraped_data["rss"].append(
+            {
+                "feed_url": feed_url,
+                "feed_id": feed_id,
+                "idx": idx,
+                "data": rss_data,
+                "data_file": cache_rss_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ RSS: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone RSS SKILL.md: {e}")
+
+        logger.info(f"✅ RSS: {len(rss_data.get('articles', []))} articles extracted")
+
+    def _scrape_manpage(self, source: dict[str, Any]):
+        """Scrape man page(s)."""
+        try:
+            from skill_seekers.cli.man_scraper import ManPageToSkillConverter
+        except ImportError:
+            logger.error("man_scraper.py not found")
+            return
+
+        idx = self._source_counters["manpage"]
+        self._source_counters["manpage"] += 1
+
+        man_names = source.get("names", [])
+        man_path = source.get("path", "")
+        man_id = man_names[0] if man_names else os.path.basename(man_path.rstrip("/"))
+
+        man_config = {
+            "name": f"{self.name}_manpage_{idx}_{man_id}",
+            "man_names": man_names,
+            "man_path": man_path,
+            "sections": source.get("sections", []),
+            "description": source.get("description", f"{man_id} man pages"),
+        }
+
+        logger.info(f"Scraping man pages: {man_id}")
+        converter = ManPageToSkillConverter(man_config)
+        converter.extract_manpages()
+
+        man_data_file = converter.data_file
+        with open(man_data_file, encoding="utf-8") as f:
+            man_data = json.load(f)
+
+        cache_man_data = os.path.join(self.data_dir, f"manpage_data_{idx}_{man_id}.json")
+        shutil.copy(man_data_file, cache_man_data)
+
+        self.scraped_data["manpage"].append(
+            {
+                "man_id": man_id,
+                "idx": idx,
+                "data": man_data,
+                "data_file": cache_man_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ Man pages: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone man page SKILL.md: {e}")
+
+        logger.info(f"✅ Man pages: {len(man_data.get('pages', []))} man pages extracted")
+
+    def _scrape_chat(self, source: dict[str, Any]):
+        """Scrape Slack/Discord chat export or API."""
+        try:
+            from skill_seekers.cli.chat_scraper import ChatToSkillConverter
+        except ImportError:
+            logger.error(
+                "Chat scraper dependencies not installed.\n"
+                "  Install with: pip install skill-seekers[chat]"
+            )
+            return
+
+        idx = self._source_counters["chat"]
+        self._source_counters["chat"] += 1
+
+        export_path = source.get("path", "")
+        channel = source.get("channel", source.get("channel_id", ""))
+        chat_id = channel or os.path.basename(export_path.rstrip("/")) or f"chat_{idx}"
+
+        chat_config = {
+            "name": f"{self.name}_chat_{idx}_{chat_id}",
+            "export_path": source.get("path"),
+            "platform": source.get("platform", "slack"),
+            "token": source.get("token"),
+            "channel": channel,
+            "max_messages": source.get("max_messages", 10000),
+            "description": source.get("description", f"{chat_id} chat export"),
+        }
+
+        logger.info(f"Scraping chat: {chat_id}")
+        converter = ChatToSkillConverter(chat_config)
+        converter.extract_chat()
+
+        chat_data_file = converter.data_file
+        with open(chat_data_file, encoding="utf-8") as f:
+            chat_data = json.load(f)
+
+        cache_chat_data = os.path.join(self.data_dir, f"chat_data_{idx}_{chat_id}.json")
+        shutil.copy(chat_data_file, cache_chat_data)
+
+        self.scraped_data["chat"].append(
+            {
+                "chat_id": chat_id,
+                "platform": source.get("platform", "slack"),
+                "idx": idx,
+                "data": chat_data,
+                "data_file": cache_chat_data,
+            }
+        )
+
+        try:
+            converter.build_skill()
+            logger.info("✅ Chat: Standalone SKILL.md created")
+        except Exception as e:
+            logger.warning(f"⚠️  Failed to build standalone chat SKILL.md: {e}")
+
+        logger.info(f"✅ Chat: {len(chat_data.get('messages', []))} messages extracted")
+
     def _load_json(self, file_path: Path) -> dict:
         """
         Load JSON file safely.
@@ -1297,14 +1931,33 @@ Examples:
     if args.dry_run:
         logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
         logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
+        # Source type display config: type -> (label, key for detail)
+        _SOURCE_DISPLAY = {
+            "documentation": ("Documentation", "base_url"),
+            "github": ("GitHub", "repo"),
+            "pdf": ("PDF", "path"),
+            "word": ("Word", "path"),
+            "epub": ("EPUB", "path"),
+            "video": ("Video", "url"),
+            "local": ("Local Codebase", "path"),
+            "jupyter": ("Jupyter Notebook", "path"),
+            "html": ("HTML", "path"),
+            "openapi": ("OpenAPI Spec", "path"),
+            "asciidoc": ("AsciiDoc", "path"),
+            "pptx": ("PowerPoint", "path"),
+            "confluence": ("Confluence", "base_url"),
+            "notion": ("Notion", "page_id"),
+            "rss": ("RSS/Atom Feed", "url"),
+            "manpage": ("Man Page", "names"),
+            "chat": ("Chat Export", "path"),
+        }
         for idx, source in enumerate(scraper.config.get("sources", []), 1):
             source_type = source.get("type", "unknown")
-            if source_type == "documentation":
-                logger.info(f"  {idx}. Documentation: {source.get('base_url', 'N/A')}")
-            elif source_type == "github":
-                logger.info(f"  {idx}. GitHub: {source.get('repo', 'N/A')}")
-            elif source_type == "pdf":
-                logger.info(f"  {idx}. PDF: {source.get('pdf_path', 'N/A')}")
+            label, key = _SOURCE_DISPLAY.get(source_type, (source_type.title(), "path"))
+            detail = source.get(key, "N/A")
+            if isinstance(detail, list):
+                detail = ", ".join(str(d) for d in detail)
+            logger.info(f"  {idx}. {label}: {detail}")
         logger.info(f"\nOutput directory: {scraper.output_dir}")
         logger.info(f"Merge mode: {scraper.merge_mode}")
         return
diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py
index 1f4a606..28a29cd 100644
--- a/src/skill_seekers/cli/unified_skill_builder.py
+++ b/src/skill_seekers/cli/unified_skill_builder.py
@@ -136,6 +136,44 @@ class UnifiedSkillBuilder:
             skill_mds["pdf"] = "\n\n---\n\n".join(pdf_sources)
             logger.debug(f"Combined {len(pdf_sources)} PDF SKILL.md files")
 
+        # Load additional source types using generic glob pattern
+        # Each source type uses: {name}_{type}_{idx}_*/ or {name}_{type}_*/
+        _extra_types = [
+            "word",
+            "epub",
+            "video",
+            "jupyter",
+            "html",
+            "openapi",
+            "asciidoc",
+            "pptx",
+            "confluence",
+            "notion",
+            "rss",
+            "manpage",
+            "chat",
+        ]
+        for source_type in _extra_types:
+            type_sources = []
+            for type_dir in sources_dir.glob(f"{self.name}_{source_type}_*"):
+                type_skill_path = type_dir / "SKILL.md"
+                if type_skill_path.exists():
+                    try:
+                        content = type_skill_path.read_text(encoding="utf-8")
+                        type_sources.append(content)
+                        logger.debug(
+                            f"Loaded {source_type} SKILL.md from {type_dir.name} "
+                            f"({len(content)} chars)"
+                        )
+                    except OSError as e:
+                        logger.warning(
+                            f"Failed to read {source_type} SKILL.md from {type_dir.name}: {e}"
+                        )
+
+            if type_sources:
+                skill_mds[source_type] = "\n\n---\n\n".join(type_sources)
+                logger.debug(f"Combined {len(type_sources)} {source_type} SKILL.md files")
+
         logger.info(f"Loaded {len(skill_mds)} source SKILL.md files")
         return skill_mds
 
@@ -477,6 +515,18 @@ This skill synthesizes knowledge from multiple sources:
             logger.info("Using PDF SKILL.md as-is")
             content = skill_mds["pdf"]
 
+        # Generic merge for additional source types not covered by pairwise methods
+        if not content and skill_mds:
+            # At least one source SKILL.md exists but not docs/github/pdf
+            logger.info(f"Generic merge for source types: {list(skill_mds.keys())}")
+            content = self._generic_merge(skill_mds)
+        elif content and len(skill_mds) > (int(has_docs) + int(has_github) + int(has_pdf)):
+            # Pairwise synthesis handled the core types; append additional sources
+            extra_types = set(skill_mds.keys()) - {"documentation", "github", "pdf"}
+            if extra_types:
+                logger.info(f"Appending additional sources: {extra_types}")
+                content = self._append_extra_sources(content, skill_mds, extra_types)
+
         # Fallback: generate minimal SKILL.md (legacy behavior)
         if not content:
             logger.warning("No source SKILL.md files found, generating minimal SKILL.md (legacy)")
@@ -574,6 +624,165 @@ This skill synthesizes knowledge from multiple sources:
 
         return "\n".join(lines)
 
+    # ------------------------------------------------------------------
+    # Generic merge system for any combination of source types (v3.2.0+)
+    # ------------------------------------------------------------------
+
+    # Human-readable labels for source types
+    _SOURCE_LABELS: dict[str, str] = {
+        "documentation": "Documentation",
+        "github": "GitHub Repository",
+        "pdf": "PDF Document",
+        "word": "Word Document",
+        "epub": "EPUB E-book",
+        "video": "Video",
+        "local": "Local Codebase",
+        "jupyter": "Jupyter Notebook",
+        "html": "HTML Document",
+        "openapi": "OpenAPI/Swagger Spec",
+        "asciidoc": "AsciiDoc Document",
+        "pptx": "PowerPoint Presentation",
+        "confluence": "Confluence Wiki",
+        "notion": "Notion Page",
+        "rss": "RSS/Atom Feed",
+        "manpage": "Man Page",
+        "chat": "Chat Export",
+    }
+
+    def _generic_merge(self, skill_mds: dict[str, str]) -> str:
+        """Generic merge for any combination of source types.
+
+        Uses a priority-based section ordering approach:
+        1. Parse all source SKILL.md files into sections
+        2. Collect unique sections across all sources
+        3. Merge matching sections with source attribution
+        4. Produce a unified SKILL.md
+
+        This preserves the existing pairwise synthesis for docs+github, docs+pdf, etc.
+        and handles any other combination generically.
+
+        Args:
+            skill_mds: Dict mapping source type to SKILL.md content
+
+        Returns:
+            Merged SKILL.md content string
+        """
+        skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
+        desc = self.description[:1024] if len(self.description) > 1024 else self.description
+
+        # Parse all source SKILL.md files into sections
+        all_sections: dict[str, dict[str, str]] = {}
+        for source_type, content in skill_mds.items():
+            all_sections[source_type] = self._parse_skill_md_sections(content)
+
+        # Determine all unique section names in priority order
+        # Sections that appear earlier in sources have higher priority
+        seen_sections: list[str] = []
+        for _source_type, sections in all_sections.items():
+            for section_name in sections:
+                if section_name not in seen_sections:
+                    seen_sections.append(section_name)
+
+        # Build merged content
+        source_labels = ", ".join(self._SOURCE_LABELS.get(t, t.title()) for t in skill_mds)
+        lines = [
+            "---",
+            f"name: {skill_name}",
+            f"description: {desc}",
+            "---",
+            "",
+            f"# {self.name.replace('_', ' ').title()}",
+            "",
+            f"{self.description}",
+            "",
+            f"*Merged from: {source_labels}*",
+            "",
+        ]
+
+        # Emit each section, merging content from all sources that have it
+        for section_name in seen_sections:
+            contributing_sources = [
+                (stype, sections[section_name])
+                for stype, sections in all_sections.items()
+                if section_name in sections
+            ]
+
+            if len(contributing_sources) == 1:
+                # Single source for this section — emit as-is
+                stype, content = contributing_sources[0]
+                label = self._SOURCE_LABELS.get(stype, stype.title())
+                lines.append(f"## {section_name}")
+                lines.append("")
+                lines.append(f"*From {label}*")
+                lines.append("")
+                lines.append(content)
+                lines.append("")
+            else:
+                # Multiple sources — merge with attribution
+                lines.append(f"## {section_name}")
+                lines.append("")
+                for stype, content in contributing_sources:
+                    label = self._SOURCE_LABELS.get(stype, stype.title())
+                    lines.append(f"### From {label}")
+                    lines.append("")
+                    lines.append(content)
+                    lines.append("")
+
+        lines.append("---")
+        lines.append("")
+        lines.append("*Generated by Skill Seeker's unified multi-source scraper*")
+
+        return "\n".join(lines)
+
+    def _append_extra_sources(
+        self,
+        base_content: str,
+        skill_mds: dict[str, str],
+        extra_types: set[str],
+    ) -> str:
+        """Append additional source content to existing pairwise-synthesized SKILL.md.
+
+        Used when the core docs+github+pdf synthesis has run, but there are
+        additional source types (epub, jupyter, etc.) that need to be included.
+
+        Args:
+            base_content: Already-synthesized SKILL.md content
+            skill_mds: All source SKILL.md files
+            extra_types: Set of extra source type keys to append
+
+        Returns:
+            Extended SKILL.md content
+        """
+        lines = base_content.split("\n")
+
+        # Find the final separator (---) or end of file
+        insertion_index = len(lines)
+        for i in range(len(lines) - 1, -1, -1):
+            if lines[i].strip() == "---":
+                insertion_index = i
+                break
+
+        # Build extra content
+        extra_lines = [""]
+        for source_type in sorted(extra_types):
+            if source_type not in skill_mds:
+                continue
+            label = self._SOURCE_LABELS.get(source_type, source_type.title())
+            sections = self._parse_skill_md_sections(skill_mds[source_type])
+
+            extra_lines.append(f"## {label} Content")
+            extra_lines.append("")
+
+            for section_name, content in sections.items():
+                extra_lines.append(f"### {section_name}")
+                extra_lines.append("")
+                extra_lines.append(content)
+                extra_lines.append("")
+
+        lines[insertion_index:insertion_index] = extra_lines
+
+        return "\n".join(lines)
+
     def _generate_minimal_skill_md(self) -> str:
         """Generate minimal SKILL.md (legacy fallback behavior).
 
@@ -597,18 +806,42 @@ This skill combines knowledge from multiple sources:
 
 """
 
+        # Source type display keys: type -> (label, primary_key, extra_keys)
+        _source_detail_map = {
+            "documentation": ("Documentation", "base_url", [("Pages", "max_pages", "unlimited")]),
+            "github": (
+                "GitHub Repository",
+                "repo",
+                [("Code Analysis", "code_analysis_depth", "surface"), ("Issues", "max_issues", 0)],
+            ),
+            "pdf": ("PDF Document", "path", []),
+            "word": ("Word Document", "path", []),
+            "epub": ("EPUB E-book", "path", []),
+            "video": ("Video", "url", []),
+            "local": ("Local Codebase", "path", [("Analysis Depth", "analysis_depth", "surface")]),
+            "jupyter": ("Jupyter Notebook", "path", []),
+            "html": ("HTML Document", "path", []),
+            "openapi": ("OpenAPI Spec", "path", []),
+            "asciidoc": ("AsciiDoc Document", "path", []),
+            "pptx": ("PowerPoint", "path", []),
+            "confluence": ("Confluence Wiki", "base_url", []),
+            "notion": ("Notion Page", "page_id", []),
+            "rss": ("RSS/Atom Feed", "url", []),
+            "manpage": ("Man Page", "names", []),
+            "chat": ("Chat Export", "path", []),
+        }
+
         # List sources
         for source in self.config.get("sources", []):
             source_type = source["type"]
-            if source_type == "documentation":
-                content += f"- ✅ **Documentation**: {source.get('base_url', 'N/A')}\n"
-                content += f"  - Pages: {source.get('max_pages', 'unlimited')}\n"
-            elif source_type == "github":
-                content += f"- ✅ **GitHub Repository**: {source.get('repo', 'N/A')}\n"
-                content += f"  - Code Analysis: {source.get('code_analysis_depth', 'surface')}\n"
-                content += f"  - Issues: {source.get('max_issues', 0)}\n"
-            elif source_type == "pdf":
-                content += f"- ✅ **PDF Document**: {source.get('path', 'N/A')}\n"
+            display = _source_detail_map.get(source_type, (source_type.title(), "path", []))
+            label, primary_key, extras = display
+            primary_val = source.get(primary_key, "N/A")
+            if isinstance(primary_val, list):
+                primary_val = ", ".join(str(v) for v in primary_val)
+            content += f"- ✅ **{label}**: {primary_val}\n"
+            for extra_label, extra_key, extra_default in extras:
+                content += f"  - {extra_label}: {source.get(extra_key, extra_default)}\n"
 
         # C3.x Architecture & Code Analysis section (if available)
         github_data = self.scraped_data.get("github", {})
@@ -796,6 +1029,27 @@ This skill combines knowledge from multiple sources:
         if pdf_list:
             self._generate_pdf_references(pdf_list)
 
+        # Generate references for all additional source types
+        _extra_source_types = [
+            "word",
+            "epub",
+            "video",
+            "jupyter",
+            "html",
+            "openapi",
+            "asciidoc",
+            "pptx",
+            "confluence",
+            "notion",
+            "rss",
+            "manpage",
+            "chat",
+        ]
+        for source_type in _extra_source_types:
+            source_list = self.scraped_data.get(source_type, [])
+            if source_list:
+                self._generate_generic_references(source_type, source_list)
+
         # Generate merged API reference if available
         if self.merged_data:
             self._generate_merged_api_reference()
@@ -977,6 +1231,63 @@ This skill combines knowledge from multiple sources:
 
         logger.info(f"Created PDF references ({len(pdf_list)} sources)")
 
+    def _generate_generic_references(self, source_type: str, source_list: list[dict]):
+        """Generate references for any source type using a generic approach.
+
+        Creates a references/<source_type>/ directory with an index and
+        copies any data files from the source list.
+
+        Args:
+            source_type: The source type key (e.g., 'epub', 'jupyter')
+            source_list: List of scraped source dicts for this type
+        """
+        if not source_list:
+            return
+
+        label = self._SOURCE_LABELS.get(source_type, source_type.title())
+        type_dir = os.path.join(self.skill_dir, "references", source_type)
+        os.makedirs(type_dir, exist_ok=True)
+
+        # Create index
+        index_path = os.path.join(type_dir, "index.md")
+        with open(index_path, "w", encoding="utf-8") as f:
+            f.write(f"# {label} References\n\n")
+            f.write(f"Reference from {len(source_list)} {label} source(s).\n\n")
+
+            for i, source_data in enumerate(source_list):
+                # Try common ID fields
+                source_id = (
+                    source_data.get("source_id")
+                    or source_data.get(f"{source_type}_id")
+                    or source_data.get("notebook_id")
+                    or source_data.get("spec_id")
+                    or source_data.get("feed_id")
+                    or source_data.get("man_id")
+                    or source_data.get("chat_id")
+                    or f"source_{i}"
+                )
+                f.write(f"## {source_id}\n\n")
+
+                # Write summary of extracted data
+                data = source_data.get("data", {})
+                if isinstance(data, dict):
+                    for key in ["title", "description", "metadata"]:
+                        if key in data:
+                            val = data[key]
+                            if isinstance(val, str) and val:
+                                f.write(f"**{key.title()}:** {val}\n\n")
+
+                # Copy data file if available
+                data_file = source_data.get("data_file")
+                if data_file and os.path.isfile(data_file):
+                    dest = os.path.join(type_dir, f"{source_id}_data.json")
+                    import contextlib
+
+                    with contextlib.suppress(OSError):
+                        shutil.copy(data_file, dest)
+
+        logger.info(f"Created {label} references ({len(source_list)} sources)")
+
     def _generate_merged_api_reference(self):
         """Generate merged API reference file."""
         api_dir = os.path.join(self.skill_dir, "references", "api")
diff --git a/src/skill_seekers/mcp/server_fastmcp.py b/src/skill_seekers/mcp/server_fastmcp.py
index f955c33..6d8bf3e 100644
--- a/src/skill_seekers/mcp/server_fastmcp.py
+++ b/src/skill_seekers/mcp/server_fastmcp.py
@@ -3,16 +3,16 @@
 Skill Seeker MCP Server (FastMCP Implementation)
 
 Modern, decorator-based MCP server using FastMCP for simplified tool registration.
-Provides 33 tools for generating Claude AI skills from documentation.
+Provides 34 tools for generating Claude AI skills from documentation.
 
 This is a streamlined alternative to server.py (2200 lines → 708 lines, 68% reduction).
 All tool implementations are delegated to modular tool files in tools/ directory.
 
 **Architecture:**
 - FastMCP server with decorator-based tool registration
-- 33 tools organized into 7 categories:
+- 34 tools organized into 7 categories:
   * Config tools (3): generate_config, list_configs, validate_config
-  * Scraping tools (10): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns
+  * Scraping tools (11): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video, scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns, scrape_generic
   * Packaging tools (4): package_skill, upload_skill, enhance_skill, install_skill
   * Splitting tools (2): split_config, generate_router
   * Source tools (5): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source
@@ -97,6 +97,7 @@ try:
         remove_config_source_impl,
         scrape_codebase_impl,
         scrape_docs_impl,
+        scrape_generic_impl,
         scrape_github_impl,
         scrape_pdf_impl,
         scrape_video_impl,
@@ -141,6 +142,7 @@ except ImportError:
         remove_config_source_impl,
         scrape_codebase_impl,
         scrape_docs_impl,
+        scrape_generic_impl,
         scrape_github_impl,
         scrape_pdf_impl,
         scrape_video_impl,
@@ -301,7 +303,7 @@ async def sync_config(
 
 
 # ============================================================================
-# SCRAPING TOOLS (10 tools)
+# SCRAPING TOOLS (11 tools)
 # ============================================================================
 
 
@@ -823,6 +825,50 @@ async def extract_config_patterns(
     return str(result)
 
 
+@safe_tool_decorator(
+    description="Scrape content from new source types: jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat. A generic entry point that delegates to the appropriate CLI scraper module."
+)
+async def scrape_generic(
+    source_type: str,
+    name: str,
+    path: str | None = None,
+    url: str | None = None,
+) -> str:
+    """
+    Scrape content from various source types and build a skill.
+
+    A generic scraper that supports 10 new source types. It delegates to the
+    corresponding CLI scraper module (e.g., skill_seekers.cli.jupyter_scraper).
+
+    File-based types (jupyter, html, openapi, asciidoc, pptx, manpage, chat)
+    typically use the 'path' parameter. URL-based types (confluence, notion, rss)
+    typically use the 'url' parameter.
+
+    Args:
+        source_type: Source type to scrape. One of: jupyter, html, openapi,
+            asciidoc, pptx, confluence, notion, rss, manpage, chat.
+        name: Skill name for the output
+        path: File or directory path (for file-based sources like jupyter, html, pptx)
+        url: URL (for URL-based sources like confluence, notion, rss)
+
+    Returns:
+        Scraping results with file paths and statistics.
+    """
+    args = {
+        "source_type": source_type,
+        "name": name,
+    }
+    if path:
+        args["path"] = path
+    if url:
+        args["url"] = url
+
+    result = await scrape_generic_impl(args)
+    if isinstance(result, list) and result:
+        return result[0].text if hasattr(result[0], "text") else str(result[0])
+    return str(result)
+
+
 # ============================================================================
 # PACKAGING TOOLS (4 tools)
 # ============================================================================
diff --git a/src/skill_seekers/mcp/tools/__init__.py b/src/skill_seekers/mcp/tools/__init__.py
index 6783c9d..0d7c5a9 100644
--- a/src/skill_seekers/mcp/tools/__init__.py
+++ b/src/skill_seekers/mcp/tools/__init__.py
@@ -63,6 +63,9 @@ from .scraping_tools import (
 from .scraping_tools import (
     scrape_pdf_tool as scrape_pdf_impl,
 )
+from .scraping_tools import (
+    scrape_generic_tool as scrape_generic_impl,
+)
 from .scraping_tools import (
     scrape_video_tool as scrape_video_impl,
 )
@@ -135,6 +138,7 @@ __all__ = [
     "extract_test_examples_impl",
     "build_how_to_guides_impl",
     "extract_config_patterns_impl",
+    "scrape_generic_impl",
     # Packaging tools
     "package_skill_impl",
     "upload_skill_impl",
diff --git a/src/skill_seekers/mcp/tools/config_tools.py b/src/skill_seekers/mcp/tools/config_tools.py
index 67d363d..8e0c873 100644
--- a/src/skill_seekers/mcp/tools/config_tools.py
+++ b/src/skill_seekers/mcp/tools/config_tools.py
@@ -205,6 +205,18 @@ async def validate_config(args: dict) -> list[TextContent]:
                         )
                     elif source["type"] == "pdf":
                         result += f"    Path: {source.get('path', 'N/A')}\n"
+                    elif source["type"] in (
+                        "jupyter",
+                        "html",
+                        "openapi",
+                        "asciidoc",
+                        "pptx",
+                        "manpage",
+                        "chat",
+                    ):
+                        result += f"    Path: {source.get('path', 'N/A')}\n"
+                    elif source["type"] in ("confluence", "notion", "rss"):
+                        result += f"    URL: {source.get('url', 'N/A')}\n"
 
                 # Show merge settings if applicable
                 if validator.needs_api_merge():
diff --git a/src/skill_seekers/mcp/tools/scraping_tools.py b/src/skill_seekers/mcp/tools/scraping_tools.py
index 2853adc..c8f99f2 100644
--- a/src/skill_seekers/mcp/tools/scraping_tools.py
+++ b/src/skill_seekers/mcp/tools/scraping_tools.py
@@ -7,6 +7,8 @@ This module contains all scraping-related MCP tool implementations:
 - scrape_github_tool: Scrape GitHub repositories
 - scrape_pdf_tool: Scrape PDF documentation
 - scrape_codebase_tool: Analyze local codebase and extract code knowledge
+- scrape_generic_tool: Generic scraper for new source types (jupyter, html,
+  openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat)
 
 Extracted from server.py for better modularity and organization.
 """
@@ -1005,3 +1007,155 @@ async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
         return [TextContent(type="text", text=output_text)]
     else:
         return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]
+
+
+# Valid source types for the generic scraper
+GENERIC_SOURCE_TYPES = (
+    "jupyter",
+    "html",
+    "openapi",
+    "asciidoc",
+    "pptx",
+    "confluence",
+    "notion",
+    "rss",
+    "manpage",
+    "chat",
+)
+
+# Mapping from source type to the CLI flag used for the primary input argument.
+# URL-based types use --url; file/path-based types use --path.
+_URL_BASED_TYPES = {"confluence", "notion", "rss"}
+
+# Friendly emoji labels per source type
+_SOURCE_EMOJIS = {
+    "jupyter": "📓",
+    "html": "🌐",
+    "openapi": "📡",
+    "asciidoc": "📄",
+    "pptx": "📊",
+    "confluence": "🏢",
+    "notion": "📝",
+    "rss": "📰",
+    "manpage": "📖",
+    "chat": "💬",
+}
+
+
+async def scrape_generic_tool(args: dict) -> list[TextContent]:
+    """
+    Generic scraper for new source types.
+
+    Handles all 10 new source types by building the appropriate subprocess
+    command and delegating to the corresponding CLI scraper module.
+
+    Supported source types: jupyter, html, openapi, asciidoc, pptx,
+    confluence, notion, rss, manpage, chat.
+
+    Args:
+        args: Dictionary containing:
+            - source_type (str): One of the supported source types
+            - path (str, optional): File or directory path (for file-based sources)
+            - url (str, optional): URL (for URL-based sources like confluence, notion, rss)
+            - name (str): Skill name for the output
+
+    Returns:
+        List[TextContent]: Tool execution results
+    """
+    source_type = args.get("source_type", "")
+    path = args.get("path")
+    url = args.get("url")
+    name = args.get("name")
+
+    # Validate source_type
+    if source_type not in GENERIC_SOURCE_TYPES:
+        return [
+            TextContent(
+                type="text",
+                text=(
+                    f"❌ Error: Unknown source_type '{source_type}'. "
+                    f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}"
+                ),
+            )
+        ]
+
+    # Validate that we have either path or url
+    if not path and not url:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: Must specify either 'path' (file/directory) or 'url'",
+            )
+        ]
+
+    if not name:
+        return [
+            TextContent(
+                type="text",
+                text="❌ Error: 'name' parameter is required",
+            )
+        ]
+
+    # Build the subprocess command
+    # Map source type to module name (most are <type>_scraper, but some differ)
+    _MODULE_NAMES = {
+        "manpage": "man_scraper",
+    }
+    module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper")
+    cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"]
+
+    # Map source type to the correct CLI flag for file/path input and URL input.
+    # Each scraper has its own flag name — using a generic --path or --url would fail.
+    _PATH_FLAGS: dict[str, str] = {
+        "jupyter": "--notebook",
+        "html": "--html-path",
+        "openapi": "--spec",
+        "asciidoc": "--asciidoc-path",
+        "pptx": "--pptx",
+        "manpage": "--man-path",
+        "confluence": "--export-path",
+        "notion": "--export-path",
+        "rss": "--feed-path",
+        "chat": "--export-path",
+    }
+    _URL_FLAGS: dict[str, str] = {
+        "confluence": "--base-url",
+        "notion": "--page-id",
+        "rss": "--feed-url",
+        "openapi": "--spec-url",
+    }
+
+    # Determine the input flag based on source type
+    if source_type in _URL_BASED_TYPES and url:
+        url_flag = _URL_FLAGS.get(source_type, "--url")
+        cmd.extend([url_flag, url])
+    elif path:
+        path_flag = _PATH_FLAGS.get(source_type, "--path")
+        cmd.extend([path_flag, path])
+    elif url:
+        # Allow url fallback for file-based types (some may accept URLs too)
+        url_flag = _URL_FLAGS.get(source_type, "--url")
+        cmd.extend([url_flag, url])
+
+    cmd.extend(["--name", name])
+
+    # Set a reasonable timeout
+    timeout = 600  # 10 minutes
+
+    emoji = _SOURCE_EMOJIS.get(source_type, "🔧")
+    progress_msg = f"{emoji} Scraping {source_type} source...\n"
+    if path:
+        progress_msg += f"📁 Path: {path}\n"
+    if url:
+        progress_msg += f"🔗 URL: {url}\n"
+    progress_msg += f"📛 Name: {name}\n"
+    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"
+
+    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)
+
+    output = progress_msg + stdout
+
+    if returncode == 0:
+        return [TextContent(type="text", text=output)]
+    else:
+        return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]
diff --git a/src/skill_seekers/mcp/tools/splitting_tools.py b/src/skill_seekers/mcp/tools/splitting_tools.py
index c67ea15..2cda4eb 100644
--- a/src/skill_seekers/mcp/tools/splitting_tools.py
+++ b/src/skill_seekers/mcp/tools/splitting_tools.py
@@ -106,7 +106,9 @@ async def split_config(args: dict) -> list[TextContent]:
 
     Supports both documentation and unified (multi-source) configs:
     - Documentation configs: Split by categories, size, or create router skills
-    - Unified configs: Split by source type (documentation, github, pdf)
+    - Unified configs: Split by source type (documentation, github, pdf,
+      jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss,
+      manpage, chat)
 
     For large documentation sites (10K+ pages), this tool splits the config into
     multiple smaller configs. For unified configs with multiple sources, splits
diff --git a/src/skill_seekers/workflows/complex-merge.yaml b/src/skill_seekers/workflows/complex-merge.yaml
new file mode 100644
index 0000000..1d0c429
--- /dev/null
+++ b/src/skill_seekers/workflows/complex-merge.yaml
@@ -0,0 +1,222 @@
+name: complex-merge
+description: Intelligent multi-source merging with conflict resolution, priority rules, and gap analysis
+version: "1.0"
+author: Skill Seekers
+tags:
+  - merge
+  - multi-source
+  - conflict-resolution
+  - synthesis
+applies_to:
+  - doc_scraping
+  - codebase_analysis
+  - github_analysis
+variables:
+  merge_strategy: priority
+  source_priority_order: "official_docs,code,community"
+  conflict_resolution: highest_priority
+  min_sources_for_consensus: 2
+stages:
+  - name: source_inventory
+    type: custom
+    target: inventory
+    uses_history: false
+    enabled: true
+    prompt: >
+      Catalog every source that contributed content to this skill extraction.
+      For each source, classify its type and assess its characteristics.
+
+      For each source, determine:
+      1. Source type (official_docs, codebase, github_repo, pdf, video, community, blog)
+      2. Content scope — what topics or areas does this source cover?
+      3. Freshness — how recent is the content? Look for version numbers, dates, deprecation notices
+      4. Authority level — is this an official maintainer, core contributor, or third party?
+      5. Content density — roughly how much substantive information does this source provide?
+      6. Format characteristics — prose, code samples, API reference, tutorial, etc.
+
+      Output JSON with:
+      - "sources": array of {id, type, scope_summary, topics_covered, freshness_estimate, authority, density, format}
+      - "source_type_distribution": count of sources by type
+      - "total_topics_identified": number of unique topics across all sources
+      - "coverage_summary": brief overview of what the combined sources cover
+
+  - name: cross_reference
+    type: custom
+    target: cross_references
+    uses_history: true
+    enabled: true
+    prompt: >
+      Using the source inventory, identify overlapping topics across sources.
+      Find where multiple sources discuss the same concept, API, feature, or pattern.
+
+      For each overlapping topic:
+      1. List which sources cover it and how deeply
+      2. Note whether sources agree, complement each other, or diverge
+      3. Identify the richest source for that topic (most detail, best examples)
+      4. Flag any terminology differences across sources for the same concept
+
+      Output JSON with:
+      - "overlapping_topics": array of {topic, sources_covering, agreement_level, richest_source, terminology_variants}
+      - "high_overlap_topics": topics covered by 3+ sources
+      - "complementary_pairs": pairs of sources that cover different aspects of the same topic well
+      - "terminology_map": dictionary mapping variant terms to a canonical term
+
+  - name: conflict_detection
+    type: custom
+    target: conflicts
+    uses_history: true
+    enabled: true
+    prompt: >
+      Examine the cross-referenced topics and identify genuine contradictions
+      between sources. Distinguish between true conflicts and superficial differences.
+
+      Categories of conflict to detect:
+      1. Factual contradictions — sources state opposite things about the same feature
+      2. Version mismatches — sources describe different versions of an API or behavior
+      3. Best practice disagreements — sources recommend conflicting approaches
+      4. Deprecated vs current — one source shows deprecated usage another shows current
+      5. Scope conflicts — sources disagree on what a feature can or cannot do
+
+      For each conflict:
+      - Identify the specific claim from each source
+      - Assess which source is more likely correct and why
+      - Recommend a resolution strategy
+
+      Output JSON with:
+      - "conflicts": array of {topic, type, source_a_claim, source_b_claim, likely_correct, resolution_rationale}
+      - "conflict_count_by_type": breakdown of conflicts by category
+      - "high_severity_conflicts": conflicts that would mislead users if unresolved
+      - "auto_resolvable": conflicts that can be resolved by version/date alone
+
+  - name: priority_merge
+    type: custom
+    target: merged_content
+    uses_history: true
+    enabled: true
+    prompt: >
+      Merge content from all sources using the following priority hierarchy:
+        1. Official documentation (highest authority)
+        2. Source code and inline comments (ground truth for behavior)
+        3. Community content — tutorials, blog posts, Stack Overflow (practical usage)
+
+      Merging rules:
+      - When sources agree, combine the best explanation with the best examples
+      - When sources conflict, prefer the higher-priority source but note the alternative
+      - When only a lower-priority source covers a topic, include it but flag the authority level
+      - Preserve code examples from any source, annotating their origin
+      - Deduplicate content — do not repeat the same information from multiple sources
+      - Normalize terminology using the canonical terms from cross-referencing
+
+      For each merged topic, produce:
+      1. Authoritative explanation (from highest-priority source)
+      2. Practical examples (best available from any source)
+      3. Source attribution (which sources contributed)
+      4. Confidence level (high if official docs confirm, medium if code-only, low if community-only)
+
+      Output JSON with:
+      - "merged_topics": array of {topic, explanation, examples, sources_used, confidence, notes}
+      - "merge_decisions": array of {topic, decision, rationale} for non-trivial merges
+      - "source_contribution_stats": how much each source contributed to the final output
+
+  - name: gap_analysis
+    type: custom
+    target: gaps
+    uses_history: true
+    enabled: true
+    prompt: >
+      Analyse the merged content to identify gaps — topics or areas that are
+      underrepresented or missing entirely.
+
+      Identify:
+      1. Single-source topics — covered by only one source, making them fragile
+      2. Missing fundamentals — core concepts that should be documented but are not
+      3. Missing examples — topics explained in prose but lacking code samples
+      4. Missing edge cases — common error scenarios or limitations not documented
+      5. Broken references — topics that reference other topics not present in any source
+      6. Audience gaps — content assumes knowledge that is never introduced
+
+      For each gap, assess:
+      - Severity (critical, important, nice-to-have)
+      - Whether the gap can be inferred from existing content
+      - Suggested source type that would best fill this gap
+
+      Output JSON with:
+      - "single_source_topics": array of {topic, sole_source, risk_level}
+      - "missing_fundamentals": topics that should exist but do not
+      - "example_gaps": topics needing code examples
+      - "edge_case_gaps": undocumented error scenarios
+      - "broken_references": internal references with no target
+      - "gap_severity_summary": counts by severity level
+
+  - name: synthesis
+    type: custom
+    target: skill_md
+    uses_history: true
+    enabled: true
+    prompt: >
+      Create a unified, coherent narrative from the merged content. The output
+      should read as if written by a single knowledgeable author, not as a
+      patchwork of multiple sources.
+
+      Synthesis guidelines:
+      1. Structure content logically — concepts build on each other
+      2. Lead with the most important information for each topic
+      3. Integrate code examples naturally within explanations
+      4. Use consistent voice, terminology, and formatting throughout
+      5. Add transition text between topics for narrative flow
+      6. Include a "Sources and Confidence" appendix noting where information came from
+      7. Mark any low-confidence or single-source claims with a caveat
+      8. Fill minor gaps by inference where safe to do so, clearly marking inferred content
+
+      Output JSON with:
+      - "synthesized_sections": array of {title, content, sources_used, confidence}
+      - "section_order": recommended reading order
+      - "inferred_content": content that was inferred rather than directly sourced
+      - "caveats": any warnings about content reliability
+
+  - name: quality_check
+    type: custom
+    target: quality
+    uses_history: true
+    enabled: true
+    prompt: >
+      Perform a final quality review of the synthesized output. Evaluate the
+      merge result against multiple quality dimensions.
+
+      Check for:
+      1. Completeness — does the output cover all topics from all sources?
+      2. Accuracy — are merged claims consistent and non-contradictory?
+      3. Coherence — does the document flow logically as a unified piece?
+      4. Attribution — are source contributions properly tracked?
+      5. Confidence calibration — are confidence levels appropriate?
+      6. Example quality — are code examples correct, runnable, and well-annotated?
+      7. Terminology consistency — is the canonical terminology used throughout?
+      8. Gap acknowledgment — are known gaps clearly communicated?
+
+      Scoring:
+      - Rate each dimension 1-10
+      - Provide specific issues found for any dimension scoring below 7
+      - Suggest concrete fixes for each issue
+
+      Output JSON with:
+      - "quality_scores": {completeness, accuracy, coherence, attribution, confidence_calibration, example_quality, terminology_consistency, gap_acknowledgment}
+      - "overall_score": weighted average (accuracy and completeness weighted 2x)
+      - "issues_found": array of {dimension, description, severity, suggested_fix}
+      - "merge_health": "excellent" | "good" | "needs_review" | "poor" based on overall score
+      - "recommendations": top 3 actions to improve merge quality
+
+post_process:
+  reorder_sections:
+    - overview
+    - core_concepts
+    - api_reference
+    - examples
+    - advanced_topics
+    - troubleshooting
+    - sources_and_confidence
+  add_metadata:
+    enhanced: true
+    workflow: complex-merge
+    multi_source: true
+    conflict_resolution: priority
+    quality_checked: true
diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py
index a9878a3..36d86a3 100644
--- a/tests/test_cli_parsers.py
+++ b/tests/test_cli_parsers.py
@@ -24,12 +24,12 @@ class TestParserRegistry:
 
     def test_all_parsers_registered(self):
         """Test that all parsers are registered."""
-        assert len(PARSERS) == 25, f"Expected 25 parsers, got {len(PARSERS)}"
+        assert len(PARSERS) == 35, f"Expected 35 parsers, got {len(PARSERS)}"
 
     def test_get_parser_names(self):
         """Test getting list of parser names."""
         names = get_parser_names()
-        assert len(names) == 25
+        assert len(names) == 35
         assert "scrape" in names
         assert "github" in names
         assert "package" in names
@@ -243,9 +243,9 @@ class TestBackwardCompatibility:
             assert cmd in names, f"Command '{cmd}' not found in parser registry!"
 
     def test_command_count_matches(self):
-        """Test that we have exactly 25 commands (includes create, workflows, word, epub, video, and sync-config)."""
-        assert len(PARSERS) == 25
-        assert len(get_parser_names()) == 25
+        """Test that we have exactly 35 commands (25 original + 10 new source types)."""
+        assert len(PARSERS) == 35
+        assert len(get_parser_names()) == 35
 
 
 if __name__ == "__main__":
diff --git a/tests/test_new_source_types.py b/tests/test_new_source_types.py
new file mode 100644
index 0000000..40ddb0e
--- /dev/null
+++ b/tests/test_new_source_types.py
@@ -0,0 +1,824 @@
+#!/usr/bin/env python3
+"""
+Tests for v3.2.0 new source type integration points.
+
+Covers source detection, config validation, generic merge, CLI wiring,
+and source validation for the 10 new source types: jupyter, html, openapi,
+asciidoc, pptx, rss, manpage, confluence, notion, chat.
+"""
+
+import os
+import textwrap
+
+import pytest
+
+from skill_seekers.cli.config_validator import ConfigValidator
+from skill_seekers.cli.main import COMMAND_MODULES
+from skill_seekers.cli.parsers import PARSERS, get_parser_names
+from skill_seekers.cli.source_detector import SourceDetector, SourceInfo
+from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+
+# ---------------------------------------------------------------------------
+# 1. SourceDetector — new type detection
+# ---------------------------------------------------------------------------
+
+
+class TestSourceDetectorNewTypes:
+    """Test that SourceDetector.detect() maps new extensions to correct types."""
+
+    # -- Jupyter --
+    def test_detect_ipynb(self):
+        """Test .ipynb → jupyter detection."""
+        info = SourceDetector.detect("analysis.ipynb")
+        assert info.type == "jupyter"
+        assert info.parsed["file_path"] == "analysis.ipynb"
+        assert info.suggested_name == "analysis"
+
+    # -- HTML --
+    def test_detect_html_extension(self):
+        """Test .html → html detection."""
+        info = SourceDetector.detect("page.html")
+        assert info.type == "html"
+        assert info.parsed["file_path"] == "page.html"
+
+    def test_detect_htm_extension(self):
+        """Test .htm → html detection."""
+        info = SourceDetector.detect("index.HTM")
+        assert info.type == "html"
+        assert info.parsed["file_path"] == "index.HTM"
+
+    # -- PowerPoint --
+    def test_detect_pptx(self):
+        """Test .pptx → pptx detection."""
+        info = SourceDetector.detect("slides.pptx")
+        assert info.type == "pptx"
+        assert info.parsed["file_path"] == "slides.pptx"
+        assert info.suggested_name == "slides"
+
+    # -- AsciiDoc --
+    def test_detect_adoc(self):
+        """Test .adoc → asciidoc detection."""
+        info = SourceDetector.detect("manual.adoc")
+        assert info.type == "asciidoc"
+        assert info.parsed["file_path"] == "manual.adoc"
+
+    def test_detect_asciidoc_extension(self):
+        """Test .asciidoc → asciidoc detection."""
+        info = SourceDetector.detect("guide.ASCIIDOC")
+        assert info.type == "asciidoc"
+        assert info.parsed["file_path"] == "guide.ASCIIDOC"
+
+    # -- Man pages --
+    def test_detect_man_extension(self):
+        """Test .man → manpage detection."""
+        info = SourceDetector.detect("curl.man")
+        assert info.type == "manpage"
+        assert info.parsed["file_path"] == "curl.man"
+
+    @pytest.mark.parametrize("section", range(1, 9))
+    def test_detect_man_sections(self, section):
+        """Test .1 through .8 → manpage for simple basenames."""
+        filename = f"git.{section}"
+        info = SourceDetector.detect(filename)
+        assert info.type == "manpage", f"{filename} should detect as manpage"
+        assert info.suggested_name == "git"
+
+    def test_man_section_with_dotted_basename_not_detected(self):
+        """Test that 'access.log.1' is NOT detected as a man page.
+
+        The heuristic checks that the basename (without extension) has no dots.
+        """
+        # This should fall through to web/domain detection (has a dot, not a path)
+        info = SourceDetector.detect("access.log.1")
+        # access.log.1 has a dot in the basename-without-ext ("access.log"),
+        # so it should NOT be detected as manpage.  It falls through to the
+        # domain inference branch because it contains a dot and doesn't start
+        # with '/'.
+        assert info.type != "manpage"
+
+    # -- RSS/Atom --
+    def test_detect_rss_extension(self):
+        """Test .rss → rss detection."""
+        info = SourceDetector.detect("feed.rss")
+        assert info.type == "rss"
+        assert info.parsed["file_path"] == "feed.rss"
+
+    def test_detect_atom_extension(self):
+        """Test .atom → rss detection."""
+        info = SourceDetector.detect("updates.atom")
+        assert info.type == "rss"
+        assert info.parsed["file_path"] == "updates.atom"
+
+    def test_xml_not_detected_as_rss(self):
+        """Test .xml is NOT detected as rss (too generic).
+
+        The fix ensures .xml files do not get incorrectly classified as RSS feeds.
+        """
+        # .xml has no special handling — it will fall through to domain inference
+        # or raise ValueError depending on contents.  Either way, it must not
+        # be classified as "rss".
+        info = SourceDetector.detect("data.xml")
+        assert info.type != "rss"
+
+    # -- OpenAPI --
+    def test_yaml_with_openapi_content_detected(self, tmp_path):
+        """Test .yaml with 'openapi:' key → openapi detection."""
+        spec = tmp_path / "petstore.yaml"
+        spec.write_text(
+            textwrap.dedent("""\
+                openapi: "3.0.0"
+                info:
+                  title: Petstore
+                  version: "1.0.0"
+                paths: {}
+            """)
+        )
+        info = SourceDetector.detect(str(spec))
+        assert info.type == "openapi"
+        assert info.parsed["file_path"] == str(spec)
+        assert info.suggested_name == "petstore"
+
+    def test_yaml_with_swagger_content_detected(self, tmp_path):
+        """Test .yaml with 'swagger:' key → openapi detection."""
+        spec = tmp_path / "legacy.yml"
+        spec.write_text(
+            textwrap.dedent("""\
+                swagger: "2.0"
+                info:
+                  title: Legacy API
+                basePath: /v1
+            """)
+        )
+        info = SourceDetector.detect(str(spec))
+        assert info.type == "openapi"
+
+    def test_yaml_without_openapi_not_detected(self, tmp_path):
+        """Test .yaml without OpenAPI content is NOT detected as openapi.
+
+        When the YAML file doesn't contain openapi/swagger keys the detector
+        skips OpenAPI and falls through.  For an absolute path it will raise
+        ValueError (cannot determine type), which still confirms it was NOT
+        classified as openapi.
+        """
+        plain = tmp_path / "config.yaml"
+        plain.write_text("name: my-project\nversion: 1.0\n")
+        # Absolute path falls through to ValueError (no matching type).
+        # Either way, it must NOT be "openapi".
+        try:
+            info = SourceDetector.detect(str(plain))
+            assert info.type != "openapi"
+        except ValueError:
+            # Raised because source type cannot be determined — this is fine,
+            # the important thing is it was not classified as openapi.
+            pass
+
+    def test_looks_like_openapi_returns_false_for_missing_file(self):
+        """Test _looks_like_openapi returns False for non-existent file."""
+        assert SourceDetector._looks_like_openapi("/nonexistent/spec.yaml") is False
+
+    def test_looks_like_openapi_json_key_format(self, tmp_path):
+        """Test _looks_like_openapi detects JSON-style keys (quoted)."""
+        spec = tmp_path / "api.yaml"
+        spec.write_text('"openapi": "3.0.0"\n')
+        assert SourceDetector._looks_like_openapi(str(spec)) is True
+
+
+# ---------------------------------------------------------------------------
+# 2. ConfigValidator — new source type validation
+# ---------------------------------------------------------------------------
+
+
+class TestConfigValidatorNewTypes:
+    """Test ConfigValidator VALID_SOURCE_TYPES and per-type validation."""
+
+    # All 17 expected types
+    EXPECTED_TYPES = {
+        "documentation",
+        "github",
+        "pdf",
+        "local",
+        "word",
+        "video",
+        "epub",
+        "jupyter",
+        "html",
+        "openapi",
+        "asciidoc",
+        "pptx",
+        "confluence",
+        "notion",
+        "rss",
+        "manpage",
+        "chat",
+    }
+
+    def test_all_17_types_present(self):
+        """Test that VALID_SOURCE_TYPES contains all 17 types."""
+        assert ConfigValidator.VALID_SOURCE_TYPES == self.EXPECTED_TYPES
+
+    def test_unknown_type_rejected(self):
+        """Test that an unknown source type is rejected during validation."""
+        config = {
+            "name": "test",
+            "description": "test",
+            "sources": [{"type": "foobar"}],
+        }
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Invalid type 'foobar'"):
+            validator.validate()
+
+    # --- Per-type required-field validation ---
+
+    def _make_config(self, source: dict) -> dict:
+        """Helper: wrap a source dict in a valid config structure."""
+        return {
+            "name": "test",
+            "description": "test",
+            "sources": [source],
+        }
+
+    def test_epub_requires_path(self):
+        """Test epub source validation requires 'path'."""
+        config = self._make_config({"type": "epub"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_jupyter_requires_path(self):
+        """Test jupyter source validation requires 'path'."""
+        config = self._make_config({"type": "jupyter"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_html_requires_path(self):
+        """Test html source validation requires 'path'."""
+        config = self._make_config({"type": "html"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_openapi_requires_path_or_url(self):
+        """Test openapi source validation requires 'path' or 'url'."""
+        config = self._make_config({"type": "openapi"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path' or 'url'"):
+            validator.validate()
+
+    def test_openapi_accepts_url(self):
+        """Test openapi source passes validation with 'url'."""
+        config = self._make_config({"type": "openapi", "url": "https://example.com/spec.yaml"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_pptx_requires_path(self):
+        """Test pptx source validation requires 'path'."""
+        config = self._make_config({"type": "pptx"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_asciidoc_requires_path(self):
+        """Test asciidoc source validation requires 'path'."""
+        config = self._make_config({"type": "asciidoc"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'"):
+            validator.validate()
+
+    def test_confluence_requires_url_or_path(self):
+        """Test confluence requires 'url'/'base_url' or 'path'."""
+        config = self._make_config({"type": "confluence"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field"):
+            validator.validate()
+
+    def test_confluence_accepts_base_url(self):
+        """Test confluence passes with base_url + space_key."""
+        config = self._make_config(
+            {
+                "type": "confluence",
+                "base_url": "https://wiki.example.com",
+                "space_key": "DEV",
+            }
+        )
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_confluence_accepts_path(self):
+        """Test confluence passes with export path."""
+        config = self._make_config({"type": "confluence", "path": "/exports/wiki"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_notion_requires_url_or_path(self):
+        """Test notion requires 'url'/'database_id'/'page_id' or 'path'."""
+        config = self._make_config({"type": "notion"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field"):
+            validator.validate()
+
+    def test_notion_accepts_page_id(self):
+        """Test notion passes with page_id."""
+        config = self._make_config({"type": "notion", "page_id": "abc123"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_notion_accepts_database_id(self):
+        """Test notion passes with database_id."""
+        config = self._make_config({"type": "notion", "database_id": "db-456"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_rss_requires_url_or_path(self):
+        """Test rss source validation requires 'url' or 'path'."""
+        config = self._make_config({"type": "rss"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'url' or 'path'"):
+            validator.validate()
+
+    def test_rss_accepts_url(self):
+        """Test rss passes with url."""
+        config = self._make_config({"type": "rss", "url": "https://blog.example.com/feed.xml"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_manpage_requires_path_or_names(self):
+        """Test manpage source validation requires 'path' or 'names'."""
+        config = self._make_config({"type": "manpage"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path' or 'names'"):
+            validator.validate()
+
+    def test_manpage_accepts_names(self):
+        """Test manpage passes with 'names' list."""
+        config = self._make_config({"type": "manpage", "names": ["git", "curl"]})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_chat_requires_path_or_token(self):
+        """Test chat source validation requires 'path' or 'token'."""
+        config = self._make_config({"type": "chat"})
+        validator = ConfigValidator(config)
+        with pytest.raises(ValueError, match="Missing required field 'path'.*or 'token'"):
+            validator.validate()
+
+    def test_chat_accepts_path(self):
+        """Test chat passes with export path."""
+        config = self._make_config({"type": "chat", "path": "/exports/slack"})
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+    def test_chat_accepts_token_with_channel(self):
+        """Test chat passes with API token + channel."""
+        config = self._make_config(
+            {
+                "type": "chat",
+                "token": "xoxb-fake",
+                "channel": "#general",
+            }
+        )
+        validator = ConfigValidator(config)
+        assert validator.validate() is True
+
+
+# ---------------------------------------------------------------------------
+# 3. UnifiedSkillBuilder — generic merge system
+# ---------------------------------------------------------------------------
+
+
+class TestUnifiedSkillBuilderGenericMerge:
+    """Test _generic_merge, _append_extra_sources, and _SOURCE_LABELS."""
+
+    def _make_builder(self, tmp_path) -> UnifiedSkillBuilder:
+        """Create a minimal builder instance for testing."""
+        config = {
+            "name": "test_project",
+            "description": "A test project for merge testing",
+            "sources": [
+                {"type": "jupyter", "path": "nb.ipynb"},
+                {"type": "rss", "url": "https://example.com/feed.rss"},
+            ],
+        }
+        scraped_data: dict = {}
+        builder = UnifiedSkillBuilder(
+            config=config,
+            scraped_data=scraped_data,
+            cache_dir=str(tmp_path / "cache"),
+        )
+        # Override skill_dir to use tmp_path
+        builder.skill_dir = str(tmp_path / "output" / "test_project")
+        os.makedirs(builder.skill_dir, exist_ok=True)
+        os.makedirs(os.path.join(builder.skill_dir, "references"), exist_ok=True)
+        return builder
+
+    def test_generic_merge_produces_valid_markdown(self, tmp_path):
+        """Test _generic_merge with two source types produces markdown."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "jupyter": "## When to Use\n\nFor data analysis.\n\n## Quick Reference\n\nImport pandas.",
+            "rss": "## When to Use\n\nFor feed monitoring.\n\n## Feed Items\n\nLatest entries.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        # Must be non-empty markdown
+        assert len(result) > 100
+        # Must contain the project title
+        assert "Test Project" in result
+
+    def test_generic_merge_includes_yaml_frontmatter(self, tmp_path):
+        """Test _generic_merge includes YAML frontmatter."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "html": "## Overview\n\nHTML content here.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        assert result.startswith("---\n")
+        assert "name: test-project" in result
+        assert "description: A test project" in result
+
+    def test_generic_merge_attributes_content_to_sources(self, tmp_path):
+        """Test _generic_merge attributes content to correct source labels."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "jupyter": "## Overview\n\nNotebook content.",
+            "pptx": "## Overview\n\nSlide content.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        # Check source labels appear
+        assert "Jupyter Notebook" in result
+        assert "PowerPoint Presentation" in result
+
+    def test_generic_merge_single_source_section(self, tmp_path):
+        """Test section unique to one source has 'From <Label>' attribution."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "manpage": "## Synopsis\n\ngit [options]",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        assert "*From Man Page*" in result
+        assert "## Synopsis" in result
+
+    def test_generic_merge_multi_source_section(self, tmp_path):
+        """Test section shared by multiple sources gets sub-headings per source."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "asciidoc": "## Quick Reference\n\nAsciiDoc quick ref.",
+            "html": "## Quick Reference\n\nHTML quick ref.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        # Both sources should be attributed under the shared section
+        assert "### From AsciiDoc Document" in result
+        assert "### From HTML Document" in result
+
+    def test_generic_merge_footer(self, tmp_path):
+        """Test _generic_merge ends with the standard footer."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "rss": "## Feeds\n\nSome feeds.",
+        }
+        result = builder._generic_merge(skill_mds)
+        assert "Generated by Skill Seeker" in result
+
+    def test_generic_merge_merged_from_line(self, tmp_path):
+        """Test _generic_merge includes 'Merged from:' with correct labels."""
+        builder = self._make_builder(tmp_path)
+        skill_mds = {
+            "confluence": "## Pages\n\nWiki pages.",
+            "notion": "## Databases\n\nNotion DBs.",
+        }
+        result = builder._generic_merge(skill_mds)
+
+        assert "*Merged from: Confluence Wiki, Notion Page*" in result
+
+    def test_append_extra_sources_adds_sections(self, tmp_path):
+        """Test _append_extra_sources adds new sections to base content."""
+        builder = self._make_builder(tmp_path)
+        base_content = "# Test\n\nIntro.\n\n## Main Section\n\nContent.\n\n---\n\n*Footer*\n"
+        skill_mds = {
+            "epub": "## Chapters\n\nChapter list.\n\n## Key Concepts\n\nConcept A.",
+        }
+        result = builder._append_extra_sources(base_content, skill_mds, {"epub"})
+
+        # The extra source content should be inserted before the footer separator
+        assert "EPUB E-book Content" in result
+        assert "Chapters" in result
+        assert "Key Concepts" in result
+        # Original content should still be present
+        assert "# Test" in result
+        assert "## Main Section" in result
+
+    def test_append_extra_sources_preserves_footer(self, tmp_path):
+        """Test _append_extra_sources keeps the footer intact."""
+        builder = self._make_builder(tmp_path)
+        base_content = "# Test\n\n---\n\n*Footer*\n"
+        skill_mds = {
+            "chat": "## Messages\n\nChat history.",
+        }
+        result = builder._append_extra_sources(base_content, skill_mds, {"chat"})
+
+        assert "*Footer*" in result
+
+    def test_source_labels_has_all_17_types(self):
+        """Test _SOURCE_LABELS has entries for all 17 source types."""
+        expected = {
+            "documentation",
+            "github",
+            "pdf",
+            "word",
+            "epub",
+            "video",
+            "local",
+            "jupyter",
+            "html",
+            "openapi",
+            "asciidoc",
+            "pptx",
+            "confluence",
+            "notion",
+            "rss",
+            "manpage",
+            "chat",
+        }
+        assert set(UnifiedSkillBuilder._SOURCE_LABELS.keys()) == expected
+
+    def test_source_labels_values_are_nonempty_strings(self):
+        """Test all _SOURCE_LABELS values are non-empty strings."""
+        for key, label in UnifiedSkillBuilder._SOURCE_LABELS.items():
+            assert isinstance(label, str), f"Label for '{key}' is not a string"
+            assert len(label) > 0, f"Label for '{key}' is empty"
+
+
+# ---------------------------------------------------------------------------
+# 4. COMMAND_MODULES and parser wiring
+# ---------------------------------------------------------------------------
+
+
+class TestCommandModules:
+    """Test that all 10 new source types are wired into CLI."""
+
+    NEW_COMMAND_NAMES = [
+        "jupyter",
+        "html",
+        "openapi",
+        "asciidoc",
+        "pptx",
+        "rss",
+        "manpage",
+        "confluence",
+        "notion",
+        "chat",
+    ]
+
+    def test_new_types_in_command_modules(self):
+        """Test all 10 new source types are in COMMAND_MODULES."""
+        for cmd in self.NEW_COMMAND_NAMES:
+            assert cmd in COMMAND_MODULES, f"'{cmd}' not in COMMAND_MODULES"
+
+    def test_command_modules_values_are_module_paths(self):
+        """Test COMMAND_MODULES values look like importable module paths."""
+        for cmd in self.NEW_COMMAND_NAMES:
+            module_path = COMMAND_MODULES[cmd]
+            assert module_path.startswith("skill_seekers.cli."), (
+                f"Module path for '{cmd}' doesn't start with 'skill_seekers.cli.'"
+            )
+
+    def test_new_parser_names_include_all_10(self):
+        """Test that get_parser_names() includes all 10 new source types."""
+        names = get_parser_names()
+        for cmd in self.NEW_COMMAND_NAMES:
+            assert cmd in names, f"Parser '{cmd}' not registered"
+
+    def test_total_parser_count(self):
+        """Test total PARSERS count is 35 (25 original + 10 new)."""
+        assert len(PARSERS) == 35
+
+    def test_no_duplicate_parser_names(self):
+        """Test no duplicate parser names exist."""
+        names = get_parser_names()
+        assert len(names) == len(set(names)), "Duplicate parser names found!"
+
+    def test_command_module_count(self):
+        """Test COMMAND_MODULES has expected number of entries."""
+        # 25 original + 10 new = 35
+        assert len(COMMAND_MODULES) == 35
+
+
+# ---------------------------------------------------------------------------
+# 5. SourceDetector.validate_source — new types
+# ---------------------------------------------------------------------------
+
+
+class TestSourceDetectorValidation:
+    """Test validate_source for new file-based source types."""
+
+    def test_validation_passes_for_existing_jupyter(self, tmp_path):
+        """Test validation passes for an existing .ipynb file."""
+        nb = tmp_path / "test.ipynb"
+        nb.write_text('{"cells": []}')
+
+        info = SourceInfo(
+            type="jupyter",
+            parsed={"file_path": str(nb)},
+            suggested_name="test",
+            raw_input=str(nb),
+        )
+        # Should not raise
+        SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_jupyter(self):
+        """Test validation raises ValueError for non-existent file."""
+        info = SourceInfo(
+            type="jupyter",
+            parsed={"file_path": "/nonexistent/notebook.ipynb"},
+            suggested_name="notebook",
+            raw_input="/nonexistent/notebook.ipynb",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_existing_html(self, tmp_path):
+        """Test validation passes for an existing .html file."""
+        html = tmp_path / "page.html"
+        html.write_text("<html></html>")
+
+        info = SourceInfo(
+            type="html",
+            parsed={"file_path": str(html)},
+            suggested_name="page",
+            raw_input=str(html),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_pptx(self):
+        """Test validation raises ValueError for non-existent pptx."""
+        info = SourceInfo(
+            type="pptx",
+            parsed={"file_path": "/nonexistent/slides.pptx"},
+            suggested_name="slides",
+            raw_input="/nonexistent/slides.pptx",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_existing_openapi(self, tmp_path):
+        """Test validation passes for an existing OpenAPI spec file."""
+        spec = tmp_path / "api.yaml"
+        spec.write_text("openapi: '3.0.0'\n")
+
+        info = SourceInfo(
+            type="openapi",
+            parsed={"file_path": str(spec)},
+            suggested_name="api",
+            raw_input=str(spec),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_asciidoc(self):
+        """Test validation raises ValueError for non-existent asciidoc."""
+        info = SourceInfo(
+            type="asciidoc",
+            parsed={"file_path": "/nonexistent/doc.adoc"},
+            suggested_name="doc",
+            raw_input="/nonexistent/doc.adoc",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_raises_for_nonexistent_manpage(self):
+        """Test validation raises ValueError for non-existent manpage."""
+        info = SourceInfo(
+            type="manpage",
+            parsed={"file_path": "/nonexistent/git.1"},
+            suggested_name="git",
+            raw_input="/nonexistent/git.1",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_existing_manpage(self, tmp_path):
+        """Test validation passes for an existing man page file."""
+        man = tmp_path / "curl.1"
+        man.write_text(".TH CURL 1\n")
+
+        info = SourceInfo(
+            type="manpage",
+            parsed={"file_path": str(man)},
+            suggested_name="curl",
+            raw_input=str(man),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_rss_url_validation_no_file_check(self):
+        """Test rss validation passes for URL-based source (no file check)."""
+        info = SourceInfo(
+            type="rss",
+            parsed={"url": "https://example.com/feed.rss"},
+            suggested_name="feed",
+            raw_input="https://example.com/feed.rss",
+        )
+        # rss validation only checks file if file_path is present; URL should pass
+        SourceDetector.validate_source(info)
+
+    def test_rss_validation_raises_for_nonexistent_file(self):
+        """Test rss validation raises for non-existent local file."""
+        info = SourceInfo(
+            type="rss",
+            parsed={"file_path": "/nonexistent/feed.rss"},
+            suggested_name="feed",
+            raw_input="/nonexistent/feed.rss",
+        )
+        with pytest.raises(ValueError, match="does not exist"):
+            SourceDetector.validate_source(info)
+
+    def test_rss_validation_passes_for_existing_file(self, tmp_path):
+        """Test rss validation passes for an existing .rss file."""
+        rss = tmp_path / "feed.rss"
+        rss.write_text("<rss></rss>")
+
+        info = SourceInfo(
+            type="rss",
+            parsed={"file_path": str(rss)},
+            suggested_name="feed",
+            raw_input=str(rss),
+        )
+        SourceDetector.validate_source(info)
+
+    def test_validation_passes_for_directory_types(self, tmp_path):
+        """Test validation passes when source is a directory (e.g., html dir)."""
+        html_dir = tmp_path / "pages"
+        html_dir.mkdir()
+
+        info = SourceInfo(
+            type="html",
+            parsed={"file_path": str(html_dir)},
+            suggested_name="pages",
+            raw_input=str(html_dir),
+        )
+        # The validator allows directories for these types (isfile or isdir)
+        SourceDetector.validate_source(info)
+
+
+# ---------------------------------------------------------------------------
+# 6. CreateCommand._route_generic coverage
+# ---------------------------------------------------------------------------
+
+
+class TestCreateCommandRouting:
+    """Test that CreateCommand._route_to_scraper maps new types to _route_generic."""
+
+    # We can't easily call _route_to_scraper (it imports real scrapers),
+    # but we verify the routing table is correct by checking the method source.
+
+    GENERIC_ROUTES = {
+        "jupyter": ("jupyter_scraper", "--notebook"),
+        "html": ("html_scraper", "--html-path"),
+        "openapi": ("openapi_scraper", "--spec"),
+        "asciidoc": ("asciidoc_scraper", "--asciidoc-path"),
+        "pptx": ("pptx_scraper", "--pptx"),
+        "rss": ("rss_scraper", "--feed-path"),
+        "manpage": ("man_scraper", "--man-path"),
+        "confluence": ("confluence_scraper", "--export-path"),
+        "notion": ("notion_scraper", "--export-path"),
+        "chat": ("chat_scraper", "--export-path"),
+    }
+
+    def test_route_to_scraper_source_coverage(self):
+        """Test _route_to_scraper method handles all 10 new types.
+
+        We inspect the method source to verify each type has a branch.
+        """
+        import inspect
+
+        source = inspect.getsource(
+            __import__(
+                "skill_seekers.cli.create_command",
+                fromlist=["CreateCommand"],
+            ).CreateCommand._route_to_scraper
+        )
+        for source_type in self.GENERIC_ROUTES:
+            assert f'"{source_type}"' in source, (
+                f"_route_to_scraper missing branch for '{source_type}'"
+            )
+
+    def test_generic_route_module_names(self):
+        """Test _route_generic is called with correct module names."""
+        import inspect
+
+        source = inspect.getsource(
+            __import__(
+                "skill_seekers.cli.create_command",
+                fromlist=["CreateCommand"],
+            ).CreateCommand._route_to_scraper
+        )
+        for source_type, (module, flag) in self.GENERIC_ROUTES.items():
+            assert f'"{module}"' in source, f"Module name '{module}' not found for '{source_type}'"
+            assert f'"{flag}"' in source, f"Flag '{flag}' not found for '{source_type}'"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/uv.lock b/uv.lock
index d1cd30d..ff87560 100644
--- a/uv.lock
+++ b/uv.lock
@@ -220,6 +220,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
 ]
 
+[[package]]
+name = "asciidoc"
+version = "10.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/1d/e7/315a82f2d256e9270977aa3c15e8fe281fd7c40b8e2a0b97e0cb61ca8fa0/asciidoc-10.2.1.tar.gz", hash = "sha256:d9f13c285981b3c7eb660d02ca0a2779981e88d48105de81bb40445e60dddb83", size = 230179, upload-time = "2024-07-17T03:12:52.681Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/75/1f/87941eaa96e86aa22086064f67e4187e2710fb76c147312979ea29278dac/asciidoc-10.2.1-py2.py3-none-any.whl", hash = "sha256:3f277a636b617c9ce7e0b87bcaea51f144500e9a5c8a6488421ee24594850d40", size = 272433, upload-time = "2024-07-17T03:12:49.012Z" },
+]
+
 [[package]]
 name = "async-timeout"
 version = "5.0.1"
@@ -229,6 +238,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" },
 ]
 
+[[package]]
+name = "atlassian-python-api"
+version = "4.0.7"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "deprecated" },
+    { name = "jmespath" },
+    { name = "oauthlib" },
+    { name = "requests" },
+    { name = "requests-oauthlib" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/40/e8/f23b7273e410c6fe9f98f9db25268c6736572f22a9566d1dc9ed3614bb68/atlassian_python_api-4.0.7.tar.gz", hash = "sha256:8d9cc6068b1d2a48eb434e22e57f6bbd918a47fac9e46b95b7a3cefb00fceacb", size = 271149, upload-time = "2025-08-21T13:19:40.746Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1d/83/e4f9976ce3c933a079b8931325e7a9c0a8bba7030a2cb85764c0048f3479/atlassian_python_api-4.0.7-py3-none-any.whl", hash = "sha256:46a70cb29eaab87c0a1697fccd3e25df1aa477e6aa4fb9ba936a9d46b425933c", size = 197746, upload-time = "2025-08-21T13:19:39.044Z" },
+]
+
 [[package]]
 name = "attrs"
 version = "25.4.0"
@@ -1135,6 +1162,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/05/99/49ee85903dee060d9f08297b4a342e5e0bcfca2f027a07b4ee0a38ab13f9/faster_whisper-1.2.1-py3-none-any.whl", hash = "sha256:79a66ad50688c0b794dd501dc340a736992a6342f7f95e5811be60b5224a26a7", size = 1118909, upload-time = "2025-10-31T11:35:47.794Z" },
 ]
 
+[[package]]
+name = "fastjsonschema"
+version = "2.21.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/b5/23b216d9d985a956623b6bd12d4086b60f0059b27799f23016af04a74ea1/fastjsonschema-2.21.2.tar.gz", hash = "sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de", size = 374130, upload-time = "2025-08-14T18:49:36.666Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" },
+]
+
+[[package]]
+name = "feedparser"
+version = "6.0.12"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "sgmllib3k" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" },
+]
+
 [[package]]
 name = "ffmpeg-python"
 version = "0.2.0"
@@ -2100,6 +2148,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" },
 ]
 
+[[package]]
+name = "jupyter-core"
+version = "5.9.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "platformdirs" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" },
+]
+
 [[package]]
 name = "kubernetes"
 version = "35.0.0"
@@ -3122,6 +3183,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
 ]
 
+[[package]]
+name = "nbformat"
+version = "5.10.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "fastjsonschema" },
+    { name = "jsonschema" },
+    { name = "jupyter-core" },
+    { name = "traitlets" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/fd/91545e604bc3dad7dca9ed03284086039b294c6b3d75c0d2fa45f9e9caf3/nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a", size = 142749, upload-time = "2024-04-04T11:20:37.371Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" },
+]
+
 [[package]]
 name = "nest-asyncio"
 version = "1.6.0"
@@ -3173,6 +3249,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl", hash = "sha256:1e209d2b3009110635ed9709a67a1a3e33a10f799490fa71cf4bec218c11c88a", size = 1513404, upload-time = "2025-10-01T07:19:21.648Z" },
 ]
 
+[[package]]
+name = "notion-client"
+version = "3.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "httpx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a5/39/60afcbc0148c3dafaaefe851ae3f058077db49d66288dfb218a11a57b997/notion_client-3.0.0.tar.gz", hash = "sha256:05c4d2b4fa3491dc0de21c9c826277202ea8b8714077ee7f51a6e1a09ab23d0f", size = 31357, upload-time = "2026-02-16T11:15:48.024Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/aa/ce/6b03f9aedd2edfcc28e23ced5c2582d543f6ddbb2be5c570533f02890b27/notion_client-3.0.0-py2.py3-none-any.whl", hash = "sha256:177fc3d2ace7e8ef69cf96f46269e8a66071c2c7c526194bf06ce7925853e759", size = 18746, upload-time = "2026-02-16T11:15:46.602Z" },
+]
+
 [[package]]
 name = "numpy"
 version = "2.2.6"
@@ -4789,6 +4877,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/aa/76/03af049af4dcee5d27442f71b6924f01f3efb5d2bd34f23fcd563f2cc5f5/python_multipart-0.0.21-py3-none-any.whl", hash = "sha256:cf7a6713e01c87aa35387f4774e812c4361150938d20d232800f75ffcf266090", size = 24541, upload-time = "2025-12-17T09:24:21.153Z" },
 ]
 
+[[package]]
+name = "python-pptx"
+version = "1.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "lxml" },
+    { name = "pillow" },
+    { name = "typing-extensions" },
+    { name = "xlsxwriter" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
+]
+
 [[package]]
 name = "pytz"
 version = "2025.2"
@@ -5570,6 +5673,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6", size = 1062021, upload-time = "2026-02-06T21:10:37.175Z" },
 ]
 
+[[package]]
+name = "sgmllib3k"
+version = "1.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
+
 [[package]]
 name = "shellingham"
 version = "1.5.4"
@@ -5619,23 +5728,30 @@ dependencies = [
 
 [package.optional-dependencies]
 all = [
+    { name = "asciidoc" },
+    { name = "atlassian-python-api" },
     { name = "azure-storage-blob" },
     { name = "boto3" },
     { name = "chromadb" },
     { name = "ebooklib" },
     { name = "fastapi" },
+    { name = "feedparser" },
     { name = "google-cloud-storage" },
     { name = "google-generativeai" },
     { name = "httpx" },
     { name = "httpx-sse" },
     { name = "mammoth" },
     { name = "mcp" },
+    { name = "nbformat" },
+    { name = "notion-client" },
     { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "openai" },
     { name = "pinecone" },
     { name = "python-docx" },
+    { name = "python-pptx" },
     { name = "sentence-transformers" },
+    { name = "slack-sdk" },
     { name = "sse-starlette" },
     { name = "starlette" },
     { name = "uvicorn" },
@@ -5653,12 +5769,21 @@ all-llms = [
     { name = "google-generativeai" },
     { name = "openai" },
 ]
+asciidoc = [
+    { name = "asciidoc" },
+]
 azure = [
     { name = "azure-storage-blob" },
 ]
+chat = [
+    { name = "slack-sdk" },
+]
 chroma = [
     { name = "chromadb" },
 ]
+confluence = [
+    { name = "atlassian-python-api" },
+]
 docx = [
     { name = "mammoth" },
     { name = "python-docx" },
@@ -5680,6 +5805,9 @@ gcs = [
 gemini = [
     { name = "google-generativeai" },
 ]
+jupyter = [
+    { name = "nbformat" },
+]
 mcp = [
     { name = "httpx" },
     { name = "httpx-sse" },
@@ -5688,18 +5816,27 @@ mcp = [
     { name = "starlette" },
     { name = "uvicorn" },
 ]
+notion = [
+    { name = "notion-client" },
+]
 openai = [
     { name = "openai" },
 ]
 pinecone = [
     { name = "pinecone" },
 ]
+pptx = [
+    { name = "python-pptx" },
+]
 rag-upload = [
     { name = "chromadb" },
     { name = "pinecone" },
     { name = "sentence-transformers" },
     { name = "weaviate-client" },
 ]
+rss = [
+    { name = "feedparser" },
+]
 s3 = [
     { name = "boto3" },
 ]
@@ -5743,6 +5880,10 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "anthropic", specifier = ">=0.76.0" },
+    { name = "asciidoc", marker = "extra == 'all'", specifier = ">=10.0.0" },
+    { name = "asciidoc", marker = "extra == 'asciidoc'", specifier = ">=10.0.0" },
+    { name = "atlassian-python-api", marker = "extra == 'all'", specifier = ">=3.41.0" },
+    { name = "atlassian-python-api", marker = "extra == 'confluence'", specifier = ">=3.41.0" },
     { name = "azure-storage-blob", marker = "extra == 'all'", specifier = ">=12.19.0" },
     { name = "azure-storage-blob", marker = "extra == 'all-cloud'", specifier = ">=12.19.0" },
     { name = "azure-storage-blob", marker = "extra == 'azure'", specifier = ">=12.19.0" },
@@ -5759,6 +5900,8 @@ requires-dist = [
     { name = "fastapi", marker = "extra == 'all'", specifier = ">=0.109.0" },
     { name = "fastapi", marker = "extra == 'embedding'", specifier = ">=0.109.0" },
     { name = "faster-whisper", marker = "extra == 'video-full'", specifier = ">=1.0.0" },
+    { name = "feedparser", marker = "extra == 'all'", specifier = ">=6.0.0" },
+    { name = "feedparser", marker = "extra == 'rss'", specifier = ">=6.0.0" },
     { name = "gitpython", specifier = ">=3.1.40" },
     { name = "google-cloud-storage", marker = "extra == 'all'", specifier = ">=2.10.0" },
     { name = "google-cloud-storage", marker = "extra == 'all-cloud'", specifier = ">=2.10.0" },
@@ -5778,7 +5921,11 @@ requires-dist = [
     { name = "mammoth", marker = "extra == 'docx'", specifier = ">=1.6.0" },
     { name = "mcp", marker = "extra == 'all'", specifier = ">=1.25,<2" },
     { name = "mcp", marker = "extra == 'mcp'", specifier = ">=1.25,<2" },
+    { name = "nbformat", marker = "extra == 'all'", specifier = ">=5.9.0" },
+    { name = "nbformat", marker = "extra == 'jupyter'", specifier = ">=5.9.0" },
     { name = "networkx", specifier = ">=3.0" },
+    { name = "notion-client", marker = "extra == 'all'", specifier = ">=2.0.0" },
+    { name = "notion-client", marker = "extra == 'notion'", specifier = ">=2.0.0" },
     { name = "numpy", marker = "extra == 'all'", specifier = ">=1.24.0" },
     { name = "numpy", marker = "extra == 'embedding'", specifier = ">=1.24.0" },
     { name = "openai", marker = "extra == 'all'", specifier = ">=1.0.0" },
@@ -5799,6 +5946,8 @@ requires-dist = [
     { name = "python-docx", marker = "extra == 'all'", specifier = ">=1.1.0" },
     { name = "python-docx", marker = "extra == 'docx'", specifier = ">=1.1.0" },
     { name = "python-dotenv", specifier = ">=1.1.1" },
+    { name = "python-pptx", marker = "extra == 'all'", specifier = ">=0.6.21" },
+    { name = "python-pptx", marker = "extra == 'pptx'", specifier = ">=0.6.21" },
     { name = "pyyaml", specifier = ">=6.0" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "scenedetect", extras = ["opencv"], marker = "extra == 'video-full'", specifier = ">=0.6.4" },
@@ -5807,6 +5956,8 @@ requires-dist = [
     { name = "sentence-transformers", marker = "extra == 'embedding'", specifier = ">=2.3.0" },
     { name = "sentence-transformers", marker = "extra == 'rag-upload'", specifier = ">=2.2.0" },
     { name = "sentence-transformers", marker = "extra == 'sentence-transformers'", specifier = ">=2.2.0" },
+    { name = "slack-sdk", marker = "extra == 'all'", specifier = ">=3.27.0" },
+    { name = "slack-sdk", marker = "extra == 'chat'", specifier = ">=3.27.0" },
     { name = "sse-starlette", marker = "extra == 'all'", specifier = ">=3.0.2" },
     { name = "sse-starlette", marker = "extra == 'mcp'", specifier = ">=3.0.2" },
     { name = "starlette", marker = "extra == 'all'", specifier = ">=0.48.0" },
@@ -5827,7 +5978,7 @@ requires-dist = [
     { name = "yt-dlp", marker = "extra == 'video'", specifier = ">=2024.12.0" },
     { name = "yt-dlp", marker = "extra == 'video-full'", specifier = ">=2024.12.0" },
 ]
-provides-extras = ["mcp", "gemini", "openai", "all-llms", "s3", "gcs", "azure", "docx", "epub", "video", "video-full", "chroma", "weaviate", "sentence-transformers", "pinecone", "rag-upload", "all-cloud", "embedding", "all"]
+provides-extras = ["mcp", "gemini", "openai", "all-llms", "s3", "gcs", "azure", "docx", "epub", "video", "video-full", "chroma", "weaviate", "sentence-transformers", "pinecone", "rag-upload", "all-cloud", "jupyter", "asciidoc", "pptx", "confluence", "notion", "rss", "chat", "embedding", "all"]
 
 [package.metadata.requires-dev]
 dev = [
@@ -5846,6 +5997,15 @@ dev = [
     { name = "starlette", specifier = ">=0.31.0" },
 ]
 
+[[package]]
+name = "slack-sdk"
+version = "3.41.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/35/fc009118a13187dd9731657c60138e5a7c2dea88681a7f04dc406af5da7d/slack_sdk-3.41.0.tar.gz", hash = "sha256:eb61eb12a65bebeca9cb5d36b3f799e836ed2be21b456d15df2627cfe34076ca", size = 250568, upload-time = "2026-03-12T16:10:11.381Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/df/2e4be347ff98281b505cc0ccf141408cdd25eb5ca9f3830deb361b2472d3/slack_sdk-3.41.0-py2.py3-none-any.whl", hash = "sha256:bb18dcdfff1413ec448e759cf807ec3324090993d8ab9111c74081623b692a89", size = 313885, upload-time = "2026-03-12T16:10:09.811Z" },
+]
+
 [[package]]
 name = "smmap"
 version = "5.0.2"
@@ -6233,6 +6393,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
 ]
 
+[[package]]
+name = "traitlets"
+version = "5.14.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" },
+]
+
 [[package]]
 name = "transformers"
 version = "5.1.0"
@@ -6753,6 +6922,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" },
 ]
 
+[[package]]
+name = "xlsxwriter"
+version = "3.2.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" },
+]
+
 [[package]]
 name = "xxhash"
 version = "3.6.0"