diff --git a/AGENTS.md b/AGENTS.md index d0ae247..d26c952 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,866 +1,171 @@ # AGENTS.md - Skill Seekers -Essential guidance for AI coding agents working with the Skill Seekers codebase. +Concise reference for AI coding agents. Skill Seekers is a Python CLI tool (v3.2.0) that converts documentation sites, GitHub repos, PDFs, videos, notebooks, wikis, and more into AI-ready skills for 16+ LLM platforms and RAG pipelines. ---- - -## Project Overview - -**Skill Seekers** is a Python CLI tool that converts documentation websites, GitHub repositories, PDF files, and videos into AI-ready skills for LLM platforms and RAG (Retrieval-Augmented Generation) pipelines. It serves as the universal preprocessing layer for AI systems. - -### Key Facts - -| Attribute | Value | -|-----------|-------| -| **Current Version** | 3.1.3 | -| **Python Version** | 3.10+ (tested on 3.10, 3.11, 3.12, 3.13) | -| **License** | MIT | -| **Package Name** | `skill-seekers` (PyPI) | -| **Source Files** | 182 Python files | -| **Test Files** | 105+ test files | -| **Website** | https://skillseekersweb.com/ | -| **Repository** | https://github.com/yusufkaraaslan/Skill_Seekers | - -### Supported Target Platforms - -| Platform | Format | Use Case | -|----------|--------|----------| -| **Claude AI** | ZIP + YAML | Claude Code skills | -| **Google Gemini** | tar.gz | Gemini skills | -| **OpenAI ChatGPT** | ZIP + Vector Store | Custom GPTs | -| **LangChain** | Documents | QA chains, agents, retrievers | -| **LlamaIndex** | TextNodes | Query engines, chat engines | -| **Haystack** | Documents | Enterprise RAG pipelines | -| **Pinecone** | Ready for upsert | Production vector search | -| **Weaviate** | Vector objects | Vector database | -| **Qdrant** | Points | Vector database | -| **Chroma** | Documents | Local vector database | -| **FAISS** | Index files | Local similarity search | -| **Cursor IDE** | .cursorrules | AI coding assistant rules | -| **Windsurf** | .windsurfrules | AI coding rules | -| **Cline** | .clinerules + MCP | VS Code extension | -| **Continue.dev** | HTTP context | Universal IDE support | -| **Generic Markdown** | ZIP | Universal export | - -### Core Workflow - -1. **Scrape Phase** - Crawl documentation/GitHub/PDF/video sources -2. **Build Phase** - Organize content into categorized references -3. **Enhancement Phase** - AI-powered quality improvements (optional) -4. **Package Phase** - Create platform-specific packages -5. **Upload Phase** - Auto-upload to target platform (optional) - ---- - -## Project Structure - -``` -/mnt/1ece809a-2821-4f10-aecb-fcdf34760c0b/Git/Skill_Seekers/ -├── src/skill_seekers/ # Main source code (src/ layout) -│ ├── cli/ # CLI tools and commands (~70 modules) -│ │ ├── adaptors/ # Platform adaptors (Strategy pattern) -│ │ │ ├── base.py # Abstract base class (SkillAdaptor) -│ │ │ ├── claude.py # Claude AI adaptor -│ │ │ ├── gemini.py # Google Gemini adaptor -│ │ │ ├── openai.py # OpenAI ChatGPT adaptor -│ │ │ ├── markdown.py # Generic Markdown adaptor -│ │ │ ├── chroma.py # Chroma vector DB adaptor -│ │ │ ├── faiss_helpers.py # FAISS index adaptor -│ │ │ ├── haystack.py # Haystack RAG adaptor -│ │ │ ├── langchain.py # LangChain adaptor -│ │ │ ├── llama_index.py # LlamaIndex adaptor -│ │ │ ├── qdrant.py # Qdrant vector DB adaptor -│ │ │ ├── weaviate.py # Weaviate vector DB adaptor -│ │ │ └── streaming_adaptor.py # Streaming output adaptor -│ │ ├── arguments/ # CLI argument definitions -│ │ ├── parsers/ # Argument parsers -│ │ │ └── extractors/ # Content extractors -│ │ ├── presets/ # Preset configuration management -│ │ ├── storage/ # Cloud storage adaptors -│ │ ├── main.py # Unified CLI entry point -│ │ ├── create_command.py # Unified create command -│ │ ├── doc_scraper.py # Documentation scraper -│ │ ├── github_scraper.py # GitHub repository scraper -│ │ ├── pdf_scraper.py # PDF extraction -│ │ ├── word_scraper.py # Word document scraper -│ │ ├── video_scraper.py # Video extraction -│ │ ├── video_setup.py # GPU detection & dependency installation -│ │ ├── unified_scraper.py # Multi-source scraping -│ │ ├── codebase_scraper.py # Local codebase analysis -│ │ ├── enhance_command.py # AI enhancement command -│ │ ├── enhance_skill_local.py # AI enhancement (local mode) -│ │ ├── package_skill.py # Skill packager -│ │ ├── upload_skill.py # Upload to platforms -│ │ ├── cloud_storage_cli.py # Cloud storage CLI -│ │ ├── benchmark_cli.py # Benchmarking CLI -│ │ ├── sync_cli.py # Sync monitoring CLI -│ │ └── workflows_command.py # Workflow management CLI -│ ├── mcp/ # MCP server integration -│ │ ├── server_fastmcp.py # FastMCP server (~708 lines) -│ │ ├── server_legacy.py # Legacy server implementation -│ │ ├── server.py # Server entry point -│ │ ├── agent_detector.py # AI agent detection -│ │ ├── git_repo.py # Git repository operations -│ │ ├── source_manager.py # Config source management -│ │ └── tools/ # MCP tool implementations -│ │ ├── config_tools.py # Configuration tools -│ │ ├── packaging_tools.py # Packaging tools -│ │ ├── scraping_tools.py # Scraping tools -│ │ ├── source_tools.py # Source management tools -│ │ ├── splitting_tools.py # Config splitting tools -│ │ ├── vector_db_tools.py # Vector database tools -│ │ └── workflow_tools.py # Workflow management tools -│ ├── sync/ # Sync monitoring module -│ │ ├── detector.py # Change detection -│ │ ├── models.py # Data models (Pydantic) -│ │ ├── monitor.py # Monitoring logic -│ │ └── notifier.py # Notification system -│ ├── benchmark/ # Benchmarking framework -│ │ ├── framework.py # Benchmark framework -│ │ ├── models.py # Benchmark models -│ │ └── runner.py # Benchmark runner -│ ├── embedding/ # Embedding server -│ │ ├── server.py # FastAPI embedding server -│ │ ├── generator.py # Embedding generation -│ │ ├── cache.py # Embedding cache -│ │ └── models.py # Embedding models -│ ├── workflows/ # YAML workflow presets (66 presets) -│ ├── _version.py # Version information (reads from pyproject.toml) -│ └── __init__.py # Package init -├── tests/ # Test suite (105+ test files) -├── configs/ # Preset configuration files -├── docs/ # Documentation (80+ markdown files) -│ ├── integrations/ # Platform integration guides -│ ├── guides/ # User guides -│ ├── reference/ # API reference -│ ├── features/ # Feature documentation -│ ├── blog/ # Blog posts -│ └── roadmap/ # Roadmap documents -├── examples/ # Usage examples -├── .github/workflows/ # CI/CD workflows -├── pyproject.toml # Main project configuration -├── requirements.txt # Pinned dependencies -├── mypy.ini # MyPy type checker configuration -├── Dockerfile # Main Docker image (multi-stage) -├── Dockerfile.mcp # MCP server Docker image -└── docker-compose.yml # Full stack deployment -``` - ---- - -## Build and Development Commands - -### Prerequisites - -- Python 3.10 or higher -- pip or uv package manager -- Git (for GitHub scraping features) - -### Setup (REQUIRED before any development) +## Setup ```bash -# Install in editable mode (REQUIRED for tests due to src/ layout) +# REQUIRED before running tests (src/ layout — tests fail without this) pip install -e . - -# Install with all platform dependencies -pip install -e ".[all-llms]" - -# Install with all optional dependencies -pip install -e ".[all]" - -# Install specific platforms only -pip install -e ".[gemini]" # Google Gemini support -pip install -e ".[openai]" # OpenAI ChatGPT support -pip install -e ".[mcp]" # MCP server dependencies -pip install -e ".[s3]" # AWS S3 support -pip install -e ".[gcs]" # Google Cloud Storage -pip install -e ".[azure]" # Azure Blob Storage -pip install -e ".[embedding]" # Embedding server support -pip install -e ".[rag-upload]" # Vector DB upload support - -# Install dev dependencies (using dependency-groups) +# With dev tools pip install -e ".[dev]" +# With all optional deps +pip install -e ".[all]" ``` -**CRITICAL:** The project uses a `src/` layout. Tests WILL FAIL unless you install with `pip install -e .` first. - -### Building +## Build / Test / Lint Commands ```bash -# Build package using uv (recommended) -uv build - -# Or using standard build -python -m build - -# Publish to PyPI -uv publish -``` - -### Docker - -```bash -# Build Docker image -docker build -t skill-seekers . - -# Run with docker-compose (includes vector databases) -docker-compose up -d - -# Run MCP server only -docker-compose up -d mcp-server - -# View logs -docker-compose logs -f mcp-server -``` - ---- - -## Testing Instructions - -### Running Tests - -**CRITICAL:** Never skip tests - all tests must pass before commits. - -```bash -# All tests (must run pip install -e . first!) +# Run ALL tests (never skip tests — all must pass before commits) pytest tests/ -v -# Specific test file +# Run a single test file pytest tests/test_scraper_features.py -v -pytest tests/test_mcp_fastmcp.py -v -pytest tests/test_cloud_storage.py -v -# With coverage -pytest tests/ --cov=src/skill_seekers --cov-report=term --cov-report=html - -# Single test +# Run a single test function pytest tests/test_scraper_features.py::test_detect_language -v -# E2E tests -pytest tests/test_e2e_three_stream_pipeline.py -v +# Run a single test class method +pytest tests/test_adaptors/test_claude_adaptor.py::TestClaudeAdaptor::test_package -v -# Skip slow tests -pytest tests/ -v -m "not slow" - -# Run only integration tests -pytest tests/ -v -m integration - -# Run only specific marker +# Skip slow/integration tests pytest tests/ -v -m "not slow and not integration" -``` -### Test Architecture +# With coverage +pytest tests/ --cov=src/skill_seekers --cov-report=term -- **105+ test files** covering all features -- **CI Matrix:** Ubuntu + macOS, Python 3.10-3.12 -- Test markers defined in `pyproject.toml`: - -| Marker | Description | -|--------|-------------| -| `slow` | Tests taking >5 seconds | -| `integration` | Requires external services (APIs) | -| `e2e` | End-to-end tests (resource-intensive) | -| `venv` | Requires virtual environment setup | -| `bootstrap` | Bootstrap skill specific | -| `benchmark` | Performance benchmark tests | - -### Test Configuration - -From `pyproject.toml`: -```toml -[tool.pytest.ini_options] -testpaths = ["tests"] -python_files = ["test_*.py"] -addopts = "-v --tb=short --strict-markers" -asyncio_mode = "auto" -asyncio_default_fixture_loop_scope = "function" -``` - -The `conftest.py` file checks that the package is installed before running tests. - ---- - -## Code Style Guidelines - -### Linting and Formatting - -```bash -# Run ruff linter +# Lint (ruff) ruff check src/ tests/ - -# Run ruff formatter check -ruff format --check src/ tests/ - -# Auto-fix issues ruff check src/ tests/ --fix + +# Format (ruff) +ruff format --check src/ tests/ ruff format src/ tests/ -# Run mypy type checker +# Type check (mypy) mypy src/skill_seekers --show-error-codes --pretty ``` -### Style Rules (from pyproject.toml) +**Test markers:** `slow`, `integration`, `e2e`, `venv`, `bootstrap`, `benchmark` +**Async tests:** use `@pytest.mark.asyncio`; asyncio_mode is `auto`. +## Code Style + +### Formatting Rules (ruff — from pyproject.toml) - **Line length:** 100 characters - **Target Python:** 3.10+ -- **Enabled rules:** E, W, F, I, B, C4, UP, ARG, SIM -- **Ignored rules:** E501, F541, ARG002, B007, I001, SIM114 -- **Import sorting:** isort style with `skill_seekers` as first-party +- **Enabled lint rules:** E, W, F, I, B, C4, UP, ARG, SIM +- **Ignored rules:** E501 (line length handled by formatter), F541 (f-string style), ARG002 (unused method args for interface compliance), B007 (intentional unused loop vars), I001 (formatter handles imports), SIM114 (readability preference) -### MyPy Configuration (from pyproject.toml) +### Imports +- Sort with isort (via ruff); `skill_seekers` is first-party +- Standard library → third-party → first-party, separated by blank lines +- Use `from __future__ import annotations` only if needed for forward refs +- Guard optional imports with try/except ImportError (see `adaptors/__init__.py` pattern) -```toml -[tool.mypy] -python_version = "3.10" -warn_return_any = true -warn_unused_configs = true -disallow_untyped_defs = false -disallow_incomplete_defs = false -check_untyped_defs = true -ignore_missing_imports = true -show_error_codes = true -pretty = true +### Naming Conventions +- **Files:** `snake_case.py` +- **Classes:** `PascalCase` (e.g., `SkillAdaptor`, `ClaudeAdaptor`) +- **Functions/methods:** `snake_case` +- **Constants:** `UPPER_CASE` (e.g., `ADAPTORS`, `DEFAULT_CHUNK_TOKENS`) +- **Private:** prefix with `_` + +### Type Hints +- Gradual typing — add hints where practical, not enforced everywhere +- Use modern syntax: `str | None` not `Optional[str]`, `list[str]` not `List[str]` +- MyPy config: `disallow_untyped_defs = false`, `check_untyped_defs = true`, `ignore_missing_imports = true` + +### Docstrings +- Module-level docstring on every file (triple-quoted, describes purpose) +- Google-style or standard docstrings for public functions/classes +- Include `Args:`, `Returns:`, `Raises:` sections where useful + +### Error Handling +- Use specific exceptions, never bare `except:` +- Provide helpful error messages with context (see `get_adaptor()` in `adaptors/__init__.py`) +- Use `raise ValueError(...)` for invalid arguments, `raise RuntimeError(...)` for state errors +- Guard optional dependency imports with try/except and give clear install instructions on failure + +### Suppressing Lint Warnings +- Use inline `# noqa: XXXX` comments (e.g., `# noqa: F401` for re-exports, `# noqa: ARG001` for required but unused params) + +## Supported Source Types (17) + +| Type | CLI Command | Config Type | Detection | +|------|------------|-------------|-----------| +| Documentation (web) | `scrape` / `create ` | `documentation` | HTTP/HTTPS URLs | +| GitHub repo | `github` / `create owner/repo` | `github` | `owner/repo` or github.com URLs | +| PDF | `pdf` / `create file.pdf` | `pdf` | `.pdf` extension | +| Word (.docx) | `word` / `create file.docx` | `word` | `.docx` extension | +| EPUB | `epub` / `create file.epub` | `epub` | `.epub` extension | +| Video | `video` / `create ` | `video` | YouTube/Vimeo URLs, video extensions | +| Local codebase | `analyze` / `create ./path` | `local` | Directory paths | +| Jupyter Notebook | `jupyter` / `create file.ipynb` | `jupyter` | `.ipynb` extension | +| Local HTML | `html` / `create file.html` | `html` | `.html`/`.htm` extensions | +| OpenAPI/Swagger | `openapi` / `create spec.yaml` | `openapi` | `.yaml`/`.yml` with OpenAPI content | +| AsciiDoc | `asciidoc` / `create file.adoc` | `asciidoc` | `.adoc`/`.asciidoc` extensions | +| PowerPoint | `pptx` / `create file.pptx` | `pptx` | `.pptx` extension | +| RSS/Atom | `rss` / `create feed.rss` | `rss` | `.rss`/`.atom` extensions | +| Man pages | `manpage` / `create cmd.1` | `manpage` | `.1`-`.8`/`.man` extensions | +| Confluence | `confluence` | `confluence` | API or export directory | +| Notion | `notion` | `notion` | API or export directory | +| Slack/Discord | `chat` | `chat` | Export directory or API | + +## Project Layout + +``` +src/skill_seekers/ # Main package (src/ layout) + cli/ # CLI commands and entry points + adaptors/ # Platform adaptors (Strategy pattern, inherit SkillAdaptor) + arguments/ # CLI argument definitions (one per source type) + parsers/ # Subcommand parsers (one per source type) + storage/ # Cloud storage (inherit BaseStorageAdaptor) + main.py # Unified CLI entry point (COMMAND_MODULES dict) + source_detector.py # Auto-detects source type from user input + create_command.py # Unified `create` command routing + config_validator.py # VALID_SOURCE_TYPES set + per-type validation + unified_scraper.py # Multi-source orchestrator (scraped_data + dispatch) + unified_skill_builder.py # Pairwise synthesis + generic merge + mcp/ # MCP server (FastMCP + legacy) + tools/ # MCP tool implementations by category + sync/ # Sync monitoring (Pydantic models) + benchmark/ # Benchmarking framework + embedding/ # FastAPI embedding server + workflows/ # 67 YAML workflow presets (includes complex-merge.yaml) + _version.py # Reads version from pyproject.toml +tests/ # 115+ test files (pytest) +configs/ # Preset JSON scraping configs +docs/ # 80+ markdown doc files ``` -### Code Conventions +## Key Patterns -1. **Use type hints** where practical (gradual typing approach) -2. **Docstrings:** Use Google-style or standard docstrings -3. **Error handling:** Use specific exceptions, provide helpful messages -4. **Async code:** Use `asyncio`, mark tests with `@pytest.mark.asyncio` -5. **File naming:** Use snake_case for all Python files -6. **Class naming:** Use PascalCase for classes -7. **Function naming:** Use snake_case for functions and methods -8. **Constants:** Use UPPER_CASE for module-level constants +**Adaptor (Strategy) pattern** — all platform logic in `cli/adaptors/`. Inherit `SkillAdaptor`, implement `format_skill_md()`, `package()`, `upload()`. Register in `adaptors/__init__.py` ADAPTORS dict. ---- +**Scraper pattern** — each source type has: `cli/_scraper.py` (with `ToSkillConverter` class + `main()`), `arguments/.py`, `parsers/_parser.py`. Register in `parsers/__init__.py` PARSERS list, `main.py` COMMAND_MODULES dict, `config_validator.py` VALID_SOURCE_TYPES set. -## Architecture Patterns +**Unified pipeline** — `unified_scraper.py` dispatches to per-type `_scrape_()` methods. `unified_skill_builder.py` uses pairwise synthesis for docs+github+pdf combos and `_generic_merge()` for all other combinations. -### Platform Adaptor Pattern (Strategy Pattern) +**MCP tools** — grouped in `mcp/tools/` by category. `scrape_generic_tool` handles all new source types. -All platform-specific logic is encapsulated in adaptors: - -```python -from skill_seekers.cli.adaptors import get_adaptor - -# Get platform-specific adaptor -adaptor = get_adaptor('gemini') # or 'claude', 'openai', 'langchain', etc. - -# Package skill -adaptor.package(skill_dir='output/react/', output_path='output/') - -# Upload to platform -adaptor.upload( - package_path='output/react-gemini.tar.gz', - api_key=os.getenv('GOOGLE_API_KEY') -) -``` - -Each adaptor inherits from `SkillAdaptor` base class and implements: -- `format_skill_md()` - Format SKILL.md content -- `package()` - Create platform-specific package -- `upload()` - Upload to platform API -- `validate_api_key()` - Validate API key format -- `supports_enhancement()` - Whether AI enhancement is supported - -### CLI Architecture (Git-style) - -Entry point: `src/skill_seekers/cli/main.py` - -The CLI uses subcommands that delegate to existing modules: - -```bash -# skill-seekers scrape --config react.json -# Transforms to: doc_scraper.main() with modified sys.argv -``` - -**Available subcommands:** -- `create` - Unified create command -- `config` - Configuration wizard -- `scrape` - Documentation scraping -- `github` - GitHub repository scraping -- `pdf` - PDF extraction -- `word` - Word document extraction -- `video` - Video extraction (YouTube or local). Use `--setup` to auto-detect GPU and install visual deps. -- `unified` - Multi-source scraping -- `analyze` / `codebase` - Local codebase analysis -- `enhance` - AI enhancement -- `package` - Package skill for target platform -- `upload` - Upload to platform -- `cloud` - Cloud storage operations -- `sync` - Sync monitoring -- `benchmark` - Performance benchmarking -- `embed` - Embedding server -- `install` / `install-agent` - Complete workflow -- `stream` - Streaming ingestion -- `update` - Incremental updates -- `multilang` - Multi-language support -- `quality` - Quality metrics -- `resume` - Resume interrupted jobs -- `estimate` - Estimate page counts -- `workflows` - Workflow management - -### MCP Server Architecture - -Two implementations: -- `server_fastmcp.py` - Modern, decorator-based (recommended, ~708 lines) -- `server_legacy.py` - Legacy implementation - -Tools are organized by category: -- Config tools (3 tools): generate_config, list_configs, validate_config -- Scraping tools (10 tools): estimate_pages, scrape_docs, scrape_github, scrape_pdf, scrape_video (supports `setup` parameter for GPU detection and visual dep installation), scrape_codebase, detect_patterns, extract_test_examples, build_how_to_guides, extract_config_patterns -- Packaging tools (4 tools): package_skill, upload_skill, enhance_skill, install_skill -- Source tools (5 tools): fetch_config, submit_config, add_config_source, list_config_sources, remove_config_source -- Splitting tools (2 tools): split_config, generate_router -- Vector Database tools (4 tools): export_to_weaviate, export_to_chroma, export_to_faiss, export_to_qdrant -- Workflow tools (5 tools): list_workflows, get_workflow, create_workflow, update_workflow, delete_workflow - -**Running MCP Server:** -```bash -# Stdio transport (default) -python -m skill_seekers.mcp.server_fastmcp - -# HTTP transport -python -m skill_seekers.mcp.server_fastmcp --http --port 8765 -``` - -### Cloud Storage Architecture - -Abstract base class pattern for cloud providers: -- `base_storage.py` - Defines `BaseStorageAdaptor` interface -- `s3_storage.py` - AWS S3 implementation -- `gcs_storage.py` - Google Cloud Storage implementation -- `azure_storage.py` - Azure Blob Storage implementation - -### Sync Monitoring Architecture - -Pydantic-based models in `src/skill_seekers/sync/`: -- `models.py` - Data models (SyncConfig, ChangeReport, SyncState) -- `detector.py` - Change detection logic -- `monitor.py` - Monitoring daemon -- `notifier.py` - Notification system (webhook, email, slack) - ---- +**CLI subcommands** — git-style in `cli/main.py`. Each delegates to a module's `main()` function. ## Git Workflow -### Branch Structure +- **`main`** — production, protected +- **`development`** — default PR target, active dev +- Feature branches created from `development` -``` -main (production) - ↑ - │ (only maintainer merges) - │ -development (integration) ← default branch for PRs - ↑ - │ (all contributor PRs go here) - │ -feature branches -``` - -- **`main`** - Production, always stable, protected -- **`development`** - Active development, default for PRs -- **Feature branches** - Your work, created from `development` - -### Creating a Feature Branch +## Pre-commit Checklist ```bash -# 1. Checkout development -git checkout development -git pull upstream development - -# 2. Create feature branch -git checkout -b my-feature - -# 3. Make changes, commit, push -git add . -git commit -m "Add my feature" -git push origin my-feature - -# 4. Create PR targeting 'development' branch -``` - ---- - -## CI/CD Configuration - -### GitHub Actions Workflows - -All workflows are in `.github/workflows/`: - -**`tests.yml`:** -- Runs on: push/PR to `main` and `development` -- Lint job: Ruff + MyPy -- Test matrix: Ubuntu + macOS, Python 3.10-3.12 -- Coverage: Uploads to Codecov - -**`release.yml`:** -- Triggered on version tags (`v*`) -- Builds and publishes to PyPI using `uv` -- Creates GitHub release with changelog - -**`docker-publish.yml`:** -- Builds and publishes Docker images -- Multi-architecture support (linux/amd64, linux/arm64) - -**`vector-db-export.yml`:** -- Tests vector database exports - -**`scheduled-updates.yml`:** -- Scheduled sync monitoring - -**`quality-metrics.yml`:** -- Quality metrics tracking - -**`test-vector-dbs.yml`:** -- Vector database integration tests - -### Pre-commit Checks (Manual) - -```bash -# Before committing, run: ruff check src/ tests/ ruff format --check src/ tests/ -pytest tests/ -v -x # Stop on first failure +pytest tests/ -v -x # stop on first failure ``` ---- +Never commit API keys. Use env vars: `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `OPENAI_API_KEY`, `GITHUB_TOKEN`. -## Security Considerations +## CI -### API Keys and Secrets - -1. **Never commit API keys** to the repository -2. **Use environment variables:** - - `ANTHROPIC_API_KEY` - Claude AI - - `GOOGLE_API_KEY` - Google Gemini - - `OPENAI_API_KEY` - OpenAI - - `GITHUB_TOKEN` - GitHub API - - `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` - AWS S3 - - `GOOGLE_APPLICATION_CREDENTIALS` - GCS - - `AZURE_STORAGE_CONNECTION_STRING` - Azure -3. **Configuration storage:** - - Stored at `~/.config/skill-seekers/config.json` - - Permissions: 600 (owner read/write only) - -### Rate Limit Handling - -- GitHub API has rate limits (5000 requests/hour for authenticated) -- The tool has built-in rate limit handling with retry logic -- Use `--non-interactive` flag for CI/CD environments - -### Custom API Endpoints - -Support for Claude-compatible APIs: - -```bash -export ANTHROPIC_API_KEY=your-custom-api-key -export ANTHROPIC_BASE_URL=https://custom-endpoint.com/v1 -``` - ---- - -## Common Development Tasks - -### Adding a New CLI Command - -1. Create module in `src/skill_seekers/cli/my_command.py` -2. Implement `main()` function with argument parsing -3. Add entry point in `pyproject.toml`: - ```toml - [project.scripts] - skill-seekers-my-command = "skill_seekers.cli.my_command:main" - ``` -4. Add subcommand handler in `src/skill_seekers/cli/main.py` -5. Add argument parser in `src/skill_seekers/cli/parsers/` -6. Add tests in `tests/test_my_command.py` - -### Adding a New Platform Adaptor - -1. Create `src/skill_seekers/cli/adaptors/my_platform.py` -2. Inherit from `SkillAdaptor` base class -3. Implement required methods: `package()`, `upload()`, `format_skill_md()` -4. Register in `src/skill_seekers/cli/adaptors/__init__.py` -5. Add optional dependencies in `pyproject.toml` -6. Add tests in `tests/test_adaptors/` - -### Adding an MCP Tool - -1. Implement tool logic in `src/skill_seekers/mcp/tools/category_tools.py` -2. Register in `src/skill_seekers/mcp/server_fastmcp.py` -3. Add test in `tests/test_mcp_fastmcp.py` - -### Adding Cloud Storage Provider - -1. Create module in `src/skill_seekers/cli/storage/my_storage.py` -2. Inherit from `BaseStorageAdaptor` base class -3. Implement required methods: `upload_file()`, `download_file()`, `list_files()`, `delete_file()` -4. Register in `src/skill_seekers/cli/storage/__init__.py` -5. Add optional dependencies in `pyproject.toml` - ---- - -## Documentation - -### Project Documentation (New Structure - v3.1.0+) - -**Entry Points:** -- **README.md** - Main project documentation with navigation -- **docs/README.md** - Documentation hub -- **AGENTS.md** - This file, for AI coding agents - -**Getting Started (for new users):** -- `docs/getting-started/01-installation.md` - Installation guide -- `docs/getting-started/02-quick-start.md` - 3 commands to first skill -- `docs/getting-started/03-your-first-skill.md` - Complete walkthrough -- `docs/getting-started/04-next-steps.md` - Where to go from here - -**User Guides (common tasks):** -- `docs/user-guide/01-core-concepts.md` - How Skill Seekers works -- `docs/user-guide/02-scraping.md` - All scraping options -- `docs/user-guide/03-enhancement.md` - AI enhancement explained -- `docs/user-guide/04-packaging.md` - Export to platforms -- `docs/user-guide/05-workflows.md` - Enhancement workflows -- `docs/user-guide/06-troubleshooting.md` - Common issues - -**Reference (technical details):** -- `docs/reference/CLI_REFERENCE.md` - Complete command reference (20 commands) -- `docs/reference/MCP_REFERENCE.md` - MCP tools reference (33 tools) -- `docs/reference/CONFIG_FORMAT.md` - JSON configuration specification -- `docs/reference/ENVIRONMENT_VARIABLES.md` - All environment variables - -**Advanced (power user topics):** -- `docs/advanced/mcp-server.md` - MCP server setup -- `docs/advanced/mcp-tools.md` - Advanced MCP usage -- `docs/advanced/custom-workflows.md` - Creating custom workflows -- `docs/advanced/multi-source.md` - Multi-source scraping - -### Configuration Documentation - -Preset configs are in `configs/` directory: -- `godot.json` / `godot_unified.json` - Godot Engine -- `blender.json` / `blender-unified.json` - Blender Engine -- `claude-code.json` - Claude Code -- `httpx_comprehensive.json` - HTTPX library -- `medusa-mercurjs.json` - Medusa/MercurJS -- `astrovalley_unified.json` - Astrovalley -- `react.json` - React documentation -- `configs/integrations/` - Integration-specific configs - ---- - -## Key Dependencies - -### Core Dependencies (Required) - -| Package | Version | Purpose | -|---------|---------|---------| -| `requests` | >=2.32.5 | HTTP requests | -| `beautifulsoup4` | >=4.14.2 | HTML parsing | -| `PyGithub` | >=2.5.0 | GitHub API | -| `GitPython` | >=3.1.40 | Git operations | -| `httpx` | >=0.28.1 | Async HTTP | -| `anthropic` | >=0.76.0 | Claude AI API | -| `PyMuPDF` | >=1.24.14 | PDF processing | -| `Pillow` | >=11.0.0 | Image processing | -| `pytesseract` | >=0.3.13 | OCR | -| `pydantic` | >=2.12.3 | Data validation | -| `pydantic-settings` | >=2.11.0 | Settings management | -| `click` | >=8.3.0 | CLI framework | -| `Pygments` | >=2.19.2 | Syntax highlighting | -| `pathspec` | >=0.12.1 | Path matching | -| `networkx` | >=3.0 | Graph operations | -| `schedule` | >=1.2.0 | Scheduled tasks | -| `python-dotenv` | >=1.1.1 | Environment variables | -| `jsonschema` | >=4.25.1 | JSON validation | -| `PyYAML` | >=6.0 | YAML parsing | -| `langchain` | >=1.2.10 | LangChain integration | -| `llama-index` | >=0.14.15 | LlamaIndex integration | - -### Optional Dependencies - -| Feature | Package | Install Command | -|---------|---------|-----------------| -| MCP Server | `mcp>=1.25,<2` | `pip install -e ".[mcp]"` | -| Google Gemini | `google-generativeai>=0.8.0` | `pip install -e ".[gemini]"` | -| OpenAI | `openai>=1.0.0` | `pip install -e ".[openai]"` | -| AWS S3 | `boto3>=1.34.0` | `pip install -e ".[s3]"` | -| Google Cloud Storage | `google-cloud-storage>=2.10.0` | `pip install -e ".[gcs]"` | -| Azure Blob Storage | `azure-storage-blob>=12.19.0` | `pip install -e ".[azure]"` | -| Word Documents | `mammoth>=1.6.0`, `python-docx>=1.1.0` | `pip install -e ".[docx]"` | -| Video (lightweight) | `yt-dlp>=2024.12.0`, `youtube-transcript-api>=1.2.0` | `pip install -e ".[video]"` | -| Video (full) | +`faster-whisper`, `scenedetect`, `opencv-python-headless` (`easyocr` now installed via `--setup`) | `pip install -e ".[video-full]"` | -| Video (GPU setup) | Auto-detects GPU, installs PyTorch + easyocr + all visual deps | `skill-seekers video --setup` | -| Chroma DB | `chromadb>=0.4.0` | `pip install -e ".[chroma]"` | -| Weaviate | `weaviate-client>=3.25.0` | `pip install -e ".[weaviate]"` | -| Pinecone | `pinecone>=5.0.0` | `pip install -e ".[pinecone]"` | -| Embedding Server | `fastapi>=0.109.0`, `uvicorn>=0.27.0`, `sentence-transformers>=2.3.0` | `pip install -e ".[embedding]"` | - -### Dev Dependencies (in dependency-groups) - -| Package | Version | Purpose | -|---------|---------|---------| -| `pytest` | >=8.4.2 | Testing framework | -| `pytest-asyncio` | >=0.24.0 | Async test support | -| `pytest-cov` | >=7.0.0 | Coverage | -| `coverage` | >=7.11.0 | Coverage reporting | -| `ruff` | >=0.14.13 | Linting/formatting | -| `mypy` | >=1.19.1 | Type checking | -| `psutil` | >=5.9.0 | Process utilities for testing | -| `numpy` | >=1.24.0 | Numerical operations | -| `starlette` | >=0.31.0 | HTTP transport testing | -| `httpx` | >=0.24.0 | HTTP client for testing | -| `boto3` | >=1.26.0 | AWS S3 testing | -| `google-cloud-storage` | >=2.10.0 | GCS testing | -| `azure-storage-blob` | >=12.17.0 | Azure testing | - ---- - -## Troubleshooting - -### Common Issues - -**ImportError: No module named 'skill_seekers'** -- Solution: Run `pip install -e .` - -**Tests failing with "package not installed"** -- Solution: Ensure you ran `pip install -e .` in the correct virtual environment - -**MCP server import errors** -- Solution: Install with `pip install -e ".[mcp]"` - -**Type checking failures** -- MyPy is configured to be lenient (gradual typing) -- Focus on critical paths, not full coverage - -**Docker build failures** -- Ensure you have BuildKit enabled: `DOCKER_BUILDKIT=1` -- Check that all submodules are initialized: `git submodule update --init` - -**Rate limit errors from GitHub** -- Set `GITHUB_TOKEN` environment variable for authenticated requests -- Improves rate limit from 60 to 5000 requests/hour - -### Getting Help - -- Check **TROUBLESHOOTING.md** for detailed solutions -- Review **docs/FAQ.md** for common questions -- Visit https://skillseekersweb.com/ for documentation -- Open an issue on GitHub with: - - Clear title and description - - Steps to reproduce - - Expected vs actual behavior - - Environment details (OS, Python version) - - Error messages and stack traces - ---- - -## Environment Variables Reference - -| Variable | Purpose | Required For | -|----------|---------|--------------| -| `ANTHROPIC_API_KEY` | Claude AI API access | Claude enhancement/upload | -| `GOOGLE_API_KEY` | Google Gemini API access | Gemini enhancement/upload | -| `OPENAI_API_KEY` | OpenAI API access | OpenAI enhancement/upload | -| `GITHUB_TOKEN` | GitHub API authentication | GitHub scraping (recommended) | -| `AWS_ACCESS_KEY_ID` | AWS S3 authentication | S3 cloud storage | -| `AWS_SECRET_ACCESS_KEY` | AWS S3 authentication | S3 cloud storage | -| `GOOGLE_APPLICATION_CREDENTIALS` | GCS authentication path | GCS cloud storage | -| `AZURE_STORAGE_CONNECTION_STRING` | Azure Blob authentication | Azure cloud storage | -| `ANTHROPIC_BASE_URL` | Custom Claude endpoint | Custom API endpoints | -| `SKILL_SEEKERS_HOME` | Data directory path | Docker/runtime | -| `SKILL_SEEKERS_OUTPUT` | Output directory path | Docker/runtime | - ---- - -## Version Management - -The version is defined in `pyproject.toml` and dynamically read by `src/skill_seekers/_version.py`: - -```python -# _version.py reads from pyproject.toml -__version__ = get_version() # Returns version from pyproject.toml -``` - -**To update version:** -1. Edit `version` in `pyproject.toml` -2. The `_version.py` file will automatically pick up the new version - ---- - -## Configuration File Format - -Skill Seekers uses JSON configuration files to define scraping targets. Example structure: - -```json -{ - "name": "godot", - "description": "Godot Engine documentation", - "merge_mode": "claude-enhanced", - "sources": [ - { - "type": "documentation", - "base_url": "https://docs.godotengine.org/en/stable/", - "extract_api": true, - "selectors": { - "main_content": "div[role='main']", - "title": "title", - "code_blocks": "pre" - }, - "url_patterns": { - "include": [], - "exclude": ["/search.html", "/_static/"] - }, - "categories": { - "getting_started": ["introduction", "getting_started"], - "scripting": ["scripting", "gdscript"] - }, - "rate_limit": 0.5, - "max_pages": 500 - }, - { - "type": "github", - "repo": "godotengine/godot", - "enable_codebase_analysis": true, - "code_analysis_depth": "deep", - "fetch_issues": true, - "max_issues": 100 - } - ] -} -``` - ---- - -## Workflow Presets - -Skill Seekers includes 66 YAML workflow presets for AI enhancement in `src/skill_seekers/workflows/`: - -**Built-in presets:** -- `default.yaml` - Standard enhancement workflow -- `minimal.yaml` - Fast, minimal enhancement -- `security-focus.yaml` - Security-focused review -- `architecture-comprehensive.yaml` - Deep architecture analysis -- `api-documentation.yaml` - API documentation focus -- And 61 more specialized presets... - -**Usage:** -```bash -# Apply a preset -skill-seekers create ./my-project --enhance-workflow security-focus - -# Chain multiple presets -skill-seekers create ./my-project --enhance-workflow security-focus --enhance-workflow minimal - -# Manage presets -skill-seekers workflows list -skill-seekers workflows show security-focus -skill-seekers workflows copy security-focus -``` - ---- - -*This document is maintained for AI coding agents. For human contributors, see README.md and CONTRIBUTING.md.* - -*Last updated: 2026-03-01* +GitHub Actions (`.github/workflows/tests.yml`): ruff + mypy lint job, then pytest matrix (Ubuntu + macOS, Python 3.10-3.12) with Codecov upload. diff --git a/CHANGELOG.md b/CHANGELOG.md index 220d09e..407d485 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,77 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added + +#### 10 New Skill Source Types (17 total) + +Skill Seekers now supports 17 source types — up from 7. Every new type is fully integrated into the CLI (`skill-seekers `), `create` command auto-detection, unified multi-source configs, config validation, the MCP server, and the skill builder. + +- **Jupyter Notebook** — `skill-seekers jupyter --notebook file.ipynb` or `skill-seekers create file.ipynb` + - Extracts markdown cells, code cells with outputs, kernel metadata, imports, and language detection + - Handles single files and directories of notebooks; filters `.ipynb_checkpoints` + - Optional dependency: `pip install "skill-seekers[jupyter]"` (nbformat) + - Entry point: `skill-seekers-jupyter` + +- **Local HTML** — `skill-seekers html --html-path file.html` or `skill-seekers create file.html` + - Parses HTML using BeautifulSoup with smart main content detection (`
`, `
`, `.content`, largest div) + - Extracts headings, code blocks, tables (to markdown), images, links; converts inline HTML to markdown + - Handles single files and directories; supports `.html`, `.htm`, `.xhtml` extensions + - No extra dependencies (BeautifulSoup is a core dep) + +- **OpenAPI/Swagger** — `skill-seekers openapi --spec spec.yaml` or `skill-seekers create spec.yaml` + - Parses OpenAPI 3.0/3.1 and Swagger 2.0 specs from YAML or JSON (local files or URLs via `--spec-url`) + - Extracts endpoints, parameters, request/response schemas, security schemes, tags + - Resolves `$ref` references with circular reference protection; handles `allOf`/`oneOf`/`anyOf` + - Groups endpoints by tags; generates comprehensive API reference markdown + - Source detection sniffs YAML file content for `openapi:` or `swagger:` keys (avoids false positives on non-API YAML files) + - Optional dependency: `pip install "skill-seekers[openapi]"` (pyyaml — already a core dep, guard added for safety) + +- **AsciiDoc** — `skill-seekers asciidoc --asciidoc-path file.adoc` or `skill-seekers create file.adoc` + - Regex-based parser (no external library required) with optional `asciidoc` library support + - Extracts headings (= through =====), `[source,lang]` code blocks, `|===` tables, admonitions (NOTE/TIP/WARNING/IMPORTANT/CAUTION), and `include::` directives + - Converts AsciiDoc formatting to markdown; handles single files and directories + - Optional dependency: `pip install "skill-seekers[asciidoc]"` (asciidoc library for advanced rendering) + +- **PowerPoint (.pptx)** — `skill-seekers pptx --pptx file.pptx` or `skill-seekers create file.pptx` + - Extracts slide text, speaker notes, tables, images (with alt text), and grouped shapes + - Detects code blocks by monospace font analysis (30+ font families) + - Groups slides into sections by layout type; handles single files and directories + - Optional dependency: `pip install "skill-seekers[pptx]"` (python-pptx) + +- **RSS/Atom Feeds** — `skill-seekers rss --feed-url ` / `--feed-path file.rss` or `skill-seekers create feed.rss` + - Parses RSS 2.0, RSS 1.0, and Atom feeds via feedparser + - Optionally follows article links (`--follow-links`, default on) to scrape full page content using BeautifulSoup + - Extracts article titles, summaries, authors, dates, categories; configurable `--max-articles` (default 50) + - Source detection matches `.rss` and `.atom` extensions (`.xml` excluded to avoid false positives) + - Optional dependency: `pip install "skill-seekers[rss]"` (feedparser) + +- **Man Pages** — `skill-seekers manpage --man-names git,curl` / `--man-path dir/` or `skill-seekers create git.1` + - Extracts man pages by running `man` command via subprocess or reading `.1`–`.8`/`.man` files directly + - Handles gzip/bzip2/xz compressed man files; strips troff/groff formatting (backspace overstriking, macros, font escapes) + - Parses structured sections (NAME, SYNOPSIS, DESCRIPTION, OPTIONS, EXAMPLES, SEE ALSO) + - Source detection uses basename heuristic to avoid false positives on log rotation files (e.g., `access.log.1`) + - No external dependencies (stdlib only) + +- **Confluence** — `skill-seekers confluence --base-url --space-key ` or `--export-path dir/` + - API mode: fetches pages from Confluence REST API with pagination (`atlassian-python-api`) + - Export mode: parses Confluence HTML/XML export directories + - Extracts page content, code/panel/info/warning macros, page hierarchy, tables + - Optional dependency: `pip install "skill-seekers[confluence]"` (atlassian-python-api) + +- **Notion** — `skill-seekers notion --database-id ` / `--page-id ` or `--export-path dir/` + - API mode: fetches pages via Notion API with support for 20+ block types (paragraph, heading, code, callout, toggle, table, etc.) + - Export mode: parses Notion Markdown/CSV export directories + - Extracts rich text with annotations (bold, italic, code, links), 16+ property types for database entries + - Optional dependency: `pip install "skill-seekers[notion]"` (notion-client) + +- **Slack/Discord Chat** — `skill-seekers chat --export-path dir/` or `--token --channel ` + - Slack: parses workspace JSON exports or fetches via Slack Web API (`slack_sdk`) + - Discord: parses DiscordChatExporter JSON or fetches via Discord HTTP API + - Extracts messages, code snippets (fenced blocks), shared URLs, threads, reactions, attachments + - Generates per-channel summaries and topic categorization + - Optional dependency: `pip install "skill-seekers[chat]"` (slack-sdk) + +#### EPUB Unified Pipeline Integration - **EPUB (.epub) input support** via `skill-seekers create book.epub` or `skill-seekers epub --epub book.epub` - Extracts chapters, metadata (Dublin Core), code blocks, images, and tables from EPUB 2 and EPUB 3 files - DRM detection with clear error messages (Adobe ADEPT, Apple FairPlay, Readium LCP) @@ -16,6 +87,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `--help-epub` flag for EPUB-specific help - Optional dependency: `pip install "skill-seekers[epub]"` (ebooklib) - 107 tests across 14 test classes +- **EPUB added to unified scraper** — `_scrape_epub()` method, `scraped_data["epub"]`, config validation (`_validate_epub_source`), and dry-run display. Previously EPUB worked standalone but was missing from multi-source configs. + +#### Unified Skill Builder — Generic Merge System +- **`_generic_merge()`** — Priority-based section merge for any combination of source types not covered by existing pairwise synthesis (docs+github, docs+pdf, etc.). Produces YAML frontmatter + source-attributed sections. +- **`_append_extra_sources()`** — Appends additional source type content (e.g., Jupyter + PPTX) to pairwise-synthesized SKILL.md. +- **`_generate_generic_references()`** — Generates `references//index.md` for any source type, with ID resolution fallback chain. +- **`_SOURCE_LABELS`** dict — Human-readable labels for all 17 source types used in merge attribution. + +#### Config Validator Expansion +- **17 source types in `VALID_SOURCE_TYPES`** — All new types plus `word` and `video` now have per-type validation methods. +- **`_validate_word_source()`** — Validates `path` field for Word documents (was previously missing). +- **`_validate_video_source()`** — Validates `url`, `path`, or `playlist` field for video sources (was previously missing). +- **11 new `_validate_*_source()` methods** — One for each new type with appropriate required-field checks. + +#### Source Detection Improvements +- **7 new file extension detections** in `SourceDetector.detect()` — `.ipynb`, `.html`/`.htm`, `.pptx`, `.adoc`/`.asciidoc`, `.rss`/`.atom`, `.1`–`.8`/`.man`, `.yaml`/`.yml` (with content sniffing) +- **`_looks_like_openapi()`** — Content sniffing for YAML files: only classifies as OpenAPI if the file contains `openapi:` or `swagger:` key in first 20 lines (prevents false positives on docker-compose, Ansible, Kubernetes manifests, etc.) +- **Man page basename heuristic** — `.1`–`.8` extensions only detected as man pages if the basename has no dots (e.g., `git.1` matches but `access.log.1` does not) +- **`.xml` excluded from RSS detection** — Too generic; only `.rss` and `.atom` trigger RSS detection + +#### MCP Server Integration +- **`scrape_generic` tool** — New MCP tool handles all 10 new source types via subprocess with per-type flag mapping +- **`_PATH_FLAGS` / `_URL_FLAGS` dicts** — Correct flag routing for each source type (e.g., jupyter→`--notebook`, html→`--html-path`, rss→`--feed-url`) +- **`GENERIC_SOURCE_TYPES` tuple** — Lists all 10 new types for validation +- **Config validation display** — `validate_config` tool now shows source details for all new types +- **Tool count updated** — 33 → 34 tools (scraping tools 10 → 11) + +#### CLI Wiring +- **10 new CLI subcommands** — `jupyter`, `html`, `openapi`, `asciidoc`, `pptx`, `rss`, `manpage`, `confluence`, `notion`, `chat` in `COMMAND_MODULES` +- **10 new argument modules** — `arguments/{jupyter,html,openapi,asciidoc,pptx,rss,manpage,confluence,notion,chat}.py` with per-type `*_ARGUMENTS` dicts +- **10 new parser modules** — `parsers/{jupyter,html,openapi,asciidoc,pptx,rss,manpage,confluence,notion,chat}_parser.py` with `SubcommandParser` implementations +- **`create` command routing** — `_route_generic()` method for all new types with correct module names and CLI flags +- **10 new entry points** in pyproject.toml — `skill-seekers-{jupyter,html,openapi,asciidoc,pptx,rss,manpage,confluence,notion,chat}` +- **7 new optional dependency groups** in pyproject.toml — `[jupyter]`, `[asciidoc]`, `[pptx]`, `[confluence]`, `[notion]`, `[rss]`, `[chat]` +- **`[all]` group updated** — Includes all 7 new optional dependencies + +#### Workflow & Documentation +- **`complex-merge.yaml`** — New 7-stage AI-powered workflow for complex multi-source merging (source inventory → cross-reference → conflict detection → priority merge → gap analysis → synthesis → quality check) +- **AGENTS.md rewritten** — Updated with all 17 source types, scraper pattern docs, project layout, and key pattern documentation +- **77 new integration tests** in `test_new_source_types.py` — Source detection, config validation, generic merge, CLI wiring, validation, and create command routing + +### Fixed +- **Config validator missing `word` and `video` dispatch** — `_validate_source()` had no `elif` branches for `word` or `video` types, silently skipping validation. Added dispatch entries and `_validate_word_source()` / `_validate_video_source()` methods. +- **`openapi_scraper.py` unconditional `import yaml`** — Would crash at import time if pyyaml not installed. Added `try/except ImportError` guard with `YAML_AVAILABLE` flag and `_check_yaml_deps()` helper. +- **`asciidoc_scraper.py` missing standard arguments** — `main()` manually defined args instead of using `add_asciidoc_arguments()`. Refactored to use shared argument definitions + added enhancement workflow integration. +- **`pptx_scraper.py` missing standard arguments** — Same issue. Refactored to use `add_pptx_arguments()`. +- **`chat_scraper.py` missing standard arguments** — Same issue. Refactored to use `add_chat_arguments()`. +- **`notion_scraper.py` missing `run_workflows` call** — `--enhance-workflow` flags were silently ignored. Added workflow runner integration. +- **`openapi_scraper.py` return type `None`** — `main()` returned `None` instead of `int`. Fixed to `return 0` on success, matching all other scrapers. +- **MCP `scrape_generic_tool` flag mismatch** — Was passing `--path`/`--url` as generic flags, but every scraper expects its own flag name (e.g., `--notebook`, `--html-path`, `--spec`). All 10 source types would have failed at runtime. Fixed with per-type `_PATH_FLAGS` and `_URL_FLAGS` mappings. +- **Word scraper `docx_id` key mismatch** — Unified scraper data dict used `docx_id` but generic reference generation looked for `word_id`. Added `word_id` alias. +- **`main.py` docstring stale** — Missing all 10 new commands. Updated to list all 27 commands. +- **`source_detector.py` module docstring stale** — Described only 5 source types. Updated to describe 14+ detected types. +- **`manpage_parser.py` docstring referenced wrong file** — Said `manpage_scraper.py` but actual file is `man_scraper.py`. Fixed. +- **Parser registry test count** — Updated expected count from 25 to 35 for 10 new parsers. ## [3.2.0] - 2026-03-01 diff --git a/pyproject.toml b/pyproject.toml index 5b10fed..962392b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -168,6 +168,35 @@ all-cloud = [ "azure-storage-blob>=12.19.0", ] +# New source type dependencies (v3.2.0+) +jupyter = [ + "nbformat>=5.9.0", +] + +asciidoc = [ + "asciidoc>=10.0.0", +] + +pptx = [ + "python-pptx>=0.6.21", +] + +confluence = [ + "atlassian-python-api>=3.41.0", +] + +notion = [ + "notion-client>=2.0.0", +] + +rss = [ + "feedparser>=6.0.0", +] + +chat = [ + "slack-sdk>=3.27.0", +] + # Embedding server support embedding = [ "fastapi>=0.109.0", @@ -204,6 +233,14 @@ all = [ "sentence-transformers>=2.3.0", "numpy>=1.24.0", "voyageai>=0.2.0", + # New source types (v3.2.0+) + "nbformat>=5.9.0", + "asciidoc>=10.0.0", + "python-pptx>=0.6.21", + "atlassian-python-api>=3.41.0", + "notion-client>=2.0.0", + "feedparser>=6.0.0", + "slack-sdk>=3.27.0", ] [project.urls] @@ -253,6 +290,18 @@ skill-seekers-quality = "skill_seekers.cli.quality_metrics:main" skill-seekers-workflows = "skill_seekers.cli.workflows_command:main" skill-seekers-sync-config = "skill_seekers.cli.sync_config:main" +# New source type entry points (v3.2.0+) +skill-seekers-jupyter = "skill_seekers.cli.jupyter_scraper:main" +skill-seekers-html = "skill_seekers.cli.html_scraper:main" +skill-seekers-openapi = "skill_seekers.cli.openapi_scraper:main" +skill-seekers-asciidoc = "skill_seekers.cli.asciidoc_scraper:main" +skill-seekers-pptx = "skill_seekers.cli.pptx_scraper:main" +skill-seekers-rss = "skill_seekers.cli.rss_scraper:main" +skill-seekers-manpage = "skill_seekers.cli.man_scraper:main" +skill-seekers-confluence = "skill_seekers.cli.confluence_scraper:main" +skill-seekers-notion = "skill_seekers.cli.notion_scraper:main" +skill-seekers-chat = "skill_seekers.cli.chat_scraper:main" + [tool.setuptools] package-dir = {"" = "src"} diff --git a/src/skill_seekers/cli/arguments/asciidoc.py b/src/skill_seekers/cli/arguments/asciidoc.py new file mode 100644 index 0000000..2ea6e30 --- /dev/null +++ b/src/skill_seekers/cli/arguments/asciidoc.py @@ -0,0 +1,68 @@ +"""AsciiDoc command argument definitions. + +This module defines ALL arguments for the asciidoc command in ONE place. +Both asciidoc_scraper.py (standalone) and parsers/asciidoc_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# AsciiDoc-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = { + "asciidoc_path": { + "flags": ("--asciidoc-path",), + "kwargs": { + "type": str, + "help": "Path to AsciiDoc file or directory containing .adoc files", + "metavar": "PATH", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_asciidoc_arguments(parser: argparse.ArgumentParser) -> None: + """Add all asciidoc command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds AsciiDoc-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for AsciiDoc. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for AsciiDoc + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for AsciiDoc), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # AsciiDoc-specific args + for arg_name, arg_def in ASCIIDOC_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/chat.py b/src/skill_seekers/cli/arguments/chat.py new file mode 100644 index 0000000..563f162 --- /dev/null +++ b/src/skill_seekers/cli/arguments/chat.py @@ -0,0 +1,102 @@ +"""Chat command argument definitions. + +This module defines ALL arguments for the chat command in ONE place. +Both chat_scraper.py (standalone) and parsers/chat_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# Chat-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +CHAT_ARGUMENTS: dict[str, dict[str, Any]] = { + "export_path": { + "flags": ("--export-path",), + "kwargs": { + "type": str, + "help": "Path to chat export directory or file", + "metavar": "PATH", + }, + }, + "platform": { + "flags": ("--platform",), + "kwargs": { + "type": str, + "choices": ["slack", "discord"], + "default": "slack", + "help": "Chat platform type (default: slack)", + }, + }, + "token": { + "flags": ("--token",), + "kwargs": { + "type": str, + "help": "API token for chat platform authentication", + "metavar": "TOKEN", + }, + }, + "channel": { + "flags": ("--channel",), + "kwargs": { + "type": str, + "help": "Channel name or ID to extract from", + "metavar": "CHANNEL", + }, + }, + "max_messages": { + "flags": ("--max-messages",), + "kwargs": { + "type": int, + "default": 10000, + "help": "Maximum number of messages to extract (default: 10000)", + "metavar": "N", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_chat_arguments(parser: argparse.ArgumentParser) -> None: + """Add all chat command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds Chat-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for Chat. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for Chat + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for Chat), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # Chat-specific args + for arg_name, arg_def in CHAT_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/confluence.py b/src/skill_seekers/cli/arguments/confluence.py new file mode 100644 index 0000000..f65673c --- /dev/null +++ b/src/skill_seekers/cli/arguments/confluence.py @@ -0,0 +1,109 @@ +"""Confluence command argument definitions. + +This module defines ALL arguments for the confluence command in ONE place. +Both confluence_scraper.py (standalone) and parsers/confluence_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# Confluence-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = { + "base_url": { + "flags": ("--base-url",), + "kwargs": { + "type": str, + "help": "Confluence instance base URL", + "metavar": "URL", + }, + }, + "space_key": { + "flags": ("--space-key",), + "kwargs": { + "type": str, + "help": "Confluence space key to extract from", + "metavar": "KEY", + }, + }, + "export_path": { + "flags": ("--export-path",), + "kwargs": { + "type": str, + "help": "Path to Confluence HTML/XML export directory", + "metavar": "PATH", + }, + }, + "username": { + "flags": ("--username",), + "kwargs": { + "type": str, + "help": "Confluence username for API authentication", + "metavar": "USER", + }, + }, + "token": { + "flags": ("--token",), + "kwargs": { + "type": str, + "help": "Confluence API token for authentication", + "metavar": "TOKEN", + }, + }, + "max_pages": { + "flags": ("--max-pages",), + "kwargs": { + "type": int, + "default": 500, + "help": "Maximum number of pages to extract (default: 500)", + "metavar": "N", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_confluence_arguments(parser: argparse.ArgumentParser) -> None: + """Add all confluence command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds Confluence-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for Confluence. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for Confluence + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for Confluence), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # Confluence-specific args + for arg_name, arg_def in CONFLUENCE_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py index 094590a..6fb153f 100644 --- a/src/skill_seekers/cli/arguments/create.py +++ b/src/skill_seekers/cli/arguments/create.py @@ -549,6 +549,121 @@ CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = { # For unified config files, use `skill-seekers unified --fresh` directly. } +# New source type arguments (v3.2.0+) +# These are minimal dicts since most flags are handled by each scraper's own argument module. +# The create command only needs the primary input flag for routing. + +JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = { + "notebook": { + "flags": ("--notebook",), + "kwargs": {"type": str, "help": "Jupyter Notebook file path (.ipynb)", "metavar": "PATH"}, + }, +} + +HTML_ARGUMENTS: dict[str, dict[str, Any]] = { + "html_path": { + "flags": ("--html-path",), + "kwargs": {"type": str, "help": "Local HTML file or directory path", "metavar": "PATH"}, + }, +} + +OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = { + "spec": { + "flags": ("--spec",), + "kwargs": {"type": str, "help": "OpenAPI/Swagger spec file path", "metavar": "PATH"}, + }, + "spec_url": { + "flags": ("--spec-url",), + "kwargs": {"type": str, "help": "OpenAPI/Swagger spec URL", "metavar": "URL"}, + }, +} + +ASCIIDOC_ARGUMENTS: dict[str, dict[str, Any]] = { + "asciidoc_path": { + "flags": ("--asciidoc-path",), + "kwargs": {"type": str, "help": "AsciiDoc file or directory path", "metavar": "PATH"}, + }, +} + +PPTX_ARGUMENTS: dict[str, dict[str, Any]] = { + "pptx": { + "flags": ("--pptx",), + "kwargs": {"type": str, "help": "PowerPoint file path (.pptx)", "metavar": "PATH"}, + }, +} + +RSS_ARGUMENTS: dict[str, dict[str, Any]] = { + "feed_url": { + "flags": ("--feed-url",), + "kwargs": {"type": str, "help": "RSS/Atom feed URL", "metavar": "URL"}, + }, + "feed_path": { + "flags": ("--feed-path",), + "kwargs": {"type": str, "help": "RSS/Atom feed file path", "metavar": "PATH"}, + }, +} + +MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = { + "man_names": { + "flags": ("--man-names",), + "kwargs": { + "type": str, + "help": "Comma-separated man page names (e.g., 'git,curl')", + "metavar": "NAMES", + }, + }, + "man_path": { + "flags": ("--man-path",), + "kwargs": {"type": str, "help": "Directory of man page files", "metavar": "PATH"}, + }, +} + +CONFLUENCE_ARGUMENTS: dict[str, dict[str, Any]] = { + "conf_base_url": { + "flags": ("--conf-base-url",), + "kwargs": {"type": str, "help": "Confluence base URL", "metavar": "URL"}, + }, + "space_key": { + "flags": ("--space-key",), + "kwargs": {"type": str, "help": "Confluence space key", "metavar": "KEY"}, + }, + "conf_export_path": { + "flags": ("--conf-export-path",), + "kwargs": {"type": str, "help": "Confluence export directory", "metavar": "PATH"}, + }, +} + +NOTION_ARGUMENTS: dict[str, dict[str, Any]] = { + "database_id": { + "flags": ("--database-id",), + "kwargs": {"type": str, "help": "Notion database ID", "metavar": "ID"}, + }, + "page_id": { + "flags": ("--page-id",), + "kwargs": {"type": str, "help": "Notion page ID", "metavar": "ID"}, + }, + "notion_export_path": { + "flags": ("--notion-export-path",), + "kwargs": {"type": str, "help": "Notion export directory", "metavar": "PATH"}, + }, +} + +CHAT_ARGUMENTS: dict[str, dict[str, Any]] = { + "chat_export_path": { + "flags": ("--chat-export-path",), + "kwargs": {"type": str, "help": "Slack/Discord export directory", "metavar": "PATH"}, + }, + "platform": { + "flags": ("--platform",), + "kwargs": { + "type": str, + "choices": ["slack", "discord"], + "default": "slack", + "help": "Chat platform (default: slack)", + }, + }, +} + # ============================================================================= # TIER 3: ADVANCED/RARE ARGUMENTS # ============================================================================= @@ -613,6 +728,17 @@ def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]] "epub": EPUB_ARGUMENTS, "video": VIDEO_ARGUMENTS, "config": CONFIG_ARGUMENTS, + # New source types (v3.2.0+) + "jupyter": JUPYTER_ARGUMENTS, + "html": HTML_ARGUMENTS, + "openapi": OPENAPI_ARGUMENTS, + "asciidoc": ASCIIDOC_ARGUMENTS, + "pptx": PPTX_ARGUMENTS, + "rss": RSS_ARGUMENTS, + "manpage": MANPAGE_ARGUMENTS, + "confluence": CONFLUENCE_ARGUMENTS, + "notion": NOTION_ARGUMENTS, + "chat": CHAT_ARGUMENTS, } return source_args.get(source_type, {}) @@ -703,6 +829,24 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default") for arg_name, arg_def in CONFIG_ARGUMENTS.items(): parser.add_argument(*arg_def["flags"], **arg_def["kwargs"]) + # New source types (v3.2.0+) + _NEW_SOURCE_ARGS = { + "jupyter": JUPYTER_ARGUMENTS, + "html": HTML_ARGUMENTS, + "openapi": OPENAPI_ARGUMENTS, + "asciidoc": ASCIIDOC_ARGUMENTS, + "pptx": PPTX_ARGUMENTS, + "rss": RSS_ARGUMENTS, + "manpage": MANPAGE_ARGUMENTS, + "confluence": CONFLUENCE_ARGUMENTS, + "notion": NOTION_ARGUMENTS, + "chat": CHAT_ARGUMENTS, + } + for stype, sargs in _NEW_SOURCE_ARGS.items(): + if mode in [stype, "all"]: + for arg_name, arg_def in sargs.items(): + parser.add_argument(*arg_def["flags"], **arg_def["kwargs"]) + # Add advanced arguments if requested if mode in ["advanced", "all"]: for arg_name, arg_def in ADVANCED_ARGUMENTS.items(): diff --git a/src/skill_seekers/cli/arguments/html.py b/src/skill_seekers/cli/arguments/html.py new file mode 100644 index 0000000..56ee554 --- /dev/null +++ b/src/skill_seekers/cli/arguments/html.py @@ -0,0 +1,68 @@ +"""HTML command argument definitions. + +This module defines ALL arguments for the html command in ONE place. +Both html_scraper.py (standalone) and parsers/html_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# HTML-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +HTML_ARGUMENTS: dict[str, dict[str, Any]] = { + "html_path": { + "flags": ("--html-path",), + "kwargs": { + "type": str, + "help": "Path to HTML file or directory containing HTML files", + "metavar": "PATH", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_html_arguments(parser: argparse.ArgumentParser) -> None: + """Add all html command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds HTML-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for HTML. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for HTML + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for HTML), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # HTML-specific args + for arg_name, arg_def in HTML_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/jupyter.py b/src/skill_seekers/cli/arguments/jupyter.py new file mode 100644 index 0000000..f4f0bbd --- /dev/null +++ b/src/skill_seekers/cli/arguments/jupyter.py @@ -0,0 +1,68 @@ +"""Jupyter Notebook command argument definitions. + +This module defines ALL arguments for the jupyter command in ONE place. +Both jupyter_scraper.py (standalone) and parsers/jupyter_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# Jupyter-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +JUPYTER_ARGUMENTS: dict[str, dict[str, Any]] = { + "notebook": { + "flags": ("--notebook",), + "kwargs": { + "type": str, + "help": "Path to .ipynb file or directory containing notebooks", + "metavar": "PATH", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_jupyter_arguments(parser: argparse.ArgumentParser) -> None: + """Add all jupyter command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds Jupyter-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for Jupyter. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for Jupyter + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for Jupyter), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # Jupyter-specific args + for arg_name, arg_def in JUPYTER_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/manpage.py b/src/skill_seekers/cli/arguments/manpage.py new file mode 100644 index 0000000..f867c35 --- /dev/null +++ b/src/skill_seekers/cli/arguments/manpage.py @@ -0,0 +1,84 @@ +"""Man page command argument definitions. + +This module defines ALL arguments for the manpage command in ONE place. +Both manpage_scraper.py (standalone) and parsers/manpage_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# ManPage-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +MANPAGE_ARGUMENTS: dict[str, dict[str, Any]] = { + "man_names": { + "flags": ("--man-names",), + "kwargs": { + "type": str, + "help": "Comma-separated list of man page names (e.g., 'ls,grep,find')", + "metavar": "NAMES", + }, + }, + "man_path": { + "flags": ("--man-path",), + "kwargs": { + "type": str, + "help": "Path to directory containing man page files", + "metavar": "PATH", + }, + }, + "sections": { + "flags": ("--sections",), + "kwargs": { + "type": str, + "help": "Comma-separated section numbers to include (e.g., '1,3,8')", + "metavar": "SECTIONS", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_manpage_arguments(parser: argparse.ArgumentParser) -> None: + """Add all manpage command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds ManPage-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for ManPage. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for ManPage + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for ManPage), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # ManPage-specific args + for arg_name, arg_def in MANPAGE_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/notion.py b/src/skill_seekers/cli/arguments/notion.py new file mode 100644 index 0000000..b48f161 --- /dev/null +++ b/src/skill_seekers/cli/arguments/notion.py @@ -0,0 +1,101 @@ +"""Notion command argument definitions. + +This module defines ALL arguments for the notion command in ONE place. +Both notion_scraper.py (standalone) and parsers/notion_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# Notion-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +NOTION_ARGUMENTS: dict[str, dict[str, Any]] = { + "database_id": { + "flags": ("--database-id",), + "kwargs": { + "type": str, + "help": "Notion database ID to extract from", + "metavar": "ID", + }, + }, + "page_id": { + "flags": ("--page-id",), + "kwargs": { + "type": str, + "help": "Notion page ID to extract from", + "metavar": "ID", + }, + }, + "export_path": { + "flags": ("--export-path",), + "kwargs": { + "type": str, + "help": "Path to Notion export directory", + "metavar": "PATH", + }, + }, + "token": { + "flags": ("--token",), + "kwargs": { + "type": str, + "help": "Notion integration token for API authentication", + "metavar": "TOKEN", + }, + }, + "max_pages": { + "flags": ("--max-pages",), + "kwargs": { + "type": int, + "default": 500, + "help": "Maximum number of pages to extract (default: 500)", + "metavar": "N", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_notion_arguments(parser: argparse.ArgumentParser) -> None: + """Add all notion command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds Notion-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for Notion. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for Notion + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for Notion), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # Notion-specific args + for arg_name, arg_def in NOTION_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/openapi.py b/src/skill_seekers/cli/arguments/openapi.py new file mode 100644 index 0000000..ed0ffa5 --- /dev/null +++ b/src/skill_seekers/cli/arguments/openapi.py @@ -0,0 +1,76 @@ +"""OpenAPI command argument definitions. + +This module defines ALL arguments for the openapi command in ONE place. +Both openapi_scraper.py (standalone) and parsers/openapi_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# OpenAPI-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +OPENAPI_ARGUMENTS: dict[str, dict[str, Any]] = { + "spec": { + "flags": ("--spec",), + "kwargs": { + "type": str, + "help": "Path to OpenAPI/Swagger spec file", + "metavar": "PATH", + }, + }, + "spec_url": { + "flags": ("--spec-url",), + "kwargs": { + "type": str, + "help": "URL to OpenAPI/Swagger spec", + "metavar": "URL", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_openapi_arguments(parser: argparse.ArgumentParser) -> None: + """Add all openapi command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds OpenAPI-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for OpenAPI. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for OpenAPI + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for OpenAPI), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # OpenAPI-specific args + for arg_name, arg_def in OPENAPI_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/pptx.py b/src/skill_seekers/cli/arguments/pptx.py new file mode 100644 index 0000000..ce0b114 --- /dev/null +++ b/src/skill_seekers/cli/arguments/pptx.py @@ -0,0 +1,68 @@ +"""PPTX command argument definitions. + +This module defines ALL arguments for the pptx command in ONE place. +Both pptx_scraper.py (standalone) and parsers/pptx_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# PPTX-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +PPTX_ARGUMENTS: dict[str, dict[str, Any]] = { + "pptx": { + "flags": ("--pptx",), + "kwargs": { + "type": str, + "help": "Path to PowerPoint file (.pptx)", + "metavar": "PATH", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_pptx_arguments(parser: argparse.ArgumentParser) -> None: + """Add all pptx command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds PPTX-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for PPTX. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for PPTX + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for PPTX), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # PPTX-specific args + for arg_name, arg_def in PPTX_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/arguments/rss.py b/src/skill_seekers/cli/arguments/rss.py new file mode 100644 index 0000000..6ca89c7 --- /dev/null +++ b/src/skill_seekers/cli/arguments/rss.py @@ -0,0 +1,101 @@ +"""RSS command argument definitions. + +This module defines ALL arguments for the rss command in ONE place. +Both rss_scraper.py (standalone) and parsers/rss_parser.py (unified CLI) +import and use these definitions. + +Shared arguments (name, description, output, enhance-level, api-key, +dry-run, verbose, quiet, workflow args) come from common.py / workflow.py +via ``add_all_standard_arguments()``. +""" + +import argparse +from typing import Any + +from .common import add_all_standard_arguments + +# RSS-specific argument definitions as data structure +# NOTE: Shared args (name, description, output, enhance_level, api_key, dry_run, +# verbose, quiet, workflow args) are registered by add_all_standard_arguments(). +RSS_ARGUMENTS: dict[str, dict[str, Any]] = { + "feed_url": { + "flags": ("--feed-url",), + "kwargs": { + "type": str, + "help": "URL of the RSS/Atom feed", + "metavar": "URL", + }, + }, + "feed_path": { + "flags": ("--feed-path",), + "kwargs": { + "type": str, + "help": "Path to local RSS/Atom feed file", + "metavar": "PATH", + }, + }, + "follow_links": { + "flags": ("--follow-links",), + "kwargs": { + "action": "store_true", + "default": True, + "help": "Follow article links and extract full content (default: True)", + }, + }, + "no_follow_links": { + "flags": ("--no-follow-links",), + "kwargs": { + "action": "store_false", + "dest": "follow_links", + "help": "Do not follow article links; use feed summary only", + }, + }, + "max_articles": { + "flags": ("--max-articles",), + "kwargs": { + "type": int, + "default": 50, + "help": "Maximum number of articles to extract (default: 50)", + "metavar": "N", + }, + }, + "from_json": { + "flags": ("--from-json",), + "kwargs": { + "type": str, + "help": "Build skill from extracted JSON", + "metavar": "FILE", + }, + }, +} + + +def add_rss_arguments(parser: argparse.ArgumentParser) -> None: + """Add all rss command arguments to a parser. + + Registers shared args (name, description, output, enhance-level, api-key, + dry-run, verbose, quiet, workflow args) via add_all_standard_arguments(), + then adds RSS-specific args on top. + + The default for --enhance-level is overridden to 0 (disabled) for RSS. + """ + # Shared universal args first + add_all_standard_arguments(parser) + + # Override enhance-level default to 0 for RSS + for action in parser._actions: + if hasattr(action, "dest") and action.dest == "enhance_level": + action.default = 0 + action.help = ( + "AI enhancement level (auto-detects API vs LOCAL mode): " + "0=disabled (default for RSS), 1=SKILL.md only, " + "2=+architecture/config, 3=full enhancement. " + "Mode selection: uses API if ANTHROPIC_API_KEY is set, " + "otherwise LOCAL (Claude Code)" + ) + + # RSS-specific args + for arg_name, arg_def in RSS_ARGUMENTS.items(): + flags = arg_def["flags"] + kwargs = arg_def["kwargs"] + parser.add_argument(*flags, **kwargs) diff --git a/src/skill_seekers/cli/asciidoc_scraper.py b/src/skill_seekers/cli/asciidoc_scraper.py new file mode 100644 index 0000000..b5082ed --- /dev/null +++ b/src/skill_seekers/cli/asciidoc_scraper.py @@ -0,0 +1,1085 @@ +#!/usr/bin/env python3 +""" +AsciiDoc Documentation to Skill Converter + +Converts AsciiDoc (.adoc, .asciidoc) documentation files into AI-ready skills. +Supports both single files and directories of AsciiDoc documents. + +Uses the ``asciidoc`` library when available for accurate HTML rendering, +falling back to a comprehensive regex-based parser that handles headings, +code blocks, tables, admonitions, include directives, and inline formatting. + +Usage: + skill-seekers asciidoc --asciidoc-path doc.adoc --name myskill + skill-seekers asciidoc --asciidoc-path docs/ --name myskill + skill-seekers asciidoc --from-json doc_extracted.json +""" + +import argparse +import json +import logging +import os +import re +import sys +from pathlib import Path + +# Optional dependency guard — asciidoc library for HTML conversion +try: + import asciidoc as asciidoc_lib # noqa: F401 + + ASCIIDOC_AVAILABLE = True +except ImportError: + ASCIIDOC_AVAILABLE = False + +logger = logging.getLogger(__name__) + +ASCIIDOC_EXTENSIONS = {".adoc", ".asciidoc", ".asc", ".ad"} +ADMONITION_TYPES = ("NOTE", "TIP", "WARNING", "IMPORTANT", "CAUTION") + +# Regex patterns for AsciiDoc structure +RE_HEADING = re.compile(r"^(={1,5})\s+(.+)$", re.MULTILINE) +RE_SOURCE_ATTR = re.compile(r"^\[source(?:,\s*(\w[\w+#.-]*))?(?:,.*?)?\]$", re.MULTILINE) +RE_LISTING_DELIM = re.compile(r"^(-{4,})$", re.MULTILINE) +RE_LITERAL_DELIM = re.compile(r"^(\.{4,})$", re.MULTILINE) +RE_TABLE_DELIM = re.compile(r"^\|={3,}$", re.MULTILINE) +RE_TABLE_CELL = re.compile(r"^\|(.+)$", re.MULTILINE) +RE_ADMONITION_PARA = re.compile( + r"^(NOTE|TIP|WARNING|IMPORTANT|CAUTION):\s+(.+?)(?:\n\n|\Z)", + re.MULTILINE | re.DOTALL, +) +RE_ADMONITION_BLOCK = re.compile( + r"^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\n={4,}\n(.*?)\n={4,}", + re.MULTILINE | re.DOTALL, +) +RE_INCLUDE = re.compile(r"^include::(.+?)\[([^\]]*)\]$", re.MULTILINE) +RE_ATTRIBUTE = re.compile(r"^:([a-zA-Z0-9_-]+):\s*(.*)$", re.MULTILINE) +RE_ATTR_REF = re.compile(r"\{([a-zA-Z0-9_-]+)\}") +RE_BOLD = re.compile(r"\*([^\s*](?:.*?[^\s*])?)\*") +RE_ITALIC = re.compile(r"_([^\s_](?:.*?[^\s_])?)_") +RE_MONO = re.compile(r"`([^`]+)`") +RE_LINK = re.compile(r"(https?://\S+)\[([^\]]*)\]") +RE_XREF = re.compile(r"<<([^,>]+)(?:,\s*([^>]+))?>>") + + +def _check_asciidoc_deps() -> None: + """Log debug message when asciidoc library is not installed (regex fallback used).""" + if not ASCIIDOC_AVAILABLE: + logger.debug( + "asciidoc library not installed; using regex-based parser.\n" + 'Install with: pip install "skill-seekers[asciidoc]" or: pip install asciidoc' + ) + + +def infer_description_from_asciidoc(metadata: dict | None = None, name: str = "") -> str: + """Infer skill description from AsciiDoc document metadata.""" + if metadata: + if metadata.get("description") and len(str(metadata["description"])) > 20: + desc = str(metadata["description"]).strip() + return ( + f"Use when {desc[:147].lower()}..." + if len(desc) > 150 + else f"Use when {desc.lower()}" + ) + if metadata.get("title") and len(str(metadata["title"])) > 10: + return f"Use when working with {str(metadata['title']).lower()}" + return ( + f"Use when referencing {name} documentation" + if name + else "Use when referencing this documentation" + ) + + +def _score_code_quality(code: str) -> float: + """Simple quality heuristic for code blocks (0-10 scale).""" + if not code: + return 0.0 + score = 5.0 + line_count = len(code.strip().split("\n")) + if line_count >= 10: + score += 2.0 + elif line_count >= 5: + score += 1.0 + if re.search(r"\b(def |class |function |func |fn )", code): + score += 1.5 + if re.search(r"\b(import |from .+ import|require\(|#include|using )", code): + score += 0.5 + if re.search(r"^ ", code, re.MULTILINE): + score += 0.5 + if re.search(r"[=:{}()\[\]]", code): + score += 0.3 + if len(code) < 30: + score -= 2.0 + return min(10.0, max(0.0, score)) + + +class AsciiDocToSkillConverter: + """Convert AsciiDoc documentation to an AI-ready skill. + + Handles single ``.adoc`` files and directories. Content is parsed into + intermediate JSON, categorised, then rendered into the standard skill + directory layout (SKILL.md, references/, etc.). + """ + + def __init__(self, config: dict) -> None: + self.config = config + self.name: str = config["name"] + self.asciidoc_path: str = config.get("asciidoc_path", "") + self.description: str = ( + config.get("description") or f"Use when referencing {self.name} documentation" + ) + self.skill_dir: str = f"output/{self.name}" + self.data_file: str = f"output/{self.name}_extracted.json" + self.categories: dict = config.get("categories", {}) + self.extracted_data: dict | None = None + + # ------------------------------------------------------------------ + # Extraction + # ------------------------------------------------------------------ + + def extract_asciidoc(self) -> bool: + """Extract content from AsciiDoc file(s). + + Discovers files, resolves attributes/includes, parses sections, + detects languages, and saves intermediate JSON. + + Returns: + True on success. + + Raises: + FileNotFoundError: If path does not exist. + ValueError: If no AsciiDoc files found. + """ + _check_asciidoc_deps() + from skill_seekers.cli.language_detector import LanguageDetector + + print(f"\n🔍 Extracting from AsciiDoc: {self.asciidoc_path}") + path = Path(self.asciidoc_path) + if not path.exists(): + raise FileNotFoundError(f"AsciiDoc path not found: {self.asciidoc_path}") + + files = self._discover_files(path) + if not files: + raise ValueError( + f"No AsciiDoc files found at: {self.asciidoc_path}\n" + f"Expected extensions: {', '.join(sorted(ASCIIDOC_EXTENSIONS))}" + ) + print(f" Found {len(files)} AsciiDoc file(s)") + + all_sections: list[dict] = [] + metadata: dict = {} + section_counter = 0 + + for file_path in sorted(files): + raw_text = file_path.read_text(encoding="utf-8", errors="replace") + attributes = self._extract_attributes(raw_text) + resolved_text = self._resolve_attributes(raw_text, attributes) + resolved_text = self._resolve_includes(resolved_text, file_path.parent) + if not metadata: + metadata = self._build_metadata(attributes, file_path) + + for section in self._parse_asciidoc_sections(resolved_text): + section_counter += 1 + section["section_number"] = section_counter + section["source_file"] = str(file_path) + body = section.pop("body", "") + section["code_samples"] = self._extract_code_blocks(body) + section["tables"] = self._extract_tables(body) + section["admonitions"] = self._extract_admonitions(body) + section["includes"] = self._extract_includes(body) + section["text"] = self._convert_to_markdown(body) + all_sections.append(section) + + # Language detection + detector = LanguageDetector(min_confidence=0.15) + languages_detected: dict[str, int] = {} + total_code_blocks = 0 + for section in all_sections: + for cs in section.get("code_samples", []): + if cs.get("language"): + languages_detected[cs["language"]] = ( + languages_detected.get(cs["language"], 0) + 1 + ) + total_code_blocks += 1 + for section in all_sections: + for cs in section.get("code_samples", []): + if not cs.get("language") and cs.get("code"): + lang, conf = detector.detect_from_code(cs["code"]) + if lang and conf >= 0.3: + cs["language"] = lang + languages_detected[lang] = languages_detected.get(lang, 0) + 1 + + if not self.config.get("description"): + self.description = infer_description_from_asciidoc(metadata, self.name) + + result_data = { + "source_path": self.asciidoc_path, + "metadata": metadata, + "total_sections": len(all_sections), + "total_files": len(files), + "total_code_blocks": total_code_blocks, + "total_tables": sum(len(s.get("tables", [])) for s in all_sections), + "total_admonitions": sum(len(s.get("admonitions", [])) for s in all_sections), + "languages_detected": languages_detected, + "pages": all_sections, + } + os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True) + with open(self.data_file, "w", encoding="utf-8") as f: + json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) + + print(f"\n💾 Saved extracted data to: {self.data_file}") + self.extracted_data = result_data + print( + f"✅ Extracted {len(all_sections)} sections, {total_code_blocks} code blocks, " + f"{result_data['total_tables']} tables, {result_data['total_admonitions']} admonitions" + ) + return True + + def _discover_files(self, path: Path) -> list[Path]: + """Return sorted list of AsciiDoc files from *path* (file or directory).""" + if path.is_file(): + return [path] if path.suffix.lower() in ASCIIDOC_EXTENSIONS else [] + found: list[Path] = [] + for ext in ASCIIDOC_EXTENSIONS: + found.extend(path.rglob(f"*{ext}")) + return sorted(set(found)) + + # ------------------------------------------------------------------ + # Attribute / include resolution + # ------------------------------------------------------------------ + + @staticmethod + def _extract_attributes(text: str) -> dict[str, str]: + """Extract ``:attr-name: value`` definitions from text.""" + return {m.group(1): m.group(2).strip() for m in RE_ATTRIBUTE.finditer(text)} + + @staticmethod + def _resolve_attributes(text: str, attributes: dict[str, str]) -> str: + """Replace ``{attr-name}`` references with their values.""" + return RE_ATTR_REF.sub(lambda m: attributes.get(m.group(1), m.group(0)), text) + + def _resolve_includes(self, text: str, base_dir: Path) -> str: + """Resolve ``include::`` directives by inlining referenced files.""" + max_depth = 5 + + def _resolve_once(src: str, depth: int) -> str: + if depth >= max_depth: + return src + + def _replacer(match: re.Match) -> str: + inc_path = match.group(1).strip() + inc_file = base_dir / inc_path + if inc_file.is_file(): + try: + return _resolve_once( + inc_file.read_text(encoding="utf-8", errors="replace"), depth + 1 + ) + except OSError: + logger.debug("Could not read include file: %s", inc_file) + return f"// include::{inc_path}[] (not resolved)" + + return RE_INCLUDE.sub(_replacer, src) + + return _resolve_once(text, 0) + + @staticmethod + def _build_metadata(attributes: dict[str, str], file_path: Path) -> dict: + """Build metadata dict from document attributes.""" + return { + "title": attributes.get("doctitle", attributes.get("title", file_path.stem)), + "author": attributes.get("author", ""), + "email": attributes.get("email", ""), + "revision": attributes.get("revnumber", attributes.get("version", "")), + "date": attributes.get("revdate", attributes.get("date", "")), + "description": attributes.get("description", ""), + "keywords": attributes.get("keywords", ""), + "source_file": str(file_path), + } + + # ------------------------------------------------------------------ + # Section parsing + # ------------------------------------------------------------------ + + def _parse_asciidoc_sections(self, text: str) -> list[dict]: + """Parse AsciiDoc text into sections split by headings (= through =====).""" + heading_matches = [ + (m.start(), len(m.group(1)), m.group(2).strip(), m.group(0)) + for m in RE_HEADING.finditer(text) + ] + if not heading_matches: + return [{"heading": "", "heading_level": "h1", "body": text.strip(), "headings": []}] + + sections: list[dict] = [] + preamble = text[: heading_matches[0][0]].strip() + if preamble: + sections.append( + {"heading": "", "heading_level": "h1", "body": preamble, "headings": []} + ) + + for idx, (start, level, heading_text, raw) in enumerate(heading_matches): + body_start = start + len(raw) + body_end = heading_matches[idx + 1][0] if idx + 1 < len(heading_matches) else len(text) + body = text[body_start:body_end].strip() + + sub_headings = [ + {"level": f"h{len(m.group(1))}", "text": m.group(2).strip()} + for m in RE_HEADING.finditer(body) + if len(m.group(1)) > level + ] + sections.append( + { + "heading": heading_text, + "heading_level": f"h{level}", + "body": body, + "headings": sub_headings, + } + ) + return sections + + # ------------------------------------------------------------------ + # Code block extraction + # ------------------------------------------------------------------ + + def _extract_code_blocks(self, text: str) -> list[dict]: + """Extract source/listing/literal code blocks from AsciiDoc text. + + Handles [source,lang] + ---- blocks, bare ---- blocks, and .... blocks. + """ + blocks: list[dict] = [] + consumed: list[tuple[int, int]] = [] + + # Pattern 1: [source,lang] + ---- block + for attr_m in RE_SOURCE_ATTR.finditer(text): + lang = (attr_m.group(1) or "").strip() + open_m = RE_LISTING_DELIM.search(text, attr_m.end()) + if not open_m: + continue + between = text[attr_m.end() : open_m.start()].strip() + if between and not between.startswith(".") and "\n" in between: + continue + delim = open_m.group(1) + close_m = re.search( + r"^" + re.escape(delim) + r"$", text[open_m.end() + 1 :], re.MULTILINE + ) + if not close_m: + continue + abs_close = open_m.end() + 1 + close_m.start() + code = text[open_m.end() : abs_close].strip("\n") + if code: + blocks.append( + {"code": code, "language": lang, "quality_score": _score_code_quality(code)} + ) + consumed.append((attr_m.start(), abs_close + len(close_m.group(0)))) + + # Pattern 2: bare ---- listing blocks + for m in RE_LISTING_DELIM.finditer(text): + if self._in_range(m.start(), consumed): + continue + delim = m.group(1) + close_m = re.search(r"^" + re.escape(delim) + r"$", text[m.end() + 1 :], re.MULTILINE) + if not close_m: + continue + abs_close = m.end() + 1 + close_m.start() + code = text[m.end() : abs_close].strip("\n") + if code: + blocks.append( + {"code": code, "language": "", "quality_score": _score_code_quality(code)} + ) + consumed.append((m.start(), abs_close + len(close_m.group(0)))) + + # Pattern 3: .... literal blocks + for m in RE_LITERAL_DELIM.finditer(text): + if self._in_range(m.start(), consumed): + continue + delim = m.group(1) + close_m = re.search(r"^" + re.escape(delim) + r"$", text[m.end() + 1 :], re.MULTILINE) + if not close_m: + continue + abs_close = m.end() + 1 + close_m.start() + code = text[m.end() : abs_close].strip("\n") + if code: + blocks.append( + {"code": code, "language": "", "quality_score": _score_code_quality(code)} + ) + consumed.append((m.start(), abs_close + len(close_m.group(0)))) + + return blocks + + # ------------------------------------------------------------------ + # Table extraction + # ------------------------------------------------------------------ + + def _extract_tables(self, text: str) -> list[dict]: + """Parse AsciiDoc tables delimited by ``|===``.""" + tables: list[dict] = [] + delimiters = list(RE_TABLE_DELIM.finditer(text)) + idx = 0 + while idx + 1 < len(delimiters): + body = text[delimiters[idx].end() : delimiters[idx + 1].start()].strip() + if body: + table = self._parse_table_body(body) + if table: + tables.append(table) + idx += 2 + return tables + + @staticmethod + def _parse_table_body(table_body: str) -> dict | None: + """Parse body of an AsciiDoc table into headers and rows.""" + groups = re.split(r"\n\s*\n", table_body.strip()) + if not groups: + return None + + def _parse_row(row_text: str) -> list[str]: + return [p.strip() for p in row_text.split("|") if p.strip()] + + # First group → headers + headers: list[str] = [] + for line in groups[0].strip().splitlines(): + if line.strip().startswith("|"): + parsed = _parse_row(line) + if parsed and not headers: + headers = parsed + elif parsed: + for i, cell in enumerate(parsed): + if i < len(headers): + headers[i] = f"{headers[i]} {cell}".strip() + else: + headers.append(cell) + + # Remaining groups → rows + rows: list[list[str]] = [] + for group in groups[1:]: + for line in group.strip().splitlines(): + if line.strip().startswith("|"): + parsed = _parse_row(line) + if parsed: + rows.append(parsed) + + # Single group fallback: first parsed line = header, rest = rows + if len(groups) == 1 and not rows: + all_parsed = [ + _parse_row(line) + for line in groups[0].strip().splitlines() + if line.strip().startswith("|") + ] + all_parsed = [r for r in all_parsed if r] + if len(all_parsed) > 1: + headers, rows = all_parsed[0], all_parsed[1:] + elif all_parsed: + headers = all_parsed[0] + + return {"headers": headers, "rows": rows} if headers or rows else None + + # ------------------------------------------------------------------ + # Admonition extraction + # ------------------------------------------------------------------ + + def _extract_admonitions(self, text: str) -> list[dict]: + """Extract NOTE/TIP/WARNING/IMPORTANT/CAUTION admonitions.""" + admonitions: list[dict] = [] + seen: set[str] = set() + for pattern in (RE_ADMONITION_BLOCK, RE_ADMONITION_PARA): + for m in pattern.finditer(text): + adm_type, adm_text = m.group(1), m.group(2).strip() + if adm_text and adm_text not in seen: + admonitions.append({"type": adm_type, "text": adm_text}) + seen.add(adm_text) + return admonitions + + # ------------------------------------------------------------------ + # Include directive extraction + # ------------------------------------------------------------------ + + @staticmethod + def _extract_includes(text: str) -> list[dict]: + """Detect remaining ``include::`` directives in text.""" + return [ + {"path": m.group(1).strip(), "options": m.group(2).strip()} + for m in RE_INCLUDE.finditer(text) + ] + + # ------------------------------------------------------------------ + # AsciiDoc → Markdown conversion + # ------------------------------------------------------------------ + + def _convert_to_markdown(self, text: str) -> str: + """Convert AsciiDoc inline formatting to Markdown equivalents.""" + result = text + # Remove processed block delimiters and attribute lines + for pat in ( + RE_LISTING_DELIM, + RE_LITERAL_DELIM, + RE_TABLE_DELIM, + RE_SOURCE_ATTR, + RE_ATTRIBUTE, + ): + result = pat.sub("", result) + # Remove admonition block markers and delimiters + result = re.sub( + r"^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\s*$", "", result, flags=re.MULTILINE + ) + result = re.sub(r"^={4,}$", "", result, flags=re.MULTILINE) + # Headings: = Title → # Title + result = RE_HEADING.sub(lambda m: f"{'#' * len(m.group(1))} {m.group(2).strip()}", result) + # Inline formatting + result = RE_BOLD.sub(r"**\1**", result) + result = RE_ITALIC.sub(r"*\1*", result) + result = RE_LINK.sub(r"[\2](\1)", result) + result = RE_XREF.sub(lambda m: f"*{m.group(2) or m.group(1)}*", result) + # Lists: * item → - item, . item → 1. item + result = re.sub( + r"^(\*{1,5})\s+", + lambda m: " " * (len(m.group(1)) - 1) + "- ", + result, + flags=re.MULTILINE, + ) + result = re.sub( + r"^(\.{1,5})\s+", + lambda m: " " * (len(m.group(1)) - 1) + "1. ", + result, + flags=re.MULTILINE, + ) + # Block titles: .Title → **Title** + result = re.sub(r"^\.([A-Z][\w\s]+)$", r"**\1**", result, flags=re.MULTILINE) + # Include comments + result = re.sub( + r"^//\s*include::(.+?)\[\].*$", r"*(included: \1)*", result, flags=re.MULTILINE + ) + # Remove leftover table cell markers + result = re.sub(r"^\|\s*", "", result, flags=re.MULTILINE) + # Collapse blank lines + result = re.sub(r"\n{3,}", "\n\n", result) + return result.strip() + + # ------------------------------------------------------------------ + # Load / categorize / build + # ------------------------------------------------------------------ + + def load_extracted_data(self, json_path: str) -> bool: + """Load previously extracted data from JSON file.""" + print(f"\n📂 Loading extracted data from: {json_path}") + with open(json_path, encoding="utf-8") as f: + self.extracted_data = json.load(f) + total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", []))) + print(f"✅ Loaded {total} sections") + return True + + def categorize_content(self) -> dict: + """Categorize sections by source file, headings, or keywords.""" + print("\n📋 Categorizing content...") + categorized: dict[str, dict] = {} + sections = self.extracted_data.get("pages", []) + path = Path(self.asciidoc_path) if self.asciidoc_path else None + + if path and path.is_file(): + key = self._sanitize_filename(path.stem) + categorized[key] = {"title": path.stem, "pages": sections} + print(f"✅ Created 1 category (single file): {path.stem}: {len(sections)} sections") + return categorized + + if path and path.is_dir(): + for s in sections: + src_stem = Path(s.get("source_file", "unknown")).stem + key = self._sanitize_filename(src_stem) + categorized.setdefault(key, {"title": src_stem, "pages": []})["pages"].append(s) + if categorized: + print(f"✅ Created {len(categorized)} categories (by source file)") + for cat in categorized.values(): + print(f" - {cat['title']}: {len(cat['pages'])} sections") + return categorized + + if self.categories: + first_val = next(iter(self.categories.values()), None) + if isinstance(first_val, list) and first_val and isinstance(first_val[0], dict): + for k, pages in self.categories.items(): + categorized[k] = {"title": k.replace("_", " ").title(), "pages": pages} + else: + for k in self.categories: + categorized[k] = {"title": k.replace("_", " ").title(), "pages": []} + for s in sections: + txt = s.get("text", "").lower() + htxt = s.get("heading", "").lower() + scores = { + k: sum( + 1 + for kw in kws + if isinstance(kw, str) and (kw.lower() in txt or kw.lower() in htxt) + ) + for k, kws in self.categories.items() + if isinstance(kws, list) + } + scores = {k: v for k, v in scores.items() if v > 0} + if scores: + categorized[max(scores, key=scores.get)]["pages"].append(s) + else: + categorized.setdefault("other", {"title": "Other", "pages": []})[ + "pages" + ].append(s) + else: + categorized["content"] = {"title": "Content", "pages": sections} + + print(f"✅ Created {len(categorized)} categories") + for cat in categorized.values(): + print(f" - {cat['title']}: {len(cat['pages'])} sections") + return categorized + + def build_skill(self) -> None: + """Build complete skill directory structure.""" + print(f"\n🏗️ Building skill: {self.name}") + for subdir in ("references", "scripts", "assets"): + os.makedirs(f"{self.skill_dir}/{subdir}", exist_ok=True) + + categorized = self.categorize_content() + print("\n📝 Generating reference files...") + total_cats = len(categorized) + for i, (cat_key, cat_data) in enumerate(categorized.items(), 1): + self._generate_reference_file(cat_key, cat_data, i, total_cats) + self._generate_index(categorized) + self._generate_skill_md(categorized) + print(f"\n✅ Skill built successfully: {self.skill_dir}/") + print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/") + + # ------------------------------------------------------------------ + # Private generation methods + # ------------------------------------------------------------------ + + def _ref_filename(self, cat_data: dict, section_num: int, total: int) -> str: + """Compute reference file path for a category.""" + sections = cat_data["pages"] + adoc_base = "" + if self.asciidoc_path: + p = Path(self.asciidoc_path) + adoc_base = p.stem if p.is_file() else "" + + if sections: + nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)] + if total == 1: + return f"{self.skill_dir}/references/{adoc_base or 'main'}.md" + base = adoc_base or "section" + return f"{self.skill_dir}/references/{base}_s{min(nums)}-s{max(nums)}.md" + return f"{self.skill_dir}/references/section_{section_num:02d}.md" + + def _generate_reference_file( + self, _cat_key: str, cat_data: dict, section_num: int, total: int + ) -> None: + """Generate a reference Markdown file for one category.""" + filename = self._ref_filename(cat_data, section_num, total) + with open(filename, "w", encoding="utf-8") as f: + f.write(f"# {cat_data['title']}\n\n") + for section in cat_data["pages"]: + sec_num = section.get("section_number", "?") + heading = section.get("heading", "") + hl = section.get("heading_level", "h1") + f.write(f"---\n\n**📄 Source: Section {sec_num}**\n\n") + if heading: + f.write(f"{'#' * (int(hl[1]) + 1)} {heading}\n\n") + for sub in section.get("headings", []): + sl = sub.get("level", "h3") + if sub.get("text"): + f.write(f"{'#' * (int(sl[1]) + 1)} {sub['text']}\n\n") + if section.get("text"): + f.write(f"{section['text']}\n\n") + if section.get("code_samples"): + f.write("### Code Examples\n\n") + for c in section["code_samples"]: + f.write(f"```{c.get('language', '')}\n{c['code']}\n```\n\n") + if section.get("tables"): + f.write("### Tables\n\n") + for t in section["tables"]: + hdrs = t.get("headers", []) + if hdrs: + f.write("| " + " | ".join(str(h) for h in hdrs) + " |\n") + f.write("| " + " | ".join("---" for _ in hdrs) + " |\n") + for row in t.get("rows", []): + f.write("| " + " | ".join(str(c) for c in row) + " |\n") + f.write("\n") + if section.get("admonitions"): + f.write("### Notes & Warnings\n\n") + for a in section["admonitions"]: + f.write(f"> **{a.get('type', 'NOTE')}:** {a.get('text', '')}\n\n") + f.write("---\n\n") + print(f" Generated: {filename}") + + def _generate_index(self, categorized: dict) -> None: + """Generate references/index.md.""" + filename = f"{self.skill_dir}/references/index.md" + adoc_base = "" + if self.asciidoc_path: + p = Path(self.asciidoc_path) + adoc_base = p.stem if p.is_file() else "" + total = len(categorized) + + with open(filename, "w", encoding="utf-8") as f: + f.write(f"# {self.name.title()} Documentation Reference\n\n## Categories\n\n") + for i, (_k, cd) in enumerate(categorized.items(), 1): + pages = cd["pages"] + cnt = len(pages) + if pages: + nums = [s.get("section_number", j + 1) for j, s in enumerate(pages)] + rng = f"Sections {min(nums)}-{max(nums)}" + if total == 1: + lf = f"{adoc_base or 'main'}.md" + else: + lf = f"{adoc_base or 'section'}_s{min(nums)}-s{max(nums)}.md" + else: + lf, rng = f"section_{i:02d}.md", "N/A" + f.write(f"- [{cd['title']}]({lf}) ({cnt} sections, {rng})\n") + + f.write("\n## Statistics\n\n") + for key, label in [ + ("total_sections", "Total sections"), + ("total_code_blocks", "Code blocks"), + ("total_tables", "Tables"), + ("total_admonitions", "Admonitions"), + ("total_files", "Source files"), + ]: + f.write(f"- {label}: {self.extracted_data.get(key, 0)}\n") + meta = self.extracted_data.get("metadata", {}) + if meta.get("author"): + f.write(f"- Author: {meta['author']}\n") + if meta.get("date"): + f.write(f"- Date: {meta['date']}\n") + print(f" Generated: {filename}") + + def _generate_skill_md(self, categorized: dict) -> None: + """Generate main SKILL.md file with rich summary content.""" + filename = f"{self.skill_dir}/SKILL.md" + skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] + desc = self.description[:1024] + ed = self.extracted_data # shorthand + + with open(filename, "w", encoding="utf-8") as f: + f.write(f"---\nname: {skill_name}\ndescription: {desc}\n---\n\n") + f.write(f"# {self.name.title()} Documentation Skill\n\n{self.description}\n\n") + + # Document metadata + meta = ed.get("metadata", {}) + if any(v for v in meta.values() if v): + f.write("## 📋 Document Information\n\n") + for key, label in [ + ("title", "Title"), + ("author", "Author"), + ("revision", "Revision"), + ("date", "Date"), + ("description", "Description"), + ]: + if meta.get(key): + f.write(f"**{label}:** {meta[key]}\n\n") + + f.write("## 💡 When to Use This Skill\n\nUse this skill when you need to:\n") + f.write(f"- Understand {self.name} concepts and fundamentals\n") + f.write("- Look up API references and technical specifications\n") + f.write("- Find code examples and implementation patterns\n") + f.write("- Review tutorials, guides, and best practices\n") + f.write("- Explore the complete documentation structure\n\n") + + # Section Overview + f.write( + f"## 📖 Section Overview\n\n**Total Sections:** {ed.get('total_sections', 0)}\n\n" + ) + f.write("**Content Breakdown:**\n\n") + for cd in categorized.values(): + f.write(f"- **{cd['title']}**: {len(cd['pages'])} sections\n") + f.write("\n") + + f.write(self._format_key_concepts()) + f.write("## ⚡ Quick Reference\n\n") + f.write(self._format_patterns_from_content()) + + # Code examples (top 15 grouped by language) + all_code = [c for s in ed.get("pages", []) for c in s.get("code_samples", [])] + all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True) + if all_code[:15]: + f.write("## 📝 Code Examples\n\n*High-quality examples from documentation*\n\n") + by_lang: dict[str, list] = {} + for c in all_code[:15]: + by_lang.setdefault(c.get("language", "unknown"), []).append(c) + for lang in sorted(by_lang): + exs = by_lang[lang] + f.write(f"### {lang.title()} Examples ({len(exs)})\n\n") + for i, c in enumerate(exs[:5], 1): + ct = c.get("code", "") + f.write( + f"**Example {i}** (Quality: {c.get('quality_score', 0):.1f}/10):\n\n" + ) + f.write(f"```{lang}\n{ct[:500]}{'...' if len(ct) > 500 else ''}\n```\n\n") + + # Table summary + all_tables = [ + (s.get("heading", ""), t) for s in ed.get("pages", []) for t in s.get("tables", []) + ] + if all_tables: + f.write(f"## 📊 Table Summary\n\n*{len(all_tables)} table(s) found*\n\n") + for sh, t in all_tables[:5]: + if sh: + f.write(f"**From section: {sh}**\n\n") + hdrs = t.get("headers", []) + if hdrs: + f.write("| " + " | ".join(str(h) for h in hdrs) + " |\n") + f.write("| " + " | ".join("---" for _ in hdrs) + " |\n") + for row in t.get("rows", [])[:5]: + f.write("| " + " | ".join(str(c) for c in row) + " |\n") + f.write("\n") + + # Admonition summary + all_adm = [a for s in ed.get("pages", []) for a in s.get("admonitions", [])] + if all_adm: + f.write("## ⚠️ Admonition Summary\n\n") + by_type: dict[str, list[str]] = {} + for a in all_adm: + by_type.setdefault(a.get("type", "NOTE"), []).append(a.get("text", "")) + for at in sorted(by_type): + items = by_type[at] + f.write(f"**{at}** ({len(items)}):\n\n") + for txt in items[:5]: + f.write(f"> {txt[:120]}{'...' if len(txt) > 120 else ''}\n\n") + + # Statistics + f.write("## 📊 Documentation Statistics\n\n") + for key, label in [ + ("total_sections", "Total Sections"), + ("total_code_blocks", "Code Blocks"), + ("total_tables", "Tables"), + ("total_admonitions", "Admonitions"), + ("total_files", "Source Files"), + ]: + f.write(f"- **{label}**: {ed.get(key, 0)}\n") + langs = ed.get("languages_detected", {}) + if langs: + f.write(f"- **Programming Languages**: {len(langs)}\n\n**Language Breakdown:**\n\n") + for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True): + f.write(f"- {lang}: {count} examples\n") + f.write("\n") + + # Navigation + f.write("## 🗺️ Navigation\n\n**Reference Files:**\n\n") + for cd in categorized.values(): + cf = self._sanitize_filename(cd["title"]) + f.write(f"- `references/{cf}.md` - {cd['title']}\n") + f.write("\nSee `references/index.md` for complete documentation structure.\n\n") + f.write("---\n\n**Generated by Skill Seeker** | AsciiDoc Scraper\n") + + with open(filename, encoding="utf-8") as f: + print(f" Generated: {filename} ({len(f.read().splitlines())} lines)") + + # ------------------------------------------------------------------ + # Content analysis helpers + # ------------------------------------------------------------------ + + def _format_key_concepts(self) -> str: + """Extract key concepts from headings across all sections.""" + all_h: list[tuple[str, str]] = [] + for s in self.extracted_data.get("pages", []): + h = s.get("heading", "").strip() + if h and len(h) > 3: + all_h.append((s.get("heading_level", "h1"), h)) + for sub in s.get("headings", []): + t = sub.get("text", "").strip() + if t and len(t) > 3: + all_h.append((sub.get("level", "h3"), t)) + if not all_h: + return "" + content = "## 🔑 Key Concepts\n\n*Main topics covered in this documentation*\n\n" + h1s = [t for lv, t in all_h if lv == "h1"] + h2s = [t for lv, t in all_h if lv == "h2"] + if h1s: + content += "**Major Topics:**\n\n" + "".join(f"- {h}\n" for h in h1s[:10]) + "\n" + if h2s: + content += "**Subtopics:**\n\n" + "".join(f"- {h}\n" for h in h2s[:15]) + "\n" + return content + + def _format_patterns_from_content(self) -> str: + """Extract common documentation patterns from section headings.""" + keywords = [ + "getting started", + "installation", + "configuration", + "usage", + "api", + "examples", + "tutorial", + "guide", + "best practices", + "troubleshooting", + "faq", + ] + patterns: list[dict] = [] + for s in self.extracted_data.get("pages", []): + ht = s.get("heading", "").lower() + for kw in keywords: + if kw in ht: + patterns.append( + { + "type": kw.title(), + "heading": s.get("heading", ""), + "section": s.get("section_number", 0), + } + ) + break + if not patterns: + return "*See reference files for detailed content*\n\n" + by_type: dict[str, list] = {} + for p in patterns: + by_type.setdefault(p["type"], []).append(p) + content = "*Common documentation patterns found:*\n\n" + for pt in sorted(by_type): + items = by_type[pt] + content += f"**{pt}** ({len(items)} sections):\n" + content += "".join(f"- {it['heading']} (section {it['section']})\n" for it in items[:3]) + content += "\n" + return content + + # ------------------------------------------------------------------ + # Utilities + # ------------------------------------------------------------------ + + @staticmethod + def _sanitize_filename(name: str) -> str: + """Convert name to a safe filename slug.""" + safe = re.sub(r"[^\w\s-]", "", name.lower()) + return re.sub(r"[-\s]+", "_", safe) + + @staticmethod + def _in_range(pos: int, ranges: list[tuple[int, int]]) -> bool: + """Check whether pos falls within any consumed range.""" + return any(s <= pos < e for s, e in ranges) + + +# ============================================================================ +# CLI entry point +# ============================================================================ + + +def main() -> int: + """CLI entry point for AsciiDoc scraper.""" + from skill_seekers.cli.arguments.asciidoc import add_asciidoc_arguments + + parser = argparse.ArgumentParser( + description="Convert AsciiDoc documentation to skill", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + add_asciidoc_arguments(parser) + + args = parser.parse_args() + + # Set logging level + if getattr(args, "quiet", False): + logging.getLogger().setLevel(logging.WARNING) + elif getattr(args, "verbose", False): + logging.getLogger().setLevel(logging.DEBUG) + + # Handle --dry-run + if getattr(args, "dry_run", False): + source = ( + getattr(args, "asciidoc_path", None) or getattr(args, "from_json", None) or "(none)" + ) + print(f"\n{'=' * 60}") + print("DRY RUN: AsciiDoc Extraction") + print(f"{'=' * 60}") + print(f"Source: {source}") + print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") + print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") + print(f"\n✅ Dry run complete") + return 0 + + # Validate inputs + if not (getattr(args, "asciidoc_path", None) or getattr(args, "from_json", None)): + parser.error("Must specify --asciidoc-path or --from-json") + + # Build from JSON workflow + if getattr(args, "from_json", None): + name = Path(args.from_json).stem.replace("_extracted", "") + config = { + "name": getattr(args, "name", None) or name, + "description": getattr(args, "description", None) + or f"Use when referencing {name} documentation", + } + try: + converter = AsciiDocToSkillConverter(config) + converter.load_extracted_data(args.from_json) + converter.build_skill() + except Exception as e: + print(f"\n❌ Error: {e}", file=sys.stderr) + sys.exit(1) + return 0 + + # Direct AsciiDoc mode + if not getattr(args, "name", None): + p = Path(args.asciidoc_path) + args.name = p.stem if p.is_file() else p.name + + config = { + "name": args.name, + "asciidoc_path": args.asciidoc_path, + "description": getattr(args, "description", None), + } + + try: + converter = AsciiDocToSkillConverter(config) + + # Extract + if not converter.extract_asciidoc(): + print("\n❌ AsciiDoc extraction failed - see error above", file=sys.stderr) + sys.exit(1) + + # Build skill + converter.build_skill() + + # Enhancement Workflow Integration + from skill_seekers.cli.workflow_runner import run_workflows + + workflow_executed, workflow_names = run_workflows(args) + workflow_name = ", ".join(workflow_names) if workflow_names else None + + # Traditional enhancement (complements workflow system) + if getattr(args, "enhance_level", 0) > 0: + api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") + mode = "API" if api_key else "LOCAL" + + print("\n" + "=" * 80) + print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") + print("=" * 80) + if workflow_executed: + print(f" Running after workflow: {workflow_name}") + print( + " (Workflow provides specialized analysis," + " enhancement provides general improvements)" + ) + print("") + + skill_dir = converter.skill_dir + if api_key: + try: + from skill_seekers.cli.enhance_skill import enhance_skill_md + + enhance_skill_md(skill_dir, api_key) + print("✅ API enhancement complete!") + except ImportError: + print("❌ API enhancement not available. Falling back to LOCAL mode...") + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer + + enhancer = LocalSkillEnhancer(Path(skill_dir)) + enhancer.run(headless=True) + print("✅ Local enhancement complete!") + else: + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer + + enhancer = LocalSkillEnhancer(Path(skill_dir)) + enhancer.run(headless=True) + print("✅ Local enhancement complete!") + + except (FileNotFoundError, ValueError, RuntimeError) as e: + print(f"\n❌ Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"\n❌ Unexpected error during AsciiDoc processing: {e}", file=sys.stderr) + import traceback + + traceback.print_exc() + sys.exit(1) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/skill_seekers/cli/chat_scraper.py b/src/skill_seekers/cli/chat_scraper.py new file mode 100644 index 0000000..2d60c7c --- /dev/null +++ b/src/skill_seekers/cli/chat_scraper.py @@ -0,0 +1,1920 @@ +#!/usr/bin/env python3 +""" +Slack/Discord Chat Export to Skill Converter + +Converts chat history from Slack and Discord into AI-ready skills. +Supports two modes of operation per platform: + +**Export mode** (offline, no API key required): + - Slack: Parse workspace export ZIP/directory (JSON files per channel per day) + - Discord: Parse DiscordChatExporter JSON output + +**API mode** (live, requires authentication token): + - Slack: Fetch messages via Slack Web API (slack_sdk) + - Discord: Fetch messages via Discord HTTP API (discord.py or aiohttp) + +Extracted content includes messages, threads, reactions, code snippets, +shared links, attachments, and user references. Messages are categorized +by channel, date, and detected topic for structured skill output. + +Usage: + # Slack workspace export (directory or ZIP) + skill-seekers chat --export-path ./slack-export/ --platform slack --name myteam + + # Slack API (live fetch) + skill-seekers chat --platform slack --token xoxb-... --channel C01234 --name myteam + + # Discord export (DiscordChatExporter JSON) + skill-seekers chat --export-path ./discord-export.json --platform discord --name myserver + + # Discord API (live fetch) + skill-seekers chat --platform discord --token Bot ... --channel 12345 --name myserver + + # Build from previously extracted JSON + skill-seekers chat --from-json myteam_extracted.json --name myteam +""" + +import argparse +import json +import logging +import os +import re +import sys +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path + +# Optional dependency guard — Slack SDK +try: + from slack_sdk import WebClient + from slack_sdk.errors import SlackApiError + + SLACK_AVAILABLE = True +except ImportError: + SLACK_AVAILABLE = False + +# Optional dependency guard — Discord +try: + import discord # noqa: F401 + + DISCORD_AVAILABLE = True +except ImportError: + DISCORD_AVAILABLE = False + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Maximum messages to fetch per channel when using API mode +DEFAULT_MAX_MESSAGES = 5000 + +# Topic keywords for automatic content categorization +_TOPIC_KEYWORDS: dict[str, list[str]] = { + "troubleshooting": [ + "error", + "bug", + "fix", + "issue", + "broken", + "crash", + "exception", + "traceback", + "debug", + "failing", + "stacktrace", + "segfault", + ], + "setup": [ + "install", + "setup", + "configure", + "config", + "environment", + "docker", + "deploy", + "ci/cd", + "pipeline", + "build", + "dependency", + ], + "architecture": [ + "design", + "architecture", + "pattern", + "refactor", + "abstraction", + "interface", + "module", + "service", + "microservice", + "api", + ], + "code_review": [ + "review", + "pr", + "pull request", + "merge", + "approve", + "lgtm", + "nit", + "suggestion", + "feedback", + "diff", + ], + "howto": [ + "how to", + "how do", + "tutorial", + "example", + "guide", + "walkthrough", + "step by step", + "documentation", + "docs", + ], + "release": [ + "release", + "version", + "changelog", + "migration", + "upgrade", + "breaking change", + "deprecat", + "v1", + "v2", + "v3", + ], + "performance": [ + "performance", + "slow", + "fast", + "optimize", + "latency", + "throughput", + "benchmark", + "profil", + "memory", + "cpu", + ], + "testing": [ + "test", + "pytest", + "unittest", + "coverage", + "mock", + "fixture", + "assertion", + "spec", + "e2e", + "integration test", + ], +} + + +# --------------------------------------------------------------------------- +# Dependency checks +# --------------------------------------------------------------------------- + + +def _check_slack_deps() -> None: + """Raise RuntimeError if slack_sdk is not installed.""" + if not SLACK_AVAILABLE: + raise RuntimeError( + "slack_sdk is required for Slack API support.\n" + 'Install with: pip install "skill-seekers[slack]"\n' + "Or: pip install slack_sdk" + ) + + +def _check_discord_deps() -> None: + """Raise RuntimeError if discord.py is not installed.""" + if not DISCORD_AVAILABLE: + raise RuntimeError( + "discord.py is required for Discord API support.\n" + 'Install with: pip install "skill-seekers[discord]"\n' + "Or: pip install discord.py" + ) + + +# --------------------------------------------------------------------------- +# Helper: code quality scoring (consistent with other scrapers) +# --------------------------------------------------------------------------- + + +def _score_code_quality(code: str) -> float: + """Score code quality on a 0-10 scale using heuristics. + + Args: + code: Source code text to score. + + Returns: + Float quality score between 0.0 and 10.0. + """ + if not code: + return 0.0 + + score = 5.0 + lines = code.strip().split("\n") + line_count = len(lines) + + if line_count >= 10: + score += 2.0 + elif line_count >= 5: + score += 1.0 + + if re.search(r"\b(def |class |function |func |fn )", code): + score += 1.5 + if re.search(r"\b(import |from .+ import|require\(|#include|using )", code): + score += 0.5 + if re.search(r"^ ", code, re.MULTILINE): + score += 0.5 + if re.search(r"[=:{}()\[\]]", code): + score += 0.3 + if len(code) < 30: + score -= 2.0 + + return min(10.0, max(0.0, score)) + + +# --------------------------------------------------------------------------- +# Main converter class +# --------------------------------------------------------------------------- + + +class ChatToSkillConverter: + """Convert Slack or Discord chat history into an AI-ready skill. + + Follows the same pipeline pattern as the EPUB, Jupyter, and PPTX scrapers: + extract -> categorize -> build_skill (reference files + index + SKILL.md). + + Supports two input modes per platform: + - **Export mode**: Parse a previously exported archive (Slack workspace + export directory/ZIP or DiscordChatExporter JSON). + - **API mode**: Fetch messages live from the platform's API using an + authentication token. + + The extraction phase produces a normalized intermediate JSON containing + messages with text, user, timestamp, reactions, threads, attachments, + code snippets, and shared links. Messages are then categorized by + channel, date range, and detected topic. + """ + + def __init__(self, config: dict) -> None: + """Initialize the converter with a configuration dictionary. + + Args: + config: Configuration dict with keys: + - name (str): Skill name (required). + - export_path (str): Path to export file/directory (optional). + - platform (str): "slack" or "discord" (default "slack"). + - token (str): API authentication token (optional, API mode). + - channel (str): Channel ID to fetch (optional, API mode). + - max_messages (int): Max messages to fetch per channel + (default 5000). + - description (str): Skill description (optional, inferred + if absent). + """ + self.config = config + self.name: str = config["name"] + self.export_path: str = config.get("export_path", "") + self.platform: str = config.get("platform", "slack").lower() + self.token: str = config.get("token", "") + self.channel: str = config.get("channel", "") + self.max_messages: int = config.get("max_messages", DEFAULT_MAX_MESSAGES) + self.description: str = ( + config.get("description") or f"Use when referencing {self.name} chat knowledge base" + ) + + # Output paths + self.skill_dir: str = f"output/{self.name}" + self.data_file: str = f"output/{self.name}_extracted.json" + + # Extracted data (populated by extract_chat or load_extracted_data) + self.extracted_data: dict | None = None + + # ------------------------------------------------------------------ + # Extraction — public entry point + # ------------------------------------------------------------------ + + def extract_chat(self) -> bool: + """Extract chat content based on platform and input mode. + + Dispatches to the appropriate extraction method: + - Export mode (export_path set): parse local export files. + - API mode (token set): fetch messages via platform API. + + Returns: + True on successful extraction. + + Raises: + ValueError: If neither export_path nor token is provided, or + if the platform is not recognized. + """ + if self.platform not in ("slack", "discord"): + raise ValueError( + f"Unsupported platform: '{self.platform}'. Supported platforms: 'slack', 'discord'" + ) + + # Determine mode + if self.export_path: + print(f"\n🔍 Extracting {self.platform} chat from export: {self.export_path}") + if self.platform == "slack": + messages = self._extract_slack_export() + else: + messages = self._extract_discord_export() + elif self.token: + print(f"\n🔍 Fetching {self.platform} chat via API...") + if self.platform == "slack": + _check_slack_deps() + messages = self._extract_slack_api() + else: + _check_discord_deps() + messages = self._extract_discord_api() + else: + raise ValueError( + "Must provide either --export-path (export mode) " + "or --token (API mode) for chat extraction." + ) + + if not messages: + logger.warning("No messages extracted from %s source", self.platform) + print(" ⚠️ No messages were extracted.") + + # Identify threads and extract enrichment + threads = self._identify_threads(messages) + code_snippets = self._extract_code_snippets(messages) + links = self._extract_links(messages) + channel_summaries = self._summarize_channels(messages) + + # Group messages into sections by channel + sections = self._build_sections(messages, threads) + + # Compute statistics + total_messages = len(messages) + total_threads = len(threads) + total_code_snippets = len(code_snippets) + total_links = len(links) + unique_users = len({m.get("user", "unknown") for m in messages}) + channels_found = list(channel_summaries.keys()) + + result_data = { + "source": self.export_path or f"{self.platform}-api", + "platform": self.platform, + "metadata": { + "total_messages": total_messages, + "total_threads": total_threads, + "total_code_snippets": total_code_snippets, + "total_links": total_links, + "unique_users": unique_users, + "channels": channels_found, + }, + "total_sections": len(sections), + "total_code_blocks": total_code_snippets, + "channel_summaries": channel_summaries, + "code_snippets": code_snippets[:100], # Keep top 100 for JSON size + "links": links[:200], + "pages": sections, + } + + # Save extracted data + os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True) + with open(self.data_file, "w", encoding="utf-8") as f: + json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) + + print(f"\n💾 Saved extracted data to: {self.data_file}") + self.extracted_data = result_data + print( + f"✅ Extracted {total_messages} messages across " + f"{len(channels_found)} channel(s), " + f"{total_threads} threads, " + f"{total_code_snippets} code snippets" + ) + return True + + # ------------------------------------------------------------------ + # Load previously extracted data + # ------------------------------------------------------------------ + + def load_extracted_data(self, json_path: str) -> bool: + """Load previously extracted data from JSON file. + + Args: + json_path: Path to the extracted JSON file. + + Returns: + True on success. + """ + print(f"\n📂 Loading extracted data from: {json_path}") + with open(json_path, encoding="utf-8") as f: + self.extracted_data = json.load(f) + total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", []))) + print(f"✅ Loaded {total} sections") + return True + + # ------------------------------------------------------------------ + # Categorization + # ------------------------------------------------------------------ + + def categorize_content(self) -> dict[str, dict]: + """Categorize sections by channel, date range, and detected topic. + + Groups the extracted sections into categories suitable for + generating reference files. Each category contains a title + and a list of page/section dicts. + + Returns: + Dict mapping category keys to dicts with 'title' and 'pages'. + """ + print("\n📋 Categorizing content...") + + categorized: dict[str, dict] = {} + sections = self.extracted_data.get("pages", []) + + if not sections: + categorized["content"] = {"title": "Chat Content", "pages": []} + print("✅ Created 0 categories (no content)") + return categorized + + # Group sections by channel name + by_channel: dict[str, list[dict]] = defaultdict(list) + for section in sections: + channel = section.get("channel", "general") + by_channel[channel].append(section) + + if len(by_channel) <= 1: + # Single channel — categorize by topic instead + all_sections = sections + topic_buckets: dict[str, list[dict]] = defaultdict(list) + uncategorized: list[dict] = [] + + for section in all_sections: + combined = self._section_text(section) + matched_topic = "" + best_score = 0 + for topic, keywords in _TOPIC_KEYWORDS.items(): + score = sum(1 for kw in keywords if kw.lower() in combined) + if score > best_score: + best_score = score + matched_topic = topic + if matched_topic and best_score >= 2: + topic_buckets[matched_topic].append(section) + else: + uncategorized.append(section) + + for topic, pages in sorted(topic_buckets.items()): + categorized[topic] = { + "title": topic.replace("_", " ").title(), + "pages": pages, + } + if uncategorized: + categorized["general"] = { + "title": "General Discussion", + "pages": uncategorized, + } + else: + # Multiple channels — use channel names as categories + for channel, channel_sections in sorted(by_channel.items()): + cat_key = self._sanitize_filename(channel) + categorized[cat_key] = { + "title": f"#{channel}", + "pages": channel_sections, + } + + if not categorized: + categorized["content"] = {"title": "Chat Content", "pages": sections} + + print(f"✅ Created {len(categorized)} categories") + for cat_data in categorized.values(): + print(f" - {cat_data['title']}: {len(cat_data['pages'])} sections") + + return categorized + + # ------------------------------------------------------------------ + # Build skill + # ------------------------------------------------------------------ + + def build_skill(self) -> None: + """Build complete skill directory structure from extracted data. + + Creates the output directory tree with: + - references/ — one markdown file per category + - references/index.md — category index with statistics + - SKILL.md — main skill file with frontmatter and overview + - scripts/ — reserved for future use + - assets/ — reserved for future use + """ + print(f"\n🏗️ Building skill: {self.name}") + + os.makedirs(f"{self.skill_dir}/references", exist_ok=True) + os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True) + os.makedirs(f"{self.skill_dir}/assets", exist_ok=True) + + categorized = self.categorize_content() + + print("\n📝 Generating reference files...") + total_categories = len(categorized) + for section_num, (cat_key, cat_data) in enumerate(categorized.items(), 1): + self._generate_reference_file(cat_key, cat_data, section_num, total_categories) + + self._generate_index(categorized) + self._generate_skill_md(categorized) + + print(f"\n✅ Skill built successfully: {self.skill_dir}/") + print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/") + + # ------------------------------------------------------------------ + # Slack export extraction + # ------------------------------------------------------------------ + + def _extract_slack_export(self) -> list[dict]: + """Parse a Slack workspace export directory. + + Slack exports contain one directory per channel, each with JSON + files named by date (e.g., ``2024-01-15.json``). Each JSON file + is a list of message objects. + + Returns: + List of normalized message dicts. + + Raises: + FileNotFoundError: If export_path does not exist. + ValueError: If the path structure is not a valid Slack export. + """ + export_path = Path(self.export_path) + if not export_path.exists(): + raise FileNotFoundError(f"Slack export path not found: {self.export_path}") + + # Handle ZIP archives + if export_path.is_file() and export_path.suffix == ".zip": + export_path = self._unzip_export(export_path) + + if not export_path.is_dir(): + raise ValueError( + f"Expected a directory for Slack export, got: {self.export_path}\n" + "Slack workspace exports are directories containing channel " + "subdirectories with daily JSON files." + ) + + messages: list[dict] = [] + channel_dirs = sorted( + d for d in export_path.iterdir() if d.is_dir() and not d.name.startswith(".") + ) + + if not channel_dirs: + raise ValueError( + f"No channel directories found in Slack export: {self.export_path}\n" + "Expected subdirectories named after channels (e.g., general/, random/)." + ) + + # Load users.json if available (for display name resolution) + users_map = self._load_slack_users(export_path) + + for channel_dir in channel_dirs: + channel_name = channel_dir.name + json_files = sorted(channel_dir.glob("*.json")) + + for json_file in json_files: + try: + with open(json_file, encoding="utf-8") as f: + day_messages = json.load(f) + except (json.JSONDecodeError, OSError) as e: + logger.warning("Failed to parse %s: %s", json_file, e) + continue + + if not isinstance(day_messages, list): + continue + + for raw_msg in day_messages: + parsed = self._parse_slack_message(raw_msg, channel_name, users_map) + if parsed: + messages.append(parsed) + + print(f" 📁 #{channel_name}: {len(json_files)} day file(s)") + + print(f" Total messages parsed: {len(messages)}") + return messages + + def _load_slack_users(self, export_dir: Path) -> dict[str, str]: + """Load user ID -> display name mapping from users.json. + + Args: + export_dir: Root directory of the Slack export. + + Returns: + Dict mapping user IDs to display names. + """ + users_file = export_dir / "users.json" + if not users_file.exists(): + return {} + + try: + with open(users_file, encoding="utf-8") as f: + users_list = json.load(f) + except (json.JSONDecodeError, OSError): + return {} + + users_map: dict[str, str] = {} + if isinstance(users_list, list): + for user in users_list: + uid = user.get("id", "") + display = ( + user.get("profile", {}).get("display_name") + or user.get("profile", {}).get("real_name") + or user.get("real_name") + or user.get("name", uid) + ) + if uid: + users_map[uid] = display + + return users_map + + def _unzip_export(self, zip_path: Path) -> Path: + """Extract a ZIP export to a temporary directory. + + Args: + zip_path: Path to the ZIP archive. + + Returns: + Path to the extracted directory. + """ + import zipfile + + extract_dir = zip_path.parent / zip_path.stem + if extract_dir.exists(): + print(f" Using existing extracted directory: {extract_dir}") + return extract_dir + + print(f" Extracting ZIP: {zip_path} -> {extract_dir}") + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(extract_dir) + + return extract_dir + + # ------------------------------------------------------------------ + # Slack API extraction + # ------------------------------------------------------------------ + + def _extract_slack_api(self) -> list[dict]: + """Fetch messages from Slack via the Web API using slack_sdk. + + Requires ``self.token`` to be set to a valid Slack Bot or User + token. If ``self.channel`` is set, only that channel is fetched; + otherwise all accessible channels are iterated. + + Returns: + List of normalized message dicts. + + Raises: + RuntimeError: If the API call fails. + """ + client = WebClient(token=self.token) + messages: list[dict] = [] + + try: + # Determine channels to fetch + if self.channel: + channel_ids = [self.channel] + channel_names = {self.channel: self.channel} + else: + # List all accessible channels + result = client.conversations_list( + types="public_channel,private_channel", + limit=200, + ) + channels = result.get("channels", []) + channel_ids = [ch["id"] for ch in channels] + channel_names = {ch["id"]: ch.get("name", ch["id"]) for ch in channels} + print(f" Found {len(channel_ids)} channel(s)") + + for ch_id in channel_ids: + ch_name = channel_names.get(ch_id, ch_id) + ch_messages = self._fetch_slack_channel_messages(client, ch_id, ch_name) + messages.extend(ch_messages) + print(f" 📡 #{ch_name}: {len(ch_messages)} messages") + + except SlackApiError as e: + raise RuntimeError( + f"Slack API error: {e.response['error']}\n" + "Check your token permissions (channels:history, channels:read)." + ) from e + + print(f" Total messages fetched: {len(messages)}") + return messages + + def _fetch_slack_channel_messages( + self, client: "WebClient", channel_id: str, channel_name: str + ) -> list[dict]: + """Fetch all messages from a single Slack channel with pagination. + + Args: + client: Authenticated slack_sdk WebClient. + channel_id: Slack channel ID. + channel_name: Human-readable channel name. + + Returns: + List of normalized message dicts. + """ + messages: list[dict] = [] + cursor = None + fetched = 0 + + while fetched < self.max_messages: + kwargs: dict = { + "channel": channel_id, + "limit": min(200, self.max_messages - fetched), + } + if cursor: + kwargs["cursor"] = cursor + + result = client.conversations_history(**kwargs) + batch = result.get("messages", []) + if not batch: + break + + for raw_msg in batch: + parsed = self._parse_slack_message(raw_msg, channel_name, {}) + if parsed: + messages.append(parsed) + + fetched += len(batch) + + # Pagination + response_meta = result.get("response_metadata", {}) + cursor = response_meta.get("next_cursor") + if not cursor: + break + + return messages + + # ------------------------------------------------------------------ + # Discord export extraction + # ------------------------------------------------------------------ + + def _extract_discord_export(self) -> list[dict]: + """Parse a Discord chat export in DiscordChatExporter JSON format. + + DiscordChatExporter produces a single JSON file per channel with + a ``messages`` array. Each message object has ``id``, ``content``, + ``author``, ``timestamp``, ``attachments``, ``reactions``, etc. + + Returns: + List of normalized message dicts. + + Raises: + FileNotFoundError: If export_path does not exist. + ValueError: If the file is not valid JSON or has unexpected structure. + """ + export_path = Path(self.export_path) + if not export_path.exists(): + raise FileNotFoundError(f"Discord export path not found: {self.export_path}") + + # Support single file or directory of JSON files + json_files: list[Path] = [] + if export_path.is_file(): + json_files = [export_path] + elif export_path.is_dir(): + json_files = sorted(export_path.glob("*.json")) + else: + raise ValueError(f"Invalid export path: {self.export_path}") + + if not json_files: + raise ValueError(f"No JSON files found in Discord export: {self.export_path}") + + messages: list[dict] = [] + + for json_file in json_files: + try: + with open(json_file, encoding="utf-8") as f: + export_data = json.load(f) + except (json.JSONDecodeError, OSError) as e: + logger.warning("Failed to parse %s: %s", json_file, e) + continue + + # DiscordChatExporter format: top-level object with "messages" key + if isinstance(export_data, dict): + channel_info = export_data.get("channel", {}) + channel_name = ( + channel_info.get("name", json_file.stem) + if isinstance(channel_info, dict) + else json_file.stem + ) + raw_messages = export_data.get("messages", []) + elif isinstance(export_data, list): + # Some exporters produce a bare list of messages + channel_name = json_file.stem + raw_messages = export_data + else: + logger.warning("Unexpected JSON structure in %s", json_file) + continue + + for raw_msg in raw_messages: + parsed = self._parse_discord_message(raw_msg, channel_name) + if parsed: + messages.append(parsed) + + print(f" 📁 #{channel_name}: {len(raw_messages)} messages") + + print(f" Total messages parsed: {len(messages)}") + return messages + + # ------------------------------------------------------------------ + # Discord API extraction + # ------------------------------------------------------------------ + + def _extract_discord_api(self) -> list[dict]: + """Fetch messages from Discord via the HTTP API. + + Uses aiohttp directly (not the discord.py gateway client) to + fetch channel history. Requires a Bot token and channel ID. + + Returns: + List of normalized message dicts. + + Raises: + RuntimeError: If the API call fails. + ValueError: If no channel ID is provided. + """ + if not self.channel: + raise ValueError( + "Discord API mode requires --channel (channel ID). " + "Find channel IDs in Discord Developer Mode." + ) + + import asyncio + + try: + import aiohttp + except ImportError: + raise RuntimeError( + "aiohttp is required for Discord API mode.\nInstall with: pip install aiohttp" + ) from None + + async def _fetch() -> list[dict]: + messages: list[dict] = [] + base_url = "https://discord.com/api/v10" + headers = {"Authorization": f"Bot {self.token}"} + + # Get channel info + async with aiohttp.ClientSession() as session: + async with session.get( + f"{base_url}/channels/{self.channel}", headers=headers + ) as resp: + if resp.status != 200: + body = await resp.text() + raise RuntimeError( + f"Discord API error (HTTP {resp.status}): {body}\n" + "Check your Bot token and channel ID." + ) + channel_info = await resp.json() + channel_name = channel_info.get("name", self.channel) + + # Fetch messages with pagination (before= cursor) + before: str | None = None + fetched = 0 + + while fetched < self.max_messages: + params: dict[str, str | int] = {"limit": min(100, self.max_messages - fetched)} + if before: + params["before"] = before + + async with session.get( + f"{base_url}/channels/{self.channel}/messages", + headers=headers, + params=params, + ) as resp: + if resp.status != 200: + body = await resp.text() + logger.warning("Discord API error fetching messages: %s", body) + break + batch = await resp.json() + + if not batch: + break + + for raw_msg in batch: + parsed = self._parse_discord_message(raw_msg, channel_name) + if parsed: + messages.append(parsed) + + fetched += len(batch) + before = batch[-1]["id"] + + print(f" 📡 #{channel_name}: {len(messages)} messages") + return messages + + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(_fetch()) + finally: + loop.close() + + # ------------------------------------------------------------------ + # Message parsing + # ------------------------------------------------------------------ + + def _parse_slack_message( + self, raw: dict, channel: str, users_map: dict[str, str] + ) -> dict | None: + """Parse a single Slack message into normalized format. + + Handles regular messages, bot messages, and subtypes like + ``channel_join``, ``channel_leave``, ``file_share``, etc. + System subtypes (join/leave/topic) are skipped. + + Args: + raw: Raw Slack message dict from export or API. + channel: Channel name this message belongs to. + users_map: User ID -> display name mapping. + + Returns: + Normalized message dict, or None if the message should be skipped. + """ + # Skip system messages + subtype = raw.get("subtype", "") + skip_subtypes = { + "channel_join", + "channel_leave", + "channel_topic", + "channel_purpose", + "channel_name", + "channel_archive", + "channel_unarchive", + "group_join", + "group_leave", + } + if subtype in skip_subtypes: + return None + + text = raw.get("text", "").strip() + if not text and not raw.get("files") and not raw.get("attachments"): + return None + + # Resolve user + user_id = raw.get("user", raw.get("bot_id", "unknown")) + user_name = users_map.get(user_id, user_id) + if raw.get("username"): + user_name = raw["username"] + + # Parse timestamp + ts = raw.get("ts", "0") + try: + timestamp = datetime.fromtimestamp(float(ts), tz=timezone.utc).isoformat() + except (ValueError, TypeError, OSError): + timestamp = ts + + # Resolve user mentions in text: <@U12345> -> @username + def _resolve_mention(match: re.Match) -> str: + uid = match.group(1) + return f"@{users_map.get(uid, uid)}" + + text = re.sub(r"<@(U[A-Z0-9]+)>", _resolve_mention, text) + + # Decode Slack link format: -> label (url) + text = re.sub(r"<(https?://[^|>]+)\|([^>]+)>", r"\2 (\1)", text) + text = re.sub(r"<(https?://[^>]+)>", r"\1", text) + + # Reactions + reactions = [] + for reaction in raw.get("reactions", []): + reactions.append( + { + "emoji": reaction.get("name", ""), + "count": reaction.get("count", 0), + } + ) + + # Attachments / files + attachments = [] + for f in raw.get("files", []): + attachments.append( + { + "name": f.get("name", f.get("title", "unnamed")), + "type": f.get("mimetype", f.get("filetype", "")), + "url": f.get("url_private", f.get("permalink", "")), + } + ) + for att in raw.get("attachments", []): + attachments.append( + { + "name": att.get("title", att.get("fallback", "attachment")), + "type": "link", + "url": att.get("from_url", att.get("title_link", "")), + "text": att.get("text", ""), + } + ) + + # Thread info + thread_ts = raw.get("thread_ts") + is_thread_parent = thread_ts == ts and raw.get("reply_count", 0) > 0 + reply_count = raw.get("reply_count", 0) if is_thread_parent else 0 + + return { + "platform": "slack", + "channel": channel, + "user": user_name, + "user_id": user_id, + "text": text, + "timestamp": timestamp, + "ts": ts, + "thread_ts": thread_ts, + "is_thread_parent": is_thread_parent, + "reply_count": reply_count, + "reactions": reactions, + "attachments": attachments, + "subtype": subtype, + } + + def _parse_discord_message(self, raw: dict, channel: str) -> dict | None: + """Parse a single Discord message into normalized format. + + Handles regular messages, embeds, and attachments. System messages + (type != 0 and type != 19) are skipped. + + Args: + raw: Raw Discord message dict from export or API. + channel: Channel name this message belongs to. + + Returns: + Normalized message dict, or None if the message should be skipped. + """ + # Skip system messages (type 0 = DEFAULT, 19 = REPLY) + msg_type = raw.get("type", 0) + if isinstance(msg_type, int) and msg_type not in (0, 19): + return None + # DiscordChatExporter uses string type names + if isinstance(msg_type, str) and msg_type not in ("Default", "Reply"): + return None + + content = raw.get("content", "").strip() + + # Extract author info + author = raw.get("author", {}) + if isinstance(author, dict): + user_name = ( + author.get("nickname") or author.get("name") or author.get("username", "unknown") + ) + user_id = str(author.get("id", "unknown")) + else: + user_name = str(author) + user_id = str(author) + + # Parse timestamp + raw_ts = raw.get("timestamp", "") + try: + if isinstance(raw_ts, str) and raw_ts: + # ISO 8601 format from Discord API + dt = datetime.fromisoformat(raw_ts.replace("Z", "+00:00")) + timestamp = dt.isoformat() + else: + timestamp = str(raw_ts) + except (ValueError, TypeError): + timestamp = str(raw_ts) + + # Skip empty messages with no content and no attachments + embeds = raw.get("embeds", []) + attachments_raw = raw.get("attachments", []) + if not content and not embeds and not attachments_raw: + return None + + # Reactions + reactions = [] + for reaction in raw.get("reactions", []): + emoji_data = reaction.get("emoji", {}) + if isinstance(emoji_data, dict): + emoji_name = emoji_data.get("name", "") + else: + emoji_name = str(emoji_data) + reactions.append( + { + "emoji": emoji_name, + "count": reaction.get("count", 0), + } + ) + + # Attachments + attachments = [] + for att in attachments_raw: + attachments.append( + { + "name": att.get("fileName", att.get("filename", "unnamed")), + "type": att.get("contentType", att.get("content_type", "")), + "url": att.get("url", ""), + } + ) + + # Embeds as additional content + embed_texts: list[str] = [] + for embed in embeds: + title = embed.get("title", "") + desc = embed.get("description", "") + if title or desc: + embed_texts.append(f"[Embed: {title}] {desc}".strip()) + + if embed_texts: + content = content + "\n" + "\n".join(embed_texts) if content else "\n".join(embed_texts) + + # Thread / reply info + reference = raw.get("reference", raw.get("messageReference")) + thread_ts = None + if isinstance(reference, dict): + thread_ts = str(reference.get("messageId", "")) + + msg_id = str(raw.get("id", "")) + + return { + "platform": "discord", + "channel": channel, + "user": user_name, + "user_id": user_id, + "text": content, + "timestamp": timestamp, + "ts": msg_id, + "thread_ts": thread_ts, + "is_thread_parent": False, # Determined later in _identify_threads + "reply_count": 0, + "reactions": reactions, + "attachments": attachments, + "subtype": "", + } + + # ------------------------------------------------------------------ + # Content enrichment + # ------------------------------------------------------------------ + + def _extract_code_snippets(self, messages: list[dict]) -> list[dict]: + """Extract fenced code blocks from all messages. + + Detects triple-backtick fenced code blocks (````` ```lang ... ``` `````) + and inline code that spans multiple lines. + + Args: + messages: List of normalized message dicts. + + Returns: + List of code snippet dicts with 'code', 'language', + 'quality_score', 'channel', 'user', and 'timestamp'. + """ + snippets: list[dict] = [] + code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL) + + for msg in messages: + text = msg.get("text", "") + for match in code_block_pattern.finditer(text): + lang = match.group(1) or "" + code = match.group(2).strip() + if code: + snippets.append( + { + "code": code, + "language": lang, + "quality_score": _score_code_quality(code), + "channel": msg.get("channel", ""), + "user": msg.get("user", ""), + "timestamp": msg.get("timestamp", ""), + } + ) + + # Sort by quality descending + snippets.sort(key=lambda x: x.get("quality_score", 0), reverse=True) + return snippets + + def _extract_links(self, messages: list[dict]) -> list[dict]: + """Extract shared URLs from all messages. + + Finds HTTP/HTTPS URLs in message text and deduplicates by URL. + + Args: + messages: List of normalized message dicts. + + Returns: + List of link dicts with 'url', 'channel', 'user', 'timestamp', + and 'context' (surrounding text snippet). + """ + links: list[dict] = [] + seen_urls: set[str] = set() + url_pattern = re.compile(r"https?://[^\s<>\"')\]]+") + + for msg in messages: + text = msg.get("text", "") + for match in url_pattern.finditer(text): + url = match.group(0).rstrip(".,;:!?)") + if url in seen_urls: + continue + seen_urls.add(url) + + # Extract context: up to 80 chars around the URL + start = max(0, match.start() - 40) + end = min(len(text), match.end() + 40) + context = text[start:end].strip() + + links.append( + { + "url": url, + "channel": msg.get("channel", ""), + "user": msg.get("user", ""), + "timestamp": msg.get("timestamp", ""), + "context": context, + } + ) + + return links + + def _identify_threads(self, messages: list[dict]) -> list[dict]: + """Group messages into conversation threads. + + Threads are identified by shared ``thread_ts`` values (Slack) + or ``thread_ts`` references (Discord). Each thread contains the + parent message and its replies in chronological order. + + Args: + messages: List of normalized message dicts. + + Returns: + List of thread dicts with 'parent', 'replies', 'channel', + 'reply_count', and 'participants'. + """ + # Group by thread_ts + thread_map: dict[str, list[dict]] = defaultdict(list) + msg_by_ts: dict[str, dict] = {} + + for msg in messages: + ts = msg.get("ts", "") + if ts: + msg_by_ts[ts] = msg + + thread_ts = msg.get("thread_ts") + if thread_ts: + thread_map[thread_ts].append(msg) + + threads: list[dict] = [] + for thread_ts, thread_msgs in thread_map.items(): + if len(thread_msgs) < 2: + continue + + # Sort by timestamp + thread_msgs.sort(key=lambda m: m.get("timestamp", "")) + + parent = msg_by_ts.get(thread_ts, thread_msgs[0]) + replies = [m for m in thread_msgs if m.get("ts") != thread_ts] + participants = list({m.get("user", "unknown") for m in thread_msgs}) + + threads.append( + { + "parent": parent, + "replies": replies, + "channel": parent.get("channel", ""), + "reply_count": len(replies), + "participants": participants, + } + ) + + return threads + + def _summarize_channels(self, messages: list[dict]) -> dict[str, dict]: + """Generate summary statistics for each channel. + + Args: + messages: List of normalized message dicts. + + Returns: + Dict mapping channel names to summary dicts with message_count, + unique_users, date_range, top_users, and has_code. + """ + channel_data: dict[str, list[dict]] = defaultdict(list) + for msg in messages: + channel_data[msg.get("channel", "unknown")].append(msg) + + summaries: dict[str, dict] = {} + for channel, ch_messages in channel_data.items(): + users = [m.get("user", "unknown") for m in ch_messages] + user_counts: dict[str, int] = defaultdict(int) + for u in users: + user_counts[u] += 1 + + top_users = sorted(user_counts.items(), key=lambda x: x[1], reverse=True)[:5] + timestamps = [m.get("timestamp", "") for m in ch_messages if m.get("timestamp")] + + has_code = any("```" in m.get("text", "") for m in ch_messages) + + summaries[channel] = { + "message_count": len(ch_messages), + "unique_users": len(set(users)), + "date_range": { + "earliest": min(timestamps) if timestamps else "", + "latest": max(timestamps) if timestamps else "", + }, + "top_users": [{"user": u, "count": c} for u, c in top_users], + "has_code": has_code, + } + + return summaries + + # Alias for single-channel usage in _build_sections + _summarize_channel = _summarize_channels + + # ------------------------------------------------------------------ + # Section building + # ------------------------------------------------------------------ + + def _build_sections(self, messages: list[dict], threads: list[dict]) -> list[dict]: + """Build sections from messages, grouping by channel and date. + + Each section represents a chunk of conversation from a single + channel on a single date. Sections are compatible with the + pipeline's intermediate JSON 'pages' format. + + Args: + messages: List of normalized message dicts. + threads: List of thread dicts (for enrichment). + + Returns: + List of section dicts with heading, text, code_samples, etc. + """ + # Group by (channel, date) + groups: dict[tuple[str, str], list[dict]] = defaultdict(list) + for msg in messages: + channel = msg.get("channel", "general") + ts = msg.get("timestamp", "") + try: + date_str = ts[:10] if ts else "unknown" + except (TypeError, IndexError): + date_str = "unknown" + groups[(channel, date_str)].append(msg) + + sections: list[dict] = [] + + for section_number, ((channel, date_str), group_msgs) in enumerate( + sorted(groups.items()), 1 + ): + # Sort messages chronologically + group_msgs.sort(key=lambda m: m.get("timestamp", "")) + + # Build text from messages + text_parts: list[str] = [] + code_samples: list[dict] = [] + + for msg in group_msgs: + user = msg.get("user", "unknown") + text = msg.get("text", "") + ts_display = msg.get("timestamp", "")[:19] + + # Format message + msg_line = f"**{user}** ({ts_display}): {text}" + text_parts.append(msg_line) + + # Add reactions + reactions = msg.get("reactions", []) + if reactions: + reaction_str = " ".join(f":{r['emoji']}: ({r['count']})" for r in reactions) + text_parts.append(f" Reactions: {reaction_str}") + + # Extract inline code blocks + code_block_pattern = re.compile(r"```(\w*)\n(.*?)```", re.DOTALL) + for match in code_block_pattern.finditer(text): + lang = match.group(1) or "" + code = match.group(2).strip() + if code: + code_samples.append( + { + "code": code, + "language": lang, + "quality_score": _score_code_quality(code), + } + ) + + sections.append( + { + "section_number": section_number, + "heading": f"#{channel} - {date_str}", + "heading_level": "h2", + "text": "\n\n".join(text_parts), + "headings": [], + "code_samples": code_samples, + "tables": [], + "images": [], + "channel": channel, + "date": date_str, + "message_count": len(group_msgs), + } + ) + + return sections + + # ------------------------------------------------------------------ + # Output generation (private) + # ------------------------------------------------------------------ + + def _generate_reference_file( + self, + _cat_key: str, + cat_data: dict, + section_num: int, + total_sections: int, + ) -> None: + """Generate a reference markdown file for a category. + + Args: + _cat_key: Category key (unused, for interface consistency). + cat_data: Category dict with 'title' and 'pages'. + section_num: 1-based index among all categories. + total_sections: Total number of categories being generated. + """ + sections = cat_data["pages"] + + if sections: + section_nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)] + if total_sections == 1: + filename = f"{self.skill_dir}/references/main.md" + else: + sec_range = f"s{min(section_nums)}-s{max(section_nums)}" + filename = f"{self.skill_dir}/references/{_cat_key}_{sec_range}.md" + else: + filename = f"{self.skill_dir}/references/section_{section_num:02d}.md" + + with open(filename, "w", encoding="utf-8") as f: + f.write(f"# {cat_data['title']}\n\n") + + for section in sections: + sec_num = section.get("section_number", "?") + heading = section.get("heading", "") + msg_count = section.get("message_count", 0) + + f.write(f"---\n\n**📄 Section {sec_num}**") + f.write(f" ({msg_count} messages)\n\n") + + if heading: + f.write(f"## {heading}\n\n") + + # Message text + text = section.get("text", "").strip() + if text: + f.write(f"{text}\n\n") + + # Code samples + code_list = section.get("code_samples", []) + if code_list: + f.write("### Code Snippets\n\n") + for code in code_list: + lang = code.get("language", "") + f.write(f"```{lang}\n{code['code']}\n```\n\n") + + f.write("---\n\n") + + print(f" Generated: {filename}") + + def _generate_index(self, categorized: dict[str, dict]) -> None: + """Generate reference index file listing all categories. + + Args: + categorized: Dict mapping category keys to category dicts. + """ + filename = f"{self.skill_dir}/references/index.md" + total_cats = len(categorized) + + with open(filename, "w", encoding="utf-8") as f: + f.write(f"# {self.name.title()} Chat Reference\n\n") + f.write("## Categories\n\n") + + for section_num, (_ck, cd) in enumerate(categorized.items(), 1): + pages = cd["pages"] + count = len(pages) + total_msgs = sum(p.get("message_count", 0) for p in pages) + + if pages: + snums = [s.get("section_number", i + 1) for i, s in enumerate(pages)] + rng = f"Sections {min(snums)}-{max(snums)}" + link = "main.md" if total_cats == 1 else f"{_ck}_s{min(snums)}-s{max(snums)}.md" + else: + link = f"section_{section_num:02d}.md" + rng = "N/A" + + f.write( + f"- [{cd['title']}]({link}) ({count} sections, {total_msgs} messages, {rng})\n" + ) + + # Statistics + f.write("\n## Statistics\n\n") + meta = self.extracted_data.get("metadata", {}) + f.write(f"- Platform: {self.extracted_data.get('platform', 'unknown')}\n") + f.write(f"- Total messages: {meta.get('total_messages', 0)}\n") + f.write(f"- Total threads: {meta.get('total_threads', 0)}\n") + f.write(f"- Code snippets: {meta.get('total_code_snippets', 0)}\n") + f.write(f"- Shared links: {meta.get('total_links', 0)}\n") + f.write(f"- Unique users: {meta.get('unique_users', 0)}\n") + f.write(f"- Channels: {len(meta.get('channels', []))}\n") + + # Channel summaries + channel_summaries = self.extracted_data.get("channel_summaries", {}) + if channel_summaries: + f.write("\n## Channel Summary\n\n") + for ch_name, summary in sorted(channel_summaries.items()): + f.write(f"### #{ch_name}\n\n") + f.write(f"- Messages: {summary.get('message_count', 0)}\n") + f.write(f"- Users: {summary.get('unique_users', 0)}\n") + dr = summary.get("date_range", {}) + if dr.get("earliest") and dr.get("latest"): + f.write(f"- Date range: {dr['earliest'][:10]} to {dr['latest'][:10]}\n") + if summary.get("has_code"): + f.write("- Contains code snippets\n") + top_users = summary.get("top_users", []) + if top_users: + top_str = ", ".join(f"{u['user']} ({u['count']})" for u in top_users[:3]) + f.write(f"- Top contributors: {top_str}\n") + f.write("\n") + + print(f" Generated: {filename}") + + def _generate_skill_md(self, categorized: dict[str, dict]) -> None: + """Generate main SKILL.md file with YAML frontmatter and overview. + + Args: + categorized: Dict mapping category keys to category dicts. + """ + filename = f"{self.skill_dir}/SKILL.md" + skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] + desc = self.description[:1024] + meta = self.extracted_data.get("metadata", {}) + + with open(filename, "w", encoding="utf-8") as f: + # YAML frontmatter + f.write("---\n") + f.write(f"name: {skill_name}\n") + f.write(f"description: {desc}\n") + f.write("---\n\n") + + platform_label = self.platform.title() + f.write(f"# {self.name.title()} {platform_label} Chat Skill\n\n") + f.write(f"{self.description}\n\n") + + # Chat metadata + f.write(f"## 📋 {platform_label} Chat Information\n\n") + f.write(f"**Platform:** {platform_label}\n\n") + f.write(f"**Source:** {self.extracted_data.get('source', 'N/A')}\n\n") + f.write(f"**Total Messages:** {meta.get('total_messages', 0)}\n\n") + f.write(f"**Unique Users:** {meta.get('unique_users', 0)}\n\n") + channels = meta.get("channels", []) + if channels: + f.write(f"**Channels:** {', '.join(f'#{c}' for c in channels)}\n\n") + + # When to Use + f.write("## 💡 When to Use This Skill\n\n") + f.write("Use this skill when you need to:\n") + f.write(f"- Find solutions discussed in {self.name} chat history\n") + f.write("- Reference code snippets shared by team members\n") + f.write("- Understand team decisions and architectural discussions\n") + f.write("- Look up troubleshooting steps from past conversations\n") + f.write("- Find shared links and resources from the team\n\n") + + # Section overview + total_sections = self.extracted_data.get("total_sections", 0) + f.write(f"## 📖 Content Overview\n\n") + f.write(f"**Total Sections:** {total_sections}\n\n") + f.write("**Content Breakdown:**\n\n") + for cd in categorized.values(): + f.write(f"- **{cd['title']}**: {len(cd['pages'])} sections\n") + f.write("\n") + + # Key topics + f.write(self._format_key_topics()) + + # Top code examples + code_snippets = self.extracted_data.get("code_snippets", []) + if code_snippets: + f.write("## 📝 Top Code Snippets\n\n") + f.write("*High-quality code shared in chat*\n\n") + + by_lang: dict[str, list] = {} + for cs in code_snippets[:15]: + lang = cs.get("language", "unknown") or "unknown" + by_lang.setdefault(lang, []).append(cs) + + for lang in sorted(by_lang.keys()): + examples = by_lang[lang] + f.write(f"### {lang.title()} ({len(examples)} snippets)\n\n") + for i, cs in enumerate(examples[:3], 1): + quality = cs.get("quality_score", 0) + user = cs.get("user", "") + code_text = cs.get("code", "") + f.write(f"**Snippet {i}**") + if user: + f.write(f" (by {user})") + f.write(f" (Quality: {quality:.1f}/10):\n\n") + f.write(f"```{lang}\n") + if len(code_text) <= 500: + f.write(code_text) + else: + f.write(code_text[:500] + "\n...") + f.write("\n```\n\n") + + # Shared links + links = self.extracted_data.get("links", []) + if links: + f.write(f"## 🔗 Shared Links ({len(links)})\n\n") + f.write("*Key resources shared in chat*\n\n") + for link in links[:20]: + url = link.get("url", "") + user = link.get("user", "") + channel = link.get("channel", "") + f.write(f"- {url}") + if user or channel: + parts = [] + if user: + parts.append(f"by {user}") + if channel: + parts.append(f"in #{channel}") + f.write(f" ({', '.join(parts)})") + f.write("\n") + if len(links) > 20: + f.write(f"\n*... and {len(links) - 20} more links*\n") + f.write("\n") + + # Statistics + f.write(f"## 📊 Chat Statistics\n\n") + f.write(f"- **Total Messages**: {meta.get('total_messages', 0)}\n") + f.write(f"- **Total Threads**: {meta.get('total_threads', 0)}\n") + f.write(f"- **Code Snippets**: {meta.get('total_code_snippets', 0)}\n") + f.write(f"- **Shared Links**: {meta.get('total_links', 0)}\n") + f.write(f"- **Unique Users**: {meta.get('unique_users', 0)}\n") + f.write(f"- **Channels**: {len(meta.get('channels', []))}\n\n") + + # Channel breakdown + channel_summaries = self.extracted_data.get("channel_summaries", {}) + if channel_summaries: + f.write("**Channel Activity:**\n\n") + for ch_name, summary in sorted( + channel_summaries.items(), + key=lambda x: x[1].get("message_count", 0), + reverse=True, + ): + msg_count = summary.get("message_count", 0) + user_count = summary.get("unique_users", 0) + f.write(f"- #{ch_name}: {msg_count} messages, {user_count} users\n") + f.write("\n") + + # Navigation + f.write("## 🗺️ Navigation\n\n") + f.write("**Reference Files:**\n\n") + for cd in categorized.values(): + cat_file = self._sanitize_filename(cd["title"]) + f.write(f"- `references/{cat_file}.md` - {cd['title']}\n") + f.write("\nSee `references/index.md` for complete chat structure.\n\n") + + # Footer + f.write("---\n\n") + f.write(f"**Generated by Skill Seeker** | {platform_label} Chat Scraper\n") + + with open(filename, encoding="utf-8") as f: + line_count = len(f.read().split("\n")) + print(f" Generated: {filename} ({line_count} lines)") + + # ------------------------------------------------------------------ + # Content analysis helpers + # ------------------------------------------------------------------ + + def _format_key_topics(self) -> str: + """Extract key discussion topics from section headings and content. + + Returns: + Markdown string with key topics section. + """ + sections = self.extracted_data.get("pages", []) + if not sections: + return "" + + # Count topic matches across all sections + topic_counts: dict[str, int] = defaultdict(int) + for section in sections: + combined = self._section_text(section) + for topic, keywords in _TOPIC_KEYWORDS.items(): + score = sum(1 for kw in keywords if kw.lower() in combined) + if score >= 2: + topic_counts[topic] += 1 + + if not topic_counts: + return "" + + content = "## 🔑 Key Discussion Topics\n\n" + content += "*Topics frequently discussed in chat*\n\n" + + for topic, count in sorted(topic_counts.items(), key=lambda x: x[1], reverse=True): + label = topic.replace("_", " ").title() + content += f"- **{label}**: {count} conversations\n" + content += "\n" + + return content + + def _section_text(self, section: dict) -> str: + """Combine section text, heading, and code into a lowercase string. + + Args: + section: Section dict. + + Returns: + Combined lowercase text for keyword matching. + """ + text = section.get("text", "").lower() + heading = section.get("heading", "").lower() + code = " ".join(cs.get("code", "").lower() for cs in section.get("code_samples", [])) + return f"{text} {heading} {code}" + + def _sanitize_filename(self, name: str) -> str: + """Convert a string to a filesystem-safe filename. + + Args: + name: Input string to sanitize. + + Returns: + Safe lowercase filename with underscores. + """ + safe = re.sub(r"[^\w\s-]", "", name.lower()) + return re.sub(r"[-\s]+", "_", safe) + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + """CLI entry point for the Slack/Discord chat scraper. + + Parses command-line arguments and runs the extraction and + skill-building pipeline. Supports export import, API fetch, + and loading from previously extracted JSON. + + Returns: + Exit code (0 for success, non-zero for errors). + """ + from .arguments.chat import add_chat_arguments + + parser = argparse.ArgumentParser( + description="Convert Slack/Discord chat history to AI-ready skill", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Slack workspace export + %(prog)s --export-path ./slack-export/ --platform slack --name myteam + + # Slack API + %(prog)s --platform slack --token xoxb-... --channel C01234 --name myteam + + # Discord export (DiscordChatExporter) + %(prog)s --export-path ./discord-export.json --platform discord --name myserver + + # Discord API + %(prog)s --platform discord --token Bot-token --channel 12345 --name myserver + + # From previously extracted JSON + %(prog)s --from-json myteam_extracted.json --name myteam + """, + ) + + add_chat_arguments(parser) + + args = parser.parse_args() + + # Set logging level + if getattr(args, "quiet", False): + logging.getLogger().setLevel(logging.WARNING) + elif getattr(args, "verbose", False): + logging.getLogger().setLevel(logging.DEBUG) + + # Handle --dry-run + if args.dry_run: + source = args.export_path or args.from_json or f"{args.platform}-api" + print(f"\n{'=' * 60}") + print("DRY RUN: Chat Extraction") + print(f"{'=' * 60}") + print(f"Platform: {args.platform}") + print(f"Source: {source}") + print(f"Name: {args.name or '(auto-detect)'}") + print(f"Channel: {args.channel or '(all)'}") + print(f"Max messages: {args.max_messages}") + print(f"Enhance level: {args.enhance_level}") + print(f"\n✅ Dry run complete") + return 0 + + # Validate inputs + if args.from_json: + # Build from previously extracted JSON + name = args.name or Path(args.from_json).stem.replace("_extracted", "") + config = { + "name": name, + "description": (args.description or f"Use when referencing {name} chat knowledge base"), + } + try: + converter = ChatToSkillConverter(config) + converter.load_extracted_data(args.from_json) + converter.build_skill() + except Exception as e: + print(f"\n❌ Error: {e}", file=sys.stderr) + sys.exit(1) + return 0 + + # Require either --export-path or --token for extraction + if not args.export_path and not args.token: + parser.error( + "Must specify --export-path (export mode), --token (API mode), " + "or --from-json (build from extracted data)" + ) + + if not args.name: + if args.export_path: + args.name = Path(args.export_path).stem + else: + args.name = f"{args.platform}_chat" + + config = { + "name": args.name, + "export_path": args.export_path or "", + "platform": args.platform, + "token": args.token or "", + "channel": args.channel or "", + "max_messages": args.max_messages, + "description": args.description, + } + + try: + converter = ChatToSkillConverter(config) + + # Extract + if not converter.extract_chat(): + print( + "\n❌ Chat extraction failed - see error above", + file=sys.stderr, + ) + sys.exit(1) + + # Build skill + converter.build_skill() + + # Enhancement Workflow Integration + from skill_seekers.cli.workflow_runner import run_workflows + + workflow_executed, workflow_names = run_workflows(args) + workflow_name = ", ".join(workflow_names) if workflow_names else None + + # Traditional enhancement (complements workflow system) + if getattr(args, "enhance_level", 0) > 0: + api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") + mode = "API" if api_key else "LOCAL" + + print("\n" + "=" * 80) + print(f"🤖 Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") + print("=" * 80) + if workflow_executed: + print(f" Running after workflow: {workflow_name}") + print( + " (Workflow provides specialized analysis, " + "enhancement provides general improvements)" + ) + print("") + + skill_dir = converter.skill_dir + if api_key: + try: + from skill_seekers.cli.enhance_skill import enhance_skill_md + + enhance_skill_md(skill_dir, api_key) + print("✅ API enhancement complete!") + except ImportError: + print("❌ API enhancement not available. Falling back to LOCAL mode...") + from skill_seekers.cli.enhance_skill_local import ( + LocalSkillEnhancer, + ) + + enhancer = LocalSkillEnhancer(Path(skill_dir)) + enhancer.run(headless=True) + print("✅ Local enhancement complete!") + else: + from skill_seekers.cli.enhance_skill_local import ( + LocalSkillEnhancer, + ) + + enhancer = LocalSkillEnhancer(Path(skill_dir)) + enhancer.run(headless=True) + print("✅ Local enhancement complete!") + + except (FileNotFoundError, ValueError) as e: + print(f"\n❌ Input error: {e}", file=sys.stderr) + sys.exit(1) + except RuntimeError as e: + print(f"\n❌ Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print( + f"\n❌ Unexpected error during chat processing: {e}", + file=sys.stderr, + ) + import traceback + + traceback.print_exc() + sys.exit(1) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/skill_seekers/cli/config_validator.py b/src/skill_seekers/cli/config_validator.py index c55e73d..086d2ef 100644 --- a/src/skill_seekers/cli/config_validator.py +++ b/src/skill_seekers/cli/config_validator.py @@ -7,6 +7,19 @@ Validates unified config format that supports multiple sources: - github (repository scraping) - pdf (PDF document scraping) - local (local codebase analysis) +- word (Word .docx document scraping) +- video (video transcript/visual extraction) +- epub (EPUB e-book extraction) +- jupyter (Jupyter Notebook extraction) +- html (local HTML file extraction) +- openapi (OpenAPI/Swagger spec extraction) +- asciidoc (AsciiDoc document extraction) +- pptx (PowerPoint presentation extraction) +- confluence (Confluence wiki extraction) +- notion (Notion page extraction) +- rss (RSS/Atom feed extraction) +- manpage (man page extraction) +- chat (Slack/Discord chat export extraction) Legacy config format support removed in v2.11.0. All configs must use unified format with 'sources' array. @@ -27,7 +40,25 @@ class ConfigValidator: """ # Valid source types - VALID_SOURCE_TYPES = {"documentation", "github", "pdf", "local", "word", "video"} + VALID_SOURCE_TYPES = { + "documentation", + "github", + "pdf", + "local", + "word", + "video", + "epub", + "jupyter", + "html", + "openapi", + "asciidoc", + "pptx", + "confluence", + "notion", + "rss", + "manpage", + "chat", + } # Valid merge modes VALID_MERGE_MODES = {"rule-based", "claude-enhanced"} @@ -159,6 +190,32 @@ class ConfigValidator: self._validate_pdf_source(source, index) elif source_type == "local": self._validate_local_source(source, index) + elif source_type == "word": + self._validate_word_source(source, index) + elif source_type == "video": + self._validate_video_source(source, index) + elif source_type == "epub": + self._validate_epub_source(source, index) + elif source_type == "jupyter": + self._validate_jupyter_source(source, index) + elif source_type == "html": + self._validate_html_source(source, index) + elif source_type == "openapi": + self._validate_openapi_source(source, index) + elif source_type == "asciidoc": + self._validate_asciidoc_source(source, index) + elif source_type == "pptx": + self._validate_pptx_source(source, index) + elif source_type == "confluence": + self._validate_confluence_source(source, index) + elif source_type == "notion": + self._validate_notion_source(source, index) + elif source_type == "rss": + self._validate_rss_source(source, index) + elif source_type == "manpage": + self._validate_manpage_source(source, index) + elif source_type == "chat": + self._validate_chat_source(source, index) def _validate_documentation_source(self, source: dict[str, Any], index: int): """Validate documentation source configuration.""" @@ -253,12 +310,126 @@ class ConfigValidator: f"Source {index} (local): Invalid ai_mode '{ai_mode}'. Must be one of {self.VALID_AI_MODES}" ) + def _validate_word_source(self, source: dict[str, Any], index: int): + """Validate Word document (.docx) source configuration.""" + if "path" not in source: + raise ValueError(f"Source {index} (word): Missing required field 'path'") + word_path = source["path"] + if not Path(word_path).exists(): + logger.warning(f"Source {index} (word): File not found: {word_path}") + + def _validate_video_source(self, source: dict[str, Any], index: int): + """Validate video source configuration.""" + has_url = "url" in source + has_path = "path" in source + has_playlist = "playlist" in source + if not has_url and not has_path and not has_playlist: + raise ValueError( + f"Source {index} (video): Missing required field 'url', 'path', or 'playlist'" + ) + + def _validate_epub_source(self, source: dict[str, Any], index: int): + """Validate EPUB source configuration.""" + if "path" not in source: + raise ValueError(f"Source {index} (epub): Missing required field 'path'") + epub_path = source["path"] + if not Path(epub_path).exists(): + logger.warning(f"Source {index} (epub): File not found: {epub_path}") + + def _validate_jupyter_source(self, source: dict[str, Any], index: int): + """Validate Jupyter Notebook source configuration.""" + if "path" not in source: + raise ValueError(f"Source {index} (jupyter): Missing required field 'path'") + nb_path = source["path"] + if not Path(nb_path).exists(): + logger.warning(f"Source {index} (jupyter): Path not found: {nb_path}") + + def _validate_html_source(self, source: dict[str, Any], index: int): + """Validate local HTML source configuration.""" + if "path" not in source: + raise ValueError(f"Source {index} (html): Missing required field 'path'") + html_path = source["path"] + if not Path(html_path).exists(): + logger.warning(f"Source {index} (html): Path not found: {html_path}") + + def _validate_openapi_source(self, source: dict[str, Any], index: int): + """Validate OpenAPI/Swagger source configuration.""" + if "path" not in source and "url" not in source: + raise ValueError(f"Source {index} (openapi): Missing required field 'path' or 'url'") + if "path" in source and not Path(source["path"]).exists(): + logger.warning(f"Source {index} (openapi): File not found: {source['path']}") + + def _validate_asciidoc_source(self, source: dict[str, Any], index: int): + """Validate AsciiDoc source configuration.""" + if "path" not in source: + raise ValueError(f"Source {index} (asciidoc): Missing required field 'path'") + adoc_path = source["path"] + if not Path(adoc_path).exists(): + logger.warning(f"Source {index} (asciidoc): Path not found: {adoc_path}") + + def _validate_pptx_source(self, source: dict[str, Any], index: int): + """Validate PowerPoint source configuration.""" + if "path" not in source: + raise ValueError(f"Source {index} (pptx): Missing required field 'path'") + pptx_path = source["path"] + if not Path(pptx_path).exists(): + logger.warning(f"Source {index} (pptx): File not found: {pptx_path}") + + def _validate_confluence_source(self, source: dict[str, Any], index: int): + """Validate Confluence source configuration.""" + has_url = "url" in source or "base_url" in source + has_path = "path" in source + if not has_url and not has_path: + raise ValueError( + f"Source {index} (confluence): Missing required field 'url'/'base_url' " + f"(for API) or 'path' (for export)" + ) + if has_url and "space_key" not in source and "path" not in source: + logger.warning(f"Source {index} (confluence): No 'space_key' specified for API mode") + + def _validate_notion_source(self, source: dict[str, Any], index: int): + """Validate Notion source configuration.""" + has_url = "url" in source or "database_id" in source or "page_id" in source + has_path = "path" in source + if not has_url and not has_path: + raise ValueError( + f"Source {index} (notion): Missing required field 'url'/'database_id'/'page_id' " + f"(for API) or 'path' (for export)" + ) + + def _validate_rss_source(self, source: dict[str, Any], index: int): + """Validate RSS/Atom feed source configuration.""" + if "url" not in source and "path" not in source: + raise ValueError(f"Source {index} (rss): Missing required field 'url' or 'path'") + + def _validate_manpage_source(self, source: dict[str, Any], index: int): + """Validate man page source configuration.""" + if "path" not in source and "names" not in source: + raise ValueError(f"Source {index} (manpage): Missing required field 'path' or 'names'") + if "path" in source and not Path(source["path"]).exists(): + logger.warning(f"Source {index} (manpage): Path not found: {source['path']}") + + def _validate_chat_source(self, source: dict[str, Any], index: int): + """Validate Slack/Discord chat source configuration.""" + has_path = "path" in source + has_api = "token" in source or "webhook_url" in source + has_channel = "channel" in source or "channel_id" in source + if not has_path and not has_api: + raise ValueError( + f"Source {index} (chat): Missing required field 'path' (for export) " + f"or 'token' (for API)" + ) + if has_api and not has_channel: + logger.warning( + f"Source {index} (chat): No 'channel' or 'channel_id' specified for API mode" + ) + def get_sources_by_type(self, source_type: str) -> list[dict[str, Any]]: """ Get all sources of a specific type. Args: - source_type: 'documentation', 'github', 'pdf', or 'local' + source_type: Any valid source type string Returns: List of sources matching the type diff --git a/src/skill_seekers/cli/confluence_scraper.py b/src/skill_seekers/cli/confluence_scraper.py new file mode 100644 index 0000000..6204606 --- /dev/null +++ b/src/skill_seekers/cli/confluence_scraper.py @@ -0,0 +1,2166 @@ +#!/usr/bin/env python3 +""" +Confluence Documentation to Skill Converter + +Converts Confluence spaces into AI-ready skills by extracting page content, +hierarchy, code blocks, tables, and attachments. Supports two extraction modes: + +1. **API mode**: Connects to a Confluence instance via the Atlassian REST API + (requires ``atlassian-python-api``). Fetches pages from a specified space, + preserving the parent-child hierarchy. Requires ``--base-url``, ``--space-key``, + and authentication via ``--username`` / ``--token`` (or env vars). + +2. **Export mode**: Parses a Confluence HTML/XML export directory previously + downloaded from the Confluence admin UI. Requires ``--export-path`` pointing + to the extracted export directory containing ``entities.xml`` or HTML files. + +Usage: + # API mode + skill-seekers confluence --base-url https://wiki.example.com \\ + --space-key PROJ --username user@example.com --token $CONFLUENCE_TOKEN \\ + --name my-project-wiki + + # Export mode + skill-seekers confluence --export-path ./confluence-export/ --name my-wiki + + # Build from previously extracted JSON + skill-seekers confluence --from-json my-wiki_extracted.json + + # Standalone execution + python3 -m skill_seekers.cli.confluence_scraper --base-url https://wiki.example.com \\ + --space-key DEV --name dev-wiki --max-pages 200 +""" + +import argparse +import json +import logging +import os +import re +import sys +from pathlib import Path +from typing import Any + +# Optional dependency guard for atlassian-python-api +try: + from atlassian import Confluence + + ATLASSIAN_AVAILABLE = True +except ImportError: + ATLASSIAN_AVAILABLE = False + +# BeautifulSoup is a core dependency (always available) +from bs4 import BeautifulSoup, Comment, Tag + +logger = logging.getLogger(__name__) + +# Confluence-specific HTML macro class patterns to strip during cleaning +_CONFLUENCE_MACRO_CLASSES = { + "confluence-information-macro", + "confluence-information-macro-body", + "confluence-information-macro-icon", + "expand-container", + "expand-content", + "expand-control", + "plugin-tabmeta", + "plugin_pagetree", + "page-metadata", + "aui-message", +} + +# Confluence macro element tag names (structured-macro in storage format) +_STORAGE_MACRO_TAGS = { + "ac:structured-macro", + "ac:rich-text-body", + "ac:parameter", + "ac:plain-text-body", + "ac:image", + "ac:link", + "ac:emoticon", + "ac:task-list", + "ac:task", + "ac:task-body", + "ac:task-status", + "ri:attachment", + "ri:page", + "ri:space", + "ri:url", + "ri:user", +} + +# Known Confluence code macro language mappings +_CODE_MACRO_LANGS = { + "py": "python", + "python": "python", + "python3": "python", + "js": "javascript", + "javascript": "javascript", + "ts": "typescript", + "typescript": "typescript", + "java": "java", + "bash": "bash", + "sh": "bash", + "shell": "bash", + "sql": "sql", + "xml": "xml", + "html": "html", + "css": "css", + "json": "json", + "yaml": "yaml", + "yml": "yaml", + "ruby": "ruby", + "go": "go", + "golang": "go", + "rust": "rust", + "c": "c", + "cpp": "cpp", + "csharp": "csharp", + "cs": "csharp", + "kotlin": "kotlin", + "swift": "swift", + "scala": "scala", + "groovy": "groovy", + "perl": "perl", + "php": "php", + "r": "r", + "powershell": "powershell", + "dockerfile": "dockerfile", + "terraform": "hcl", + "hcl": "hcl", + "markdown": "markdown", + "text": "", + "none": "", +} + + +def _check_atlassian_deps() -> None: + """Raise RuntimeError if atlassian-python-api is not installed.""" + if not ATLASSIAN_AVAILABLE: + raise RuntimeError( + "atlassian-python-api is required for Confluence API mode.\n" + "Install with: pip install atlassian-python-api\n" + 'Or: pip install "skill-seekers[confluence]"' + ) + + +def infer_description_from_confluence( + space_info: dict | None = None, + name: str = "", +) -> str: + """Infer skill description from Confluence space metadata. + + Args: + space_info: Confluence space metadata dict (name, description, key). + name: Skill name for fallback. + + Returns: + Description string suitable for "Use when..." format. + """ + if space_info: + desc_text = space_info.get("description", "") + if isinstance(desc_text, dict): + # Confluence API returns description as {"plain": {"value": "..."}} + desc_text = desc_text.get("plain", {}).get("value", "") or desc_text.get( + "view", {} + ).get("value", "") + if desc_text and len(desc_text) > 20: + clean = re.sub(r"<[^>]+>", "", desc_text).strip() + if len(clean) > 150: + clean = clean[:147] + "..." + return f"Use when {clean.lower()}" + space_name = space_info.get("name", "") + if space_name and len(space_name) > 5: + return f"Use when working with {space_name.lower()} documentation" + return ( + f"Use when referencing {name} documentation" + if name + else "Use when referencing this Confluence documentation" + ) + + +class ConfluenceToSkillConverter: + """Convert Confluence space documentation to an AI-ready skill. + + Supports two extraction modes: + + - **API mode**: Uses the Atlassian Confluence REST API to fetch pages from + a space, including page hierarchy, labels, and storage-format content. + Requires ``base_url``, ``space_key``, and authentication credentials. + + - **Export mode**: Parses a Confluence HTML/XML export directory that has + been downloaded and extracted from the Confluence admin interface. + Requires ``export_path`` pointing to the extracted directory. + + After extraction, the converter categorises pages by their parent-child + hierarchy, generates reference markdown files, an index, and the main + SKILL.md manifest. + + Attributes: + config: Configuration dictionary. + name: Skill name used for output directory and filenames. + base_url: Confluence instance base URL (API mode). + space_key: Confluence space key (API mode). + export_path: Path to exported Confluence directory (export mode). + username: Confluence username / email for API authentication. + token: Confluence API token or password. + description: Skill description for SKILL.md frontmatter. + max_pages: Maximum number of pages to fetch in API mode. + skill_dir: Output directory for the generated skill. + data_file: Path to the intermediate extracted JSON file. + extracted_data: Structured extraction results dict. + """ + + def __init__(self, config: dict) -> None: + """Initialize the Confluence to skill converter. + + Args: + config: Configuration dictionary containing: + - name (str): Skill name (required). + - base_url (str): Confluence instance URL (API mode). + - space_key (str): Confluence space key (API mode). + - export_path (str): Path to export directory (export mode). + - username (str): API username / email (optional, falls back to env). + - token (str): API token (optional, falls back to env). + - description (str): Skill description (optional). + - max_pages (int): Maximum pages to fetch, default 500. + """ + self.config = config + self.name: str = config["name"] + self.base_url: str = config.get("base_url", "") + self.space_key: str = config.get("space_key", "") + self.export_path: str = config.get("export_path", "") + self.username: str = config.get("username", "") + self.token: str = config.get("token", "") + self.description: str = ( + config.get("description") or f"Use when referencing {self.name} documentation" + ) + self.max_pages: int = int(config.get("max_pages", 500)) + + # Output paths + self.skill_dir = f"output/{self.name}" + self.data_file = f"output/{self.name}_extracted.json" + + # Extracted data storage + self.extracted_data: dict[str, Any] | None = None + + # ────────────────────────────────────────────────────────────────────── + # Extraction dispatcher + # ────────────────────────────────────────────────────────────────────── + + def extract_confluence(self) -> bool: + """Extract content from Confluence, dispatching to API or export mode. + + Determines the extraction mode based on the provided configuration: + - If ``base_url`` and ``space_key`` are set, uses API mode. + - If ``export_path`` is set, uses export mode. + - Raises ValueError if neither mode is configured. + + After extraction, saves intermediate JSON to ``{name}_extracted.json`` + and updates the description from space metadata if not explicitly set. + + Returns: + True on successful extraction. + + Raises: + ValueError: If neither API nor export configuration is provided. + RuntimeError: If API dependencies are missing or connection fails. + """ + if self.base_url and self.space_key: + print(f"\n Extracting from Confluence API: {self.base_url}") + print(f" Space: {self.space_key}") + raw_pages = self._extract_via_api() + elif self.export_path: + print(f"\n Extracting from Confluence export: {self.export_path}") + raw_pages = self._extract_from_export() + else: + raise ValueError( + "No Confluence source configured. Provide either:\n" + " - --base-url and --space-key (API mode), or\n" + " - --export-path (export mode)" + ) + + if not raw_pages: + logger.warning("No pages extracted from Confluence") + + # Build page hierarchy tree + page_tree = self._extract_page_tree(raw_pages) + + # Parse each page's HTML content to structured sections + sections: list[dict[str, Any]] = [] + total_code_blocks = 0 + total_images = 0 + section_number = 0 + + for page in raw_pages: + page_id = page.get("id", "") + page_title = page.get("title", "Untitled") + body_html = page.get("body", "") + labels = page.get("labels", []) + parent_id = page.get("parent_id", "") + + if not body_html: + logger.debug("Skipping page with no body: %s", page_title) + continue + + # Parse the Confluence HTML content + parsed = self._parse_confluence_html(body_html, page_title) + + section_number += 1 + section_data: dict[str, Any] = { + "section_number": section_number, + "page_id": page_id, + "heading": page_title, + "heading_level": "h1", + "parent_id": parent_id, + "labels": labels, + "text": parsed.get("text", ""), + "headings": parsed.get("headings", []), + "code_samples": parsed.get("code_samples", []), + "tables": parsed.get("tables", []), + "images": parsed.get("images", []), + "links": parsed.get("links", []), + "macros": parsed.get("macros", []), + } + sections.append(section_data) + total_code_blocks += len(parsed.get("code_samples", [])) + total_images += len(parsed.get("images", [])) + + # Collect space metadata + space_info = raw_pages[0].get("space_info", {}) if raw_pages else {} + + # Update description from space metadata if not explicitly set + if not self.config.get("description"): + self.description = infer_description_from_confluence(space_info, self.name) + + # Detect programming languages in code samples + languages_detected: dict[str, int] = {} + for section in sections: + for code_sample in section.get("code_samples", []): + lang = code_sample.get("language", "") + if lang: + languages_detected[lang] = languages_detected.get(lang, 0) + 1 + + result_data: dict[str, Any] = { + "source": self.base_url or self.export_path, + "space_key": self.space_key, + "space_info": space_info, + "page_tree": page_tree, + "total_sections": len(sections), + "total_pages": len(raw_pages), + "total_code_blocks": total_code_blocks, + "total_images": total_images, + "languages_detected": languages_detected, + "pages": sections, + } + + # Save extracted data + os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True) + with open(self.data_file, "w", encoding="utf-8") as f: + json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) + + print(f"\n Saved extracted data to: {self.data_file}") + self.extracted_data = result_data + print( + f" Extracted {len(sections)} pages, " + f"{total_code_blocks} code blocks, " + f"{total_images} images" + ) + return True + + # ────────────────────────────────────────────────────────────────────── + # API extraction + # ────────────────────────────────────────────────────────────────────── + + def _extract_via_api(self) -> list[dict[str, Any]]: + """Fetch pages from a Confluence space using the REST API. + + Connects to the Confluence instance using ``atlassian-python-api``, + retrieves all pages in the configured space (up to ``max_pages``), + and returns them as a list of normalised page dicts. + + Authentication is resolved in priority order: + 1. Constructor arguments (username/token) + 2. Environment variables (CONFLUENCE_USERNAME / CONFLUENCE_TOKEN) + + Returns: + List of page dicts with keys: id, title, body, parent_id, labels, + url, space_info, version, created, modified. + + Raises: + RuntimeError: If atlassian-python-api is not installed or + the connection / fetch fails. + """ + _check_atlassian_deps() + + # Resolve authentication credentials + username = ( + self.username + or os.environ.get("CONFLUENCE_USERNAME", "") + or os.environ.get("ATLASSIAN_USERNAME", "") + ) + token = ( + self.token + or os.environ.get("CONFLUENCE_TOKEN", "") + or os.environ.get("ATLASSIAN_TOKEN", "") + ) + + if not username or not token: + raise RuntimeError( + "Confluence API authentication required.\n" + "Provide --username and --token, or set CONFLUENCE_USERNAME " + "and CONFLUENCE_TOKEN environment variables." + ) + + # Connect to Confluence + try: + confluence = Confluence( + url=self.base_url, + username=username, + password=token, + cloud=self._is_cloud_instance(), + ) + except Exception as e: + raise RuntimeError(f"Failed to connect to Confluence at {self.base_url}: {e}") from e + + # Fetch space information + space_info: dict[str, Any] = {} + try: + space_data = confluence.get_space(self.space_key, expand="description.plain,homepage") + space_info = { + "key": space_data.get("key", self.space_key), + "name": space_data.get("name", self.space_key), + "description": space_data.get("description", {}), + "type": space_data.get("type", "global"), + "homepage_id": ( + space_data.get("homepage", {}).get("id", "") + if space_data.get("homepage") + else "" + ), + } + print(f" Space: {space_info.get('name', self.space_key)}") + except Exception as e: + logger.warning("Could not fetch space info: %s", e) + space_info = {"key": self.space_key, "name": self.space_key} + + # Fetch all pages in the space, paginated + pages: list[dict[str, Any]] = [] + start = 0 + limit = 50 # Confluence API page size + expand_fields = "body.storage,version,ancestors,metadata.labels" + + print(f" Fetching pages (max {self.max_pages})...") + + while len(pages) < self.max_pages: + try: + batch = confluence.get_all_pages_from_space( + self.space_key, + start=start, + limit=min(limit, self.max_pages - len(pages)), + expand=expand_fields, + content_type="page", + ) + except Exception as e: + logger.error("Failed to fetch pages at offset %d: %s", start, e) + break + + if not batch: + break + + for page_data in batch: + page_id = str(page_data.get("id", "")) + title = page_data.get("title", "Untitled") + + # Extract body (storage format HTML) + body = page_data.get("body", {}).get("storage", {}).get("value", "") + + # Extract parent ID from ancestors + ancestors = page_data.get("ancestors", []) + parent_id = str(ancestors[-1]["id"]) if ancestors else "" + + # Extract labels + labels_data = page_data.get("metadata", {}).get("labels", {}).get("results", []) + labels = [lbl.get("name", "") for lbl in labels_data if lbl.get("name")] + + # Version and dates + version_info = page_data.get("version", {}) + version_number = version_info.get("number", 1) + created = version_info.get("when", "") if version_number == 1 else "" + modified = version_info.get("when", "") + + # Build page URL + page_url = f"{self.base_url}/wiki/spaces/{self.space_key}/pages/{page_id}" + links = page_data.get("_links", {}) + if links.get("webui"): + page_url = f"{self.base_url}/wiki{links['webui']}" + + page_dict: dict[str, Any] = { + "id": page_id, + "title": title, + "body": body, + "parent_id": parent_id, + "labels": labels, + "url": page_url, + "space_info": space_info, + "version": version_number, + "created": created, + "modified": modified, + } + pages.append(page_dict) + + print(f" Fetched {len(pages)} pages...") + start += len(batch) + + # If we got fewer results than the limit, we've reached the end + if len(batch) < limit: + break + + print(f" Total pages fetched: {len(pages)}") + return pages + + def _is_cloud_instance(self) -> bool: + """Detect whether the base URL points to an Atlassian Cloud instance. + + Cloud instances use ``*.atlassian.net`` domain names. + + Returns: + True if the URL looks like an Atlassian Cloud instance. + """ + return "atlassian.net" in self.base_url.lower() + + # ────────────────────────────────────────────────────────────────────── + # Export extraction + # ────────────────────────────────────────────────────────────────────── + + def _extract_from_export(self) -> list[dict[str, Any]]: + """Parse a Confluence HTML/XML export directory into page dicts. + + Confluence exports can contain either: + - An ``entities.xml`` file (full XML export from admin) + - A directory of HTML files (HTML export) + + This method auto-detects the export format and delegates accordingly. + HTML files are parsed with BeautifulSoup to extract content and metadata. + + Returns: + List of normalised page dicts (same structure as API mode). + + Raises: + FileNotFoundError: If the export path does not exist. + ValueError: If no parseable content is found in the export. + """ + export_dir = Path(self.export_path) + if not export_dir.exists(): + raise FileNotFoundError(f"Confluence export path not found: {self.export_path}") + if not export_dir.is_dir(): + raise ValueError(f"Export path is not a directory: {self.export_path}") + + pages: list[dict[str, Any]] = [] + space_info: dict[str, Any] = {"key": self.space_key or "EXPORT", "name": self.name} + + # Check for entities.xml (full XML export) + entities_xml = export_dir / "entities.xml" + if entities_xml.exists(): + pages = self._parse_entities_xml(entities_xml, space_info) + if pages: + print(f" Parsed entities.xml: {len(pages)} pages") + return pages + + # Fall back to HTML file export + html_files = sorted( + f for f in export_dir.rglob("*.html") if f.is_file() and f.name != "index.html" + ) + + if not html_files: + # Also try .htm files + html_files = sorted( + f for f in export_dir.rglob("*.htm") if f.is_file() and f.name != "index.htm" + ) + + if not html_files: + raise ValueError( + f"No HTML files found in export directory: {self.export_path}\n" + "Expected either entities.xml or HTML files from Confluence export." + ) + + print(f" Found {len(html_files)} HTML files in export") + + # Parse index.html for page hierarchy if available + index_file = export_dir / "index.html" + hierarchy_map: dict[str, str] = {} # filename -> parent filename + if index_file.exists(): + hierarchy_map = self._parse_export_index(index_file) + + for idx, html_file in enumerate(html_files): + if idx >= self.max_pages: + logger.info("Reached max_pages limit (%d)", self.max_pages) + break + + try: + raw_html = html_file.read_text(encoding="utf-8", errors="ignore") + except Exception as e: + logger.warning("Could not read %s: %s", html_file, e) + continue + + soup = BeautifulSoup(raw_html, "html.parser") + + # Extract title + title_tag = soup.find("title") + title = title_tag.get_text(strip=True) if title_tag else html_file.stem + + # Find main content area (Confluence exports use specific div IDs) + main_content = ( + soup.find("div", id="main-content") + or soup.find("div", class_="wiki-content") + or soup.find("div", id="content") + or soup.find("body") + ) + + body_html = str(main_content) if main_content else "" + file_key = html_file.stem + parent_key = hierarchy_map.get(file_key, "") + + page_dict: dict[str, Any] = { + "id": file_key, + "title": title, + "body": body_html, + "parent_id": parent_key, + "labels": [], + "url": str(html_file), + "space_info": space_info, + "version": 1, + "created": "", + "modified": "", + } + pages.append(page_dict) + + print(f" Parsed {len(pages)} pages from HTML export") + return pages + + def _parse_entities_xml( + self, + xml_path: Path, + space_info: dict[str, Any], + ) -> list[dict[str, Any]]: + """Parse Confluence entities.xml export file. + + The entities.xml file contains all page data including body content + in Confluence storage format. This method extracts page objects and + their parent-child relationships. + + Args: + xml_path: Path to the entities.xml file. + space_info: Space metadata dict to attach to each page. + + Returns: + List of normalised page dicts. + """ + pages: list[dict[str, Any]] = [] + + try: + # Use iterparse for memory efficiency on large exports + import xml.etree.ElementTree as ET + + tree = ET.parse(xml_path) # noqa: S314 + root = tree.getroot() + except Exception as e: + logger.warning("Failed to parse entities.xml: %s", e) + return [] + + # Find all page objects in the XML + for obj_elem in root.iter("object"): + obj_class = obj_elem.get("class", "") + if obj_class != "Page": + continue + + page_data: dict[str, str] = {} + for prop_elem in obj_elem: + prop_name = prop_elem.get("name", "") + if prop_name == "title": + page_data["title"] = prop_elem.text or "" + elif prop_name == "id": + page_data["id"] = prop_elem.text or "" + elif prop_name == "bodyContents": + # Body content is nested inside a collection + for body_obj in prop_elem.iter("object"): + for body_prop in body_obj: + if body_prop.get("name") == "body": + page_data["body"] = body_prop.text or "" + elif prop_name == "parent": + # Parent reference + parent_ref = prop_elem.find("id") + if parent_ref is not None and parent_ref.text: + page_data["parent_id"] = parent_ref.text + + if page_data.get("title") and page_data.get("id"): + page_dict: dict[str, Any] = { + "id": page_data.get("id", ""), + "title": page_data.get("title", ""), + "body": page_data.get("body", ""), + "parent_id": page_data.get("parent_id", ""), + "labels": [], + "url": "", + "space_info": space_info, + "version": 1, + "created": "", + "modified": "", + } + pages.append(page_dict) + + return pages + + def _parse_export_index(self, index_path: Path) -> dict[str, str]: + """Parse the index.html from a Confluence HTML export for hierarchy. + + The export index page contains a nested list structure representing + the page tree. This method parses it to build a child-to-parent mapping. + + Args: + index_path: Path to the index.html file. + + Returns: + Dict mapping page filename stem to parent filename stem. + """ + hierarchy: dict[str, str] = {} + + try: + raw_html = index_path.read_text(encoding="utf-8", errors="ignore") + soup = BeautifulSoup(raw_html, "html.parser") + + # Confluence export index uses nested