#!/usr/bin/env python3 """ Confluence Documentation to Skill Converter Converts Confluence spaces into AI-ready skills by extracting page content, hierarchy, code blocks, tables, and attachments. Supports two extraction modes: 1. **API mode**: Connects to a Confluence instance via the Atlassian REST API (requires ``atlassian-python-api``). Fetches pages from a specified space, preserving the parent-child hierarchy. Requires ``--base-url``, ``--space-key``, and authentication via ``--username`` / ``--token`` (or env vars). 2. **Export mode**: Parses a Confluence HTML/XML export directory previously downloaded from the Confluence admin UI. Requires ``--export-path`` pointing to the extracted export directory containing ``entities.xml`` or HTML files. Usage: # API mode skill-seekers confluence --base-url https://wiki.example.com \\ --space-key PROJ --username user@example.com --token $CONFLUENCE_TOKEN \\ --name my-project-wiki # Export mode skill-seekers confluence --export-path ./confluence-export/ --name my-wiki # Build from previously extracted JSON skill-seekers confluence --from-json my-wiki_extracted.json # Standalone execution python3 -m skill_seekers.cli.confluence_scraper --base-url https://wiki.example.com \\ --space-key DEV --name dev-wiki --max-pages 200 """ import argparse import json import logging import os import re import sys from pathlib import Path from typing import Any # Optional dependency guard for atlassian-python-api try: from atlassian import Confluence ATLASSIAN_AVAILABLE = True except ImportError: ATLASSIAN_AVAILABLE = False # BeautifulSoup is a core dependency (always available) from bs4 import BeautifulSoup, Comment, Tag logger = logging.getLogger(__name__) # Confluence-specific HTML macro class patterns to strip during cleaning _CONFLUENCE_MACRO_CLASSES = { "confluence-information-macro", "confluence-information-macro-body", "confluence-information-macro-icon", "expand-container", "expand-content", "expand-control", "plugin-tabmeta", "plugin_pagetree", "page-metadata", "aui-message", } # Confluence macro element tag names (structured-macro in storage format) _STORAGE_MACRO_TAGS = { "ac:structured-macro", "ac:rich-text-body", "ac:parameter", "ac:plain-text-body", "ac:image", "ac:link", "ac:emoticon", "ac:task-list", "ac:task", "ac:task-body", "ac:task-status", "ri:attachment", "ri:page", "ri:space", "ri:url", "ri:user", } # Known Confluence code macro language mappings _CODE_MACRO_LANGS = { "py": "python", "python": "python", "python3": "python", "js": "javascript", "javascript": "javascript", "ts": "typescript", "typescript": "typescript", "java": "java", "bash": "bash", "sh": "bash", "shell": "bash", "sql": "sql", "xml": "xml", "html": "html", "css": "css", "json": "json", "yaml": "yaml", "yml": "yaml", "ruby": "ruby", "go": "go", "golang": "go", "rust": "rust", "c": "c", "cpp": "cpp", "csharp": "csharp", "cs": "csharp", "kotlin": "kotlin", "swift": "swift", "scala": "scala", "groovy": "groovy", "perl": "perl", "php": "php", "r": "r", "powershell": "powershell", "dockerfile": "dockerfile", "terraform": "hcl", "hcl": "hcl", "markdown": "markdown", "text": "", "none": "", } def _check_atlassian_deps() -> None: """Raise RuntimeError if atlassian-python-api is not installed.""" if not ATLASSIAN_AVAILABLE: raise RuntimeError( "atlassian-python-api is required for Confluence API mode.\n" "Install with: pip install atlassian-python-api\n" 'Or: pip install "skill-seekers[confluence]"' ) def infer_description_from_confluence( space_info: dict | None = None, name: str = "", ) -> str: """Infer skill description from Confluence space metadata. Args: space_info: Confluence space metadata dict (name, description, key). name: Skill name for fallback. Returns: Description string suitable for "Use when..." format. """ if space_info: desc_text = space_info.get("description", "") if isinstance(desc_text, dict): # Confluence API returns description as {"plain": {"value": "..."}} desc_text = desc_text.get("plain", {}).get("value", "") or desc_text.get( "view", {} ).get("value", "") if desc_text and len(desc_text) > 20: clean = re.sub(r"<[^>]+>", "", desc_text).strip() if len(clean) > 150: clean = clean[:147] + "..." return f"Use when {clean.lower()}" space_name = space_info.get("name", "") if space_name and len(space_name) > 5: return f"Use when working with {space_name.lower()} documentation" return ( f"Use when referencing {name} documentation" if name else "Use when referencing this Confluence documentation" ) class ConfluenceToSkillConverter: """Convert Confluence space documentation to an AI-ready skill. Supports two extraction modes: - **API mode**: Uses the Atlassian Confluence REST API to fetch pages from a space, including page hierarchy, labels, and storage-format content. Requires ``base_url``, ``space_key``, and authentication credentials. - **Export mode**: Parses a Confluence HTML/XML export directory that has been downloaded and extracted from the Confluence admin interface. Requires ``export_path`` pointing to the extracted directory. After extraction, the converter categorises pages by their parent-child hierarchy, generates reference markdown files, an index, and the main SKILL.md manifest. Attributes: config: Configuration dictionary. name: Skill name used for output directory and filenames. base_url: Confluence instance base URL (API mode). space_key: Confluence space key (API mode). export_path: Path to exported Confluence directory (export mode). username: Confluence username / email for API authentication. token: Confluence API token or password. description: Skill description for SKILL.md frontmatter. max_pages: Maximum number of pages to fetch in API mode. skill_dir: Output directory for the generated skill. data_file: Path to the intermediate extracted JSON file. extracted_data: Structured extraction results dict. """ def __init__(self, config: dict) -> None: """Initialize the Confluence to skill converter. Args: config: Configuration dictionary containing: - name (str): Skill name (required). - base_url (str): Confluence instance URL (API mode). - space_key (str): Confluence space key (API mode). - export_path (str): Path to export directory (export mode). - username (str): API username / email (optional, falls back to env). - token (str): API token (optional, falls back to env). - description (str): Skill description (optional). - max_pages (int): Maximum pages to fetch, default 500. """ self.config = config self.name: str = config["name"] self.base_url: str = config.get("base_url", "") self.space_key: str = config.get("space_key", "") self.export_path: str = config.get("export_path", "") self.username: str = config.get("username", "") self.token: str = config.get("token", "") self.description: str = ( config.get("description") or f"Use when referencing {self.name} documentation" ) self.max_pages: int = int(config.get("max_pages", 500)) # Output paths self.skill_dir = f"output/{self.name}" self.data_file = f"output/{self.name}_extracted.json" # Extracted data storage self.extracted_data: dict[str, Any] | None = None # ────────────────────────────────────────────────────────────────────── # Extraction dispatcher # ────────────────────────────────────────────────────────────────────── def extract_confluence(self) -> bool: """Extract content from Confluence, dispatching to API or export mode. Determines the extraction mode based on the provided configuration: - If ``base_url`` and ``space_key`` are set, uses API mode. - If ``export_path`` is set, uses export mode. - Raises ValueError if neither mode is configured. After extraction, saves intermediate JSON to ``{name}_extracted.json`` and updates the description from space metadata if not explicitly set. Returns: True on successful extraction. Raises: ValueError: If neither API nor export configuration is provided. RuntimeError: If API dependencies are missing or connection fails. """ if self.base_url and self.space_key: print(f"\n Extracting from Confluence API: {self.base_url}") print(f" Space: {self.space_key}") raw_pages = self._extract_via_api() elif self.export_path: print(f"\n Extracting from Confluence export: {self.export_path}") raw_pages = self._extract_from_export() else: raise ValueError( "No Confluence source configured. Provide either:\n" " - --base-url and --space-key (API mode), or\n" " - --export-path (export mode)" ) if not raw_pages: logger.warning("No pages extracted from Confluence") # Build page hierarchy tree page_tree = self._extract_page_tree(raw_pages) # Parse each page's HTML content to structured sections sections: list[dict[str, Any]] = [] total_code_blocks = 0 total_images = 0 section_number = 0 for page in raw_pages: page_id = page.get("id", "") page_title = page.get("title", "Untitled") body_html = page.get("body", "") labels = page.get("labels", []) parent_id = page.get("parent_id", "") if not body_html: logger.debug("Skipping page with no body: %s", page_title) continue # Parse the Confluence HTML content parsed = self._parse_confluence_html(body_html, page_title) section_number += 1 section_data: dict[str, Any] = { "section_number": section_number, "page_id": page_id, "heading": page_title, "heading_level": "h1", "parent_id": parent_id, "labels": labels, "text": parsed.get("text", ""), "headings": parsed.get("headings", []), "code_samples": parsed.get("code_samples", []), "tables": parsed.get("tables", []), "images": parsed.get("images", []), "links": parsed.get("links", []), "macros": parsed.get("macros", []), } sections.append(section_data) total_code_blocks += len(parsed.get("code_samples", [])) total_images += len(parsed.get("images", [])) # Collect space metadata space_info = raw_pages[0].get("space_info", {}) if raw_pages else {} # Update description from space metadata if not explicitly set if not self.config.get("description"): self.description = infer_description_from_confluence(space_info, self.name) # Detect programming languages in code samples languages_detected: dict[str, int] = {} for section in sections: for code_sample in section.get("code_samples", []): lang = code_sample.get("language", "") if lang: languages_detected[lang] = languages_detected.get(lang, 0) + 1 result_data: dict[str, Any] = { "source": self.base_url or self.export_path, "space_key": self.space_key, "space_info": space_info, "page_tree": page_tree, "total_sections": len(sections), "total_pages": len(raw_pages), "total_code_blocks": total_code_blocks, "total_images": total_images, "languages_detected": languages_detected, "pages": sections, } # Save extracted data os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True) with open(self.data_file, "w", encoding="utf-8") as f: json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) print(f"\n Saved extracted data to: {self.data_file}") self.extracted_data = result_data print( f" Extracted {len(sections)} pages, " f"{total_code_blocks} code blocks, " f"{total_images} images" ) return True # ────────────────────────────────────────────────────────────────────── # API extraction # ────────────────────────────────────────────────────────────────────── def _extract_via_api(self) -> list[dict[str, Any]]: """Fetch pages from a Confluence space using the REST API. Connects to the Confluence instance using ``atlassian-python-api``, retrieves all pages in the configured space (up to ``max_pages``), and returns them as a list of normalised page dicts. Authentication is resolved in priority order: 1. Constructor arguments (username/token) 2. Environment variables (CONFLUENCE_USERNAME / CONFLUENCE_TOKEN) Returns: List of page dicts with keys: id, title, body, parent_id, labels, url, space_info, version, created, modified. Raises: RuntimeError: If atlassian-python-api is not installed or the connection / fetch fails. """ _check_atlassian_deps() # Resolve authentication credentials username = ( self.username or os.environ.get("CONFLUENCE_USERNAME", "") or os.environ.get("ATLASSIAN_USERNAME", "") ) token = ( self.token or os.environ.get("CONFLUENCE_TOKEN", "") or os.environ.get("ATLASSIAN_TOKEN", "") ) if not username or not token: raise RuntimeError( "Confluence API authentication required.\n" "Provide --username and --token, or set CONFLUENCE_USERNAME " "and CONFLUENCE_TOKEN environment variables." ) # Connect to Confluence try: confluence = Confluence( url=self.base_url, username=username, password=token, cloud=self._is_cloud_instance(), ) except Exception as e: raise RuntimeError(f"Failed to connect to Confluence at {self.base_url}: {e}") from e # Fetch space information space_info: dict[str, Any] = {} try: space_data = confluence.get_space(self.space_key, expand="description.plain,homepage") space_info = { "key": space_data.get("key", self.space_key), "name": space_data.get("name", self.space_key), "description": space_data.get("description", {}), "type": space_data.get("type", "global"), "homepage_id": ( space_data.get("homepage", {}).get("id", "") if space_data.get("homepage") else "" ), } print(f" Space: {space_info.get('name', self.space_key)}") except Exception as e: logger.warning("Could not fetch space info: %s", e) space_info = {"key": self.space_key, "name": self.space_key} # Fetch all pages in the space, paginated pages: list[dict[str, Any]] = [] start = 0 limit = 50 # Confluence API page size expand_fields = "body.storage,version,ancestors,metadata.labels" print(f" Fetching pages (max {self.max_pages})...") while len(pages) < self.max_pages: try: batch = confluence.get_all_pages_from_space( self.space_key, start=start, limit=min(limit, self.max_pages - len(pages)), expand=expand_fields, content_type="page", ) except Exception as e: logger.error("Failed to fetch pages at offset %d: %s", start, e) break if not batch: break for page_data in batch: page_id = str(page_data.get("id", "")) title = page_data.get("title", "Untitled") # Extract body (storage format HTML) body = page_data.get("body", {}).get("storage", {}).get("value", "") # Extract parent ID from ancestors ancestors = page_data.get("ancestors", []) parent_id = str(ancestors[-1]["id"]) if ancestors else "" # Extract labels labels_data = page_data.get("metadata", {}).get("labels", {}).get("results", []) labels = [lbl.get("name", "") for lbl in labels_data if lbl.get("name")] # Version and dates version_info = page_data.get("version", {}) version_number = version_info.get("number", 1) created = version_info.get("when", "") if version_number == 1 else "" modified = version_info.get("when", "") # Build page URL page_url = f"{self.base_url}/wiki/spaces/{self.space_key}/pages/{page_id}" links = page_data.get("_links", {}) if links.get("webui"): page_url = f"{self.base_url}/wiki{links['webui']}" page_dict: dict[str, Any] = { "id": page_id, "title": title, "body": body, "parent_id": parent_id, "labels": labels, "url": page_url, "space_info": space_info, "version": version_number, "created": created, "modified": modified, } pages.append(page_dict) print(f" Fetched {len(pages)} pages...") start += len(batch) # If we got fewer results than the limit, we've reached the end if len(batch) < limit: break print(f" Total pages fetched: {len(pages)}") return pages def _is_cloud_instance(self) -> bool: """Detect whether the base URL points to an Atlassian Cloud instance. Cloud instances use ``*.atlassian.net`` domain names. Returns: True if the URL looks like an Atlassian Cloud instance. """ return "atlassian.net" in self.base_url.lower() # ────────────────────────────────────────────────────────────────────── # Export extraction # ────────────────────────────────────────────────────────────────────── def _extract_from_export(self) -> list[dict[str, Any]]: """Parse a Confluence HTML/XML export directory into page dicts. Confluence exports can contain either: - An ``entities.xml`` file (full XML export from admin) - A directory of HTML files (HTML export) This method auto-detects the export format and delegates accordingly. HTML files are parsed with BeautifulSoup to extract content and metadata. Returns: List of normalised page dicts (same structure as API mode). Raises: FileNotFoundError: If the export path does not exist. ValueError: If no parseable content is found in the export. """ export_dir = Path(self.export_path) if not export_dir.exists(): raise FileNotFoundError(f"Confluence export path not found: {self.export_path}") if not export_dir.is_dir(): raise ValueError(f"Export path is not a directory: {self.export_path}") pages: list[dict[str, Any]] = [] space_info: dict[str, Any] = {"key": self.space_key or "EXPORT", "name": self.name} # Check for entities.xml (full XML export) entities_xml = export_dir / "entities.xml" if entities_xml.exists(): pages = self._parse_entities_xml(entities_xml, space_info) if pages: print(f" Parsed entities.xml: {len(pages)} pages") return pages # Fall back to HTML file export html_files = sorted( f for f in export_dir.rglob("*.html") if f.is_file() and f.name != "index.html" ) if not html_files: # Also try .htm files html_files = sorted( f for f in export_dir.rglob("*.htm") if f.is_file() and f.name != "index.htm" ) if not html_files: raise ValueError( f"No HTML files found in export directory: {self.export_path}\n" "Expected either entities.xml or HTML files from Confluence export." ) print(f" Found {len(html_files)} HTML files in export") # Parse index.html for page hierarchy if available index_file = export_dir / "index.html" hierarchy_map: dict[str, str] = {} # filename -> parent filename if index_file.exists(): hierarchy_map = self._parse_export_index(index_file) for idx, html_file in enumerate(html_files): if idx >= self.max_pages: logger.info("Reached max_pages limit (%d)", self.max_pages) break try: raw_html = html_file.read_text(encoding="utf-8", errors="ignore") except Exception as e: logger.warning("Could not read %s: %s", html_file, e) continue soup = BeautifulSoup(raw_html, "html.parser") # Extract title title_tag = soup.find("title") title = title_tag.get_text(strip=True) if title_tag else html_file.stem # Find main content area (Confluence exports use specific div IDs) main_content = ( soup.find("div", id="main-content") or soup.find("div", class_="wiki-content") or soup.find("div", id="content") or soup.find("body") ) body_html = str(main_content) if main_content else "" file_key = html_file.stem parent_key = hierarchy_map.get(file_key, "") page_dict: dict[str, Any] = { "id": file_key, "title": title, "body": body_html, "parent_id": parent_key, "labels": [], "url": str(html_file), "space_info": space_info, "version": 1, "created": "", "modified": "", } pages.append(page_dict) print(f" Parsed {len(pages)} pages from HTML export") return pages def _parse_entities_xml( self, xml_path: Path, space_info: dict[str, Any], ) -> list[dict[str, Any]]: """Parse Confluence entities.xml export file. The entities.xml file contains all page data including body content in Confluence storage format. This method extracts page objects and their parent-child relationships. Args: xml_path: Path to the entities.xml file. space_info: Space metadata dict to attach to each page. Returns: List of normalised page dicts. """ pages: list[dict[str, Any]] = [] try: # Use iterparse for memory efficiency on large exports import xml.etree.ElementTree as ET tree = ET.parse(xml_path) # noqa: S314 root = tree.getroot() except Exception as e: logger.warning("Failed to parse entities.xml: %s", e) return [] # Find all page objects in the XML for obj_elem in root.iter("object"): obj_class = obj_elem.get("class", "") if obj_class != "Page": continue page_data: dict[str, str] = {} for prop_elem in obj_elem: prop_name = prop_elem.get("name", "") if prop_name == "title": page_data["title"] = prop_elem.text or "" elif prop_name == "id": page_data["id"] = prop_elem.text or "" elif prop_name == "bodyContents": # Body content is nested inside a collection for body_obj in prop_elem.iter("object"): for body_prop in body_obj: if body_prop.get("name") == "body": page_data["body"] = body_prop.text or "" elif prop_name == "parent": # Parent reference parent_ref = prop_elem.find("id") if parent_ref is not None and parent_ref.text: page_data["parent_id"] = parent_ref.text if page_data.get("title") and page_data.get("id"): page_dict: dict[str, Any] = { "id": page_data.get("id", ""), "title": page_data.get("title", ""), "body": page_data.get("body", ""), "parent_id": page_data.get("parent_id", ""), "labels": [], "url": "", "space_info": space_info, "version": 1, "created": "", "modified": "", } pages.append(page_dict) return pages def _parse_export_index(self, index_path: Path) -> dict[str, str]: """Parse the index.html from a Confluence HTML export for hierarchy. The export index page contains a nested list structure representing the page tree. This method parses it to build a child-to-parent mapping. Args: index_path: Path to the index.html file. Returns: Dict mapping page filename stem to parent filename stem. """ hierarchy: dict[str, str] = {} try: raw_html = index_path.read_text(encoding="utf-8", errors="ignore") soup = BeautifulSoup(raw_html, "html.parser") # Confluence export index uses nested