#!/usr/bin/env python3 """ AsciiDoc Documentation to Skill Converter Converts AsciiDoc (.adoc, .asciidoc) documentation files into AI-ready skills. Supports both single files and directories of AsciiDoc documents. Uses the ``asciidoc`` library when available for accurate HTML rendering, falling back to a comprehensive regex-based parser that handles headings, code blocks, tables, admonitions, include directives, and inline formatting. Usage: skill-seekers asciidoc --asciidoc-path doc.adoc --name myskill skill-seekers asciidoc --asciidoc-path docs/ --name myskill skill-seekers asciidoc --from-json doc_extracted.json """ import json import logging import os import re from pathlib import Path # Optional dependency guard β€” asciidoc library for HTML conversion try: import asciidoc as asciidoc_lib # noqa: F401 ASCIIDOC_AVAILABLE = True except ImportError: ASCIIDOC_AVAILABLE = False from skill_seekers.cli.skill_converter import SkillConverter logger = logging.getLogger(__name__) ASCIIDOC_EXTENSIONS = {".adoc", ".asciidoc", ".asc", ".ad"} ADMONITION_TYPES = ("NOTE", "TIP", "WARNING", "IMPORTANT", "CAUTION") # Regex patterns for AsciiDoc structure RE_HEADING = re.compile(r"^(={1,5})\s+(.+)$", re.MULTILINE) RE_SOURCE_ATTR = re.compile(r"^\[source(?:,\s*(\w[\w+#.-]*))?(?:,.*?)?\]$", re.MULTILINE) RE_LISTING_DELIM = re.compile(r"^(-{4,})$", re.MULTILINE) RE_LITERAL_DELIM = re.compile(r"^(\.{4,})$", re.MULTILINE) RE_TABLE_DELIM = re.compile(r"^\|={3,}$", re.MULTILINE) RE_TABLE_CELL = re.compile(r"^\|(.+)$", re.MULTILINE) RE_ADMONITION_PARA = re.compile( r"^(NOTE|TIP|WARNING|IMPORTANT|CAUTION):\s+(.+?)(?:\n\n|\Z)", re.MULTILINE | re.DOTALL, ) RE_ADMONITION_BLOCK = re.compile( r"^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\n={4,}\n(.*?)\n={4,}", re.MULTILINE | re.DOTALL, ) RE_INCLUDE = re.compile(r"^include::(.+?)\[([^\]]*)\]$", re.MULTILINE) RE_ATTRIBUTE = re.compile(r"^:([a-zA-Z0-9_-]+):\s*(.*)$", re.MULTILINE) RE_ATTR_REF = re.compile(r"\{([a-zA-Z0-9_-]+)\}") RE_BOLD = re.compile(r"\*([^\s*](?:.*?[^\s*])?)\*") RE_ITALIC = re.compile(r"_([^\s_](?:.*?[^\s_])?)_") RE_MONO = re.compile(r"`([^`]+)`") RE_LINK = re.compile(r"(https?://\S+)\[([^\]]*)\]") RE_XREF = re.compile(r"<<([^,>]+)(?:,\s*([^>]+))?>>") def _check_asciidoc_deps() -> None: """Log debug message when asciidoc library is not installed (regex fallback used).""" if not ASCIIDOC_AVAILABLE: logger.debug( "asciidoc library not installed; using regex-based parser.\n" 'Install with: pip install "skill-seekers[asciidoc]" or: pip install asciidoc' ) def infer_description_from_asciidoc(metadata: dict | None = None, name: str = "") -> str: """Infer skill description from AsciiDoc document metadata.""" if metadata: if metadata.get("description") and len(str(metadata["description"])) > 20: desc = str(metadata["description"]).strip() return ( f"Use when {desc[:147].lower()}..." if len(desc) > 150 else f"Use when {desc.lower()}" ) if metadata.get("title") and len(str(metadata["title"])) > 10: return f"Use when working with {str(metadata['title']).lower()}" return ( f"Use when referencing {name} documentation" if name else "Use when referencing this documentation" ) def _score_code_quality(code: str) -> float: """Simple quality heuristic for code blocks (0-10 scale).""" if not code: return 0.0 score = 5.0 line_count = len(code.strip().split("\n")) if line_count >= 10: score += 2.0 elif line_count >= 5: score += 1.0 if re.search(r"\b(def |class |function |func |fn )", code): score += 1.5 if re.search(r"\b(import |from .+ import|require\(|#include|using )", code): score += 0.5 if re.search(r"^ ", code, re.MULTILINE): score += 0.5 if re.search(r"[=:{}()\[\]]", code): score += 0.3 if len(code) < 30: score -= 2.0 return min(10.0, max(0.0, score)) class AsciiDocToSkillConverter(SkillConverter): """Convert AsciiDoc documentation to an AI-ready skill. Handles single ``.adoc`` files and directories. Content is parsed into intermediate JSON, categorised, then rendered into the standard skill directory layout (SKILL.md, references/, etc.). """ SOURCE_TYPE = "asciidoc" def __init__(self, config: dict) -> None: super().__init__(config) self.config = config self.name: str = config["name"] self.asciidoc_path: str = config.get("asciidoc_path", "") self.description: str = ( config.get("description") or f"Use when referencing {self.name} documentation" ) self.skill_dir: str = f"output/{self.name}" self.data_file: str = f"output/{self.name}_extracted.json" self.categories: dict = config.get("categories", {}) self.extracted_data: dict | None = None def extract(self): """Extract content from AsciiDoc files (SkillConverter interface).""" self.extract_asciidoc() # ------------------------------------------------------------------ # Extraction # ------------------------------------------------------------------ def extract_asciidoc(self) -> bool: """Extract content from AsciiDoc file(s). Discovers files, resolves attributes/includes, parses sections, detects languages, and saves intermediate JSON. Returns: True on success. Raises: FileNotFoundError: If path does not exist. ValueError: If no AsciiDoc files found. """ _check_asciidoc_deps() from skill_seekers.cli.language_detector import LanguageDetector print(f"\nπŸ” Extracting from AsciiDoc: {self.asciidoc_path}") path = Path(self.asciidoc_path) if not path.exists(): raise FileNotFoundError(f"AsciiDoc path not found: {self.asciidoc_path}") files = self._discover_files(path) if not files: raise ValueError( f"No AsciiDoc files found at: {self.asciidoc_path}\n" f"Expected extensions: {', '.join(sorted(ASCIIDOC_EXTENSIONS))}" ) print(f" Found {len(files)} AsciiDoc file(s)") all_sections: list[dict] = [] metadata: dict = {} section_counter = 0 for file_path in sorted(files): raw_text = file_path.read_text(encoding="utf-8", errors="replace") attributes = self._extract_attributes(raw_text) resolved_text = self._resolve_attributes(raw_text, attributes) resolved_text = self._resolve_includes(resolved_text, file_path.parent) if not metadata: metadata = self._build_metadata(attributes, file_path) for section in self._parse_asciidoc_sections(resolved_text): section_counter += 1 section["section_number"] = section_counter section["source_file"] = str(file_path) body = section.pop("body", "") section["code_samples"] = self._extract_code_blocks(body) section["tables"] = self._extract_tables(body) section["admonitions"] = self._extract_admonitions(body) section["includes"] = self._extract_includes(body) section["text"] = self._convert_to_markdown(body) all_sections.append(section) # Language detection detector = LanguageDetector(min_confidence=0.15) languages_detected: dict[str, int] = {} total_code_blocks = 0 for section in all_sections: for cs in section.get("code_samples", []): if cs.get("language"): languages_detected[cs["language"]] = ( languages_detected.get(cs["language"], 0) + 1 ) total_code_blocks += 1 for section in all_sections: for cs in section.get("code_samples", []): if not cs.get("language") and cs.get("code"): lang, conf = detector.detect_from_code(cs["code"]) if lang and conf >= 0.3: cs["language"] = lang languages_detected[lang] = languages_detected.get(lang, 0) + 1 if not self.config.get("description"): self.description = infer_description_from_asciidoc(metadata, self.name) result_data = { "source_path": self.asciidoc_path, "metadata": metadata, "total_sections": len(all_sections), "total_files": len(files), "total_code_blocks": total_code_blocks, "total_tables": sum(len(s.get("tables", [])) for s in all_sections), "total_admonitions": sum(len(s.get("admonitions", [])) for s in all_sections), "languages_detected": languages_detected, "pages": all_sections, } os.makedirs(os.path.dirname(self.data_file) or ".", exist_ok=True) with open(self.data_file, "w", encoding="utf-8") as f: json.dump(result_data, f, indent=2, ensure_ascii=False, default=str) print(f"\nπŸ’Ύ Saved extracted data to: {self.data_file}") self.extracted_data = result_data print( f"βœ… Extracted {len(all_sections)} sections, {total_code_blocks} code blocks, " f"{result_data['total_tables']} tables, {result_data['total_admonitions']} admonitions" ) return True def _discover_files(self, path: Path) -> list[Path]: """Return sorted list of AsciiDoc files from *path* (file or directory).""" if path.is_file(): return [path] if path.suffix.lower() in ASCIIDOC_EXTENSIONS else [] found: list[Path] = [] for ext in ASCIIDOC_EXTENSIONS: found.extend(path.rglob(f"*{ext}")) return sorted(set(found)) # ------------------------------------------------------------------ # Attribute / include resolution # ------------------------------------------------------------------ @staticmethod def _extract_attributes(text: str) -> dict[str, str]: """Extract ``:attr-name: value`` definitions from text.""" return {m.group(1): m.group(2).strip() for m in RE_ATTRIBUTE.finditer(text)} @staticmethod def _resolve_attributes(text: str, attributes: dict[str, str]) -> str: """Replace ``{attr-name}`` references with their values.""" return RE_ATTR_REF.sub(lambda m: attributes.get(m.group(1), m.group(0)), text) def _resolve_includes(self, text: str, base_dir: Path) -> str: """Resolve ``include::`` directives by inlining referenced files.""" max_depth = 5 def _resolve_once(src: str, depth: int) -> str: if depth >= max_depth: return src def _replacer(match: re.Match) -> str: inc_path = match.group(1).strip() inc_file = base_dir / inc_path if inc_file.is_file(): try: return _resolve_once( inc_file.read_text(encoding="utf-8", errors="replace"), depth + 1 ) except OSError: logger.debug("Could not read include file: %s", inc_file) return f"// include::{inc_path}[] (not resolved)" return RE_INCLUDE.sub(_replacer, src) return _resolve_once(text, 0) @staticmethod def _build_metadata(attributes: dict[str, str], file_path: Path) -> dict: """Build metadata dict from document attributes.""" return { "title": attributes.get("doctitle", attributes.get("title", file_path.stem)), "author": attributes.get("author", ""), "email": attributes.get("email", ""), "revision": attributes.get("revnumber", attributes.get("version", "")), "date": attributes.get("revdate", attributes.get("date", "")), "description": attributes.get("description", ""), "keywords": attributes.get("keywords", ""), "source_file": str(file_path), } # ------------------------------------------------------------------ # Section parsing # ------------------------------------------------------------------ def _parse_asciidoc_sections(self, text: str) -> list[dict]: """Parse AsciiDoc text into sections split by headings (= through =====).""" heading_matches = [ (m.start(), len(m.group(1)), m.group(2).strip(), m.group(0)) for m in RE_HEADING.finditer(text) ] if not heading_matches: return [{"heading": "", "heading_level": "h1", "body": text.strip(), "headings": []}] sections: list[dict] = [] preamble = text[: heading_matches[0][0]].strip() if preamble: sections.append( {"heading": "", "heading_level": "h1", "body": preamble, "headings": []} ) for idx, (start, level, heading_text, raw) in enumerate(heading_matches): body_start = start + len(raw) body_end = heading_matches[idx + 1][0] if idx + 1 < len(heading_matches) else len(text) body = text[body_start:body_end].strip() sub_headings = [ {"level": f"h{len(m.group(1))}", "text": m.group(2).strip()} for m in RE_HEADING.finditer(body) if len(m.group(1)) > level ] sections.append( { "heading": heading_text, "heading_level": f"h{level}", "body": body, "headings": sub_headings, } ) return sections # ------------------------------------------------------------------ # Code block extraction # ------------------------------------------------------------------ def _extract_code_blocks(self, text: str) -> list[dict]: """Extract source/listing/literal code blocks from AsciiDoc text. Handles [source,lang] + ---- blocks, bare ---- blocks, and .... blocks. """ blocks: list[dict] = [] consumed: list[tuple[int, int]] = [] # Pattern 1: [source,lang] + ---- block for attr_m in RE_SOURCE_ATTR.finditer(text): lang = (attr_m.group(1) or "").strip() open_m = RE_LISTING_DELIM.search(text, attr_m.end()) if not open_m: continue between = text[attr_m.end() : open_m.start()].strip() if between and not between.startswith(".") and "\n" in between: continue delim = open_m.group(1) close_m = re.search( r"^" + re.escape(delim) + r"$", text[open_m.end() + 1 :], re.MULTILINE ) if not close_m: continue abs_close = open_m.end() + 1 + close_m.start() code = text[open_m.end() : abs_close].strip("\n") if code: blocks.append( {"code": code, "language": lang, "quality_score": _score_code_quality(code)} ) consumed.append((attr_m.start(), abs_close + len(close_m.group(0)))) # Pattern 2: bare ---- listing blocks for m in RE_LISTING_DELIM.finditer(text): if self._in_range(m.start(), consumed): continue delim = m.group(1) close_m = re.search(r"^" + re.escape(delim) + r"$", text[m.end() + 1 :], re.MULTILINE) if not close_m: continue abs_close = m.end() + 1 + close_m.start() code = text[m.end() : abs_close].strip("\n") if code: blocks.append( {"code": code, "language": "", "quality_score": _score_code_quality(code)} ) consumed.append((m.start(), abs_close + len(close_m.group(0)))) # Pattern 3: .... literal blocks for m in RE_LITERAL_DELIM.finditer(text): if self._in_range(m.start(), consumed): continue delim = m.group(1) close_m = re.search(r"^" + re.escape(delim) + r"$", text[m.end() + 1 :], re.MULTILINE) if not close_m: continue abs_close = m.end() + 1 + close_m.start() code = text[m.end() : abs_close].strip("\n") if code: blocks.append( {"code": code, "language": "", "quality_score": _score_code_quality(code)} ) consumed.append((m.start(), abs_close + len(close_m.group(0)))) return blocks # ------------------------------------------------------------------ # Table extraction # ------------------------------------------------------------------ def _extract_tables(self, text: str) -> list[dict]: """Parse AsciiDoc tables delimited by ``|===``.""" tables: list[dict] = [] delimiters = list(RE_TABLE_DELIM.finditer(text)) idx = 0 while idx + 1 < len(delimiters): body = text[delimiters[idx].end() : delimiters[idx + 1].start()].strip() if body: table = self._parse_table_body(body) if table: tables.append(table) idx += 2 return tables @staticmethod def _parse_table_body(table_body: str) -> dict | None: """Parse body of an AsciiDoc table into headers and rows.""" groups = re.split(r"\n\s*\n", table_body.strip()) if not groups: return None def _parse_row(row_text: str) -> list[str]: return [p.strip() for p in row_text.split("|") if p.strip()] # First group β†’ headers headers: list[str] = [] for line in groups[0].strip().splitlines(): if line.strip().startswith("|"): parsed = _parse_row(line) if parsed and not headers: headers = parsed elif parsed: for i, cell in enumerate(parsed): if i < len(headers): headers[i] = f"{headers[i]} {cell}".strip() else: headers.append(cell) # Remaining groups β†’ rows rows: list[list[str]] = [] for group in groups[1:]: for line in group.strip().splitlines(): if line.strip().startswith("|"): parsed = _parse_row(line) if parsed: rows.append(parsed) # Single group fallback: first parsed line = header, rest = rows if len(groups) == 1 and not rows: all_parsed = [ _parse_row(line) for line in groups[0].strip().splitlines() if line.strip().startswith("|") ] all_parsed = [r for r in all_parsed if r] if len(all_parsed) > 1: headers, rows = all_parsed[0], all_parsed[1:] elif all_parsed: headers = all_parsed[0] return {"headers": headers, "rows": rows} if headers or rows else None # ------------------------------------------------------------------ # Admonition extraction # ------------------------------------------------------------------ def _extract_admonitions(self, text: str) -> list[dict]: """Extract NOTE/TIP/WARNING/IMPORTANT/CAUTION admonitions.""" admonitions: list[dict] = [] seen: set[str] = set() for pattern in (RE_ADMONITION_BLOCK, RE_ADMONITION_PARA): for m in pattern.finditer(text): adm_type, adm_text = m.group(1), m.group(2).strip() if adm_text and adm_text not in seen: admonitions.append({"type": adm_type, "text": adm_text}) seen.add(adm_text) return admonitions # ------------------------------------------------------------------ # Include directive extraction # ------------------------------------------------------------------ @staticmethod def _extract_includes(text: str) -> list[dict]: """Detect remaining ``include::`` directives in text.""" return [ {"path": m.group(1).strip(), "options": m.group(2).strip()} for m in RE_INCLUDE.finditer(text) ] # ------------------------------------------------------------------ # AsciiDoc β†’ Markdown conversion # ------------------------------------------------------------------ def _convert_to_markdown(self, text: str) -> str: """Convert AsciiDoc inline formatting to Markdown equivalents.""" result = text # Remove processed block delimiters and attribute lines for pat in ( RE_LISTING_DELIM, RE_LITERAL_DELIM, RE_TABLE_DELIM, RE_SOURCE_ATTR, RE_ATTRIBUTE, ): result = pat.sub("", result) # Remove admonition block markers and delimiters result = re.sub( r"^\[(NOTE|TIP|WARNING|IMPORTANT|CAUTION)\]\s*$", "", result, flags=re.MULTILINE ) result = re.sub(r"^={4,}$", "", result, flags=re.MULTILINE) # Headings: = Title β†’ # Title result = RE_HEADING.sub(lambda m: f"{'#' * len(m.group(1))} {m.group(2).strip()}", result) # Inline formatting result = RE_BOLD.sub(r"**\1**", result) result = RE_ITALIC.sub(r"*\1*", result) result = RE_LINK.sub(r"[\2](\1)", result) result = RE_XREF.sub(lambda m: f"*{m.group(2) or m.group(1)}*", result) # Lists: * item β†’ - item, . item β†’ 1. item result = re.sub( r"^(\*{1,5})\s+", lambda m: " " * (len(m.group(1)) - 1) + "- ", result, flags=re.MULTILINE, ) result = re.sub( r"^(\.{1,5})\s+", lambda m: " " * (len(m.group(1)) - 1) + "1. ", result, flags=re.MULTILINE, ) # Block titles: .Title β†’ **Title** result = re.sub(r"^\.([A-Z][\w\s]+)$", r"**\1**", result, flags=re.MULTILINE) # Include comments result = re.sub( r"^//\s*include::(.+?)\[\].*$", r"*(included: \1)*", result, flags=re.MULTILINE ) # Remove leftover table cell markers result = re.sub(r"^\|\s*", "", result, flags=re.MULTILINE) # Collapse blank lines result = re.sub(r"\n{3,}", "\n\n", result) return result.strip() # ------------------------------------------------------------------ # Load / categorize / build # ------------------------------------------------------------------ def load_extracted_data(self, json_path: str) -> bool: """Load previously extracted data from JSON file.""" print(f"\nπŸ“‚ Loading extracted data from: {json_path}") with open(json_path, encoding="utf-8") as f: self.extracted_data = json.load(f) total = self.extracted_data.get("total_sections", len(self.extracted_data.get("pages", []))) print(f"βœ… Loaded {total} sections") return True def categorize_content(self) -> dict: """Categorize sections by source file, headings, or keywords.""" print("\nπŸ“‹ Categorizing content...") categorized: dict[str, dict] = {} sections = self.extracted_data.get("pages", []) path = Path(self.asciidoc_path) if self.asciidoc_path else None if path and path.is_file(): key = self._sanitize_filename(path.stem) categorized[key] = {"title": path.stem, "pages": sections} print(f"βœ… Created 1 category (single file): {path.stem}: {len(sections)} sections") return categorized if path and path.is_dir(): for s in sections: src_stem = Path(s.get("source_file", "unknown")).stem key = self._sanitize_filename(src_stem) categorized.setdefault(key, {"title": src_stem, "pages": []})["pages"].append(s) if categorized: print(f"βœ… Created {len(categorized)} categories (by source file)") for cat in categorized.values(): print(f" - {cat['title']}: {len(cat['pages'])} sections") return categorized if self.categories: first_val = next(iter(self.categories.values()), None) if isinstance(first_val, list) and first_val and isinstance(first_val[0], dict): for k, pages in self.categories.items(): categorized[k] = {"title": k.replace("_", " ").title(), "pages": pages} else: for k in self.categories: categorized[k] = {"title": k.replace("_", " ").title(), "pages": []} for s in sections: txt = s.get("text", "").lower() htxt = s.get("heading", "").lower() scores = { k: sum( 1 for kw in kws if isinstance(kw, str) and (kw.lower() in txt or kw.lower() in htxt) ) for k, kws in self.categories.items() if isinstance(kws, list) } scores = {k: v for k, v in scores.items() if v > 0} if scores: categorized[max(scores, key=scores.get)]["pages"].append(s) else: categorized.setdefault("other", {"title": "Other", "pages": []})[ "pages" ].append(s) else: categorized["content"] = {"title": "Content", "pages": sections} print(f"βœ… Created {len(categorized)} categories") for cat in categorized.values(): print(f" - {cat['title']}: {len(cat['pages'])} sections") return categorized def build_skill(self) -> None: """Build complete skill directory structure.""" print(f"\nπŸ—οΈ Building skill: {self.name}") for subdir in ("references", "scripts", "assets"): os.makedirs(f"{self.skill_dir}/{subdir}", exist_ok=True) categorized = self.categorize_content() print("\nπŸ“ Generating reference files...") total_cats = len(categorized) for i, (cat_key, cat_data) in enumerate(categorized.items(), 1): self._generate_reference_file(cat_key, cat_data, i, total_cats) self._generate_index(categorized) self._generate_skill_md(categorized) print(f"\nβœ… Skill built successfully: {self.skill_dir}/") print(f"\nπŸ“¦ Next step: Package with: skill-seekers package {self.skill_dir}/") # ------------------------------------------------------------------ # Private generation methods # ------------------------------------------------------------------ def _ref_filename(self, cat_data: dict, section_num: int, total: int) -> str: """Compute reference file path for a category.""" sections = cat_data["pages"] adoc_base = "" if self.asciidoc_path: p = Path(self.asciidoc_path) adoc_base = p.stem if p.is_file() else "" if sections: nums = [s.get("section_number", i + 1) for i, s in enumerate(sections)] if total == 1: return f"{self.skill_dir}/references/{adoc_base or 'main'}.md" base = adoc_base or "section" return f"{self.skill_dir}/references/{base}_s{min(nums)}-s{max(nums)}.md" return f"{self.skill_dir}/references/section_{section_num:02d}.md" def _generate_reference_file( self, _cat_key: str, cat_data: dict, section_num: int, total: int ) -> None: """Generate a reference Markdown file for one category.""" filename = self._ref_filename(cat_data, section_num, total) with open(filename, "w", encoding="utf-8") as f: f.write(f"# {cat_data['title']}\n\n") for section in cat_data["pages"]: sec_num = section.get("section_number", "?") heading = section.get("heading", "") hl = section.get("heading_level", "h1") f.write(f"---\n\n**πŸ“„ Source: Section {sec_num}**\n\n") if heading: f.write(f"{'#' * (int(hl[1]) + 1)} {heading}\n\n") for sub in section.get("headings", []): sl = sub.get("level", "h3") if sub.get("text"): f.write(f"{'#' * (int(sl[1]) + 1)} {sub['text']}\n\n") if section.get("text"): f.write(f"{section['text']}\n\n") if section.get("code_samples"): f.write("### Code Examples\n\n") for c in section["code_samples"]: f.write(f"```{c.get('language', '')}\n{c['code']}\n```\n\n") if section.get("tables"): f.write("### Tables\n\n") for t in section["tables"]: hdrs = t.get("headers", []) if hdrs: f.write("| " + " | ".join(str(h) for h in hdrs) + " |\n") f.write("| " + " | ".join("---" for _ in hdrs) + " |\n") for row in t.get("rows", []): f.write("| " + " | ".join(str(c) for c in row) + " |\n") f.write("\n") if section.get("admonitions"): f.write("### Notes & Warnings\n\n") for a in section["admonitions"]: f.write(f"> **{a.get('type', 'NOTE')}:** {a.get('text', '')}\n\n") f.write("---\n\n") print(f" Generated: {filename}") def _generate_index(self, categorized: dict) -> None: """Generate references/index.md.""" filename = f"{self.skill_dir}/references/index.md" adoc_base = "" if self.asciidoc_path: p = Path(self.asciidoc_path) adoc_base = p.stem if p.is_file() else "" total = len(categorized) with open(filename, "w", encoding="utf-8") as f: f.write(f"# {self.name.title()} Documentation Reference\n\n## Categories\n\n") for i, (_k, cd) in enumerate(categorized.items(), 1): pages = cd["pages"] cnt = len(pages) if pages: nums = [s.get("section_number", j + 1) for j, s in enumerate(pages)] rng = f"Sections {min(nums)}-{max(nums)}" if total == 1: lf = f"{adoc_base or 'main'}.md" else: lf = f"{adoc_base or 'section'}_s{min(nums)}-s{max(nums)}.md" else: lf, rng = f"section_{i:02d}.md", "N/A" f.write(f"- [{cd['title']}]({lf}) ({cnt} sections, {rng})\n") f.write("\n## Statistics\n\n") for key, label in [ ("total_sections", "Total sections"), ("total_code_blocks", "Code blocks"), ("total_tables", "Tables"), ("total_admonitions", "Admonitions"), ("total_files", "Source files"), ]: f.write(f"- {label}: {self.extracted_data.get(key, 0)}\n") meta = self.extracted_data.get("metadata", {}) if meta.get("author"): f.write(f"- Author: {meta['author']}\n") if meta.get("date"): f.write(f"- Date: {meta['date']}\n") print(f" Generated: {filename}") def _generate_skill_md(self, categorized: dict) -> None: """Generate main SKILL.md file with rich summary content.""" filename = f"{self.skill_dir}/SKILL.md" skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] desc = self.description[:1024] ed = self.extracted_data # shorthand with open(filename, "w", encoding="utf-8") as f: f.write(f"---\nname: {skill_name}\ndescription: {desc}\n---\n\n") f.write(f"# {self.name.title()} Documentation Skill\n\n{self.description}\n\n") # Document metadata meta = ed.get("metadata", {}) if any(v for v in meta.values() if v): f.write("## πŸ“‹ Document Information\n\n") for key, label in [ ("title", "Title"), ("author", "Author"), ("revision", "Revision"), ("date", "Date"), ("description", "Description"), ]: if meta.get(key): f.write(f"**{label}:** {meta[key]}\n\n") f.write("## πŸ’‘ When to Use This Skill\n\nUse this skill when you need to:\n") f.write(f"- Understand {self.name} concepts and fundamentals\n") f.write("- Look up API references and technical specifications\n") f.write("- Find code examples and implementation patterns\n") f.write("- Review tutorials, guides, and best practices\n") f.write("- Explore the complete documentation structure\n\n") # Section Overview f.write( f"## πŸ“– Section Overview\n\n**Total Sections:** {ed.get('total_sections', 0)}\n\n" ) f.write("**Content Breakdown:**\n\n") for cd in categorized.values(): f.write(f"- **{cd['title']}**: {len(cd['pages'])} sections\n") f.write("\n") f.write(self._format_key_concepts()) f.write("## ⚑ Quick Reference\n\n") f.write(self._format_patterns_from_content()) # Code examples (top 15 grouped by language) all_code = [c for s in ed.get("pages", []) for c in s.get("code_samples", [])] all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True) if all_code[:15]: f.write("## πŸ“ Code Examples\n\n*High-quality examples from documentation*\n\n") by_lang: dict[str, list] = {} for c in all_code[:15]: by_lang.setdefault(c.get("language", "unknown"), []).append(c) for lang in sorted(by_lang): exs = by_lang[lang] f.write(f"### {lang.title()} Examples ({len(exs)})\n\n") for i, c in enumerate(exs[:5], 1): ct = c.get("code", "") f.write( f"**Example {i}** (Quality: {c.get('quality_score', 0):.1f}/10):\n\n" ) f.write(f"```{lang}\n{ct[:500]}{'...' if len(ct) > 500 else ''}\n```\n\n") # Table summary all_tables = [ (s.get("heading", ""), t) for s in ed.get("pages", []) for t in s.get("tables", []) ] if all_tables: f.write(f"## πŸ“Š Table Summary\n\n*{len(all_tables)} table(s) found*\n\n") for sh, t in all_tables[:5]: if sh: f.write(f"**From section: {sh}**\n\n") hdrs = t.get("headers", []) if hdrs: f.write("| " + " | ".join(str(h) for h in hdrs) + " |\n") f.write("| " + " | ".join("---" for _ in hdrs) + " |\n") for row in t.get("rows", [])[:5]: f.write("| " + " | ".join(str(c) for c in row) + " |\n") f.write("\n") # Admonition summary all_adm = [a for s in ed.get("pages", []) for a in s.get("admonitions", [])] if all_adm: f.write("## ⚠️ Admonition Summary\n\n") by_type: dict[str, list[str]] = {} for a in all_adm: by_type.setdefault(a.get("type", "NOTE"), []).append(a.get("text", "")) for at in sorted(by_type): items = by_type[at] f.write(f"**{at}** ({len(items)}):\n\n") for txt in items[:5]: f.write(f"> {txt[:120]}{'...' if len(txt) > 120 else ''}\n\n") # Statistics f.write("## πŸ“Š Documentation Statistics\n\n") for key, label in [ ("total_sections", "Total Sections"), ("total_code_blocks", "Code Blocks"), ("total_tables", "Tables"), ("total_admonitions", "Admonitions"), ("total_files", "Source Files"), ]: f.write(f"- **{label}**: {ed.get(key, 0)}\n") langs = ed.get("languages_detected", {}) if langs: f.write(f"- **Programming Languages**: {len(langs)}\n\n**Language Breakdown:**\n\n") for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True): f.write(f"- {lang}: {count} examples\n") f.write("\n") # Navigation f.write("## πŸ—ΊοΈ Navigation\n\n**Reference Files:**\n\n") for cd in categorized.values(): cf = self._sanitize_filename(cd["title"]) f.write(f"- `references/{cf}.md` - {cd['title']}\n") f.write("\nSee `references/index.md` for complete documentation structure.\n\n") f.write("---\n\n**Generated by Skill Seeker** | AsciiDoc Scraper\n") with open(filename, encoding="utf-8") as f: print(f" Generated: {filename} ({len(f.read().splitlines())} lines)") # ------------------------------------------------------------------ # Content analysis helpers # ------------------------------------------------------------------ def _format_key_concepts(self) -> str: """Extract key concepts from headings across all sections.""" all_h: list[tuple[str, str]] = [] for s in self.extracted_data.get("pages", []): h = s.get("heading", "").strip() if h and len(h) > 3: all_h.append((s.get("heading_level", "h1"), h)) for sub in s.get("headings", []): t = sub.get("text", "").strip() if t and len(t) > 3: all_h.append((sub.get("level", "h3"), t)) if not all_h: return "" content = "## πŸ”‘ Key Concepts\n\n*Main topics covered in this documentation*\n\n" h1s = [t for lv, t in all_h if lv == "h1"] h2s = [t for lv, t in all_h if lv == "h2"] if h1s: content += "**Major Topics:**\n\n" + "".join(f"- {h}\n" for h in h1s[:10]) + "\n" if h2s: content += "**Subtopics:**\n\n" + "".join(f"- {h}\n" for h in h2s[:15]) + "\n" return content def _format_patterns_from_content(self) -> str: """Extract common documentation patterns from section headings.""" keywords = [ "getting started", "installation", "configuration", "usage", "api", "examples", "tutorial", "guide", "best practices", "troubleshooting", "faq", ] patterns: list[dict] = [] for s in self.extracted_data.get("pages", []): ht = s.get("heading", "").lower() for kw in keywords: if kw in ht: patterns.append( { "type": kw.title(), "heading": s.get("heading", ""), "section": s.get("section_number", 0), } ) break if not patterns: return "*See reference files for detailed content*\n\n" by_type: dict[str, list] = {} for p in patterns: by_type.setdefault(p["type"], []).append(p) content = "*Common documentation patterns found:*\n\n" for pt in sorted(by_type): items = by_type[pt] content += f"**{pt}** ({len(items)} sections):\n" content += "".join(f"- {it['heading']} (section {it['section']})\n" for it in items[:3]) content += "\n" return content # ------------------------------------------------------------------ # Utilities # ------------------------------------------------------------------ @staticmethod def _sanitize_filename(name: str) -> str: """Convert name to a safe filename slug.""" safe = re.sub(r"[^\w\s-]", "", name.lower()) return re.sub(r"[-\s]+", "_", safe) @staticmethod def _in_range(pos: int, ranges: list[tuple[int, int]]) -> bool: """Check whether pos falls within any consumed range.""" return any(s <= pos < e for s, e in ranges)