feat: add ReStructuredText (RST) support to documentation extraction

Adds support for .rst and .rest files in codebase documentation extraction. Problem: The godot-docs repository contains 1,571 RST files but only 8 Markdown files. Previously only Markdown files were processed, missing 99.5% of documentation. Changes: 1. Added RST_EXTENSIONS = {".rst", ".rest"} 2. Created DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS 3. Implemented extract_rst_structure() function - Parses RST underline-style headers (===, ---, ~~~, etc.) - Extracts code blocks (.. code-block:: directive) - Extracts links (`text <url>`_ format) - Calculates word/line counts 4. Updated scan_markdown_files() to use DOC_EXTENSIONS 5. Updated doc processing to call appropriate parser based on extension RST Header Syntax: Title Section Subsection ===== ------- ~~~~~~~~~~ Result: ✅ Now processes BOTH Markdown AND RST documentation files ✅ Godot docs: 8 MD + 1,571 RST = 1,579 files (was 8, now all 1,579!) ✅ Supports Sphinx documentation, Python docs, Godot docs, etc. Breakdown of Godot docs by RST files: - classes/: 1,069 RST files (API reference) - tutorials/: 393 RST files - engine_details/: 61 RST files - getting_started/: 33 RST files Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-15 21:33:42 +03:00
parent 140b571536
commit 3d84275314
1 changed files with 97 additions and 4 deletions
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -80,8 +80,10 @@ LANGUAGE_EXTENSIONS = {
    ".php": "PHP",
 }

-# Markdown extension mapping
+# Documentation file extensions
 MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"}
+RST_EXTENSIONS = {".rst", ".rest"}  # ReStructuredText (Sphinx, Godot docs, etc.)
+DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS  # All supported doc formats

 # Common documentation folders to scan
 DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"}
@@ -328,8 +330,8 @@ def walk_markdown_files(
                except ValueError:
                    continue

-            # Check if markdown file
-            if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS:
+            # Check if documentation file (markdown or RST)
+            if file_path.suffix.lower() not in DOC_EXTENSIONS:
                continue

            files.append(file_path)
@@ -439,6 +441,93 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
    return structure


+def extract_rst_structure(content: str) -> dict[str, Any]:
+    """
+    Extract structure from ReStructuredText (RST) content.
+
+    RST uses underline-style headers:
+        Title
+        =====
+
+        Section
+        -------
+
+        Subsection
+        ~~~~~~~~~~
+
+    Args:
+        content: RST file content
+
+    Returns:
+        Dictionary with extracted structure
+    """
+    import re
+
+    structure = {
+        "title": None,
+        "headers": [],
+        "code_blocks": [],
+        "links": [],
+        "word_count": len(content.split()),
+        "line_count": len(content.split("\n")),
+    }
+
+    lines = content.split("\n")
+
+    # RST header underline characters (ordered by common usage for levels)
+    # Level 1: ===, Level 2: ---, Level 3: ~~~, Level 4: ^^^, etc.
+    underline_chars = ["=", "-", "~", "^", '"', "'", "`", ":", "."]
+
+    # Extract headers (RST style: text on one line, underline on next)
+    for i in range(len(lines) - 1):
+        current_line = lines[i].strip()
+        next_line = lines[i + 1].strip()
+
+        # Check if next line is an underline (same character repeated)
+        if (
+            current_line
+            and next_line
+            and len(set(next_line)) == 1  # All same character
+            and next_line[0] in underline_chars
+            and len(next_line) >= len(current_line) - 2  # Underline roughly matches length
+        ):
+            level = underline_chars.index(next_line[0]) + 1
+            text = current_line.strip()
+
+            structure["headers"].append({"level": level, "text": text, "line": i + 1})
+
+            # First header is typically the title
+            if structure["title"] is None:
+                structure["title"] = text
+
+    # Extract code blocks (RST uses :: and indentation or .. code-block::)
+    # Simple extraction: look for .. code-block:: directive
+    code_block_pattern = re.compile(r"\.\.\s+code-block::\s+(\w+)\s*\n\s+(.*?)(?=\n\S|\Z)", re.DOTALL)
+    for match in code_block_pattern.finditer(content):
+        language = match.group(1) or "text"
+        code = match.group(2).strip()
+        if code:
+            structure["code_blocks"].append(
+                {
+                    "language": language,
+                    "code": code[:500],  # Truncate long code blocks
+                    "full_length": len(code),
+                }
+            )
+
+    # Extract links (RST uses `text <url>`_ or :ref:`label`)
+    link_pattern = re.compile(r"`([^<`]+)\s+<([^>]+)>`_")
+    for match in link_pattern.finditer(content):
+        structure["links"].append(
+            {
+                "text": match.group(1).strip(),
+                "url": match.group(2),
+            }
+        )
+
+    return structure
+
+
 def generate_markdown_summary(
    content: str, structure: dict[str, Any], max_length: int = 500
 ) -> str:
@@ -551,7 +640,11 @@ def process_markdown_docs(
                processed_docs.append(doc_data)
            else:
                # Deep/Full: extract structure and summary
-                structure = extract_markdown_structure(content)
+                # Use appropriate parser based on file extension
+                if md_path.suffix.lower() in RST_EXTENSIONS:
+                    structure = extract_rst_structure(content)
+                else:
+                    structure = extract_markdown_structure(content)
                summary = generate_markdown_summary(content, structure)

                doc_data.update(