From 3d842753142b4afe934f1f8b1dd4556ad799127a Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Sun, 15 Feb 2026 21:33:42 +0300
Subject: [PATCH] feat: add ReStructuredText (RST) support to documentation
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds support for .rst and .rest files in codebase documentation extraction.

Problem:
The godot-docs repository contains 1,571 RST files but only 8 Markdown files.
Previously only Markdown files were processed, missing 99.5% of documentation.

Changes:
1. Added RST_EXTENSIONS = {".rst", ".rest"}
2. Created DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS
3. Implemented extract_rst_structure() function
   - Parses RST underline-style headers (===, ---, ~~~, etc.)
   - Extracts code blocks (.. code-block:: directive)
   - Extracts links (`text <url>`_ format)
   - Calculates word/line counts
4. Updated scan_markdown_files() to use DOC_EXTENSIONS
5. Updated doc processing to call appropriate parser based on extension

RST Header Syntax:
  Title          Section        Subsection
  =====          -------        ~~~~~~~~~~

Result:
✅ Now processes BOTH Markdown AND RST documentation files
✅ Godot docs: 8 MD + 1,571 RST = 1,579 files (was 8, now all 1,579!)
✅ Supports Sphinx documentation, Python docs, Godot docs, etc.

Breakdown of Godot docs by RST files:
- classes/: 1,069 RST files (API reference)
- tutorials/: 393 RST files
- engine_details/: 61 RST files
- getting_started/: 33 RST files

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/skill_seekers/cli/codebase_scraper.py | 101 +++++++++++++++++++++-
 1 file changed, 97 insertions(+), 4 deletions(-)
diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py
index 7696d59..618498b 100644
--- a/src/skill_seekers/cli/codebase_scraper.py
+++ b/src/skill_seekers/cli/codebase_scraper.py
@@ -80,8 +80,10 @@ LANGUAGE_EXTENSIONS = {
     ".php": "PHP",
 }
 
-# Markdown extension mapping
+# Documentation file extensions
 MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"}
+RST_EXTENSIONS = {".rst", ".rest"}  # ReStructuredText (Sphinx, Godot docs, etc.)
+DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS  # All supported doc formats
 
 # Common documentation folders to scan
 DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"}
@@ -328,8 +330,8 @@ def walk_markdown_files(
                 except ValueError:
                     continue
 
-            # Check if markdown file
-            if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS:
+            # Check if documentation file (markdown or RST)
+            if file_path.suffix.lower() not in DOC_EXTENSIONS:
                 continue
 
             files.append(file_path)
@@ -439,6 +441,93 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
     return structure
 
 
+def extract_rst_structure(content: str) -> dict[str, Any]:
+    """
+    Extract structure from ReStructuredText (RST) content.
+
+    RST uses underline-style headers:
+        Title
+        =====
+
+        Section
+        -------
+
+        Subsection
+        ~~~~~~~~~~
+
+    Args:
+        content: RST file content
+
+    Returns:
+        Dictionary with extracted structure
+    """
+    import re
+
+    structure = {
+        "title": None,
+        "headers": [],
+        "code_blocks": [],
+        "links": [],
+        "word_count": len(content.split()),
+        "line_count": len(content.split("\n")),
+    }
+
+    lines = content.split("\n")
+
+    # RST header underline characters (ordered by common usage for levels)
+    # Level 1: ===, Level 2: ---, Level 3: ~~~, Level 4: ^^^, etc.
+    underline_chars = ["=", "-", "~", "^", '"', "'", "`", ":", "."]
+
+    # Extract headers (RST style: text on one line, underline on next)
+    for i in range(len(lines) - 1):
+        current_line = lines[i].strip()
+        next_line = lines[i + 1].strip()
+
+        # Check if next line is an underline (same character repeated)
+        if (
+            current_line
+            and next_line
+            and len(set(next_line)) == 1  # All same character
+            and next_line[0] in underline_chars
+            and len(next_line) >= len(current_line) - 2  # Underline roughly matches length
+        ):
+            level = underline_chars.index(next_line[0]) + 1
+            text = current_line.strip()
+
+            structure["headers"].append({"level": level, "text": text, "line": i + 1})
+
+            # First header is typically the title
+            if structure["title"] is None:
+                structure["title"] = text
+
+    # Extract code blocks (RST uses :: and indentation or .. code-block::)
+    # Simple extraction: look for .. code-block:: directive
+    code_block_pattern = re.compile(r"\.\.\s+code-block::\s+(\w+)\s*\n\s+(.*?)(?=\n\S|\Z)", re.DOTALL)
+    for match in code_block_pattern.finditer(content):
+        language = match.group(1) or "text"
+        code = match.group(2).strip()
+        if code:
+            structure["code_blocks"].append(
+                {
+                    "language": language,
+                    "code": code[:500],  # Truncate long code blocks
+                    "full_length": len(code),
+                }
+            )
+
+    # Extract links (RST uses `text <url>`_ or :ref:`label`)
+    link_pattern = re.compile(r"`([^<`]+)\s+<([^>]+)>`_")
+    for match in link_pattern.finditer(content):
+        structure["links"].append(
+            {
+                "text": match.group(1).strip(),
+                "url": match.group(2),
+            }
+        )
+
+    return structure
+
+
 def generate_markdown_summary(
     content: str, structure: dict[str, Any], max_length: int = 500
 ) -> str:
@@ -551,7 +640,11 @@ def process_markdown_docs(
                 processed_docs.append(doc_data)
             else:
                 # Deep/Full: extract structure and summary
-                structure = extract_markdown_structure(content)
+                # Use appropriate parser based on file extension
+                if md_path.suffix.lower() in RST_EXTENSIONS:
+                    structure = extract_rst_structure(content)
+                else:
+                    structure = extract_markdown_structure(content)
                 summary = generate_markdown_summary(content, structure)
 
                 doc_data.update(