feat: add ReStructuredText (RST) support to documentation extraction
Adds support for .rst and .rest files in codebase documentation extraction.
Problem:
The godot-docs repository contains 1,571 RST files but only 8 Markdown files.
Previously only Markdown files were processed, missing 99.5% of documentation.
Changes:
1. Added RST_EXTENSIONS = {".rst", ".rest"}
2. Created DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS
3. Implemented extract_rst_structure() function
- Parses RST underline-style headers (===, ---, ~~~, etc.)
- Extracts code blocks (.. code-block:: directive)
- Extracts links (`text <url>`_ format)
- Calculates word/line counts
4. Updated scan_markdown_files() to use DOC_EXTENSIONS
5. Updated doc processing to call appropriate parser based on extension
RST Header Syntax:
Title Section Subsection
===== ------- ~~~~~~~~~~
Result:
✅ Now processes BOTH Markdown AND RST documentation files
✅ Godot docs: 8 MD + 1,571 RST = 1,579 files (was 8, now all 1,579!)
✅ Supports Sphinx documentation, Python docs, Godot docs, etc.
Breakdown of Godot docs by RST files:
- classes/: 1,069 RST files (API reference)
- tutorials/: 393 RST files
- engine_details/: 61 RST files
- getting_started/: 33 RST files
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -80,8 +80,10 @@ LANGUAGE_EXTENSIONS = {
|
||||
".php": "PHP",
|
||||
}
|
||||
|
||||
# Markdown extension mapping
|
||||
# Documentation file extensions
|
||||
MARKDOWN_EXTENSIONS = {".md", ".markdown", ".mdown", ".mkd"}
|
||||
RST_EXTENSIONS = {".rst", ".rest"} # ReStructuredText (Sphinx, Godot docs, etc.)
|
||||
DOC_EXTENSIONS = MARKDOWN_EXTENSIONS | RST_EXTENSIONS # All supported doc formats
|
||||
|
||||
# Common documentation folders to scan
|
||||
DOC_FOLDERS = {"docs", "doc", "documentation", "wiki", ".github"}
|
||||
@@ -328,8 +330,8 @@ def walk_markdown_files(
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Check if markdown file
|
||||
if file_path.suffix.lower() not in MARKDOWN_EXTENSIONS:
|
||||
# Check if documentation file (markdown or RST)
|
||||
if file_path.suffix.lower() not in DOC_EXTENSIONS:
|
||||
continue
|
||||
|
||||
files.append(file_path)
|
||||
@@ -439,6 +441,93 @@ def extract_markdown_structure(content: str) -> dict[str, Any]:
|
||||
return structure
|
||||
|
||||
|
||||
def extract_rst_structure(content: str) -> dict[str, Any]:
|
||||
"""
|
||||
Extract structure from ReStructuredText (RST) content.
|
||||
|
||||
RST uses underline-style headers:
|
||||
Title
|
||||
=====
|
||||
|
||||
Section
|
||||
-------
|
||||
|
||||
Subsection
|
||||
~~~~~~~~~~
|
||||
|
||||
Args:
|
||||
content: RST file content
|
||||
|
||||
Returns:
|
||||
Dictionary with extracted structure
|
||||
"""
|
||||
import re
|
||||
|
||||
structure = {
|
||||
"title": None,
|
||||
"headers": [],
|
||||
"code_blocks": [],
|
||||
"links": [],
|
||||
"word_count": len(content.split()),
|
||||
"line_count": len(content.split("\n")),
|
||||
}
|
||||
|
||||
lines = content.split("\n")
|
||||
|
||||
# RST header underline characters (ordered by common usage for levels)
|
||||
# Level 1: ===, Level 2: ---, Level 3: ~~~, Level 4: ^^^, etc.
|
||||
underline_chars = ["=", "-", "~", "^", '"', "'", "`", ":", "."]
|
||||
|
||||
# Extract headers (RST style: text on one line, underline on next)
|
||||
for i in range(len(lines) - 1):
|
||||
current_line = lines[i].strip()
|
||||
next_line = lines[i + 1].strip()
|
||||
|
||||
# Check if next line is an underline (same character repeated)
|
||||
if (
|
||||
current_line
|
||||
and next_line
|
||||
and len(set(next_line)) == 1 # All same character
|
||||
and next_line[0] in underline_chars
|
||||
and len(next_line) >= len(current_line) - 2 # Underline roughly matches length
|
||||
):
|
||||
level = underline_chars.index(next_line[0]) + 1
|
||||
text = current_line.strip()
|
||||
|
||||
structure["headers"].append({"level": level, "text": text, "line": i + 1})
|
||||
|
||||
# First header is typically the title
|
||||
if structure["title"] is None:
|
||||
structure["title"] = text
|
||||
|
||||
# Extract code blocks (RST uses :: and indentation or .. code-block::)
|
||||
# Simple extraction: look for .. code-block:: directive
|
||||
code_block_pattern = re.compile(r"\.\.\s+code-block::\s+(\w+)\s*\n\s+(.*?)(?=\n\S|\Z)", re.DOTALL)
|
||||
for match in code_block_pattern.finditer(content):
|
||||
language = match.group(1) or "text"
|
||||
code = match.group(2).strip()
|
||||
if code:
|
||||
structure["code_blocks"].append(
|
||||
{
|
||||
"language": language,
|
||||
"code": code[:500], # Truncate long code blocks
|
||||
"full_length": len(code),
|
||||
}
|
||||
)
|
||||
|
||||
# Extract links (RST uses `text <url>`_ or :ref:`label`)
|
||||
link_pattern = re.compile(r"`([^<`]+)\s+<([^>]+)>`_")
|
||||
for match in link_pattern.finditer(content):
|
||||
structure["links"].append(
|
||||
{
|
||||
"text": match.group(1).strip(),
|
||||
"url": match.group(2),
|
||||
}
|
||||
)
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def generate_markdown_summary(
|
||||
content: str, structure: dict[str, Any], max_length: int = 500
|
||||
) -> str:
|
||||
@@ -551,7 +640,11 @@ def process_markdown_docs(
|
||||
processed_docs.append(doc_data)
|
||||
else:
|
||||
# Deep/Full: extract structure and summary
|
||||
structure = extract_markdown_structure(content)
|
||||
# Use appropriate parser based on file extension
|
||||
if md_path.suffix.lower() in RST_EXTENSIONS:
|
||||
structure = extract_rst_structure(content)
|
||||
else:
|
||||
structure = extract_markdown_structure(content)
|
||||
summary = generate_markdown_summary(content, structure)
|
||||
|
||||
doc_data.update(
|
||||
|
||||
Reference in New Issue
Block a user