From a18ea8cf68d397e3c0d301ad13fbf55bb012b19b Mon Sep 17 00:00:00 2001
From: "Edgar I." <eibrahimov@gmail.com>
Date: Fri, 24 Oct 2025 13:21:03 +0400
Subject: [PATCH] feat: add llms.txt markdown parser

---
 cli/llms_txt_parser.py        | 74 +++++++++++++++++++++++++++++++++++
 tests/test_llms_txt_parser.py | 34 ++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 cli/llms_txt_parser.py
 create mode 100644 tests/test_llms_txt_parser.py

diff --git a/cli/llms_txt_parser.py b/cli/llms_txt_parser.py
new file mode 100644
index 0000000..e288c92
--- /dev/null
+++ b/cli/llms_txt_parser.py
@@ -0,0 +1,74 @@
+"""ABOUTME: Parses llms.txt markdown content into structured page data"""
+"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
+
+import re
+from typing import List, Dict
+
+class LlmsTxtParser:
+    """Parse llms.txt markdown content into page structures"""
+
+    def __init__(self, content: str):
+        self.content = content
+
+    def parse(self) -> List[Dict]:
+        """
+        Parse markdown content into page structures.
+
+        Returns:
+            List of page dicts with title, content, code_samples, headings
+        """
+        pages = []
+
+        # Split by h1 headers (# Title)
+        sections = re.split(r'\n# ', self.content)
+
+        for section in sections:
+            if not section.strip():
+                continue
+
+            # First line is title
+            lines = section.split('\n')
+            title = lines[0].strip('#').strip()
+
+            # Parse content
+            page = self._parse_section('\n'.join(lines[1:]), title)
+            pages.append(page)
+
+        return pages
+
+    def _parse_section(self, content: str, title: str) -> Dict:
+        """Parse a single section into page structure"""
+        page = {
+            'title': title,
+            'content': '',
+            'code_samples': [],
+            'headings': [],
+            'url': f'llms-txt#{title.lower().replace(" ", "-")}',
+            'links': []
+        }
+
+        # Extract code blocks
+        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
+        for lang, code in code_blocks:
+            page['code_samples'].append({
+                'code': code.strip(),
+                'language': lang or 'unknown'
+            })
+
+        # Extract h2/h3 headings
+        headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
+        for level_markers, text in headings:
+            page['headings'].append({
+                'level': f'h{len(level_markers)}',
+                'text': text.strip(),
+                'id': text.lower().replace(' ', '-')
+            })
+
+        # Remove code blocks from content for plain text
+        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
+
+        # Extract paragraphs
+        paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
+        page['content'] = '\n\n'.join(paragraphs)
+
+        return page
diff --git a/tests/test_llms_txt_parser.py b/tests/test_llms_txt_parser.py
new file mode 100644
index 0000000..8e8c7fa
--- /dev/null
+++ b/tests/test_llms_txt_parser.py
@@ -0,0 +1,34 @@
+import pytest
+from cli.llms_txt_parser import LlmsTxtParser
+
+def test_parse_markdown_sections():
+    """Test parsing markdown into page sections"""
+    sample_content = """# Getting Started
+
+Welcome to the docs.
+
+## Installation
+
+Run: npm install
+
+## Usage
+
+Import the library:
+
+```javascript
+import { app } from 'framework'
+```
+
+# API Reference
+
+Main API documentation here.
+"""
+
+    parser = LlmsTxtParser(sample_content)
+    pages = parser.parse()
+
+    assert len(pages) >= 2
+    assert pages[0]['title'] == 'Getting Started'
+    assert pages[1]['title'] == 'API Reference'
+    assert len(pages[0]['code_samples']) == 1
+    assert pages[0]['code_samples'][0]['language'] == 'javascript'