feat: add llms.txt markdown parser

2025-10-24 13:21:03 +04:00
parent 60fefb6c0b
commit a18ea8cf68
2 changed files with 108 additions and 0 deletions
--- a/cli/llms_txt_parser.py
+++ b/cli/llms_txt_parser.py
@@ -0,0 +1,74 @@
 """ABOUTME: Parses llms.txt markdown content into structured page data"""
 """ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
 import re
 from typing import List, Dict
 class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""
    def __init__(self, content: str):
        self.content = content
    def parse(self) -> List[Dict]:
        """
        Parse markdown content into page structures.
        Returns:
            List of page dicts with title, content, code_samples, headings
        """
        pages = []
        # Split by h1 headers (# Title)
        sections = re.split(r'\n# ', self.content)
        for section in sections:
            if not section.strip():
                continue
            # First line is title
            lines = section.split('\n')
            title = lines[0].strip('#').strip()
            # Parse content
            page = self._parse_section('\n'.join(lines[1:]), title)
            pages.append(page)
        return pages
    def _parse_section(self, content: str, title: str) -> Dict:
        """Parse a single section into page structure"""
        page = {
            'title': title,
            'content': '',
            'code_samples': [],
            'headings': [],
            'url': f'llms-txt#{title.lower().replace(" ", "-")}',
            'links': []
        }
        # Extract code blocks
        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
        for lang, code in code_blocks:
            page['code_samples'].append({
                'code': code.strip(),
                'language': lang or 'unknown'
            })
        # Extract h2/h3 headings
        headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
        for level_markers, text in headings:
            page['headings'].append({
                'level': f'h{len(level_markers)}',
                'text': text.strip(),
                'id': text.lower().replace(' ', '-')
            })
        # Remove code blocks from content for plain text
        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
        # Extract paragraphs
        paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
        page['content'] = '\n\n'.join(paragraphs)
        return page
--- a/tests/test_llms_txt_parser.py
+++ b/tests/test_llms_txt_parser.py
@@ -0,0 +1,34 @@
 import pytest
 from cli.llms_txt_parser import LlmsTxtParser
 def test_parse_markdown_sections():
    """Test parsing markdown into page sections"""
    sample_content = """# Getting Started
 Welcome to the docs.
 ## Installation
 Run: npm install
 ## Usage
 Import the library:
 ```javascript
 import { app } from 'framework'
 ```
 # API Reference
 Main API documentation here.
 """
    parser = LlmsTxtParser(sample_content)
    pages = parser.parse()
    assert len(pages) >= 2
    assert pages[0]['title'] == 'Getting Started'
    assert pages[1]['title'] == 'API Reference'
    assert len(pages[0]['code_samples']) == 1
    assert pages[0]['code_samples'][0]['language'] == 'javascript'