From a18ea8cf68d397e3c0d301ad13fbf55bb012b19b Mon Sep 17 00:00:00 2001 From: "Edgar I." Date: Fri, 24 Oct 2025 13:21:03 +0400 Subject: [PATCH] feat: add llms.txt markdown parser --- cli/llms_txt_parser.py | 74 +++++++++++++++++++++++++++++++++++ tests/test_llms_txt_parser.py | 34 ++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 cli/llms_txt_parser.py create mode 100644 tests/test_llms_txt_parser.py diff --git a/cli/llms_txt_parser.py b/cli/llms_txt_parser.py new file mode 100644 index 0000000..e288c92 --- /dev/null +++ b/cli/llms_txt_parser.py @@ -0,0 +1,74 @@ +"""ABOUTME: Parses llms.txt markdown content into structured page data""" +"""ABOUTME: Extracts titles, content, code samples, and headings from markdown""" + +import re +from typing import List, Dict + +class LlmsTxtParser: + """Parse llms.txt markdown content into page structures""" + + def __init__(self, content: str): + self.content = content + + def parse(self) -> List[Dict]: + """ + Parse markdown content into page structures. + + Returns: + List of page dicts with title, content, code_samples, headings + """ + pages = [] + + # Split by h1 headers (# Title) + sections = re.split(r'\n# ', self.content) + + for section in sections: + if not section.strip(): + continue + + # First line is title + lines = section.split('\n') + title = lines[0].strip('#').strip() + + # Parse content + page = self._parse_section('\n'.join(lines[1:]), title) + pages.append(page) + + return pages + + def _parse_section(self, content: str, title: str) -> Dict: + """Parse a single section into page structure""" + page = { + 'title': title, + 'content': '', + 'code_samples': [], + 'headings': [], + 'url': f'llms-txt#{title.lower().replace(" ", "-")}', + 'links': [] + } + + # Extract code blocks + code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL) + for lang, code in code_blocks: + page['code_samples'].append({ + 'code': code.strip(), + 'language': lang or 'unknown' + }) + + # Extract h2/h3 headings + headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE) + for level_markers, text in headings: + page['headings'].append({ + 'level': f'h{len(level_markers)}', + 'text': text.strip(), + 'id': text.lower().replace(' ', '-') + }) + + # Remove code blocks from content for plain text + content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL) + + # Extract paragraphs + paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20] + page['content'] = '\n\n'.join(paragraphs) + + return page diff --git a/tests/test_llms_txt_parser.py b/tests/test_llms_txt_parser.py new file mode 100644 index 0000000..8e8c7fa --- /dev/null +++ b/tests/test_llms_txt_parser.py @@ -0,0 +1,34 @@ +import pytest +from cli.llms_txt_parser import LlmsTxtParser + +def test_parse_markdown_sections(): + """Test parsing markdown into page sections""" + sample_content = """# Getting Started + +Welcome to the docs. + +## Installation + +Run: npm install + +## Usage + +Import the library: + +```javascript +import { app } from 'framework' +``` + +# API Reference + +Main API documentation here. +""" + + parser = LlmsTxtParser(sample_content) + pages = parser.parse() + + assert len(pages) >= 2 + assert pages[0]['title'] == 'Getting Started' + assert pages[1]['title'] == 'API Reference' + assert len(pages[0]['code_samples']) == 1 + assert pages[0]['code_samples'][0]['language'] == 'javascript'