feat: add llms.txt markdown parser
This commit is contained in:
74
cli/llms_txt_parser.py
Normal file
74
cli/llms_txt_parser.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
"""ABOUTME: Parses llms.txt markdown content into structured page data"""
|
||||||
|
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
class LlmsTxtParser:
|
||||||
|
"""Parse llms.txt markdown content into page structures"""
|
||||||
|
|
||||||
|
def __init__(self, content: str):
|
||||||
|
self.content = content
|
||||||
|
|
||||||
|
def parse(self) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Parse markdown content into page structures.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of page dicts with title, content, code_samples, headings
|
||||||
|
"""
|
||||||
|
pages = []
|
||||||
|
|
||||||
|
# Split by h1 headers (# Title)
|
||||||
|
sections = re.split(r'\n# ', self.content)
|
||||||
|
|
||||||
|
for section in sections:
|
||||||
|
if not section.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# First line is title
|
||||||
|
lines = section.split('\n')
|
||||||
|
title = lines[0].strip('#').strip()
|
||||||
|
|
||||||
|
# Parse content
|
||||||
|
page = self._parse_section('\n'.join(lines[1:]), title)
|
||||||
|
pages.append(page)
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def _parse_section(self, content: str, title: str) -> Dict:
|
||||||
|
"""Parse a single section into page structure"""
|
||||||
|
page = {
|
||||||
|
'title': title,
|
||||||
|
'content': '',
|
||||||
|
'code_samples': [],
|
||||||
|
'headings': [],
|
||||||
|
'url': f'llms-txt#{title.lower().replace(" ", "-")}',
|
||||||
|
'links': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract code blocks
|
||||||
|
code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
|
||||||
|
for lang, code in code_blocks:
|
||||||
|
page['code_samples'].append({
|
||||||
|
'code': code.strip(),
|
||||||
|
'language': lang or 'unknown'
|
||||||
|
})
|
||||||
|
|
||||||
|
# Extract h2/h3 headings
|
||||||
|
headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
|
||||||
|
for level_markers, text in headings:
|
||||||
|
page['headings'].append({
|
||||||
|
'level': f'h{len(level_markers)}',
|
||||||
|
'text': text.strip(),
|
||||||
|
'id': text.lower().replace(' ', '-')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Remove code blocks from content for plain text
|
||||||
|
content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
|
||||||
|
|
||||||
|
# Extract paragraphs
|
||||||
|
paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
|
||||||
|
page['content'] = '\n\n'.join(paragraphs)
|
||||||
|
|
||||||
|
return page
|
||||||
34
tests/test_llms_txt_parser.py
Normal file
34
tests/test_llms_txt_parser.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import pytest
|
||||||
|
from cli.llms_txt_parser import LlmsTxtParser
|
||||||
|
|
||||||
|
def test_parse_markdown_sections():
|
||||||
|
"""Test parsing markdown into page sections"""
|
||||||
|
sample_content = """# Getting Started
|
||||||
|
|
||||||
|
Welcome to the docs.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Run: npm install
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Import the library:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
import { app } from 'framework'
|
||||||
|
```
|
||||||
|
|
||||||
|
# API Reference
|
||||||
|
|
||||||
|
Main API documentation here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = LlmsTxtParser(sample_content)
|
||||||
|
pages = parser.parse()
|
||||||
|
|
||||||
|
assert len(pages) >= 2
|
||||||
|
assert pages[0]['title'] == 'Getting Started'
|
||||||
|
assert pages[1]['title'] == 'API Reference'
|
||||||
|
assert len(pages[0]['code_samples']) == 1
|
||||||
|
assert pages[0]['code_samples'][0]['language'] == 'javascript'
|
||||||
Reference in New Issue
Block a user