Files
skill-seekers-reference/cli/llms_txt_parser.py
2025-10-24 18:26:10 +04:00

75 lines
2.3 KiB
Python

"""ABOUTME: Parses llms.txt markdown content into structured page data"""
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""
import re
from typing import List, Dict
class LlmsTxtParser:
"""Parse llms.txt markdown content into page structures"""
def __init__(self, content: str):
self.content = content
def parse(self) -> List[Dict]:
"""
Parse markdown content into page structures.
Returns:
List of page dicts with title, content, code_samples, headings
"""
pages = []
# Split by h1 headers (# Title)
sections = re.split(r'\n# ', self.content)
for section in sections:
if not section.strip():
continue
# First line is title
lines = section.split('\n')
title = lines[0].strip('#').strip()
# Parse content
page = self._parse_section('\n'.join(lines[1:]), title)
pages.append(page)
return pages
def _parse_section(self, content: str, title: str) -> Dict:
"""Parse a single section into page structure"""
page = {
'title': title,
'content': '',
'code_samples': [],
'headings': [],
'url': f'llms-txt#{title.lower().replace(" ", "-")}',
'links': []
}
# Extract code blocks
code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
for lang, code in code_blocks:
page['code_samples'].append({
'code': code.strip(),
'language': lang or 'unknown'
})
# Extract h2/h3 headings
headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
for level_markers, text in headings:
page['headings'].append({
'level': f'h{len(level_markers)}',
'text': text.strip(),
'id': text.lower().replace(' ', '-')
})
# Remove code blocks from content for plain text
content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)
# Extract paragraphs
paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
page['content'] = '\n\n'.join(paragraphs)
return page