skill-seekers-reference/cli/llms_txt_parser.py

"""ABOUTME: Parses llms.txt markdown content into structured page data"""
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""

import re
from typing import List, Dict

class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""

    def __init__(self, content: str):
        self.content = content

    def parse(self) -> List[Dict]:
        """
        Parse markdown content into page structures.

        Returns:
            List of page dicts with title, content, code_samples, headings
        """
        pages = []

        # Split by h1 headers (# Title)
        sections = re.split(r'\n# ', self.content)

        for section in sections:
            if not section.strip():
                continue

            # First line is title
            lines = section.split('\n')
            title = lines[0].strip('#').strip()

            # Parse content
            page = self._parse_section('\n'.join(lines[1:]), title)
            pages.append(page)

        return pages

    def _parse_section(self, content: str, title: str) -> Dict:
        """Parse a single section into page structure"""
        page = {
            'title': title,
            'content': '',
            'code_samples': [],
            'headings': [],
            'url': f'llms-txt#{title.lower().replace(" ", "-")}',
            'links': []
        }

        # Extract code blocks
        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
        for lang, code in code_blocks:
            page['code_samples'].append({
                'code': code.strip(),
                'language': lang or 'unknown'
            })

        # Extract h2/h3 headings
        headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
        for level_markers, text in headings:
            page['headings'].append({
                'level': f'h{len(level_markers)}',
                'text': text.strip(),
                'id': text.lower().replace(' ', '-')
            })

        # Remove code blocks from content for plain text
        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)

        # Extract paragraphs
        paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
        page['content'] = '\n\n'.join(paragraphs)

        return page