skill-seekers-reference/src/skill_seekers/cli/llms_txt_parser.py

"""ABOUTME: Parses llms.txt markdown content into structured page data"""
"""ABOUTME: Extracts titles, content, code samples, and headings from markdown"""

import re
from typing import List, Dict
from urllib.parse import urljoin

class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""

    def __init__(self, content: str, base_url: str = None):
        self.content = content
        self.base_url = base_url

    def extract_urls(self) -> List[str]:
        """
        Extract all URLs from the llms.txt content.

        Supports both markdown-style links [text](url) and bare URLs.
        Resolves relative URLs using base_url if provided.
        Filters out malformed URLs with invalid anchor patterns.

        Returns:
            List of unique, cleaned URLs found in the content.
            Returns empty list if no valid URLs found.

        Note:
            - Markdown links: [Getting Started](./docs/guide.md)
            - Bare URLs: https://example.com/api.md
            - Relative paths resolved with base_url
            - Invalid anchors (#section/path.md) are stripped
        """
        urls = set()

        # Match markdown links: [text](url)
        md_links = re.findall(r'\[([^\]]*)\]\(([^)]+)\)', self.content)
        for _, url in md_links:
            if url.startswith('http'):
                clean_url = self._clean_url(url)
                if clean_url:
                    urls.add(clean_url)
            elif self.base_url and not url.startswith('#'):
                clean_url = self._clean_url(urljoin(self.base_url, url))
                if clean_url:
                    urls.add(clean_url)

        # Match bare URLs
        bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
        for url in bare_urls:
            # Clean trailing punctuation
            url = url.rstrip('.,;:')
            clean_url = self._clean_url(url)
            if clean_url:
                urls.add(clean_url)

        return list(urls)

    def _clean_url(self, url: str) -> str:
        """
        Clean and validate URL, removing invalid anchor patterns.

        Detects and strips malformed anchors that contain path separators.
        Valid: https://example.com/page.md#section
        Invalid: https://example.com/page#section/index.html.md

        Args:
            url: URL to clean (absolute or relative)

        Returns:
            Cleaned URL with malformed anchors stripped.
            Returns base URL if anchor contains '/' (malformed).
            Returns original URL if anchor is valid or no anchor present.

        Example:
            >>> parser._clean_url("https://ex.com/page#sec/path.md")
            "https://ex.com/page"
            >>> parser._clean_url("https://ex.com/page.md#section")
            "https://ex.com/page.md#section"
        """
        # Skip URLs with path after anchor (e.g., #section/index.html.md)
        # These are malformed and return duplicate HTML content
        if '#' in url:
            anchor_pos = url.index('#')
            after_anchor = url[anchor_pos + 1:]
            # If there's a path separator after anchor, it's invalid
            if '/' in after_anchor:
                # Extract the base URL without the malformed anchor
                return url[:anchor_pos]
        return url

    def parse(self) -> List[Dict]:
        """
        Parse markdown content into page structures.

        Returns:
            List of page dicts with title, content, code_samples, headings
        """
        pages = []

        # Split by h1 headers (# Title)
        sections = re.split(r'\n# ', self.content)

        for section in sections:
            if not section.strip():
                continue

            # First line is title
            lines = section.split('\n')
            title = lines[0].strip('#').strip()

            # Parse content
            page = self._parse_section('\n'.join(lines[1:]), title)
            pages.append(page)

        return pages

    def _parse_section(self, content: str, title: str) -> Dict:
        """Parse a single section into page structure"""
        page = {
            'title': title,
            'content': '',
            'code_samples': [],
            'headings': [],
            'url': f'llms-txt#{title.lower().replace(" ", "-")}',
            'links': []
        }

        # Extract code blocks
        code_blocks = re.findall(r'```(\w+)?\n(.*?)```', content, re.DOTALL)
        for lang, code in code_blocks:
            page['code_samples'].append({
                'code': code.strip(),
                'language': lang or 'unknown'
            })

        # Extract h2/h3 headings
        headings = re.findall(r'^(#{2,3})\s+(.+)$', content, re.MULTILINE)
        for level_markers, text in headings:
            page['headings'].append({
                'level': f'h{len(level_markers)}',
                'text': text.strip(),
                'id': text.lower().replace(' ', '-')
            })

        # Remove code blocks from content for plain text
        content_no_code = re.sub(r'```.*?```', '', content, flags=re.DOTALL)

        # Extract paragraphs
        paragraphs = [p.strip() for p in content_no_code.split('\n\n') if len(p.strip()) > 20]
        page['content'] = '\n\n'.join(paragraphs)

        return page