skill-seekers-reference/src/skill_seekers/cli/llms_txt_parser.py

"""ABOUTME: Parses llms.txt markdown content into structured page data"""

import re
from urllib.parse import urljoin

from skill_seekers.cli.utils import sanitize_url


class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""

    def __init__(self, content: str, base_url: str = None):
        self.content = content
        self.base_url = base_url

    def extract_urls(self) -> list[str]:
        """
        Extract all URLs from the llms.txt content.

        Supports both markdown-style links [text](url) and bare URLs.
        Resolves relative URLs using base_url if provided.
        Filters out malformed URLs with invalid anchor patterns.

        Returns:
            List of unique, cleaned URLs found in the content.
            Returns empty list if no valid URLs found.

        Note:
            - Markdown links: [Getting Started](./docs/guide.md)
            - Bare URLs: https://example.com/api.md
            - Relative paths resolved with base_url
            - Invalid anchors (#section/path.md) are stripped
        """
        urls = set()

        # Match markdown links: [text](url)
        md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
        for _, url in md_links:
            if url.startswith("http"):
                clean_url = self._clean_url(url)
                if clean_url:
                    urls.add(clean_url)
            elif self.base_url and not url.startswith("#"):
                clean_url = self._clean_url(urljoin(self.base_url, url))
                if clean_url:
                    urls.add(clean_url)

        # Match bare URLs
        bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
        for url in bare_urls:
            # Clean trailing punctuation
            url = url.rstrip(".,;:")
            clean_url = self._clean_url(url)
            if clean_url:
                urls.add(clean_url)

        return list(urls)

    def _clean_url(self, url: str) -> str:
        """
        Clean and validate URL, removing invalid anchor patterns and encoding
        square brackets in the URL path.

        Detects and strips malformed anchors that contain path separators.
        Percent-encodes [ and ] characters in the path so that httpx/urllib3
        do not misinterpret them as IPv6 address literals (fixes #284).

        Valid: https://example.com/page.md#section
        Invalid: https://example.com/page#section/index.html.md

        Args:
            url: URL to clean (absolute or relative)

        Returns:
            Cleaned URL with malformed anchors stripped and brackets encoded.
            Returns base URL if anchor contains '/' (malformed).
            Returns original URL if anchor is valid or no anchor present.

        Example:
            >>> parser._clean_url("https://ex.com/page#sec/path.md")
            "https://ex.com/page"
            >>> parser._clean_url("https://ex.com/page.md#section")
            "https://ex.com/page.md#section"
            >>> parser._clean_url("https://ex.com/api/[v1]/users")
            "https://ex.com/api/%5Bv1%5D/users"
        """
        # Skip URLs with path after anchor (e.g., #section/index.html.md)
        # These are malformed and return duplicate HTML content
        if "#" in url:
            anchor_pos = url.index("#")
            after_anchor = url[anchor_pos + 1 :]
            # If there's a path separator after anchor, it's invalid
            if "/" in after_anchor:
                # Extract the base URL without the malformed anchor
                url = url[:anchor_pos]

        # Percent-encode square brackets in the path/query (see #284).
        return sanitize_url(url)

    def parse(self) -> list[dict]:
        """
        Parse markdown content into page structures.

        Returns:
            List of page dicts with title, content, code_samples, headings
        """
        pages = []

        # Split by h1 headers (# Title)
        sections = re.split(r"\n# ", self.content)

        for section in sections:
            if not section.strip():
                continue

            # First line is title
            lines = section.split("\n")
            title = lines[0].strip("#").strip()

            # Parse content
            page = self._parse_section("\n".join(lines[1:]), title)
            pages.append(page)

        return pages

    def _parse_section(self, content: str, title: str) -> dict:
        """Parse a single section into page structure"""
        page = {
            "title": title,
            "content": "",
            "code_samples": [],
            "headings": [],
            "url": f"llms-txt#{title.lower().replace(' ', '-')}",
            "links": [],
        }

        # Extract code blocks
        code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
        for lang, code in code_blocks:
            page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})

        # Extract h2/h3 headings
        headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
        for level_markers, text in headings:
            page["headings"].append(
                {
                    "level": f"h{len(level_markers)}",
                    "text": text.strip(),
                    "id": text.lower().replace(" ", "-"),
                }
            )

        # Remove code blocks from content for plain text
        content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)

        # Extract paragraphs
        paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
        page["content"] = "\n\n".join(paragraphs)

        return page