fix: percent-encode brackets in llms.txt URLs to prevent Invalid IPv6 URL (fixes #284)

Square brackets in URL paths (e.g. /api/[v1]/users from API reference docs) are technically invalid unencoded per RFC 3986. httpx interprets them as IPv6 address literals and raises "Invalid IPv6 URL", crashing the llms-full.md parse step. Fix _clean_url() in LlmsTxtParser to percent-encode [ and ] in the path and query components (-> %5B / %5D) using urlparse/urlunparse so only the path is touched, not the host. Anchor-stripping logic is preserved and runs first. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-21 23:14:18 +03:00
parent a24ee8dd9d
commit 741daf1c68
1 changed files with 21 additions and 3 deletions
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -56,9 +56,13 @@ class LlmsTxtParser:

    def _clean_url(self, url: str) -> str:
        """
-        Clean and validate URL, removing invalid anchor patterns.
+        Clean and validate URL, removing invalid anchor patterns and encoding
+        square brackets in the URL path.

        Detects and strips malformed anchors that contain path separators.
+        Percent-encodes [ and ] characters in the path so that httpx/urllib3
+        do not misinterpret them as IPv6 address literals (fixes #284).
+
        Valid: https://example.com/page.md#section
        Invalid: https://example.com/page#section/index.html.md

@@ -66,7 +70,7 @@ class LlmsTxtParser:
            url: URL to clean (absolute or relative)

        Returns:
-            Cleaned URL with malformed anchors stripped.
+            Cleaned URL with malformed anchors stripped and brackets encoded.
            Returns base URL if anchor contains '/' (malformed).
            Returns original URL if anchor is valid or no anchor present.

@@ -75,6 +79,8 @@ class LlmsTxtParser:
            "https://ex.com/page"
            >>> parser._clean_url("https://ex.com/page.md#section")
            "https://ex.com/page.md#section"
+            >>> parser._clean_url("https://ex.com/api/[v1]/users")
+            "https://ex.com/api/%5Bv1%5D/users"
        """
        # Skip URLs with path after anchor (e.g., #section/index.html.md)
        # These are malformed and return duplicate HTML content
@@ -84,7 +90,19 @@ class LlmsTxtParser:
            # If there's a path separator after anchor, it's invalid
            if "/" in after_anchor:
                # Extract the base URL without the malformed anchor
-                return url[:anchor_pos]
+                url = url[:anchor_pos]
+
+        # Percent-encode square brackets in the path — they are only valid in
+        # the host portion of a URL (IPv6 literals). Leaving them unencoded
+        # causes httpx to raise "Invalid IPv6 URL" when the URL is fetched.
+        if "[" in url or "]" in url:
+            from urllib.parse import urlparse, urlunparse
+            parsed = urlparse(url)
+            # Only encode brackets in the path/query/fragment, not in the host
+            encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D")
+            encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D")
+            url = urlunparse(parsed._replace(path=encoded_path, query=encoded_query))
+
        return url

    def parse(self) -> list[dict]: