fix: centralize bracket-encoding to prevent 'Invalid IPv6 URL' on all code paths (#284)

The original fix (741daf1) only patched LlmsTxtParser._clean_url(), which covers URLs extracted directly from llms.txt content. But URLs discovered from .md files during BFS crawl (_extract_markdown_content) and from HTML pages (extract_content) bypass _clean_url() entirely. When those pages contain links with square brackets (e.g. /api/[v1]/users), httpx raises 'Invalid IPv6 URL' on fetch. Fix: add a shared sanitize_url() utility in cli/utils.py that percent-encodes [ and ] in path/query components, and apply it at every URL ingestion point: - _enqueue_url(): main chokepoint — all discovered URLs pass through - scrape_page(): safety net for start_urls that skip _enqueue_url - scrape_page_async(): same for async mode - dry-run sync/async paths: direct fetches that also bypass _enqueue_url LlmsTxtParser._clean_url() now delegates bracket-encoding to the shared sanitize_url() (DRY), keeping only its malformed-anchor stripping logic. Added 16 tests: sanitize_url unit tests, _clean_url bracket tests, _enqueue_url sanitization tests, and integration test verifying markdown content with bracket URLs is handled safely. Fixes #284
2026-03-14 23:53:47 +03:00
parent f214976ccd
commit b25a6f7f53
5 changed files with 226 additions and 15 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -47,7 +47,7 @@ from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
 from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
 from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
 from skill_seekers.cli.arguments.scrape import add_scrape_arguments
-from skill_seekers.cli.utils import setup_logging
+from skill_seekers.cli.utils import sanitize_url, setup_logging

 # Configure logging
 logger = logging.getLogger(__name__)
@@ -225,7 +225,12 @@ class DocToSkillConverter:
            self.load_checkpoint()

    def _enqueue_url(self, url: str) -> None:
-        """Add a URL to the pending queue if not already visited or enqueued (O(1))."""
+        """Add a URL to the pending queue if not already visited or enqueued (O(1)).
+
+        Applies :func:`sanitize_url` to percent-encode square brackets before
+        enqueueing, preventing ``Invalid IPv6 URL`` errors on fetch (see #284).
+        """
+        url = sanitize_url(url)
        if url not in self.visited_urls and url not in self._enqueued_urls:
            self._enqueued_urls.add(url)
            self.pending_urls.append(url)
@@ -699,6 +704,9 @@ class DocToSkillConverter:
            Supports both HTML pages and Markdown (.md) files
        """
        try:
+            # Sanitise brackets before fetching (safety net for start_urls; see #284)
+            url = sanitize_url(url)
+
            # Scraping part (no lock needed - independent)
            headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
            response = requests.get(url, headers=headers, timeout=30)
@@ -755,6 +763,9 @@ class DocToSkillConverter:
        """
        async with semaphore:  # Limit concurrent requests
            try:
+                # Sanitise brackets before fetching (safety net; see #284)
+                url = sanitize_url(url)
+
                # Async HTTP request
                headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
                response = await client.get(url, headers=headers, timeout=30.0)
@@ -1112,6 +1123,7 @@ class DocToSkillConverter:

                if self.dry_run:
                    # Just show what would be scraped
+                    url = sanitize_url(url)  # encode brackets before fetch (see #284)
                    logger.info("  [Preview] %s", url)
                    try:
                        headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"}
@@ -1293,6 +1305,7 @@ class DocToSkillConverter:
                for url in batch:
                    if unlimited or len(self.visited_urls) <= preview_limit:
                        if self.dry_run:
+                            url = sanitize_url(url)  # encode brackets (see #284)
                            logger.info("  [Preview] %s", url)
                            # Discover links from full page (async dry-run)
                            try:
--- a/src/skill_seekers/cli/llms_txt_parser.py
+++ b/src/skill_seekers/cli/llms_txt_parser.py
@@ -3,6 +3,8 @@
 import re
 from urllib.parse import urljoin

+from skill_seekers.cli.utils import sanitize_url
+

 class LlmsTxtParser:
    """Parse llms.txt markdown content into page structures"""
@@ -92,19 +94,8 @@ class LlmsTxtParser:
                # Extract the base URL without the malformed anchor
                url = url[:anchor_pos]

-        # Percent-encode square brackets in the path — they are only valid in
-        # the host portion of a URL (IPv6 literals). Leaving them unencoded
-        # causes httpx to raise "Invalid IPv6 URL" when the URL is fetched.
-        if "[" in url or "]" in url:
-            from urllib.parse import urlparse, urlunparse
-
-            parsed = urlparse(url)
-            # Only encode brackets in the path/query/fragment, not in the host
-            encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D")
-            encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D")
-            url = urlunparse(parsed._replace(path=encoded_path, query=encoded_query))
-
-        return url
+        # Percent-encode square brackets in the path/query (see #284).
+        return sanitize_url(url)

    def parse(self) -> list[dict]:
        """
--- a/src/skill_seekers/cli/utils.py
+++ b/src/skill_seekers/cli/utils.py
@@ -484,3 +484,43 @@ def offset_to_line(newline_offsets: list[int], offset: int) -> int:
        1-based line number corresponding to *offset*.
    """
    return bisect.bisect_left(newline_offsets, offset) + 1
+
+
+# ---------------------------------------------------------------------------
+# URL sanitisation
+# ---------------------------------------------------------------------------
+
+
+def sanitize_url(url: str) -> str:
+    """Percent-encode square brackets in a URL's path and query components.
+
+    Unencoded ``[`` and ``]`` in the path are technically invalid per
+    RFC 3986 (they are only legal in the host for IPv6 literals).  Libraries
+    such as *httpx* and *urllib3* interpret them as IPv6 address markers and
+    raise ``Invalid IPv6 URL``.
+
+    This function encodes **only** the path and query — the scheme, host,
+    and fragment are left untouched.
+
+    Args:
+        url: Absolute or scheme-relative URL to sanitise.
+
+    Returns:
+        The URL with ``[`` → ``%5B`` and ``]`` → ``%5D`` in its path/query,
+        or the original URL unchanged when no brackets are present.
+
+    Examples:
+        >>> sanitize_url("https://example.com/api/[v1]/users")
+        'https://example.com/api/%5Bv1%5D/users'
+        >>> sanitize_url("https://example.com/docs/guide")
+        'https://example.com/docs/guide'
+    """
+    if "[" not in url and "]" not in url:
+        return url
+
+    from urllib.parse import urlparse, urlunparse
+
+    parsed = urlparse(url)
+    encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D")
+    encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D")
+    return urlunparse(parsed._replace(path=encoded_path, query=encoded_query))