fix: centralize bracket-encoding to prevent 'Invalid IPv6 URL' on all code paths (#284)
The original fix (741daf1) only patched LlmsTxtParser._clean_url(),
which covers URLs extracted directly from llms.txt content. But URLs
discovered from .md files during BFS crawl (_extract_markdown_content)
and from HTML pages (extract_content) bypass _clean_url() entirely.
When those pages contain links with square brackets (e.g.
/api/[v1]/users), httpx raises 'Invalid IPv6 URL' on fetch.
Fix: add a shared sanitize_url() utility in cli/utils.py that
percent-encodes [ and ] in path/query components, and apply it at
every URL ingestion point:
- _enqueue_url(): main chokepoint — all discovered URLs pass through
- scrape_page(): safety net for start_urls that skip _enqueue_url
- scrape_page_async(): same for async mode
- dry-run sync/async paths: direct fetches that also bypass _enqueue_url
LlmsTxtParser._clean_url() now delegates bracket-encoding to the
shared sanitize_url() (DRY), keeping only its malformed-anchor
stripping logic.
Added 16 tests: sanitize_url unit tests, _clean_url bracket tests,
_enqueue_url sanitization tests, and integration test verifying
markdown content with bracket URLs is handled safely.
Fixes #284
This commit is contained in:
@@ -47,7 +47,7 @@ from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
|
||||
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
from skill_seekers.cli.arguments.scrape import add_scrape_arguments
|
||||
from skill_seekers.cli.utils import setup_logging
|
||||
from skill_seekers.cli.utils import sanitize_url, setup_logging
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -225,7 +225,12 @@ class DocToSkillConverter:
|
||||
self.load_checkpoint()
|
||||
|
||||
def _enqueue_url(self, url: str) -> None:
|
||||
"""Add a URL to the pending queue if not already visited or enqueued (O(1))."""
|
||||
"""Add a URL to the pending queue if not already visited or enqueued (O(1)).
|
||||
|
||||
Applies :func:`sanitize_url` to percent-encode square brackets before
|
||||
enqueueing, preventing ``Invalid IPv6 URL`` errors on fetch (see #284).
|
||||
"""
|
||||
url = sanitize_url(url)
|
||||
if url not in self.visited_urls and url not in self._enqueued_urls:
|
||||
self._enqueued_urls.add(url)
|
||||
self.pending_urls.append(url)
|
||||
@@ -699,6 +704,9 @@ class DocToSkillConverter:
|
||||
Supports both HTML pages and Markdown (.md) files
|
||||
"""
|
||||
try:
|
||||
# Sanitise brackets before fetching (safety net for start_urls; see #284)
|
||||
url = sanitize_url(url)
|
||||
|
||||
# Scraping part (no lock needed - independent)
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
|
||||
response = requests.get(url, headers=headers, timeout=30)
|
||||
@@ -755,6 +763,9 @@ class DocToSkillConverter:
|
||||
"""
|
||||
async with semaphore: # Limit concurrent requests
|
||||
try:
|
||||
# Sanitise brackets before fetching (safety net; see #284)
|
||||
url = sanitize_url(url)
|
||||
|
||||
# Async HTTP request
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"}
|
||||
response = await client.get(url, headers=headers, timeout=30.0)
|
||||
@@ -1112,6 +1123,7 @@ class DocToSkillConverter:
|
||||
|
||||
if self.dry_run:
|
||||
# Just show what would be scraped
|
||||
url = sanitize_url(url) # encode brackets before fetch (see #284)
|
||||
logger.info(" [Preview] %s", url)
|
||||
try:
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"}
|
||||
@@ -1293,6 +1305,7 @@ class DocToSkillConverter:
|
||||
for url in batch:
|
||||
if unlimited or len(self.visited_urls) <= preview_limit:
|
||||
if self.dry_run:
|
||||
url = sanitize_url(url) # encode brackets (see #284)
|
||||
logger.info(" [Preview] %s", url)
|
||||
# Discover links from full page (async dry-run)
|
||||
try:
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
|
||||
class LlmsTxtParser:
|
||||
"""Parse llms.txt markdown content into page structures"""
|
||||
@@ -92,19 +94,8 @@ class LlmsTxtParser:
|
||||
# Extract the base URL without the malformed anchor
|
||||
url = url[:anchor_pos]
|
||||
|
||||
# Percent-encode square brackets in the path — they are only valid in
|
||||
# the host portion of a URL (IPv6 literals). Leaving them unencoded
|
||||
# causes httpx to raise "Invalid IPv6 URL" when the URL is fetched.
|
||||
if "[" in url or "]" in url:
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
parsed = urlparse(url)
|
||||
# Only encode brackets in the path/query/fragment, not in the host
|
||||
encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D")
|
||||
encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D")
|
||||
url = urlunparse(parsed._replace(path=encoded_path, query=encoded_query))
|
||||
|
||||
return url
|
||||
# Percent-encode square brackets in the path/query (see #284).
|
||||
return sanitize_url(url)
|
||||
|
||||
def parse(self) -> list[dict]:
|
||||
"""
|
||||
|
||||
@@ -484,3 +484,43 @@ def offset_to_line(newline_offsets: list[int], offset: int) -> int:
|
||||
1-based line number corresponding to *offset*.
|
||||
"""
|
||||
return bisect.bisect_left(newline_offsets, offset) + 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# URL sanitisation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def sanitize_url(url: str) -> str:
|
||||
"""Percent-encode square brackets in a URL's path and query components.
|
||||
|
||||
Unencoded ``[`` and ``]`` in the path are technically invalid per
|
||||
RFC 3986 (they are only legal in the host for IPv6 literals). Libraries
|
||||
such as *httpx* and *urllib3* interpret them as IPv6 address markers and
|
||||
raise ``Invalid IPv6 URL``.
|
||||
|
||||
This function encodes **only** the path and query — the scheme, host,
|
||||
and fragment are left untouched.
|
||||
|
||||
Args:
|
||||
url: Absolute or scheme-relative URL to sanitise.
|
||||
|
||||
Returns:
|
||||
The URL with ``[`` → ``%5B`` and ``]`` → ``%5D`` in its path/query,
|
||||
or the original URL unchanged when no brackets are present.
|
||||
|
||||
Examples:
|
||||
>>> sanitize_url("https://example.com/api/[v1]/users")
|
||||
'https://example.com/api/%5Bv1%5D/users'
|
||||
>>> sanitize_url("https://example.com/docs/guide")
|
||||
'https://example.com/docs/guide'
|
||||
"""
|
||||
if "[" not in url and "]" not in url:
|
||||
return url
|
||||
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
parsed = urlparse(url)
|
||||
encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D")
|
||||
encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D")
|
||||
return urlunparse(parsed._replace(path=encoded_path, query=encoded_query))
|
||||
|
||||
@@ -270,6 +270,46 @@ API: https://example.com/api/reference.md
|
||||
result = parser._clean_url("https://example.com/docs/guide.md")
|
||||
self.assertEqual(result, "https://example.com/docs/guide.md")
|
||||
|
||||
def test_clean_url_bracket_encoding(self):
|
||||
"""Test that square brackets are percent-encoded in URL path (#284)."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
result = parser._clean_url("https://example.com/api/[v1]/users")
|
||||
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users")
|
||||
|
||||
def test_clean_url_bracket_encoding_preserves_host(self):
|
||||
"""Test that bracket encoding does not affect host (IPv6 literals)."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
# Brackets should only be encoded in path, not in host
|
||||
result = parser._clean_url("https://example.com/path/[param]/end")
|
||||
self.assertIn("%5B", result)
|
||||
self.assertIn("%5D", result)
|
||||
self.assertIn("example.com", result)
|
||||
|
||||
def test_clean_url_bracket_in_query(self):
|
||||
"""Test that brackets in query params are also encoded."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
result = parser._clean_url("https://example.com/search?filter=[active]")
|
||||
self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D")
|
||||
|
||||
def test_clean_url_malformed_anchor_with_brackets(self):
|
||||
"""Test combined malformed anchor stripping + bracket encoding."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
# Malformed anchor should be stripped, then brackets encoded
|
||||
result = parser._clean_url("https://example.com/api/[v1]/page#section/deep")
|
||||
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/page")
|
||||
|
||||
def test_deduplicate_urls(self):
|
||||
"""Test that duplicate URLs are removed."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
@@ -520,5 +520,132 @@ class TestTextCleaning(unittest.TestCase):
|
||||
self.assertEqual(cleaned, "Hello world")
|
||||
|
||||
|
||||
class TestSanitizeUrl(unittest.TestCase):
|
||||
"""Test the shared sanitize_url utility (see issue #284)."""
|
||||
|
||||
def test_no_brackets_unchanged(self):
|
||||
"""URLs without brackets should pass through unchanged."""
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
url = "https://docs.example.com/api/v1/users"
|
||||
self.assertEqual(sanitize_url(url), url)
|
||||
|
||||
def test_brackets_in_path_encoded(self):
|
||||
"""Square brackets in path should be percent-encoded."""
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
result = sanitize_url("https://example.com/api/[v1]/users")
|
||||
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users")
|
||||
|
||||
def test_brackets_in_query_encoded(self):
|
||||
"""Square brackets in query should be percent-encoded."""
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
result = sanitize_url("https://example.com/search?filter=[active]&sort=[name]")
|
||||
self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D&sort=%5Bname%5D")
|
||||
|
||||
def test_host_not_affected(self):
|
||||
"""Host portion should never be modified (IPv6 literals are valid there)."""
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
# URL with brackets only in path, host stays intact
|
||||
result = sanitize_url("https://example.com/[v1]/ref")
|
||||
self.assertTrue(result.startswith("https://example.com/"))
|
||||
|
||||
def test_already_encoded_brackets(self):
|
||||
"""Already-encoded brackets should not be double-encoded."""
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
url = "https://example.com/api/%5Bv1%5D/users"
|
||||
# No raw brackets present, should pass through unchanged
|
||||
self.assertEqual(sanitize_url(url), url)
|
||||
|
||||
def test_empty_and_simple_urls(self):
|
||||
"""Edge cases: empty string, simple URLs."""
|
||||
from skill_seekers.cli.utils import sanitize_url
|
||||
|
||||
self.assertEqual(sanitize_url(""), "")
|
||||
self.assertEqual(sanitize_url("https://example.com"), "https://example.com")
|
||||
self.assertEqual(sanitize_url("https://example.com/"), "https://example.com/")
|
||||
|
||||
|
||||
class TestEnqueueUrlSanitization(unittest.TestCase):
|
||||
"""Test that _enqueue_url sanitises bracket URLs before enqueueing (#284)."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test converter."""
|
||||
self.config = {
|
||||
"name": "test",
|
||||
"base_url": "https://docs.example.com/",
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
|
||||
"rate_limit": 0,
|
||||
"max_pages": 100,
|
||||
}
|
||||
self.converter = DocToSkillConverter(self.config, dry_run=True)
|
||||
|
||||
def test_enqueue_sanitises_brackets(self):
|
||||
"""_enqueue_url should percent-encode brackets before adding to queue."""
|
||||
self.converter._enqueue_url("https://docs.example.com/api/[v1]/users")
|
||||
|
||||
# The URL in the queue should have encoded brackets
|
||||
queued_url = list(self.converter.pending_urls)[-1]
|
||||
self.assertNotIn("[", queued_url)
|
||||
self.assertNotIn("]", queued_url)
|
||||
self.assertIn("%5B", queued_url)
|
||||
self.assertIn("%5D", queued_url)
|
||||
|
||||
def test_enqueue_dedup_with_encoded_brackets(self):
|
||||
"""Encoded and raw bracket URLs should be treated as the same URL."""
|
||||
self.converter._enqueue_url("https://docs.example.com/api/[v1]/ref")
|
||||
initial_len = len(self.converter.pending_urls)
|
||||
|
||||
# Enqueueing the same URL again (raw brackets) should be a no-op
|
||||
self.converter._enqueue_url("https://docs.example.com/api/[v1]/ref")
|
||||
self.assertEqual(len(self.converter.pending_urls), initial_len)
|
||||
|
||||
def test_enqueue_normal_url_unchanged(self):
|
||||
"""Normal URLs without brackets should pass through unchanged."""
|
||||
self.converter._enqueue_url("https://docs.example.com/guide/intro")
|
||||
|
||||
queued_url = list(self.converter.pending_urls)[-1]
|
||||
self.assertEqual(queued_url, "https://docs.example.com/guide/intro")
|
||||
|
||||
|
||||
class TestMarkdownLinkBracketSanitization(unittest.TestCase):
|
||||
"""Integration test: markdown content with bracket URLs should not crash (#284)."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test converter."""
|
||||
self.config = {
|
||||
"name": "test",
|
||||
"base_url": "https://docs.example.com/",
|
||||
"url_patterns": {"include": [], "exclude": []},
|
||||
"selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"},
|
||||
"rate_limit": 0,
|
||||
"max_pages": 100,
|
||||
}
|
||||
self.converter = DocToSkillConverter(self.config, dry_run=True)
|
||||
|
||||
def test_extract_markdown_links_with_brackets(self):
|
||||
"""Links with brackets in .md content should be sanitised when enqueued."""
|
||||
# Simulate markdown content containing a link with brackets
|
||||
md_content = """# API Reference
|
||||
|
||||
See the [Users Endpoint](https://docs.example.com/api/[v1]/users.md) for details.
|
||||
Also check [Guide](https://docs.example.com/guide/intro.md).
|
||||
"""
|
||||
page = self.converter._extract_markdown_content(md_content, "https://docs.example.com/")
|
||||
|
||||
# Enqueue all extracted links (this is what scrape_page does)
|
||||
for link in page["links"]:
|
||||
self.converter._enqueue_url(link)
|
||||
|
||||
# All enqueued URLs should have brackets encoded
|
||||
for url in self.converter.pending_urls:
|
||||
self.assertNotIn("[", url, f"Raw bracket found in enqueued URL: {url}")
|
||||
self.assertNotIn("]", url, f"Raw bracket found in enqueued URL: {url}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user