diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 4a61807..cbe4908 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -47,7 +47,7 @@ from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader from skill_seekers.cli.llms_txt_parser import LlmsTxtParser from skill_seekers.cli.arguments.scrape import add_scrape_arguments -from skill_seekers.cli.utils import setup_logging +from skill_seekers.cli.utils import sanitize_url, setup_logging # Configure logging logger = logging.getLogger(__name__) @@ -225,7 +225,12 @@ class DocToSkillConverter: self.load_checkpoint() def _enqueue_url(self, url: str) -> None: - """Add a URL to the pending queue if not already visited or enqueued (O(1)).""" + """Add a URL to the pending queue if not already visited or enqueued (O(1)). + + Applies :func:`sanitize_url` to percent-encode square brackets before + enqueueing, preventing ``Invalid IPv6 URL`` errors on fetch (see #284). + """ + url = sanitize_url(url) if url not in self.visited_urls and url not in self._enqueued_urls: self._enqueued_urls.add(url) self.pending_urls.append(url) @@ -699,6 +704,9 @@ class DocToSkillConverter: Supports both HTML pages and Markdown (.md) files """ try: + # Sanitise brackets before fetching (safety net for start_urls; see #284) + url = sanitize_url(url) + # Scraping part (no lock needed - independent) headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"} response = requests.get(url, headers=headers, timeout=30) @@ -755,6 +763,9 @@ class DocToSkillConverter: """ async with semaphore: # Limit concurrent requests try: + # Sanitise brackets before fetching (safety net; see #284) + url = sanitize_url(url) + # Async HTTP request headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper)"} response = await client.get(url, headers=headers, timeout=30.0) @@ -1112,6 +1123,7 @@ class DocToSkillConverter: if self.dry_run: # Just show what would be scraped + url = sanitize_url(url) # encode brackets before fetch (see #284) logger.info(" [Preview] %s", url) try: headers = {"User-Agent": "Mozilla/5.0 (Documentation Scraper - Dry Run)"} @@ -1293,6 +1305,7 @@ class DocToSkillConverter: for url in batch: if unlimited or len(self.visited_urls) <= preview_limit: if self.dry_run: + url = sanitize_url(url) # encode brackets (see #284) logger.info(" [Preview] %s", url) # Discover links from full page (async dry-run) try: diff --git a/src/skill_seekers/cli/llms_txt_parser.py b/src/skill_seekers/cli/llms_txt_parser.py index 6f70b26..5d8a937 100644 --- a/src/skill_seekers/cli/llms_txt_parser.py +++ b/src/skill_seekers/cli/llms_txt_parser.py @@ -3,6 +3,8 @@ import re from urllib.parse import urljoin +from skill_seekers.cli.utils import sanitize_url + class LlmsTxtParser: """Parse llms.txt markdown content into page structures""" @@ -92,19 +94,8 @@ class LlmsTxtParser: # Extract the base URL without the malformed anchor url = url[:anchor_pos] - # Percent-encode square brackets in the path — they are only valid in - # the host portion of a URL (IPv6 literals). Leaving them unencoded - # causes httpx to raise "Invalid IPv6 URL" when the URL is fetched. - if "[" in url or "]" in url: - from urllib.parse import urlparse, urlunparse - - parsed = urlparse(url) - # Only encode brackets in the path/query/fragment, not in the host - encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D") - encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D") - url = urlunparse(parsed._replace(path=encoded_path, query=encoded_query)) - - return url + # Percent-encode square brackets in the path/query (see #284). + return sanitize_url(url) def parse(self) -> list[dict]: """ diff --git a/src/skill_seekers/cli/utils.py b/src/skill_seekers/cli/utils.py index cafb485..c7cc558 100755 --- a/src/skill_seekers/cli/utils.py +++ b/src/skill_seekers/cli/utils.py @@ -484,3 +484,43 @@ def offset_to_line(newline_offsets: list[int], offset: int) -> int: 1-based line number corresponding to *offset*. """ return bisect.bisect_left(newline_offsets, offset) + 1 + + +# --------------------------------------------------------------------------- +# URL sanitisation +# --------------------------------------------------------------------------- + + +def sanitize_url(url: str) -> str: + """Percent-encode square brackets in a URL's path and query components. + + Unencoded ``[`` and ``]`` in the path are technically invalid per + RFC 3986 (they are only legal in the host for IPv6 literals). Libraries + such as *httpx* and *urllib3* interpret them as IPv6 address markers and + raise ``Invalid IPv6 URL``. + + This function encodes **only** the path and query — the scheme, host, + and fragment are left untouched. + + Args: + url: Absolute or scheme-relative URL to sanitise. + + Returns: + The URL with ``[`` → ``%5B`` and ``]`` → ``%5D`` in its path/query, + or the original URL unchanged when no brackets are present. + + Examples: + >>> sanitize_url("https://example.com/api/[v1]/users") + 'https://example.com/api/%5Bv1%5D/users' + >>> sanitize_url("https://example.com/docs/guide") + 'https://example.com/docs/guide' + """ + if "[" not in url and "]" not in url: + return url + + from urllib.parse import urlparse, urlunparse + + parsed = urlparse(url) + encoded_path = parsed.path.replace("[", "%5B").replace("]", "%5D") + encoded_query = parsed.query.replace("[", "%5B").replace("]", "%5D") + return urlunparse(parsed._replace(path=encoded_path, query=encoded_query)) diff --git a/tests/test_markdown_parsing.py b/tests/test_markdown_parsing.py index e40a7c8..b855fca 100644 --- a/tests/test_markdown_parsing.py +++ b/tests/test_markdown_parsing.py @@ -270,6 +270,46 @@ API: https://example.com/api/reference.md result = parser._clean_url("https://example.com/docs/guide.md") self.assertEqual(result, "https://example.com/docs/guide.md") + def test_clean_url_bracket_encoding(self): + """Test that square brackets are percent-encoded in URL path (#284).""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + result = parser._clean_url("https://example.com/api/[v1]/users") + self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users") + + def test_clean_url_bracket_encoding_preserves_host(self): + """Test that bracket encoding does not affect host (IPv6 literals).""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + # Brackets should only be encoded in path, not in host + result = parser._clean_url("https://example.com/path/[param]/end") + self.assertIn("%5B", result) + self.assertIn("%5D", result) + self.assertIn("example.com", result) + + def test_clean_url_bracket_in_query(self): + """Test that brackets in query params are also encoded.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + result = parser._clean_url("https://example.com/search?filter=[active]") + self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D") + + def test_clean_url_malformed_anchor_with_brackets(self): + """Test combined malformed anchor stripping + bracket encoding.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + # Malformed anchor should be stripped, then brackets encoded + result = parser._clean_url("https://example.com/api/[v1]/page#section/deep") + self.assertEqual(result, "https://example.com/api/%5Bv1%5D/page") + def test_deduplicate_urls(self): """Test that duplicate URLs are removed.""" from skill_seekers.cli.llms_txt_parser import LlmsTxtParser diff --git a/tests/test_scraper_features.py b/tests/test_scraper_features.py index 320d743..338e4ff 100644 --- a/tests/test_scraper_features.py +++ b/tests/test_scraper_features.py @@ -520,5 +520,132 @@ class TestTextCleaning(unittest.TestCase): self.assertEqual(cleaned, "Hello world") +class TestSanitizeUrl(unittest.TestCase): + """Test the shared sanitize_url utility (see issue #284).""" + + def test_no_brackets_unchanged(self): + """URLs without brackets should pass through unchanged.""" + from skill_seekers.cli.utils import sanitize_url + + url = "https://docs.example.com/api/v1/users" + self.assertEqual(sanitize_url(url), url) + + def test_brackets_in_path_encoded(self): + """Square brackets in path should be percent-encoded.""" + from skill_seekers.cli.utils import sanitize_url + + result = sanitize_url("https://example.com/api/[v1]/users") + self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users") + + def test_brackets_in_query_encoded(self): + """Square brackets in query should be percent-encoded.""" + from skill_seekers.cli.utils import sanitize_url + + result = sanitize_url("https://example.com/search?filter=[active]&sort=[name]") + self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D&sort=%5Bname%5D") + + def test_host_not_affected(self): + """Host portion should never be modified (IPv6 literals are valid there).""" + from skill_seekers.cli.utils import sanitize_url + + # URL with brackets only in path, host stays intact + result = sanitize_url("https://example.com/[v1]/ref") + self.assertTrue(result.startswith("https://example.com/")) + + def test_already_encoded_brackets(self): + """Already-encoded brackets should not be double-encoded.""" + from skill_seekers.cli.utils import sanitize_url + + url = "https://example.com/api/%5Bv1%5D/users" + # No raw brackets present, should pass through unchanged + self.assertEqual(sanitize_url(url), url) + + def test_empty_and_simple_urls(self): + """Edge cases: empty string, simple URLs.""" + from skill_seekers.cli.utils import sanitize_url + + self.assertEqual(sanitize_url(""), "") + self.assertEqual(sanitize_url("https://example.com"), "https://example.com") + self.assertEqual(sanitize_url("https://example.com/"), "https://example.com/") + + +class TestEnqueueUrlSanitization(unittest.TestCase): + """Test that _enqueue_url sanitises bracket URLs before enqueueing (#284).""" + + def setUp(self): + """Set up test converter.""" + self.config = { + "name": "test", + "base_url": "https://docs.example.com/", + "url_patterns": {"include": [], "exclude": []}, + "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, + "rate_limit": 0, + "max_pages": 100, + } + self.converter = DocToSkillConverter(self.config, dry_run=True) + + def test_enqueue_sanitises_brackets(self): + """_enqueue_url should percent-encode brackets before adding to queue.""" + self.converter._enqueue_url("https://docs.example.com/api/[v1]/users") + + # The URL in the queue should have encoded brackets + queued_url = list(self.converter.pending_urls)[-1] + self.assertNotIn("[", queued_url) + self.assertNotIn("]", queued_url) + self.assertIn("%5B", queued_url) + self.assertIn("%5D", queued_url) + + def test_enqueue_dedup_with_encoded_brackets(self): + """Encoded and raw bracket URLs should be treated as the same URL.""" + self.converter._enqueue_url("https://docs.example.com/api/[v1]/ref") + initial_len = len(self.converter.pending_urls) + + # Enqueueing the same URL again (raw brackets) should be a no-op + self.converter._enqueue_url("https://docs.example.com/api/[v1]/ref") + self.assertEqual(len(self.converter.pending_urls), initial_len) + + def test_enqueue_normal_url_unchanged(self): + """Normal URLs without brackets should pass through unchanged.""" + self.converter._enqueue_url("https://docs.example.com/guide/intro") + + queued_url = list(self.converter.pending_urls)[-1] + self.assertEqual(queued_url, "https://docs.example.com/guide/intro") + + +class TestMarkdownLinkBracketSanitization(unittest.TestCase): + """Integration test: markdown content with bracket URLs should not crash (#284).""" + + def setUp(self): + """Set up test converter.""" + self.config = { + "name": "test", + "base_url": "https://docs.example.com/", + "url_patterns": {"include": [], "exclude": []}, + "selectors": {"main_content": "article", "title": "h1", "code_blocks": "pre code"}, + "rate_limit": 0, + "max_pages": 100, + } + self.converter = DocToSkillConverter(self.config, dry_run=True) + + def test_extract_markdown_links_with_brackets(self): + """Links with brackets in .md content should be sanitised when enqueued.""" + # Simulate markdown content containing a link with brackets + md_content = """# API Reference + +See the [Users Endpoint](https://docs.example.com/api/[v1]/users.md) for details. +Also check [Guide](https://docs.example.com/guide/intro.md). +""" + page = self.converter._extract_markdown_content(md_content, "https://docs.example.com/") + + # Enqueue all extracted links (this is what scrape_page does) + for link in page["links"]: + self.converter._enqueue_url(link) + + # All enqueued URLs should have brackets encoded + for url in self.converter.pending_urls: + self.assertNotIn("[", url, f"Raw bracket found in enqueued URL: {url}") + self.assertNotIn("]", url, f"Raw bracket found in enqueued URL: {url}") + + if __name__ == "__main__": unittest.main()