diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index 0fa3ba3..3490cc6 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -252,6 +252,15 @@ class DocToSkillConverter: return not any(pattern in url for pattern in self._exclude_patterns) + @staticmethod + def _has_md_extension(url: str) -> bool: + """Check if URL path ends with .md extension. + + Uses URL path parsing instead of substring matching to avoid + false positives on URLs like /embed/page or /cmd-line. + """ + return urlparse(url).path.endswith(".md") + def save_checkpoint(self) -> None: """Save progress checkpoint""" if not self.checkpoint_enabled or self.dry_run: @@ -462,7 +471,7 @@ class DocToSkillConverter: else: continue full_url = full_url.split("#")[0] - if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links: + if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links: links.append(full_url) return { @@ -551,7 +560,7 @@ class DocToSkillConverter: # Strip anchor fragments full_url = full_url.split("#")[0] # Only include .md URLs to avoid client-side rendered HTML pages - if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]: + if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]: page["links"].append(full_url) return page @@ -713,7 +722,7 @@ class DocToSkillConverter: response.raise_for_status() # Check if this is a Markdown file - if url.endswith(".md") or ".md" in url: + if self._has_md_extension(url): page = self._extract_markdown_content(response.text, url) else: soup = BeautifulSoup(response.content, "html.parser") @@ -772,7 +781,7 @@ class DocToSkillConverter: response.raise_for_status() # Check if this is a Markdown file - if url.endswith(".md") or ".md" in url: + if self._has_md_extension(url): page = self._extract_markdown_content(response.text, url) else: # BeautifulSoup parsing (still synchronous, but fast) @@ -798,71 +807,45 @@ class DocToSkillConverter: def _convert_to_md_urls(self, urls: list[str]) -> list[str]: """ - Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs. - Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors. - 不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。 + Clean URLs from llms.txt: strip anchor fragments, deduplicate base URLs. + + Previously this method blindly appended /index.html.md to non-.md URLs, + which caused 404 errors on sites that don't serve raw markdown files + (e.g. Discord docs, see issue #277). Now it preserves original URLs as-is + and lets the scraper handle both HTML and markdown content. Args: urls: List of URLs to process Returns: - List of .md URLs (未验证, deduplicated, no anchors) + List of cleaned, deduplicated URLs (no anchors) """ from urllib.parse import urlparse, urlunparse seen_base_urls = set() - md_urls = [] + cleaned_urls = [] for url in urls: # Parse URL to extract and remove fragment (anchor) parsed = urlparse(url) base_url = urlunparse(parsed._replace(fragment="")) # Remove #anchor - # Skip if we've already processed this base URL - if base_url in seen_base_urls: - continue - seen_base_urls.add(base_url) + # Normalize trailing slashes for dedup (but keep original form) + dedup_key = base_url.rstrip("/") - # Check if URL already ends with .md (not just contains "md") - if base_url.endswith(".md"): - md_urls.append(base_url) - else: - # 直接转换为 .md 格式,不发送 HEAD 请求检查 - base_url = base_url.rstrip("/") - md_url = f"{base_url}/index.html.md" - md_urls.append(md_url) + # Skip if we've already processed this base URL + if dedup_key in seen_base_urls: + continue + seen_base_urls.add(dedup_key) + + cleaned_urls.append(base_url) logger.info( - " ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)", + " ✓ Cleaned %d URLs to %d unique URLs (anchors stripped, will validate during crawl)", len(urls), - len(md_urls), + len(cleaned_urls), ) - return md_urls - - # ORIGINAL _convert_to_md_urls (with HEAD request validation): - # def _convert_to_md_urls(self, urls: List[str]) -> List[str]: - # md_urls = [] - # non_md_urls = [] - # for url in urls: - # if '.md' in url: - # md_urls.append(url) - # else: - # non_md_urls.append(url) - # if non_md_urls: - # logger.info(" 🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls)) - # converted = 0 - # for url in non_md_urls: - # url = url.rstrip('/') - # md_url = f"{url}/index.html.md" - # try: - # resp = requests.head(md_url, timeout=5, allow_redirects=True) - # if resp.status_code == 200: - # md_urls.append(md_url) - # converted += 1 - # except Exception: - # pass - # logger.info(" ✓ Converted %d URLs to .md format", converted) - # return md_urls + return cleaned_urls def _try_llms_txt(self) -> bool: """ @@ -933,16 +916,16 @@ class DocToSkillConverter: # Extract URLs from llms.txt and add to pending_urls for BFS crawling extracted_urls = parser.extract_urls() if extracted_urls: - # Convert non-.md URLs to .md format by trying /index.html.md suffix - md_urls = self._convert_to_md_urls(extracted_urls) + # Clean URLs: strip anchors, deduplicate + cleaned_urls = self._convert_to_md_urls(extracted_urls) logger.info( - "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...", len(extracted_urls), - len(md_urls), + len(cleaned_urls), ) # Filter URLs based on url_patterns config - for url in md_urls: + for url in cleaned_urls: if self.is_valid_url(url): self._enqueue_url(url) @@ -1019,16 +1002,16 @@ class DocToSkillConverter: # Extract URLs from llms.txt and add to pending_urls for BFS crawling extracted_urls = parser.extract_urls() if extracted_urls: - # Convert non-.md URLs to .md format by trying /index.html.md suffix - md_urls = self._convert_to_md_urls(extracted_urls) + # Clean URLs: strip anchors, deduplicate + cleaned_urls = self._convert_to_md_urls(extracted_urls) logger.info( - "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", + "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...", len(extracted_urls), - len(md_urls), + len(cleaned_urls), ) # Filter URLs based on url_patterns config - for url in md_urls: + for url in cleaned_urls: if self.is_valid_url(url): self._enqueue_url(url) diff --git a/tests/test_issue_277_discord_e2e.py b/tests/test_issue_277_discord_e2e.py new file mode 100644 index 0000000..dcd1e9f --- /dev/null +++ b/tests/test_issue_277_discord_e2e.py @@ -0,0 +1,146 @@ +""" +E2E test for Issue #277 - Discord docs case reported by @skeith. + +This test hits the REAL Discord docs llms.txt and verifies that +no /index.html.md URLs are generated. No mocks. + +Requires network access. Marked as integration test. +""" + +import os +import shutil +import unittest + +import pytest + +from skill_seekers.cli.doc_scraper import DocToSkillConverter +from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector +from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader +from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + +@pytest.mark.integration +class TestIssue277DiscordDocsE2E(unittest.TestCase): + """E2E: Reproduce @skeith's report with real Discord docs.""" + + def setUp(self): + self.base_url = "https://docs.discord.com/" + self.config = { + "name": "DiscordDocsE2E", + "description": "Discord API Documentation", + "base_url": self.base_url, + "selectors": {"main_content": "article"}, + "url_patterns": {"include": ["/developers"], "exclude": []}, + } + self.output_dir = f"output/{self.config['name']}_data" + + def tearDown(self): + # Clean up any output created + for path in [self.output_dir, f"output/{self.config['name']}"]: + if os.path.exists(path): + shutil.rmtree(path) + + def test_discord_llms_txt_exists(self): + """Verify Discord docs has llms.txt (precondition for the bug).""" + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + self.assertTrue( + len(variants) > 0, + "Discord docs should have at least one llms.txt variant", + ) + + def test_discord_llms_txt_urls_no_index_html_md(self): + """Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended.""" + # Step 1: Detect llms.txt + detector = LlmsTxtDetector(self.base_url) + variants = detector.detect_all() + self.assertTrue(len(variants) > 0, "No llms.txt found at docs.discord.com") + + # Step 2: Download the largest variant (same logic as doc_scraper) + downloaded = {} + for variant_info in variants: + downloader = LlmsTxtDownloader(variant_info["url"]) + content = downloader.download() + if content: + downloaded[variant_info["variant"]] = content + + self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant") + + largest_content = max(downloaded.values(), key=len) + + # Step 3: Parse URLs from llms.txt + parser = LlmsTxtParser(largest_content, self.base_url) + extracted_urls = parser.extract_urls() + self.assertTrue( + len(extracted_urls) > 0, + "No URLs extracted from Discord llms.txt", + ) + + # Step 4: Run _convert_to_md_urls (the function that was causing 404s) + converter = DocToSkillConverter(self.config, dry_run=True) + converted_urls = converter._convert_to_md_urls(extracted_urls) + + # Step 5: Verify NO /index.html.md was blindly appended + bad_urls = [u for u in converted_urls if "/index.html.md" in u] + self.assertEqual( + len(bad_urls), + 0, + f"Found {len(bad_urls)} URLs with /index.html.md appended " + f"(would cause 404s):\n" + + "\n".join(bad_urls[:10]), + ) + + # Step 6: Verify no anchor fragments leaked through + anchor_urls = [u for u in converted_urls if "#" in u] + self.assertEqual( + len(anchor_urls), + 0, + f"Found {len(anchor_urls)} URLs with anchor fragments:\n" + + "\n".join(anchor_urls[:10]), + ) + + # Step 7: Verify we got a reasonable number of URLs + self.assertGreater( + len(converted_urls), + 10, + "Expected at least 10 unique URLs from Discord docs", + ) + + def test_discord_full_pipeline_no_404_urls(self): + """Full pipeline: detector -> downloader -> parser -> converter -> queue. + + Simulates what `skill-seekers create https://docs.discord.com` does, + without actually scraping pages. + """ + converter = DocToSkillConverter(self.config, dry_run=True) + + # Run _try_llms_txt which calls _convert_to_md_urls internally + os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True) + os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True) + result = converter._try_llms_txt() + + # _try_llms_txt returns False when it populates pending_urls for BFS + # (True means it parsed content directly, no BFS needed) + if not result: + # Check every URL in the queue + for url in converter.pending_urls: + self.assertNotIn( + "/index.html.md", + url, + f"Queue contains 404-causing URL: {url}", + ) + self.assertNotIn( + "#", + url, + f"Queue contains URL with anchor fragment: {url}", + ) + + self.assertGreater( + len(converter.pending_urls), + 0, + "Pipeline should have queued URLs for crawling", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_issue_277_real_world.py b/tests/test_issue_277_real_world.py index 1c15f30..da65d7f 100644 --- a/tests/test_issue_277_real_world.py +++ b/tests/test_issue_277_real_world.py @@ -1,6 +1,9 @@ """ -Real-world integration test for Issue #277: URL conversion bug with anchor fragments. -Tests the exact MikroORM case that was reported in the issue. +Real-world integration test for Issue #277: URL conversion bug with anchor fragments +and blind /index.html.md appending. + +Tests the exact MikroORM and Discord cases reported in the issue. +Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs. """ import unittest @@ -28,7 +31,6 @@ class TestIssue277RealWorld(unittest.TestCase): def test_mikro_orm_urls_from_issue_277(self): """Test the exact URLs that caused 404 errors in issue #277""" - # These are the actual problematic URLs from the bug report urls_from_llms_txt = [ "https://mikro-orm.io/docs/", "https://mikro-orm.io/docs/reference.md", @@ -44,54 +46,23 @@ class TestIssue277RealWorld(unittest.TestCase): # Verify no malformed URLs with anchor fragments for url in result: - self.assertNotIn( - "#synchronous-initialization/index.html.md", - url, - "Should not append /index.html.md after anchor fragments", - ) - self.assertNotIn( - "#formulas/index.html.md", - url, - "Should not append /index.html.md after anchor fragments", - ) - self.assertNotIn( - "#postgresql-native-enums/index.html.md", - url, - "Should not append /index.html.md after anchor fragments", - ) + self.assertNotIn("#", url, f"URL should not contain anchor: {url}") + # No /index.html.md should be appended to non-.md URLs + if not url.endswith(".md"): + self.assertNotIn( + "index.html.md", url, f"Should not append /index.html.md: {url}" + ) - # Verify correct transformed URLs - - # Check that we got the expected number of unique URLs - # Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it - self.assertEqual( - len(result), - 7, - f"Should have 7 unique base URLs after deduplication, got {len(result)}", - ) - - # Verify specific URLs that were causing 404s are now correct - self.assertIn( - "https://mikro-orm.io/docs/quick-start/index.html.md", - result, - "quick-start URL should be correctly transformed", - ) - self.assertIn( - "https://mikro-orm.io/docs/propagation/index.html.md", - result, - "propagation URL should be correctly transformed", - ) - self.assertIn( - "https://mikro-orm.io/docs/defining-entities.md", - result, - "defining-entities.md should preserve .md extension", - ) + # .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated + self.assertIn("https://mikro-orm.io/docs/reference.md", result) + self.assertIn("https://mikro-orm.io/docs/repositories.md", result) + self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result) + self.assertIn("https://mikro-orm.io/docs/quick-start", result) + self.assertIn("https://mikro-orm.io/docs/propagation", result) def test_no_404_causing_urls_generated(self): """Verify that no URLs matching the 404 error pattern are generated""" - # The exact 404-causing URL pattern from the issue problematic_patterns = [ - "/index.html.md#", # /index.html.md should never come after # "#synchronous-initialization/index.html.md", "#formulas/index.html.md", "#postgresql-native-enums/index.html.md", @@ -118,9 +89,30 @@ class TestIssue277RealWorld(unittest.TestCase): f"URL '{url}' contains problematic pattern '{pattern}' that causes 404", ) + def test_no_blind_index_html_md_appending(self): + """Verify non-.md URLs don't get /index.html.md appended (core fix)""" + urls = [ + "https://mikro-orm.io/docs/quick-start", + "https://mikro-orm.io/docs/propagation", + "https://mikro-orm.io/docs/filters", + ] + + result = self.converter._convert_to_md_urls(urls) + + self.assertEqual(len(result), 3) + for url in result: + self.assertNotIn( + "/index.html.md", + url, + f"Should not blindly append /index.html.md: {url}", + ) + + self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start") + self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation") + self.assertEqual(result[2], "https://mikro-orm.io/docs/filters") + def test_deduplication_prevents_multiple_requests(self): """Verify that multiple anchors on same page don't create duplicate requests""" - # From the issue: These should all map to the same base URL urls_with_multiple_anchors = [ "https://mikro-orm.io/docs/defining-entities#formulas", "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums", @@ -136,10 +128,7 @@ class TestIssue277RealWorld(unittest.TestCase): 1, "Multiple anchors on same page should deduplicate to single request", ) - self.assertEqual( - result[0], - "https://mikro-orm.io/docs/defining-entities/index.html.md", - ) + self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities") def test_md_files_with_anchors_preserved(self): """Test that .md files with anchors are handled correctly""" @@ -167,7 +156,6 @@ class TestIssue277RealWorld(unittest.TestCase): Integration test: Simulate real scraping scenario with llms.txt URLs. Verify that the converted URLs would not cause 404 errors. """ - # Mock response for llms.txt content mock_response = MagicMock() mock_response.status_code = 200 mock_response.text = """ @@ -179,7 +167,6 @@ https://mikro-orm.io/docs/defining-entities#formulas """ mock_get.return_value = mock_response - # Simulate the llms.txt parsing flow urls_from_llms = [ "https://mikro-orm.io/docs/quick-start", "https://mikro-orm.io/docs/quick-start#synchronous-initialization", @@ -187,42 +174,36 @@ https://mikro-orm.io/docs/defining-entities#formulas "https://mikro-orm.io/docs/defining-entities#formulas", ] - # Convert URLs (this is what happens in _try_llms_txt_v2) converted_urls = self.converter._convert_to_md_urls(urls_from_llms) - # Verify converted URLs are valid - # In real scenario, these would be added to pending_urls and scraped - self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape") + self.assertTrue(len(converted_urls) > 0) - # Verify no URLs would cause 404 (no anchors in middle of path) for url in converted_urls: - # Check URL structure is valid + # Should not contain # anywhere self.assertRegex( url, - r"^https://[^#]+$", # Should not contain # anywhere + r"^https://[^#]+$", f"URL should not contain anchor fragments: {url}", ) - - # Verify the problematic pattern from the issue doesn't exist - self.assertNotRegex( - url, - r"#[^/]+/index\.html\.md", - f"URL should not have /index.html.md after anchor: {url}", - ) + # Should NOT have /index.html.md appended + if not url.endswith(".md"): + self.assertNotIn( + "index.html.md", + url, + f"Should not append /index.html.md: {url}", + ) def test_issue_277_error_message_urls(self): """ Test the exact URLs that appeared in error messages from the issue report. These were the actual 404-causing URLs that need to be fixed. """ - # These are the MALFORMED URLs that caused 404 errors (with anchors in the middle) error_urls_with_anchors = [ "https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md", "https://mikro-orm.io/docs/defining-entities#formulas/index.html.md", "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md", ] - # Extract the input URLs that would have generated these errors input_urls = [ "https://mikro-orm.io/docs/quick-start#synchronous-initialization", "https://mikro-orm.io/docs/propagation", @@ -232,7 +213,7 @@ https://mikro-orm.io/docs/defining-entities#formulas result = self.converter._convert_to_md_urls(input_urls) - # Verify NONE of the malformed error URLs (with anchors) are generated + # Verify NONE of the malformed error URLs are generated for error_url in error_urls_with_anchors: self.assertNotIn( error_url, @@ -240,20 +221,82 @@ https://mikro-orm.io/docs/defining-entities#formulas f"Should not generate the 404-causing URL: {error_url}", ) - # Verify correct URLs are generated instead - correct_urls = [ - "https://mikro-orm.io/docs/quick-start/index.html.md", - "https://mikro-orm.io/docs/propagation/index.html.md", - "https://mikro-orm.io/docs/defining-entities/index.html.md", + # Verify correct URLs are generated + self.assertIn("https://mikro-orm.io/docs/quick-start", result) + self.assertIn("https://mikro-orm.io/docs/propagation", result) + self.assertIn("https://mikro-orm.io/docs/defining-entities", result) + + +class TestIssue277DiscordDocs(unittest.TestCase): + """Test for Discord docs case reported by @skeith""" + + def setUp(self): + self.config = { + "name": "DiscordDocs", + "description": "Discord API Documentation", + "base_url": "https://docs.discord.com/", + "selectors": {"main_content": "article"}, + } + self.converter = DocToSkillConverter(self.config, dry_run=True) + + def test_discord_docs_no_index_html_md(self): + """Discord docs don't serve .md files - no /index.html.md should be appended""" + urls = [ + "https://docs.discord.com/developers/activities/building-an-activity", + "https://docs.discord.com/developers/activities/design-patterns", + "https://docs.discord.com/developers/components/overview", + "https://docs.discord.com/developers/bots/getting-started", ] - for correct_url in correct_urls: - self.assertIn( - correct_url, - result, - f"Should generate the correct URL: {correct_url}", + result = self.converter._convert_to_md_urls(urls) + + self.assertEqual(len(result), 4) + for url in result: + self.assertNotIn( + "index.html.md", + url, + f"Discord docs should not get /index.html.md appended: {url}", ) + def test_discord_docs_md_urls_preserved(self): + """Discord llms.txt has .md URLs that should be preserved""" + urls = [ + "https://docs.discord.com/developers/activities/building-an-activity.md", + "https://docs.discord.com/developers/components/overview.md", + "https://docs.discord.com/developers/change-log.md", + ] + + result = self.converter._convert_to_md_urls(urls) + + self.assertEqual(len(result), 3) + self.assertEqual( + result[0], + "https://docs.discord.com/developers/activities/building-an-activity.md", + ) + + def test_discord_docs_mixed_urls(self): + """Mix of .md and non-.md URLs from Discord docs""" + urls = [ + "https://docs.discord.com/developers/activities/building-an-activity.md", + "https://docs.discord.com/developers/overview", + "https://docs.discord.com/developers/overview#quick-start", + "https://docs.discord.com/developers/bots/getting-started.md#step-1", + ] + + result = self.converter._convert_to_md_urls(urls) + + # .md URLs preserved, non-.md as-is, anchors stripped & deduped + self.assertEqual(len(result), 3) + self.assertIn( + "https://docs.discord.com/developers/activities/building-an-activity.md", + result, + ) + self.assertIn("https://docs.discord.com/developers/overview", result) + self.assertIn( + "https://docs.discord.com/developers/bots/getting-started.md", + result, + ) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_url_conversion.py b/tests/test_url_conversion.py index e37a540..49a3c38 100644 --- a/tests/test_url_conversion.py +++ b/tests/test_url_conversion.py @@ -1,6 +1,9 @@ """ Tests for URL conversion logic (_convert_to_md_urls). Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors. + +Updated: _convert_to_md_urls no longer blindly appends /index.html.md to non-.md URLs. +It now strips anchors, deduplicates, and preserves original URLs as-is. """ import unittest @@ -31,11 +34,11 @@ class TestConvertToMdUrls(unittest.TestCase): result = self.converter._convert_to_md_urls(urls) - # All should be converted without anchor fragments + # All should have anchors stripped self.assertEqual(len(result), 3) - self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md") - self.assertEqual(result[1], "https://example.com/docs/api/index.html.md") - self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md") + self.assertEqual(result[0], "https://example.com/docs/quick-start") + self.assertEqual(result[1], "https://example.com/docs/api") + self.assertEqual(result[2], "https://example.com/docs/guide") def test_deduplicates_multiple_anchors_same_url(self): """Test that multiple anchors on the same URL are deduplicated""" @@ -50,7 +53,7 @@ class TestConvertToMdUrls(unittest.TestCase): # Should only have one entry for the base URL self.assertEqual(len(result), 1) - self.assertEqual(result[0], "https://example.com/docs/api/index.html.md") + self.assertEqual(result[0], "https://example.com/docs/api") def test_preserves_md_extension_urls(self): """Test that URLs already ending with .md are preserved""" @@ -83,8 +86,27 @@ class TestConvertToMdUrls(unittest.TestCase): self.assertIn("https://example.com/docs/guide.md", result) self.assertIn("https://example.com/docs/api.md", result) + def test_non_md_urls_not_converted(self): + """Test that non-.md URLs are NOT converted to /index.html.md (issue #277)""" + urls = [ + "https://example.com/docs/getting-started", + "https://example.com/docs/api-reference", + "https://example.com/docs/tutorials", + ] + + result = self.converter._convert_to_md_urls(urls) + + # Should preserve URLs as-is, NOT append /index.html.md + self.assertEqual(len(result), 3) + for url in result: + self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}") + + self.assertEqual(result[0], "https://example.com/docs/getting-started") + self.assertEqual(result[1], "https://example.com/docs/api-reference") + self.assertEqual(result[2], "https://example.com/docs/tutorials") + def test_does_not_match_md_in_path(self): - """Test that URLs containing 'md' in path (but not ending with .md) are converted""" + """Test that URLs containing 'md' in path are preserved as-is""" urls = [ "https://example.com/cmd-line", "https://example.com/AMD-processors", @@ -93,27 +115,23 @@ class TestConvertToMdUrls(unittest.TestCase): result = self.converter._convert_to_md_urls(urls) - # All should be converted since they don't END with .md + # URLs with 'md' substring (not extension) should be preserved as-is self.assertEqual(len(result), 3) - self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md") - self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md") - self.assertEqual(result[2], "https://example.com/metadata/index.html.md") + self.assertEqual(result[0], "https://example.com/cmd-line") + self.assertEqual(result[1], "https://example.com/AMD-processors") + self.assertEqual(result[2], "https://example.com/metadata") - def test_removes_trailing_slashes(self): - """Test that trailing slashes are removed before appending /index.html.md""" + def test_removes_trailing_slashes_for_dedup(self): + """Test that trailing slash variants are deduplicated""" urls = [ "https://example.com/docs/api/", - "https://example.com/docs/guide//", - "https://example.com/docs/reference", + "https://example.com/docs/api", ] result = self.converter._convert_to_md_urls(urls) - # All should have proper /index.html.md without double slashes - self.assertEqual(len(result), 3) - self.assertEqual(result[0], "https://example.com/docs/api/index.html.md") - self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md") - self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md") + # Should deduplicate (trailing slash vs no slash) + self.assertEqual(len(result), 1) def test_mixed_urls_with_and_without_anchors(self): """Test mixed URLs with various formats""" @@ -130,9 +148,9 @@ class TestConvertToMdUrls(unittest.TestCase): # Should deduplicate to 3 unique base URLs self.assertEqual(len(result), 3) - self.assertIn("https://example.com/docs/intro/index.html.md", result) + self.assertIn("https://example.com/docs/intro", result) self.assertIn("https://example.com/docs/api.md", result) - self.assertIn("https://example.com/docs/guide/index.html.md", result) + self.assertIn("https://example.com/docs/guide", result) def test_empty_url_list(self): """Test that empty URL list returns empty result""" @@ -155,14 +173,15 @@ class TestConvertToMdUrls(unittest.TestCase): # Should deduplicate to 3 unique base URLs self.assertEqual(len(result), 3) - self.assertIn("https://mikro-orm.io/docs/quick-start/index.html.md", result) - self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result) - self.assertIn("https://mikro-orm.io/docs/defining-entities/index.html.md", result) # Should NOT contain any URLs with anchor fragments for url in result: self.assertNotIn("#", url, f"URL should not contain anchor: {url}") + # Should NOT contain /index.html.md + for url in result: + self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}") + def test_preserves_query_parameters(self): """Test that query parameters are preserved (only anchors stripped)""" urls = [ @@ -175,8 +194,6 @@ class TestConvertToMdUrls(unittest.TestCase): # Query parameters should be preserved, anchors stripped self.assertEqual(len(result), 2) # search deduplicated - # Note: Query parameters might not be ideal for .md conversion, - # but they should be preserved if present self.assertTrue( any("?q=test" in url for url in result), "Query parameter should be preserved", @@ -199,7 +216,7 @@ class TestConvertToMdUrls(unittest.TestCase): # All should deduplicate to single base URL self.assertEqual(len(result), 1) - self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md") + self.assertEqual(result[0], "https://example.com/docs/guide") def test_url_order_preservation(self): """Test that first occurrence of base URL is preserved""" @@ -214,9 +231,59 @@ class TestConvertToMdUrls(unittest.TestCase): # Should have 3 unique URLs, first occurrence preserved self.assertEqual(len(result), 3) - self.assertEqual(result[0], "https://example.com/docs/a/index.html.md") - self.assertEqual(result[1], "https://example.com/docs/b/index.html.md") - self.assertEqual(result[2], "https://example.com/docs/c/index.html.md") + self.assertEqual(result[0], "https://example.com/docs/a") + self.assertEqual(result[1], "https://example.com/docs/b") + self.assertEqual(result[2], "https://example.com/docs/c") + + def test_discord_docs_case(self): + """Test the Discord docs case reported by @skeith in issue #277""" + urls = [ + "https://docs.discord.com/developers/activities/building-an-activity", + "https://docs.discord.com/developers/activities/design-patterns", + "https://docs.discord.com/developers/components/overview", + "https://docs.discord.com/developers/bots/getting-started#step-1", + ] + + result = self.converter._convert_to_md_urls(urls) + + # No /index.html.md should be appended + for url in result: + self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}") + self.assertNotIn("#", url, f"Should not contain anchor: {url}") + + self.assertEqual(len(result), 4) + + +class TestHasMdExtension(unittest.TestCase): + """Test suite for _has_md_extension static method""" + + def test_md_extension(self): + self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md")) + + def test_md_with_query(self): + self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md?v=1")) + + def test_no_md_extension(self): + self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page")) + + def test_md_in_path_not_extension(self): + """'cmd-line' contains 'md' but is not a .md extension""" + self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/cmd-line")) + + def test_md_in_domain(self): + """'.md' in domain should not match""" + self.assertFalse(DocToSkillConverter._has_md_extension("https://docs.md.example.com/page")) + + def test_mdx_not_md(self): + """.mdx is not .md""" + self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.mdx")) + + def test_md_in_middle_of_path(self): + """.md in middle of path should not match""" + self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage")) + + def test_index_html_md(self): + self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md")) if __name__ == "__main__":