fix: stop blindly appending /index.html.md to non-.md URLs (#277)

The previous fix (a82cf69) only addressed anchor fragment stripping but left the fundamental problem: _convert_to_md_urls() blindly appended /index.html.md to ALL non-.md URLs from llms.txt. This only works for Docusaurus sites — for sites like Discord docs it generates mass 404s. Changes: - _convert_to_md_urls() now strips anchors and deduplicates only, preserving original URLs as-is instead of appending /index.html.md - New _has_md_extension() helper uses urlparse().path.endswith(".md") instead of error-prone ".md" in url substring matching - Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775) - Removed 24 lines of dead commented-out code - Added real-world e2e test against docs.discord.com (no mocks) - Updated unit tests for new behavior (32 tests) Fixes #277 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 23:44:35 +03:00
parent f6131c6798
commit 2ef6e59d06
4 changed files with 408 additions and 169 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -252,6 +252,15 @@ class DocToSkillConverter:
        return not any(pattern in url for pattern in self._exclude_patterns)
    @staticmethod
    def _has_md_extension(url: str) -> bool:
        """Check if URL path ends with .md extension.
        Uses URL path parsing instead of substring matching to avoid
        false positives on URLs like /embed/page or /cmd-line.
        """
        return urlparse(url).path.endswith(".md")
    def save_checkpoint(self) -> None:
        """Save progress checkpoint"""
        if not self.checkpoint_enabled or self.dry_run:
@@ -462,7 +471,7 @@ class DocToSkillConverter:
                    else:
                        continue
                    full_url = full_url.split("#")[0]
-                    if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links:
+                    if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links:
                        links.append(full_url)
                return {
@@ -551,7 +560,7 @@ class DocToSkillConverter:
            # Strip anchor fragments
            full_url = full_url.split("#")[0]
            # Only include .md URLs to avoid client-side rendered HTML pages
-            if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]:
+            if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]:
                page["links"].append(full_url)
        return page
@@ -713,7 +722,7 @@ class DocToSkillConverter:
            response.raise_for_status()
            # Check if this is a Markdown file
-            if url.endswith(".md") or ".md" in url:
+            if self._has_md_extension(url):
                page = self._extract_markdown_content(response.text, url)
            else:
                soup = BeautifulSoup(response.content, "html.parser")
@@ -772,7 +781,7 @@ class DocToSkillConverter:
                response.raise_for_status()
                # Check if this is a Markdown file
-                if url.endswith(".md") or ".md" in url:
+                if self._has_md_extension(url):
                    page = self._extract_markdown_content(response.text, url)
                else:
                    # BeautifulSoup parsing (still synchronous, but fast)
@@ -798,71 +807,45 @@ class DocToSkillConverter:
    def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
        """
-        Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
+        Clean URLs from llms.txt: strip anchor fragments, deduplicate base URLs.
-        Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
+
-        不预先检查 URL 是否存在，直接加入队列，在爬取时再验证。
+        Previously this method blindly appended /index.html.md to non-.md URLs,
        which caused 404 errors on sites that don't serve raw markdown files
        (e.g. Discord docs, see issue #277). Now it preserves original URLs as-is
        and lets the scraper handle both HTML and markdown content.
        Args:
            urls: List of URLs to process
        Returns:
-            List of .md URLs (未验证, deduplicated, no anchors)
+            List of cleaned, deduplicated URLs (no anchors)
        """
        from urllib.parse import urlparse, urlunparse
        seen_base_urls = set()
-        md_urls = []
+        cleaned_urls = []
        for url in urls:
            # Parse URL to extract and remove fragment (anchor)
            parsed = urlparse(url)
            base_url = urlunparse(parsed._replace(fragment=""))  # Remove #anchor
-            # Skip if we've already processed this base URL
+            # Normalize trailing slashes for dedup (but keep original form)
-            if base_url in seen_base_urls:
+            dedup_key = base_url.rstrip("/")
                continue
            seen_base_urls.add(base_url)
-            # Check if URL already ends with .md (not just contains "md")
+            # Skip if we've already processed this base URL
-            if base_url.endswith(".md"):
+            if dedup_key in seen_base_urls:
-                md_urls.append(base_url)
+                continue
-            else:
+            seen_base_urls.add(dedup_key)
-                # 直接转换为 .md 格式，不发送 HEAD 请求检查
+
-                base_url = base_url.rstrip("/")
+            cleaned_urls.append(base_url)
                md_url = f"{base_url}/index.html.md"
                md_urls.append(md_url)
        logger.info(
-            "  ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)",
+            "  ✓ Cleaned %d URLs to %d unique URLs (anchors stripped, will validate during crawl)",
            len(urls),
-            len(md_urls),
+            len(cleaned_urls),
        )
-        return md_urls
+        return cleaned_urls
    # ORIGINAL _convert_to_md_urls (with HEAD request validation):
    # def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
    #     md_urls = []
    #     non_md_urls = []
    #     for url in urls:
    #         if '.md' in url:
    #             md_urls.append(url)
    #         else:
    #             non_md_urls.append(url)
    #     if non_md_urls:
    #         logger.info("  🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
    #         converted = 0
    #         for url in non_md_urls:
    #             url = url.rstrip('/')
    #             md_url = f"{url}/index.html.md"
    #             try:
    #                 resp = requests.head(md_url, timeout=5, allow_redirects=True)
    #                 if resp.status_code == 200:
    #                     md_urls.append(md_url)
    #                     converted += 1
    #             except Exception:
    #                 pass
    #         logger.info("  ✓ Converted %d URLs to .md format", converted)
    #     return md_urls
    def _try_llms_txt(self) -> bool:
        """
@@ -933,16 +916,16 @@ class DocToSkillConverter:
                # Extract URLs from llms.txt and add to pending_urls for BFS crawling
                extracted_urls = parser.extract_urls()
                if extracted_urls:
-                    # Convert non-.md URLs to .md format by trying /index.html.md suffix
+                    # Clean URLs: strip anchors, deduplicate
-                    md_urls = self._convert_to_md_urls(extracted_urls)
+                    cleaned_urls = self._convert_to_md_urls(extracted_urls)
                    logger.info(
-                        "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
+                        "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
                        len(extracted_urls),
-                        len(md_urls),
+                        len(cleaned_urls),
                    )
                    # Filter URLs based on url_patterns config
-                    for url in md_urls:
+                    for url in cleaned_urls:
                        if self.is_valid_url(url):
                            self._enqueue_url(url)
@@ -1019,16 +1002,16 @@ class DocToSkillConverter:
        # Extract URLs from llms.txt and add to pending_urls for BFS crawling
        extracted_urls = parser.extract_urls()
        if extracted_urls:
-            # Convert non-.md URLs to .md format by trying /index.html.md suffix
+            # Clean URLs: strip anchors, deduplicate
-            md_urls = self._convert_to_md_urls(extracted_urls)
+            cleaned_urls = self._convert_to_md_urls(extracted_urls)
            logger.info(
-                "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
+                "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
                len(extracted_urls),
-                len(md_urls),
+                len(cleaned_urls),
            )
            # Filter URLs based on url_patterns config
-            for url in md_urls:
+            for url in cleaned_urls:
                if self.is_valid_url(url):
                    self._enqueue_url(url)
--- a/tests/test_issue_277_discord_e2e.py
+++ b/tests/test_issue_277_discord_e2e.py
@@ -0,0 +1,146 @@
 """
 E2E test for Issue #277 - Discord docs case reported by @skeith.
 This test hits the REAL Discord docs llms.txt and verifies that
 no /index.html.md URLs are generated. No mocks.
 Requires network access. Marked as integration test.
 """
 import os
 import shutil
 import unittest
 import pytest
 from skill_seekers.cli.doc_scraper import DocToSkillConverter
 from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
 from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
 from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
@pytest.mark.integration
 class TestIssue277DiscordDocsE2E(unittest.TestCase):
    """E2E: Reproduce @skeith's report with real Discord docs."""
    def setUp(self):
        self.base_url = "https://docs.discord.com/"
        self.config = {
            "name": "DiscordDocsE2E",
            "description": "Discord API Documentation",
            "base_url": self.base_url,
            "selectors": {"main_content": "article"},
            "url_patterns": {"include": ["/developers"], "exclude": []},
        }
        self.output_dir = f"output/{self.config['name']}_data"
    def tearDown(self):
        # Clean up any output created
        for path in [self.output_dir, f"output/{self.config['name']}"]:
            if os.path.exists(path):
                shutil.rmtree(path)
    def test_discord_llms_txt_exists(self):
        """Verify Discord docs has llms.txt (precondition for the bug)."""
        detector = LlmsTxtDetector(self.base_url)
        variants = detector.detect_all()
        self.assertTrue(
            len(variants) > 0,
            "Discord docs should have at least one llms.txt variant",
        )
    def test_discord_llms_txt_urls_no_index_html_md(self):
        """Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
        # Step 1: Detect llms.txt
        detector = LlmsTxtDetector(self.base_url)
        variants = detector.detect_all()
        self.assertTrue(len(variants) > 0, "No llms.txt found at docs.discord.com")
        # Step 2: Download the largest variant (same logic as doc_scraper)
        downloaded = {}
        for variant_info in variants:
            downloader = LlmsTxtDownloader(variant_info["url"])
            content = downloader.download()
            if content:
                downloaded[variant_info["variant"]] = content
        self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
        largest_content = max(downloaded.values(), key=len)
        # Step 3: Parse URLs from llms.txt
        parser = LlmsTxtParser(largest_content, self.base_url)
        extracted_urls = parser.extract_urls()
        self.assertTrue(
            len(extracted_urls) > 0,
            "No URLs extracted from Discord llms.txt",
        )
        # Step 4: Run _convert_to_md_urls (the function that was causing 404s)
        converter = DocToSkillConverter(self.config, dry_run=True)
        converted_urls = converter._convert_to_md_urls(extracted_urls)
        # Step 5: Verify NO /index.html.md was blindly appended
        bad_urls = [u for u in converted_urls if "/index.html.md" in u]
        self.assertEqual(
            len(bad_urls),
            0,
            f"Found {len(bad_urls)} URLs with /index.html.md appended "
            f"(would cause 404s):\n"
            + "\n".join(bad_urls[:10]),
        )
        # Step 6: Verify no anchor fragments leaked through
        anchor_urls = [u for u in converted_urls if "#" in u]
        self.assertEqual(
            len(anchor_urls),
            0,
            f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
            + "\n".join(anchor_urls[:10]),
        )
        # Step 7: Verify we got a reasonable number of URLs
        self.assertGreater(
            len(converted_urls),
            10,
            "Expected at least 10 unique URLs from Discord docs",
        )
    def test_discord_full_pipeline_no_404_urls(self):
        """Full pipeline: detector -> downloader -> parser -> converter -> queue.
        Simulates what `skill-seekers create https://docs.discord.com` does,
        without actually scraping pages.
        """
        converter = DocToSkillConverter(self.config, dry_run=True)
        # Run _try_llms_txt which calls _convert_to_md_urls internally
        os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
        os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
        result = converter._try_llms_txt()
        # _try_llms_txt returns False when it populates pending_urls for BFS
        # (True means it parsed content directly, no BFS needed)
        if not result:
            # Check every URL in the queue
            for url in converter.pending_urls:
                self.assertNotIn(
                    "/index.html.md",
                    url,
                    f"Queue contains 404-causing URL: {url}",
                )
                self.assertNotIn(
                    "#",
                    url,
                    f"Queue contains URL with anchor fragment: {url}",
                )
            self.assertGreater(
                len(converter.pending_urls),
                0,
                "Pipeline should have queued URLs for crawling",
            )
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_issue_277_real_world.py
+++ b/tests/test_issue_277_real_world.py
@@ -1,6 +1,9 @@
 """
-Real-world integration test for Issue #277: URL conversion bug with anchor fragments.
+Real-world integration test for Issue #277: URL conversion bug with anchor fragments
-Tests the exact MikroORM case that was reported in the issue.
+and blind /index.html.md appending.
 Tests the exact MikroORM and Discord cases reported in the issue.
 Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs.
 """
 import unittest
@@ -28,7 +31,6 @@ class TestIssue277RealWorld(unittest.TestCase):
    def test_mikro_orm_urls_from_issue_277(self):
        """Test the exact URLs that caused 404 errors in issue #277"""
        # These are the actual problematic URLs from the bug report
        urls_from_llms_txt = [
            "https://mikro-orm.io/docs/",
            "https://mikro-orm.io/docs/reference.md",
@@ -44,54 +46,23 @@ class TestIssue277RealWorld(unittest.TestCase):
        # Verify no malformed URLs with anchor fragments
        for url in result:
-            self.assertNotIn(
+            self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
-                "#synchronous-initialization/index.html.md",
+            # No /index.html.md should be appended to non-.md URLs
-                url,
+            if not url.endswith(".md"):
-                "Should not append /index.html.md after anchor fragments",
+                self.assertNotIn(
-            )
+                    "index.html.md", url, f"Should not append /index.html.md: {url}"
-            self.assertNotIn(
+                )
                "#formulas/index.html.md",
                url,
                "Should not append /index.html.md after anchor fragments",
            )
            self.assertNotIn(
                "#postgresql-native-enums/index.html.md",
                url,
                "Should not append /index.html.md after anchor fragments",
            )
-        # Verify correct transformed URLs
+        # .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated
-
+        self.assertIn("https://mikro-orm.io/docs/reference.md", result)
-        # Check that we got the expected number of unique URLs
+        self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
-        # Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it
+        self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
-        self.assertEqual(
+        self.assertIn("https://mikro-orm.io/docs/quick-start", result)
-            len(result),
+        self.assertIn("https://mikro-orm.io/docs/propagation", result)
            7,
            f"Should have 7 unique base URLs after deduplication, got {len(result)}",
        )
        # Verify specific URLs that were causing 404s are now correct
        self.assertIn(
            "https://mikro-orm.io/docs/quick-start/index.html.md",
            result,
            "quick-start URL should be correctly transformed",
        )
        self.assertIn(
            "https://mikro-orm.io/docs/propagation/index.html.md",
            result,
            "propagation URL should be correctly transformed",
        )
        self.assertIn(
            "https://mikro-orm.io/docs/defining-entities.md",
            result,
            "defining-entities.md should preserve .md extension",
        )
    def test_no_404_causing_urls_generated(self):
        """Verify that no URLs matching the 404 error pattern are generated"""
        # The exact 404-causing URL pattern from the issue
        problematic_patterns = [
            "/index.html.md#",  # /index.html.md should never come after #
            "#synchronous-initialization/index.html.md",
            "#formulas/index.html.md",
            "#postgresql-native-enums/index.html.md",
@@ -118,9 +89,30 @@ class TestIssue277RealWorld(unittest.TestCase):
                    f"URL '{url}' contains problematic pattern '{pattern}' that causes 404",
                )
    def test_no_blind_index_html_md_appending(self):
        """Verify non-.md URLs don't get /index.html.md appended (core fix)"""
        urls = [
            "https://mikro-orm.io/docs/quick-start",
            "https://mikro-orm.io/docs/propagation",
            "https://mikro-orm.io/docs/filters",
        ]
        result = self.converter._convert_to_md_urls(urls)
        self.assertEqual(len(result), 3)
        for url in result:
            self.assertNotIn(
                "/index.html.md",
                url,
                f"Should not blindly append /index.html.md: {url}",
            )
        self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start")
        self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation")
        self.assertEqual(result[2], "https://mikro-orm.io/docs/filters")
    def test_deduplication_prevents_multiple_requests(self):
        """Verify that multiple anchors on same page don't create duplicate requests"""
        # From the issue: These should all map to the same base URL
        urls_with_multiple_anchors = [
            "https://mikro-orm.io/docs/defining-entities#formulas",
            "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
@@ -136,10 +128,7 @@ class TestIssue277RealWorld(unittest.TestCase):
            1,
            "Multiple anchors on same page should deduplicate to single request",
        )
-        self.assertEqual(
+        self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities")
            result[0],
            "https://mikro-orm.io/docs/defining-entities/index.html.md",
        )
    def test_md_files_with_anchors_preserved(self):
        """Test that .md files with anchors are handled correctly"""
@@ -167,7 +156,6 @@ class TestIssue277RealWorld(unittest.TestCase):
        Integration test: Simulate real scraping scenario with llms.txt URLs.
        Verify that the converted URLs would not cause 404 errors.
        """
        # Mock response for llms.txt content
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.text = """
@@ -179,7 +167,6 @@ https://mikro-orm.io/docs/defining-entities#formulas
 """
        mock_get.return_value = mock_response
        # Simulate the llms.txt parsing flow
        urls_from_llms = [
            "https://mikro-orm.io/docs/quick-start",
            "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
@@ -187,42 +174,36 @@ https://mikro-orm.io/docs/defining-entities#formulas
            "https://mikro-orm.io/docs/defining-entities#formulas",
        ]
        # Convert URLs (this is what happens in _try_llms_txt_v2)
        converted_urls = self.converter._convert_to_md_urls(urls_from_llms)
-        # Verify converted URLs are valid
+        self.assertTrue(len(converted_urls) > 0)
        # In real scenario, these would be added to pending_urls and scraped
        self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
        # Verify no URLs would cause 404 (no anchors in middle of path)
        for url in converted_urls:
-            # Check URL structure is valid
+            # Should not contain # anywhere
            self.assertRegex(
                url,
-                r"^https://[^#]+$",  # Should not contain # anywhere
+                r"^https://[^#]+$",
                f"URL should not contain anchor fragments: {url}",
            )
-
+            # Should NOT have /index.html.md appended
-            # Verify the problematic pattern from the issue doesn't exist
+            if not url.endswith(".md"):
-            self.assertNotRegex(
+                self.assertNotIn(
-                url,
+                    "index.html.md",
-                r"#[^/]+/index\.html\.md",
+                    url,
-                f"URL should not have /index.html.md after anchor: {url}",
+                    f"Should not append /index.html.md: {url}",
-            )
+                )
    def test_issue_277_error_message_urls(self):
        """
        Test the exact URLs that appeared in error messages from the issue report.
        These were the actual 404-causing URLs that need to be fixed.
        """
        # These are the MALFORMED URLs that caused 404 errors (with anchors in the middle)
        error_urls_with_anchors = [
            "https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md",
            "https://mikro-orm.io/docs/defining-entities#formulas/index.html.md",
            "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md",
        ]
        # Extract the input URLs that would have generated these errors
        input_urls = [
            "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
            "https://mikro-orm.io/docs/propagation",
@@ -232,7 +213,7 @@ https://mikro-orm.io/docs/defining-entities#formulas
        result = self.converter._convert_to_md_urls(input_urls)
-        # Verify NONE of the malformed error URLs (with anchors) are generated
+        # Verify NONE of the malformed error URLs are generated
        for error_url in error_urls_with_anchors:
            self.assertNotIn(
                error_url,
@@ -240,20 +221,82 @@ https://mikro-orm.io/docs/defining-entities#formulas
                f"Should not generate the 404-causing URL: {error_url}",
            )
-        # Verify correct URLs are generated instead
+        # Verify correct URLs are generated
-        correct_urls = [
+        self.assertIn("https://mikro-orm.io/docs/quick-start", result)
-            "https://mikro-orm.io/docs/quick-start/index.html.md",
+        self.assertIn("https://mikro-orm.io/docs/propagation", result)
-            "https://mikro-orm.io/docs/propagation/index.html.md",
+        self.assertIn("https://mikro-orm.io/docs/defining-entities", result)
-            "https://mikro-orm.io/docs/defining-entities/index.html.md",
+
 class TestIssue277DiscordDocs(unittest.TestCase):
    """Test for Discord docs case reported by @skeith"""
    def setUp(self):
        self.config = {
            "name": "DiscordDocs",
            "description": "Discord API Documentation",
            "base_url": "https://docs.discord.com/",
            "selectors": {"main_content": "article"},
        }
        self.converter = DocToSkillConverter(self.config, dry_run=True)
    def test_discord_docs_no_index_html_md(self):
        """Discord docs don't serve .md files - no /index.html.md should be appended"""
        urls = [
            "https://docs.discord.com/developers/activities/building-an-activity",
            "https://docs.discord.com/developers/activities/design-patterns",
            "https://docs.discord.com/developers/components/overview",
            "https://docs.discord.com/developers/bots/getting-started",
        ]
-        for correct_url in correct_urls:
+        result = self.converter._convert_to_md_urls(urls)
-            self.assertIn(
+
-                correct_url,
+        self.assertEqual(len(result), 4)
-                result,
+        for url in result:
-                f"Should generate the correct URL: {correct_url}",
+            self.assertNotIn(
                "index.html.md",
                url,
                f"Discord docs should not get /index.html.md appended: {url}",
            )
    def test_discord_docs_md_urls_preserved(self):
        """Discord llms.txt has .md URLs that should be preserved"""
        urls = [
            "https://docs.discord.com/developers/activities/building-an-activity.md",
            "https://docs.discord.com/developers/components/overview.md",
            "https://docs.discord.com/developers/change-log.md",
        ]
        result = self.converter._convert_to_md_urls(urls)
        self.assertEqual(len(result), 3)
        self.assertEqual(
            result[0],
            "https://docs.discord.com/developers/activities/building-an-activity.md",
        )
    def test_discord_docs_mixed_urls(self):
        """Mix of .md and non-.md URLs from Discord docs"""
        urls = [
            "https://docs.discord.com/developers/activities/building-an-activity.md",
            "https://docs.discord.com/developers/overview",
            "https://docs.discord.com/developers/overview#quick-start",
            "https://docs.discord.com/developers/bots/getting-started.md#step-1",
        ]
        result = self.converter._convert_to_md_urls(urls)
        # .md URLs preserved, non-.md as-is, anchors stripped & deduped
        self.assertEqual(len(result), 3)
        self.assertIn(
            "https://docs.discord.com/developers/activities/building-an-activity.md",
            result,
        )
        self.assertIn("https://docs.discord.com/developers/overview", result)
        self.assertIn(
            "https://docs.discord.com/developers/bots/getting-started.md",
            result,
        )
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_url_conversion.py
+++ b/tests/test_url_conversion.py
@@ -1,6 +1,9 @@
 """
 Tests for URL conversion logic (_convert_to_md_urls).
 Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
 Updated: _convert_to_md_urls no longer blindly appends /index.html.md to non-.md URLs.
 It now strips anchors, deduplicates, and preserves original URLs as-is.
 """
 import unittest
@@ -31,11 +34,11 @@ class TestConvertToMdUrls(unittest.TestCase):
        result = self.converter._convert_to_md_urls(urls)
-        # All should be converted without anchor fragments
+        # All should have anchors stripped
        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/quick-start")
-        self.assertEqual(result[1], "https://example.com/docs/api/index.html.md")
+        self.assertEqual(result[1], "https://example.com/docs/api")
-        self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md")
+        self.assertEqual(result[2], "https://example.com/docs/guide")
    def test_deduplicates_multiple_anchors_same_url(self):
        """Test that multiple anchors on the same URL are deduplicated"""
@@ -50,7 +53,7 @@ class TestConvertToMdUrls(unittest.TestCase):
        # Should only have one entry for the base URL
        self.assertEqual(len(result), 1)
-        self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/api")
    def test_preserves_md_extension_urls(self):
        """Test that URLs already ending with .md are preserved"""
@@ -83,8 +86,27 @@ class TestConvertToMdUrls(unittest.TestCase):
        self.assertIn("https://example.com/docs/guide.md", result)
        self.assertIn("https://example.com/docs/api.md", result)
    def test_non_md_urls_not_converted(self):
        """Test that non-.md URLs are NOT converted to /index.html.md (issue #277)"""
        urls = [
            "https://example.com/docs/getting-started",
            "https://example.com/docs/api-reference",
            "https://example.com/docs/tutorials",
        ]
        result = self.converter._convert_to_md_urls(urls)
        # Should preserve URLs as-is, NOT append /index.html.md
        self.assertEqual(len(result), 3)
        for url in result:
            self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
        self.assertEqual(result[0], "https://example.com/docs/getting-started")
        self.assertEqual(result[1], "https://example.com/docs/api-reference")
        self.assertEqual(result[2], "https://example.com/docs/tutorials")
    def test_does_not_match_md_in_path(self):
-        """Test that URLs containing 'md' in path (but not ending with .md) are converted"""
+        """Test that URLs containing 'md' in path are preserved as-is"""
        urls = [
            "https://example.com/cmd-line",
            "https://example.com/AMD-processors",
@@ -93,27 +115,23 @@ class TestConvertToMdUrls(unittest.TestCase):
        result = self.converter._convert_to_md_urls(urls)
-        # All should be converted since they don't END with .md
+        # URLs with 'md' substring (not extension) should be preserved as-is
        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md")
+        self.assertEqual(result[0], "https://example.com/cmd-line")
-        self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md")
+        self.assertEqual(result[1], "https://example.com/AMD-processors")
-        self.assertEqual(result[2], "https://example.com/metadata/index.html.md")
+        self.assertEqual(result[2], "https://example.com/metadata")
-    def test_removes_trailing_slashes(self):
+    def test_removes_trailing_slashes_for_dedup(self):
-        """Test that trailing slashes are removed before appending /index.html.md"""
+        """Test that trailing slash variants are deduplicated"""
        urls = [
            "https://example.com/docs/api/",
-            "https://example.com/docs/guide//",
+            "https://example.com/docs/api",
            "https://example.com/docs/reference",
        ]
        result = self.converter._convert_to_md_urls(urls)
-        # All should have proper /index.html.md without double slashes
+        # Should deduplicate (trailing slash vs no slash)
-        self.assertEqual(len(result), 3)
+        self.assertEqual(len(result), 1)
        self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
        self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md")
        self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md")
    def test_mixed_urls_with_and_without_anchors(self):
        """Test mixed URLs with various formats"""
@@ -130,9 +148,9 @@ class TestConvertToMdUrls(unittest.TestCase):
        # Should deduplicate to 3 unique base URLs
        self.assertEqual(len(result), 3)
-        self.assertIn("https://example.com/docs/intro/index.html.md", result)
+        self.assertIn("https://example.com/docs/intro", result)
        self.assertIn("https://example.com/docs/api.md", result)
-        self.assertIn("https://example.com/docs/guide/index.html.md", result)
+        self.assertIn("https://example.com/docs/guide", result)
    def test_empty_url_list(self):
        """Test that empty URL list returns empty result"""
@@ -155,14 +173,15 @@ class TestConvertToMdUrls(unittest.TestCase):
        # Should deduplicate to 3 unique base URLs
        self.assertEqual(len(result), 3)
        self.assertIn("https://mikro-orm.io/docs/quick-start/index.html.md", result)
        self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result)
        self.assertIn("https://mikro-orm.io/docs/defining-entities/index.html.md", result)
        # Should NOT contain any URLs with anchor fragments
        for url in result:
            self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
        # Should NOT contain /index.html.md
        for url in result:
            self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
    def test_preserves_query_parameters(self):
        """Test that query parameters are preserved (only anchors stripped)"""
        urls = [
@@ -175,8 +194,6 @@ class TestConvertToMdUrls(unittest.TestCase):
        # Query parameters should be preserved, anchors stripped
        self.assertEqual(len(result), 2)  # search deduplicated
        # Note: Query parameters might not be ideal for .md conversion,
        # but they should be preserved if present
        self.assertTrue(
            any("?q=test" in url for url in result),
            "Query parameter should be preserved",
@@ -199,7 +216,7 @@ class TestConvertToMdUrls(unittest.TestCase):
        # All should deduplicate to single base URL
        self.assertEqual(len(result), 1)
-        self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/guide")
    def test_url_order_preservation(self):
        """Test that first occurrence of base URL is preserved"""
@@ -214,9 +231,59 @@ class TestConvertToMdUrls(unittest.TestCase):
        # Should have 3 unique URLs, first occurrence preserved
        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/docs/a/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/a")
-        self.assertEqual(result[1], "https://example.com/docs/b/index.html.md")
+        self.assertEqual(result[1], "https://example.com/docs/b")
-        self.assertEqual(result[2], "https://example.com/docs/c/index.html.md")
+        self.assertEqual(result[2], "https://example.com/docs/c")
    def test_discord_docs_case(self):
        """Test the Discord docs case reported by @skeith in issue #277"""
        urls = [
            "https://docs.discord.com/developers/activities/building-an-activity",
            "https://docs.discord.com/developers/activities/design-patterns",
            "https://docs.discord.com/developers/components/overview",
            "https://docs.discord.com/developers/bots/getting-started#step-1",
        ]
        result = self.converter._convert_to_md_urls(urls)
        # No /index.html.md should be appended
        for url in result:
            self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
            self.assertNotIn("#", url, f"Should not contain anchor: {url}")
        self.assertEqual(len(result), 4)
 class TestHasMdExtension(unittest.TestCase):
    """Test suite for _has_md_extension static method"""
    def test_md_extension(self):
        self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md"))
    def test_md_with_query(self):
        self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md?v=1"))
    def test_no_md_extension(self):
        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page"))
    def test_md_in_path_not_extension(self):
        """'cmd-line' contains 'md' but is not a .md extension"""
        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/cmd-line"))
    def test_md_in_domain(self):
        """'.md' in domain should not match"""
        self.assertFalse(DocToSkillConverter._has_md_extension("https://docs.md.example.com/page"))
    def test_mdx_not_md(self):
        """.mdx is not .md"""
        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.mdx"))
    def test_md_in_middle_of_path(self):
        """.md in middle of path should not match"""
        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage"))
    def test_index_html_md(self):
        self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md"))
 if __name__ == "__main__":