fix: stop blindly appending /index.html.md to non-.md URLs (#277)

The previous fix (a82cf69) only addressed anchor fragment stripping but left the fundamental problem: _convert_to_md_urls() blindly appended /index.html.md to ALL non-.md URLs from llms.txt. This only works for Docusaurus sites — for sites like Discord docs it generates mass 404s. Changes: - _convert_to_md_urls() now strips anchors and deduplicates only, preserving original URLs as-is instead of appending /index.html.md - New _has_md_extension() helper uses urlparse().path.endswith(".md") instead of error-prone ".md" in url substring matching - Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775) - Removed 24 lines of dead commented-out code - Added real-world e2e test against docs.discord.com (no mocks) - Updated unit tests for new behavior (32 tests) Fixes #277 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 23:44:35 +03:00
parent f6131c6798
commit 2ef6e59d06
4 changed files with 408 additions and 169 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -252,6 +252,15 @@ class DocToSkillConverter:

        return not any(pattern in url for pattern in self._exclude_patterns)

+    @staticmethod
+    def _has_md_extension(url: str) -> bool:
+        """Check if URL path ends with .md extension.
+
+        Uses URL path parsing instead of substring matching to avoid
+        false positives on URLs like /embed/page or /cmd-line.
+        """
+        return urlparse(url).path.endswith(".md")
+
    def save_checkpoint(self) -> None:
        """Save progress checkpoint"""
        if not self.checkpoint_enabled or self.dry_run:
@@ -462,7 +471,7 @@ class DocToSkillConverter:
                    else:
                        continue
                    full_url = full_url.split("#")[0]
-                    if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links:
+                    if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links:
                        links.append(full_url)

                return {
@@ -551,7 +560,7 @@ class DocToSkillConverter:
            # Strip anchor fragments
            full_url = full_url.split("#")[0]
            # Only include .md URLs to avoid client-side rendered HTML pages
-            if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]:
+            if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]:
                page["links"].append(full_url)

        return page
@@ -713,7 +722,7 @@ class DocToSkillConverter:
            response.raise_for_status()

            # Check if this is a Markdown file
-            if url.endswith(".md") or ".md" in url:
+            if self._has_md_extension(url):
                page = self._extract_markdown_content(response.text, url)
            else:
                soup = BeautifulSoup(response.content, "html.parser")
@@ -772,7 +781,7 @@ class DocToSkillConverter:
                response.raise_for_status()

                # Check if this is a Markdown file
-                if url.endswith(".md") or ".md" in url:
+                if self._has_md_extension(url):
                    page = self._extract_markdown_content(response.text, url)
                else:
                    # BeautifulSoup parsing (still synchronous, but fast)
@@ -798,71 +807,45 @@ class DocToSkillConverter:

    def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
        """
-        Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
-        Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
-        不预先检查 URL 是否存在，直接加入队列，在爬取时再验证。
+        Clean URLs from llms.txt: strip anchor fragments, deduplicate base URLs.
+
+        Previously this method blindly appended /index.html.md to non-.md URLs,
+        which caused 404 errors on sites that don't serve raw markdown files
+        (e.g. Discord docs, see issue #277). Now it preserves original URLs as-is
+        and lets the scraper handle both HTML and markdown content.

        Args:
            urls: List of URLs to process

        Returns:
-            List of .md URLs (未验证, deduplicated, no anchors)
+            List of cleaned, deduplicated URLs (no anchors)
        """
        from urllib.parse import urlparse, urlunparse

        seen_base_urls = set()
-        md_urls = []
+        cleaned_urls = []

        for url in urls:
            # Parse URL to extract and remove fragment (anchor)
            parsed = urlparse(url)
            base_url = urlunparse(parsed._replace(fragment=""))  # Remove #anchor

-            # Skip if we've already processed this base URL
-            if base_url in seen_base_urls:
-                continue
-            seen_base_urls.add(base_url)
+            # Normalize trailing slashes for dedup (but keep original form)
+            dedup_key = base_url.rstrip("/")

-            # Check if URL already ends with .md (not just contains "md")
-            if base_url.endswith(".md"):
-                md_urls.append(base_url)
-            else:
-                # 直接转换为 .md 格式，不发送 HEAD 请求检查
-                base_url = base_url.rstrip("/")
-                md_url = f"{base_url}/index.html.md"
-                md_urls.append(md_url)
+            # Skip if we've already processed this base URL
+            if dedup_key in seen_base_urls:
+                continue
+            seen_base_urls.add(dedup_key)
+
+            cleaned_urls.append(base_url)

        logger.info(
-            "  ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)",
+            "  ✓ Cleaned %d URLs to %d unique URLs (anchors stripped, will validate during crawl)",
            len(urls),
-            len(md_urls),
+            len(cleaned_urls),
        )
-        return md_urls
-
-    # ORIGINAL _convert_to_md_urls (with HEAD request validation):
-    # def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
-    #     md_urls = []
-    #     non_md_urls = []
-    #     for url in urls:
-    #         if '.md' in url:
-    #             md_urls.append(url)
-    #         else:
-    #             non_md_urls.append(url)
-    #     if non_md_urls:
-    #         logger.info("  🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
-    #         converted = 0
-    #         for url in non_md_urls:
-    #             url = url.rstrip('/')
-    #             md_url = f"{url}/index.html.md"
-    #             try:
-    #                 resp = requests.head(md_url, timeout=5, allow_redirects=True)
-    #                 if resp.status_code == 200:
-    #                     md_urls.append(md_url)
-    #                     converted += 1
-    #             except Exception:
-    #                 pass
-    #         logger.info("  ✓ Converted %d URLs to .md format", converted)
-    #     return md_urls
+        return cleaned_urls

    def _try_llms_txt(self) -> bool:
        """
@@ -933,16 +916,16 @@ class DocToSkillConverter:
                # Extract URLs from llms.txt and add to pending_urls for BFS crawling
                extracted_urls = parser.extract_urls()
                if extracted_urls:
-                    # Convert non-.md URLs to .md format by trying /index.html.md suffix
-                    md_urls = self._convert_to_md_urls(extracted_urls)
+                    # Clean URLs: strip anchors, deduplicate
+                    cleaned_urls = self._convert_to_md_urls(extracted_urls)
                    logger.info(
-                        "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
+                        "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
                        len(extracted_urls),
-                        len(md_urls),
+                        len(cleaned_urls),
                    )

                    # Filter URLs based on url_patterns config
-                    for url in md_urls:
+                    for url in cleaned_urls:
                        if self.is_valid_url(url):
                            self._enqueue_url(url)

@@ -1019,16 +1002,16 @@ class DocToSkillConverter:
        # Extract URLs from llms.txt and add to pending_urls for BFS crawling
        extracted_urls = parser.extract_urls()
        if extracted_urls:
-            # Convert non-.md URLs to .md format by trying /index.html.md suffix
-            md_urls = self._convert_to_md_urls(extracted_urls)
+            # Clean URLs: strip anchors, deduplicate
+            cleaned_urls = self._convert_to_md_urls(extracted_urls)
            logger.info(
-                "\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
+                "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
                len(extracted_urls),
-                len(md_urls),
+                len(cleaned_urls),
            )

            # Filter URLs based on url_patterns config
-            for url in md_urls:
+            for url in cleaned_urls:
                if self.is_valid_url(url):
                    self._enqueue_url(url)

--- a/tests/test_issue_277_discord_e2e.py
+++ b/tests/test_issue_277_discord_e2e.py
@@ -0,0 +1,146 @@
+"""
+E2E test for Issue #277 - Discord docs case reported by @skeith.
+
+This test hits the REAL Discord docs llms.txt and verifies that
+no /index.html.md URLs are generated. No mocks.
+
+Requires network access. Marked as integration test.
+"""
+
+import os
+import shutil
+import unittest
+
+import pytest
+
+from skill_seekers.cli.doc_scraper import DocToSkillConverter
+from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
+from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
+from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+
+@pytest.mark.integration
+class TestIssue277DiscordDocsE2E(unittest.TestCase):
+    """E2E: Reproduce @skeith's report with real Discord docs."""
+
+    def setUp(self):
+        self.base_url = "https://docs.discord.com/"
+        self.config = {
+            "name": "DiscordDocsE2E",
+            "description": "Discord API Documentation",
+            "base_url": self.base_url,
+            "selectors": {"main_content": "article"},
+            "url_patterns": {"include": ["/developers"], "exclude": []},
+        }
+        self.output_dir = f"output/{self.config['name']}_data"
+
+    def tearDown(self):
+        # Clean up any output created
+        for path in [self.output_dir, f"output/{self.config['name']}"]:
+            if os.path.exists(path):
+                shutil.rmtree(path)
+
+    def test_discord_llms_txt_exists(self):
+        """Verify Discord docs has llms.txt (precondition for the bug)."""
+        detector = LlmsTxtDetector(self.base_url)
+        variants = detector.detect_all()
+        self.assertTrue(
+            len(variants) > 0,
+            "Discord docs should have at least one llms.txt variant",
+        )
+
+    def test_discord_llms_txt_urls_no_index_html_md(self):
+        """Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
+        # Step 1: Detect llms.txt
+        detector = LlmsTxtDetector(self.base_url)
+        variants = detector.detect_all()
+        self.assertTrue(len(variants) > 0, "No llms.txt found at docs.discord.com")
+
+        # Step 2: Download the largest variant (same logic as doc_scraper)
+        downloaded = {}
+        for variant_info in variants:
+            downloader = LlmsTxtDownloader(variant_info["url"])
+            content = downloader.download()
+            if content:
+                downloaded[variant_info["variant"]] = content
+
+        self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
+
+        largest_content = max(downloaded.values(), key=len)
+
+        # Step 3: Parse URLs from llms.txt
+        parser = LlmsTxtParser(largest_content, self.base_url)
+        extracted_urls = parser.extract_urls()
+        self.assertTrue(
+            len(extracted_urls) > 0,
+            "No URLs extracted from Discord llms.txt",
+        )
+
+        # Step 4: Run _convert_to_md_urls (the function that was causing 404s)
+        converter = DocToSkillConverter(self.config, dry_run=True)
+        converted_urls = converter._convert_to_md_urls(extracted_urls)
+
+        # Step 5: Verify NO /index.html.md was blindly appended
+        bad_urls = [u for u in converted_urls if "/index.html.md" in u]
+        self.assertEqual(
+            len(bad_urls),
+            0,
+            f"Found {len(bad_urls)} URLs with /index.html.md appended "
+            f"(would cause 404s):\n"
+            + "\n".join(bad_urls[:10]),
+        )
+
+        # Step 6: Verify no anchor fragments leaked through
+        anchor_urls = [u for u in converted_urls if "#" in u]
+        self.assertEqual(
+            len(anchor_urls),
+            0,
+            f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
+            + "\n".join(anchor_urls[:10]),
+        )
+
+        # Step 7: Verify we got a reasonable number of URLs
+        self.assertGreater(
+            len(converted_urls),
+            10,
+            "Expected at least 10 unique URLs from Discord docs",
+        )
+
+    def test_discord_full_pipeline_no_404_urls(self):
+        """Full pipeline: detector -> downloader -> parser -> converter -> queue.
+
+        Simulates what `skill-seekers create https://docs.discord.com` does,
+        without actually scraping pages.
+        """
+        converter = DocToSkillConverter(self.config, dry_run=True)
+
+        # Run _try_llms_txt which calls _convert_to_md_urls internally
+        os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
+        os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
+        result = converter._try_llms_txt()
+
+        # _try_llms_txt returns False when it populates pending_urls for BFS
+        # (True means it parsed content directly, no BFS needed)
+        if not result:
+            # Check every URL in the queue
+            for url in converter.pending_urls:
+                self.assertNotIn(
+                    "/index.html.md",
+                    url,
+                    f"Queue contains 404-causing URL: {url}",
+                )
+                self.assertNotIn(
+                    "#",
+                    url,
+                    f"Queue contains URL with anchor fragment: {url}",
+                )
+
+            self.assertGreater(
+                len(converter.pending_urls),
+                0,
+                "Pipeline should have queued URLs for crawling",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_issue_277_real_world.py
+++ b/tests/test_issue_277_real_world.py
@@ -1,6 +1,9 @@
 """
-Real-world integration test for Issue #277: URL conversion bug with anchor fragments.
-Tests the exact MikroORM case that was reported in the issue.
+Real-world integration test for Issue #277: URL conversion bug with anchor fragments
+and blind /index.html.md appending.
+
+Tests the exact MikroORM and Discord cases reported in the issue.
+Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs.
 """

 import unittest
@@ -28,7 +31,6 @@ class TestIssue277RealWorld(unittest.TestCase):

    def test_mikro_orm_urls_from_issue_277(self):
        """Test the exact URLs that caused 404 errors in issue #277"""
-        # These are the actual problematic URLs from the bug report
        urls_from_llms_txt = [
            "https://mikro-orm.io/docs/",
            "https://mikro-orm.io/docs/reference.md",
@@ -44,54 +46,23 @@ class TestIssue277RealWorld(unittest.TestCase):

        # Verify no malformed URLs with anchor fragments
        for url in result:
+            self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
+            # No /index.html.md should be appended to non-.md URLs
+            if not url.endswith(".md"):
                self.assertNotIn(
-                "#synchronous-initialization/index.html.md",
-                url,
-                "Should not append /index.html.md after anchor fragments",
-            )
-            self.assertNotIn(
-                "#formulas/index.html.md",
-                url,
-                "Should not append /index.html.md after anchor fragments",
-            )
-            self.assertNotIn(
-                "#postgresql-native-enums/index.html.md",
-                url,
-                "Should not append /index.html.md after anchor fragments",
+                    "index.html.md", url, f"Should not append /index.html.md: {url}"
                )

-        # Verify correct transformed URLs
-
-        # Check that we got the expected number of unique URLs
-        # Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it
-        self.assertEqual(
-            len(result),
-            7,
-            f"Should have 7 unique base URLs after deduplication, got {len(result)}",
-        )
-
-        # Verify specific URLs that were causing 404s are now correct
-        self.assertIn(
-            "https://mikro-orm.io/docs/quick-start/index.html.md",
-            result,
-            "quick-start URL should be correctly transformed",
-        )
-        self.assertIn(
-            "https://mikro-orm.io/docs/propagation/index.html.md",
-            result,
-            "propagation URL should be correctly transformed",
-        )
-        self.assertIn(
-            "https://mikro-orm.io/docs/defining-entities.md",
-            result,
-            "defining-entities.md should preserve .md extension",
-        )
+        # .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated
+        self.assertIn("https://mikro-orm.io/docs/reference.md", result)
+        self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
+        self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
+        self.assertIn("https://mikro-orm.io/docs/quick-start", result)
+        self.assertIn("https://mikro-orm.io/docs/propagation", result)

    def test_no_404_causing_urls_generated(self):
        """Verify that no URLs matching the 404 error pattern are generated"""
-        # The exact 404-causing URL pattern from the issue
        problematic_patterns = [
-            "/index.html.md#",  # /index.html.md should never come after #
            "#synchronous-initialization/index.html.md",
            "#formulas/index.html.md",
            "#postgresql-native-enums/index.html.md",
@@ -118,9 +89,30 @@ class TestIssue277RealWorld(unittest.TestCase):
                    f"URL '{url}' contains problematic pattern '{pattern}' that causes 404",
                )

+    def test_no_blind_index_html_md_appending(self):
+        """Verify non-.md URLs don't get /index.html.md appended (core fix)"""
+        urls = [
+            "https://mikro-orm.io/docs/quick-start",
+            "https://mikro-orm.io/docs/propagation",
+            "https://mikro-orm.io/docs/filters",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        self.assertEqual(len(result), 3)
+        for url in result:
+            self.assertNotIn(
+                "/index.html.md",
+                url,
+                f"Should not blindly append /index.html.md: {url}",
+            )
+
+        self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start")
+        self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation")
+        self.assertEqual(result[2], "https://mikro-orm.io/docs/filters")
+
    def test_deduplication_prevents_multiple_requests(self):
        """Verify that multiple anchors on same page don't create duplicate requests"""
-        # From the issue: These should all map to the same base URL
        urls_with_multiple_anchors = [
            "https://mikro-orm.io/docs/defining-entities#formulas",
            "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
@@ -136,10 +128,7 @@ class TestIssue277RealWorld(unittest.TestCase):
            1,
            "Multiple anchors on same page should deduplicate to single request",
        )
-        self.assertEqual(
-            result[0],
-            "https://mikro-orm.io/docs/defining-entities/index.html.md",
-        )
+        self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities")

    def test_md_files_with_anchors_preserved(self):
        """Test that .md files with anchors are handled correctly"""
@@ -167,7 +156,6 @@ class TestIssue277RealWorld(unittest.TestCase):
        Integration test: Simulate real scraping scenario with llms.txt URLs.
        Verify that the converted URLs would not cause 404 errors.
        """
-        # Mock response for llms.txt content
        mock_response = MagicMock()
        mock_response.status_code = 200
        mock_response.text = """
@@ -179,7 +167,6 @@ https://mikro-orm.io/docs/defining-entities#formulas
 """
        mock_get.return_value = mock_response

-        # Simulate the llms.txt parsing flow
        urls_from_llms = [
            "https://mikro-orm.io/docs/quick-start",
            "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
@@ -187,27 +174,23 @@ https://mikro-orm.io/docs/defining-entities#formulas
            "https://mikro-orm.io/docs/defining-entities#formulas",
        ]

-        # Convert URLs (this is what happens in _try_llms_txt_v2)
        converted_urls = self.converter._convert_to_md_urls(urls_from_llms)

-        # Verify converted URLs are valid
-        # In real scenario, these would be added to pending_urls and scraped
-        self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
+        self.assertTrue(len(converted_urls) > 0)

-        # Verify no URLs would cause 404 (no anchors in middle of path)
        for url in converted_urls:
-            # Check URL structure is valid
+            # Should not contain # anywhere
            self.assertRegex(
                url,
-                r"^https://[^#]+$",  # Should not contain # anywhere
+                r"^https://[^#]+$",
                f"URL should not contain anchor fragments: {url}",
            )
-
-            # Verify the problematic pattern from the issue doesn't exist
-            self.assertNotRegex(
+            # Should NOT have /index.html.md appended
+            if not url.endswith(".md"):
+                self.assertNotIn(
+                    "index.html.md",
                    url,
-                r"#[^/]+/index\.html\.md",
-                f"URL should not have /index.html.md after anchor: {url}",
+                    f"Should not append /index.html.md: {url}",
                )

    def test_issue_277_error_message_urls(self):
@@ -215,14 +198,12 @@ https://mikro-orm.io/docs/defining-entities#formulas
        Test the exact URLs that appeared in error messages from the issue report.
        These were the actual 404-causing URLs that need to be fixed.
        """
-        # These are the MALFORMED URLs that caused 404 errors (with anchors in the middle)
        error_urls_with_anchors = [
            "https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md",
            "https://mikro-orm.io/docs/defining-entities#formulas/index.html.md",
            "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md",
        ]

-        # Extract the input URLs that would have generated these errors
        input_urls = [
            "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
            "https://mikro-orm.io/docs/propagation",
@@ -232,7 +213,7 @@ https://mikro-orm.io/docs/defining-entities#formulas

        result = self.converter._convert_to_md_urls(input_urls)

-        # Verify NONE of the malformed error URLs (with anchors) are generated
+        # Verify NONE of the malformed error URLs are generated
        for error_url in error_urls_with_anchors:
            self.assertNotIn(
                error_url,
@@ -240,18 +221,80 @@ https://mikro-orm.io/docs/defining-entities#formulas
                f"Should not generate the 404-causing URL: {error_url}",
            )

-        # Verify correct URLs are generated instead
-        correct_urls = [
-            "https://mikro-orm.io/docs/quick-start/index.html.md",
-            "https://mikro-orm.io/docs/propagation/index.html.md",
-            "https://mikro-orm.io/docs/defining-entities/index.html.md",
+        # Verify correct URLs are generated
+        self.assertIn("https://mikro-orm.io/docs/quick-start", result)
+        self.assertIn("https://mikro-orm.io/docs/propagation", result)
+        self.assertIn("https://mikro-orm.io/docs/defining-entities", result)
+
+
+class TestIssue277DiscordDocs(unittest.TestCase):
+    """Test for Discord docs case reported by @skeith"""
+
+    def setUp(self):
+        self.config = {
+            "name": "DiscordDocs",
+            "description": "Discord API Documentation",
+            "base_url": "https://docs.discord.com/",
+            "selectors": {"main_content": "article"},
+        }
+        self.converter = DocToSkillConverter(self.config, dry_run=True)
+
+    def test_discord_docs_no_index_html_md(self):
+        """Discord docs don't serve .md files - no /index.html.md should be appended"""
+        urls = [
+            "https://docs.discord.com/developers/activities/building-an-activity",
+            "https://docs.discord.com/developers/activities/design-patterns",
+            "https://docs.discord.com/developers/components/overview",
+            "https://docs.discord.com/developers/bots/getting-started",
        ]

-        for correct_url in correct_urls:
+        result = self.converter._convert_to_md_urls(urls)
+
+        self.assertEqual(len(result), 4)
+        for url in result:
+            self.assertNotIn(
+                "index.html.md",
+                url,
+                f"Discord docs should not get /index.html.md appended: {url}",
+            )
+
+    def test_discord_docs_md_urls_preserved(self):
+        """Discord llms.txt has .md URLs that should be preserved"""
+        urls = [
+            "https://docs.discord.com/developers/activities/building-an-activity.md",
+            "https://docs.discord.com/developers/components/overview.md",
+            "https://docs.discord.com/developers/change-log.md",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[0],
+            "https://docs.discord.com/developers/activities/building-an-activity.md",
+        )
+
+    def test_discord_docs_mixed_urls(self):
+        """Mix of .md and non-.md URLs from Discord docs"""
+        urls = [
+            "https://docs.discord.com/developers/activities/building-an-activity.md",
+            "https://docs.discord.com/developers/overview",
+            "https://docs.discord.com/developers/overview#quick-start",
+            "https://docs.discord.com/developers/bots/getting-started.md#step-1",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # .md URLs preserved, non-.md as-is, anchors stripped & deduped
+        self.assertEqual(len(result), 3)
        self.assertIn(
-                correct_url,
+            "https://docs.discord.com/developers/activities/building-an-activity.md",
+            result,
+        )
+        self.assertIn("https://docs.discord.com/developers/overview", result)
+        self.assertIn(
+            "https://docs.discord.com/developers/bots/getting-started.md",
            result,
-                f"Should generate the correct URL: {correct_url}",
        )


--- a/tests/test_url_conversion.py
+++ b/tests/test_url_conversion.py
@@ -1,6 +1,9 @@
 """
 Tests for URL conversion logic (_convert_to_md_urls).
 Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
+
+Updated: _convert_to_md_urls no longer blindly appends /index.html.md to non-.md URLs.
+It now strips anchors, deduplicates, and preserves original URLs as-is.
 """

 import unittest
@@ -31,11 +34,11 @@ class TestConvertToMdUrls(unittest.TestCase):

        result = self.converter._convert_to_md_urls(urls)

-        # All should be converted without anchor fragments
+        # All should have anchors stripped
        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md")
-        self.assertEqual(result[1], "https://example.com/docs/api/index.html.md")
-        self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/quick-start")
+        self.assertEqual(result[1], "https://example.com/docs/api")
+        self.assertEqual(result[2], "https://example.com/docs/guide")

    def test_deduplicates_multiple_anchors_same_url(self):
        """Test that multiple anchors on the same URL are deduplicated"""
@@ -50,7 +53,7 @@ class TestConvertToMdUrls(unittest.TestCase):

        # Should only have one entry for the base URL
        self.assertEqual(len(result), 1)
-        self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/api")

    def test_preserves_md_extension_urls(self):
        """Test that URLs already ending with .md are preserved"""
@@ -83,8 +86,27 @@ class TestConvertToMdUrls(unittest.TestCase):
        self.assertIn("https://example.com/docs/guide.md", result)
        self.assertIn("https://example.com/docs/api.md", result)

+    def test_non_md_urls_not_converted(self):
+        """Test that non-.md URLs are NOT converted to /index.html.md (issue #277)"""
+        urls = [
+            "https://example.com/docs/getting-started",
+            "https://example.com/docs/api-reference",
+            "https://example.com/docs/tutorials",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should preserve URLs as-is, NOT append /index.html.md
+        self.assertEqual(len(result), 3)
+        for url in result:
+            self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
+
+        self.assertEqual(result[0], "https://example.com/docs/getting-started")
+        self.assertEqual(result[1], "https://example.com/docs/api-reference")
+        self.assertEqual(result[2], "https://example.com/docs/tutorials")
+
    def test_does_not_match_md_in_path(self):
-        """Test that URLs containing 'md' in path (but not ending with .md) are converted"""
+        """Test that URLs containing 'md' in path are preserved as-is"""
        urls = [
            "https://example.com/cmd-line",
            "https://example.com/AMD-processors",
@@ -93,27 +115,23 @@ class TestConvertToMdUrls(unittest.TestCase):

        result = self.converter._convert_to_md_urls(urls)

-        # All should be converted since they don't END with .md
+        # URLs with 'md' substring (not extension) should be preserved as-is
        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md")
-        self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md")
-        self.assertEqual(result[2], "https://example.com/metadata/index.html.md")
+        self.assertEqual(result[0], "https://example.com/cmd-line")
+        self.assertEqual(result[1], "https://example.com/AMD-processors")
+        self.assertEqual(result[2], "https://example.com/metadata")

-    def test_removes_trailing_slashes(self):
-        """Test that trailing slashes are removed before appending /index.html.md"""
+    def test_removes_trailing_slashes_for_dedup(self):
+        """Test that trailing slash variants are deduplicated"""
        urls = [
            "https://example.com/docs/api/",
-            "https://example.com/docs/guide//",
-            "https://example.com/docs/reference",
+            "https://example.com/docs/api",
        ]

        result = self.converter._convert_to_md_urls(urls)

-        # All should have proper /index.html.md without double slashes
-        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
-        self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md")
-        self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md")
+        # Should deduplicate (trailing slash vs no slash)
+        self.assertEqual(len(result), 1)

    def test_mixed_urls_with_and_without_anchors(self):
        """Test mixed URLs with various formats"""
@@ -130,9 +148,9 @@ class TestConvertToMdUrls(unittest.TestCase):

        # Should deduplicate to 3 unique base URLs
        self.assertEqual(len(result), 3)
-        self.assertIn("https://example.com/docs/intro/index.html.md", result)
+        self.assertIn("https://example.com/docs/intro", result)
        self.assertIn("https://example.com/docs/api.md", result)
-        self.assertIn("https://example.com/docs/guide/index.html.md", result)
+        self.assertIn("https://example.com/docs/guide", result)

    def test_empty_url_list(self):
        """Test that empty URL list returns empty result"""
@@ -155,14 +173,15 @@ class TestConvertToMdUrls(unittest.TestCase):

        # Should deduplicate to 3 unique base URLs
        self.assertEqual(len(result), 3)
-        self.assertIn("https://mikro-orm.io/docs/quick-start/index.html.md", result)
-        self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result)
-        self.assertIn("https://mikro-orm.io/docs/defining-entities/index.html.md", result)

        # Should NOT contain any URLs with anchor fragments
        for url in result:
            self.assertNotIn("#", url, f"URL should not contain anchor: {url}")

+        # Should NOT contain /index.html.md
+        for url in result:
+            self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
+
    def test_preserves_query_parameters(self):
        """Test that query parameters are preserved (only anchors stripped)"""
        urls = [
@@ -175,8 +194,6 @@ class TestConvertToMdUrls(unittest.TestCase):

        # Query parameters should be preserved, anchors stripped
        self.assertEqual(len(result), 2)  # search deduplicated
-        # Note: Query parameters might not be ideal for .md conversion,
-        # but they should be preserved if present
        self.assertTrue(
            any("?q=test" in url for url in result),
            "Query parameter should be preserved",
@@ -199,7 +216,7 @@ class TestConvertToMdUrls(unittest.TestCase):

        # All should deduplicate to single base URL
        self.assertEqual(len(result), 1)
-        self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/guide")

    def test_url_order_preservation(self):
        """Test that first occurrence of base URL is preserved"""
@@ -214,9 +231,59 @@ class TestConvertToMdUrls(unittest.TestCase):

        # Should have 3 unique URLs, first occurrence preserved
        self.assertEqual(len(result), 3)
-        self.assertEqual(result[0], "https://example.com/docs/a/index.html.md")
-        self.assertEqual(result[1], "https://example.com/docs/b/index.html.md")
-        self.assertEqual(result[2], "https://example.com/docs/c/index.html.md")
+        self.assertEqual(result[0], "https://example.com/docs/a")
+        self.assertEqual(result[1], "https://example.com/docs/b")
+        self.assertEqual(result[2], "https://example.com/docs/c")
+
+    def test_discord_docs_case(self):
+        """Test the Discord docs case reported by @skeith in issue #277"""
+        urls = [
+            "https://docs.discord.com/developers/activities/building-an-activity",
+            "https://docs.discord.com/developers/activities/design-patterns",
+            "https://docs.discord.com/developers/components/overview",
+            "https://docs.discord.com/developers/bots/getting-started#step-1",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # No /index.html.md should be appended
+        for url in result:
+            self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
+            self.assertNotIn("#", url, f"Should not contain anchor: {url}")
+
+        self.assertEqual(len(result), 4)
+
+
+class TestHasMdExtension(unittest.TestCase):
+    """Test suite for _has_md_extension static method"""
+
+    def test_md_extension(self):
+        self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md"))
+
+    def test_md_with_query(self):
+        self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md?v=1"))
+
+    def test_no_md_extension(self):
+        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page"))
+
+    def test_md_in_path_not_extension(self):
+        """'cmd-line' contains 'md' but is not a .md extension"""
+        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/cmd-line"))
+
+    def test_md_in_domain(self):
+        """'.md' in domain should not match"""
+        self.assertFalse(DocToSkillConverter._has_md_extension("https://docs.md.example.com/page"))
+
+    def test_mdx_not_md(self):
+        """.mdx is not .md"""
+        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.mdx"))
+
+    def test_md_in_middle_of_path(self):
+        """.md in middle of path should not match"""
+        self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage"))
+
+    def test_index_html_md(self):
+        self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md"))


 if __name__ == "__main__":