From a82cf6967a66b149e11150bddf8fa4e586edb360 Mon Sep 17 00:00:00 2001
From: yusyus <yusufkaraaslan.yk@pm.me>
Date: Wed, 4 Feb 2026 21:16:13 +0300
Subject: [PATCH] fix: Strip anchor fragments in URL conversion to prevent 404
 errors (fixes #277)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Critical bug fix for llms.txt URL parsing:

Problem:
- URLs with anchor fragments (e.g., #synchronous-initialization) were
  malformed when converting to .md format
- Example: https://example.com/api#method → https://example.com/api#method/index.html.md ❌
- Caused 404 errors and duplicate requests for same page with different anchors

Solution:
1. Parse URLs with urllib.parse.urlparse() to extract fragments
2. Strip anchor fragments before appending /index.html.md
3. Deduplicate base URLs (multiple anchors → single request)
4. Fix .md detection: '.md' in url → url.endswith('.md')
   - Prevents false matches on URLs like /cmd-line or /AMD-processors

Changes:
- src/skill_seekers/cli/doc_scraper.py (_convert_to_md_urls)
  - Added URL parsing to remove fragments
  - Added deduplication with seen_base_urls set
  - Fixed .md extension detection
  - Updated log message to show deduplicated count
- tests/test_url_conversion.py (NEW)
  - 12 comprehensive tests covering all edge cases
  - Real-world MikroORM case validation
  - 54/54 tests passing (42 existing + 12 new)
- CHANGELOG.md
  - Documented bug fix and solution

Reported-by: @devjones <https://github.com/yusufkaraaslan/Skill_Seekers/issues/277>
---
 CHANGELOG.md                         |  21 +++
 src/skill_seekers/cli/doc_scraper.py |  27 +++-
 tests/test_url_conversion.py         | 228 +++++++++++++++++++++++++++
 3 files changed, 270 insertions(+), 6 deletions(-)
 create mode 100644 tests/test_url_conversion.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b861491..6186223 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+#### URL Conversion Bug with Anchor Fragments (Issue #277)
+- **Critical Bug Fix**: Fixed 404 errors when scraping documentation with anchor links
+  - **Problem**: URLs with anchor fragments (e.g., `#synchronous-initialization`) were malformed
+    - Incorrect: `https://example.com/docs/api#method/index.html.md` ❌
+    - Correct: `https://example.com/docs/api/index.html.md` ✅
+  - **Root Cause**: `_convert_to_md_urls()` didn't strip anchor fragments before appending `/index.html.md`
+  - **Solution**: Parse URLs with `urllib.parse` to remove fragments and deduplicate base URLs
+  - **Impact**: Prevents duplicate requests for the same page with different anchors
+  - **Additional Fix**: Changed `.md` detection from `".md" in url` to `url.endswith('.md')`
+    - Prevents false matches on URLs like `/cmd-line` or `/AMD-processors`
+- **Test Coverage**: 12 comprehensive tests covering all edge cases
+  - Anchor fragment stripping
+  - Deduplication of multiple anchors on same URL
+  - Query parameter preservation
+  - Trailing slash handling
+  - Real-world MikroORM case validation
+  - 54/54 tests passing (42 existing + 12 new)
+- **Reported by**: @devjones via Issue #277
+
 ### Added
 
 #### Extended Language Detection (NEW)
diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py
index 39e97fc..f976170 100755
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -705,27 +705,42 @@ class DocToSkillConverter:
     def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
         """
         Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
+        Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
         不预先检查 URL 是否存在，直接加入队列，在爬取时再验证。
 
         Args:
             urls: List of URLs to process
 
         Returns:
-            List of .md URLs (未验证)
+            List of .md URLs (未验证, deduplicated, no anchors)
         """
+        from urllib.parse import urlparse, urlunparse
+
+        seen_base_urls = set()
         md_urls = []
 
         for url in urls:
-            if ".md" in url:
-                md_urls.append(url)
+            # Parse URL to extract and remove fragment (anchor)
+            parsed = urlparse(url)
+            base_url = urlunparse(parsed._replace(fragment=""))  # Remove #anchor
+
+            # Skip if we've already processed this base URL
+            if base_url in seen_base_urls:
+                continue
+            seen_base_urls.add(base_url)
+
+            # Check if URL already ends with .md (not just contains "md")
+            if base_url.endswith(".md"):
+                md_urls.append(base_url)
             else:
                 # 直接转换为 .md 格式，不发送 HEAD 请求检查
-                url = url.rstrip("/")
-                md_url = f"{url}/index.html.md"
+                base_url = base_url.rstrip("/")
+                md_url = f"{base_url}/index.html.md"
                 md_urls.append(md_url)
 
         logger.info(
-            "  ✓ Converted %d URLs to .md format (will validate during crawl)",
+            "  ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)",
+            len(urls),
             len(md_urls),
         )
         return md_urls
diff --git a/tests/test_url_conversion.py b/tests/test_url_conversion.py
new file mode 100644
index 0000000..5e40f67
--- /dev/null
+++ b/tests/test_url_conversion.py
@@ -0,0 +1,228 @@
+"""
+Tests for URL conversion logic (_convert_to_md_urls).
+Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
+"""
+
+import unittest
+from unittest.mock import MagicMock
+
+from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+
+class TestConvertToMdUrls(unittest.TestCase):
+    """Test suite for _convert_to_md_urls method"""
+
+    def setUp(self):
+        """Set up test converter instance"""
+        config = {
+            "name": "test",
+            "description": "Test",
+            "base_url": "https://example.com/docs/",
+            "selectors": {"main_content": "article"},
+        }
+        self.converter = DocToSkillConverter(config, dry_run=True)
+
+    def test_strips_anchor_fragments(self):
+        """Test that anchor fragments (#anchor) are properly stripped from URLs"""
+        urls = [
+            "https://example.com/docs/quick-start#synchronous-initialization",
+            "https://example.com/docs/api#methods",
+            "https://example.com/docs/guide#installation",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # All should be converted without anchor fragments
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md")
+        self.assertEqual(result[1], "https://example.com/docs/api/index.html.md")
+        self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md")
+
+    def test_deduplicates_multiple_anchors_same_url(self):
+        """Test that multiple anchors on the same URL are deduplicated"""
+        urls = [
+            "https://example.com/docs/api#method1",
+            "https://example.com/docs/api#method2",
+            "https://example.com/docs/api#method3",
+            "https://example.com/docs/api",  # Same URL without anchor
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should only have one entry for the base URL
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
+
+    def test_preserves_md_extension_urls(self):
+        """Test that URLs already ending with .md are preserved"""
+        urls = [
+            "https://example.com/docs/guide.md",
+            "https://example.com/docs/readme.md",
+            "https://example.com/docs/api-reference.md",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should preserve .md URLs without modification
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0], "https://example.com/docs/guide.md")
+        self.assertEqual(result[1], "https://example.com/docs/readme.md")
+        self.assertEqual(result[2], "https://example.com/docs/api-reference.md")
+
+    def test_md_extension_with_anchor_fragments(self):
+        """Test that .md URLs with anchors are handled correctly"""
+        urls = [
+            "https://example.com/docs/guide.md#introduction",
+            "https://example.com/docs/guide.md#advanced",
+            "https://example.com/docs/api.md#methods",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should strip anchors but preserve .md extension
+        self.assertEqual(len(result), 2)  # guide.md deduplicated
+        self.assertIn("https://example.com/docs/guide.md", result)
+        self.assertIn("https://example.com/docs/api.md", result)
+
+    def test_does_not_match_md_in_path(self):
+        """Test that URLs containing 'md' in path (but not ending with .md) are converted"""
+        urls = [
+            "https://example.com/cmd-line",
+            "https://example.com/AMD-processors",
+            "https://example.com/metadata",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # All should be converted since they don't END with .md
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md")
+        self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md")
+        self.assertEqual(result[2], "https://example.com/metadata/index.html.md")
+
+    def test_removes_trailing_slashes(self):
+        """Test that trailing slashes are removed before appending /index.html.md"""
+        urls = [
+            "https://example.com/docs/api/",
+            "https://example.com/docs/guide//",
+            "https://example.com/docs/reference",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # All should have proper /index.html.md without double slashes
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
+        self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md")
+        self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md")
+
+    def test_mixed_urls_with_and_without_anchors(self):
+        """Test mixed URLs with various formats"""
+        urls = [
+            "https://example.com/docs/intro",
+            "https://example.com/docs/intro#getting-started",
+            "https://example.com/docs/api.md",
+            "https://example.com/docs/api.md#methods",
+            "https://example.com/docs/guide#section1",
+            "https://example.com/docs/guide",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should deduplicate to 3 unique base URLs
+        self.assertEqual(len(result), 3)
+        self.assertIn("https://example.com/docs/intro/index.html.md", result)
+        self.assertIn("https://example.com/docs/api.md", result)
+        self.assertIn("https://example.com/docs/guide/index.html.md", result)
+
+    def test_empty_url_list(self):
+        """Test that empty URL list returns empty result"""
+        urls = []
+        result = self.converter._convert_to_md_urls(urls)
+        self.assertEqual(len(result), 0)
+        self.assertEqual(result, [])
+
+    def test_real_world_mikro_orm_case(self):
+        """Test the exact URLs from issue #277 (MikroORM case)"""
+        urls = [
+            "https://mikro-orm.io/docs/quick-start",
+            "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
+            "https://mikro-orm.io/docs/propagation",
+            "https://mikro-orm.io/docs/defining-entities#formulas",
+            "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should deduplicate to 3 unique base URLs
+        self.assertEqual(len(result), 3)
+        self.assertIn(
+            "https://mikro-orm.io/docs/quick-start/index.html.md", result
+        )
+        self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result)
+        self.assertIn(
+            "https://mikro-orm.io/docs/defining-entities/index.html.md", result
+        )
+
+        # Should NOT contain any URLs with anchor fragments
+        for url in result:
+            self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
+
+    def test_preserves_query_parameters(self):
+        """Test that query parameters are preserved (only anchors stripped)"""
+        urls = [
+            "https://example.com/docs/search?q=test",
+            "https://example.com/docs/search?q=test#results",
+            "https://example.com/docs/api?version=2",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Query parameters should be preserved, anchors stripped
+        self.assertEqual(len(result), 2)  # search deduplicated
+        # Note: Query parameters might not be ideal for .md conversion,
+        # but they should be preserved if present
+        self.assertTrue(
+            any("?q=test" in url for url in result),
+            "Query parameter should be preserved",
+        )
+        self.assertTrue(
+            any("?version=2" in url for url in result),
+            "Query parameter should be preserved",
+        )
+
+    def test_complex_anchor_formats(self):
+        """Test various anchor formats (encoded, with dashes, etc.)"""
+        urls = [
+            "https://example.com/docs/guide#section-one",
+            "https://example.com/docs/guide#section_two",
+            "https://example.com/docs/guide#section%20three",
+            "https://example.com/docs/guide#123",
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # All should deduplicate to single base URL
+        self.assertEqual(len(result), 1)
+        self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md")
+
+    def test_url_order_preservation(self):
+        """Test that first occurrence of base URL is preserved"""
+        urls = [
+            "https://example.com/docs/a",
+            "https://example.com/docs/b#anchor",
+            "https://example.com/docs/c",
+            "https://example.com/docs/a#different-anchor",  # Duplicate base
+        ]
+
+        result = self.converter._convert_to_md_urls(urls)
+
+        # Should have 3 unique URLs, first occurrence preserved
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[0], "https://example.com/docs/a/index.html.md")
+        self.assertEqual(result[1], "https://example.com/docs/b/index.html.md")
+        self.assertEqual(result[2], "https://example.com/docs/c/index.html.md")
+
+
+if __name__ == "__main__":
+    unittest.main()