fix: Strip anchor fragments in URL conversion to prevent 404 errors (fixes #277)
Critical bug fix for llms.txt URL parsing: Problem: - URLs with anchor fragments (e.g., #synchronous-initialization) were malformed when converting to .md format - Example: https://example.com/api#method → https://example.com/api#method/index.html.md ❌ - Caused 404 errors and duplicate requests for same page with different anchors Solution: 1. Parse URLs with urllib.parse.urlparse() to extract fragments 2. Strip anchor fragments before appending /index.html.md 3. Deduplicate base URLs (multiple anchors → single request) 4. Fix .md detection: '.md' in url → url.endswith('.md') - Prevents false matches on URLs like /cmd-line or /AMD-processors Changes: - src/skill_seekers/cli/doc_scraper.py (_convert_to_md_urls) - Added URL parsing to remove fragments - Added deduplication with seen_base_urls set - Fixed .md extension detection - Updated log message to show deduplicated count - tests/test_url_conversion.py (NEW) - 12 comprehensive tests covering all edge cases - Real-world MikroORM case validation - 54/54 tests passing (42 existing + 12 new) - CHANGELOG.md - Documented bug fix and solution Reported-by: @devjones <https://github.com/yusufkaraaslan/Skill_Seekers/issues/277>
This commit is contained in:
21
CHANGELOG.md
21
CHANGELOG.md
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
#### URL Conversion Bug with Anchor Fragments (Issue #277)
|
||||||
|
- **Critical Bug Fix**: Fixed 404 errors when scraping documentation with anchor links
|
||||||
|
- **Problem**: URLs with anchor fragments (e.g., `#synchronous-initialization`) were malformed
|
||||||
|
- Incorrect: `https://example.com/docs/api#method/index.html.md` ❌
|
||||||
|
- Correct: `https://example.com/docs/api/index.html.md` ✅
|
||||||
|
- **Root Cause**: `_convert_to_md_urls()` didn't strip anchor fragments before appending `/index.html.md`
|
||||||
|
- **Solution**: Parse URLs with `urllib.parse` to remove fragments and deduplicate base URLs
|
||||||
|
- **Impact**: Prevents duplicate requests for the same page with different anchors
|
||||||
|
- **Additional Fix**: Changed `.md` detection from `".md" in url` to `url.endswith('.md')`
|
||||||
|
- Prevents false matches on URLs like `/cmd-line` or `/AMD-processors`
|
||||||
|
- **Test Coverage**: 12 comprehensive tests covering all edge cases
|
||||||
|
- Anchor fragment stripping
|
||||||
|
- Deduplication of multiple anchors on same URL
|
||||||
|
- Query parameter preservation
|
||||||
|
- Trailing slash handling
|
||||||
|
- Real-world MikroORM case validation
|
||||||
|
- 54/54 tests passing (42 existing + 12 new)
|
||||||
|
- **Reported by**: @devjones via Issue #277
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
#### Extended Language Detection (NEW)
|
#### Extended Language Detection (NEW)
|
||||||
|
|||||||
@@ -705,27 +705,42 @@ class DocToSkillConverter:
|
|||||||
def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
|
def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
|
Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
|
||||||
|
Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
|
||||||
不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。
|
不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: List of URLs to process
|
urls: List of URLs to process
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of .md URLs (未验证)
|
List of .md URLs (未验证, deduplicated, no anchors)
|
||||||
"""
|
"""
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
|
||||||
|
seen_base_urls = set()
|
||||||
md_urls = []
|
md_urls = []
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if ".md" in url:
|
# Parse URL to extract and remove fragment (anchor)
|
||||||
md_urls.append(url)
|
parsed = urlparse(url)
|
||||||
|
base_url = urlunparse(parsed._replace(fragment="")) # Remove #anchor
|
||||||
|
|
||||||
|
# Skip if we've already processed this base URL
|
||||||
|
if base_url in seen_base_urls:
|
||||||
|
continue
|
||||||
|
seen_base_urls.add(base_url)
|
||||||
|
|
||||||
|
# Check if URL already ends with .md (not just contains "md")
|
||||||
|
if base_url.endswith(".md"):
|
||||||
|
md_urls.append(base_url)
|
||||||
else:
|
else:
|
||||||
# 直接转换为 .md 格式,不发送 HEAD 请求检查
|
# 直接转换为 .md 格式,不发送 HEAD 请求检查
|
||||||
url = url.rstrip("/")
|
base_url = base_url.rstrip("/")
|
||||||
md_url = f"{url}/index.html.md"
|
md_url = f"{base_url}/index.html.md"
|
||||||
md_urls.append(md_url)
|
md_urls.append(md_url)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
" ✓ Converted %d URLs to .md format (will validate during crawl)",
|
" ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)",
|
||||||
|
len(urls),
|
||||||
len(md_urls),
|
len(md_urls),
|
||||||
)
|
)
|
||||||
return md_urls
|
return md_urls
|
||||||
|
|||||||
228
tests/test_url_conversion.py
Normal file
228
tests/test_url_conversion.py
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
"""
|
||||||
|
Tests for URL conversion logic (_convert_to_md_urls).
|
||||||
|
Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||||
|
|
||||||
|
|
||||||
|
class TestConvertToMdUrls(unittest.TestCase):
|
||||||
|
"""Test suite for _convert_to_md_urls method"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test converter instance"""
|
||||||
|
config = {
|
||||||
|
"name": "test",
|
||||||
|
"description": "Test",
|
||||||
|
"base_url": "https://example.com/docs/",
|
||||||
|
"selectors": {"main_content": "article"},
|
||||||
|
}
|
||||||
|
self.converter = DocToSkillConverter(config, dry_run=True)
|
||||||
|
|
||||||
|
def test_strips_anchor_fragments(self):
|
||||||
|
"""Test that anchor fragments (#anchor) are properly stripped from URLs"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/quick-start#synchronous-initialization",
|
||||||
|
"https://example.com/docs/api#methods",
|
||||||
|
"https://example.com/docs/guide#installation",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# All should be converted without anchor fragments
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md")
|
||||||
|
self.assertEqual(result[1], "https://example.com/docs/api/index.html.md")
|
||||||
|
self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md")
|
||||||
|
|
||||||
|
def test_deduplicates_multiple_anchors_same_url(self):
|
||||||
|
"""Test that multiple anchors on the same URL are deduplicated"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/api#method1",
|
||||||
|
"https://example.com/docs/api#method2",
|
||||||
|
"https://example.com/docs/api#method3",
|
||||||
|
"https://example.com/docs/api", # Same URL without anchor
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Should only have one entry for the base URL
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
|
||||||
|
|
||||||
|
def test_preserves_md_extension_urls(self):
|
||||||
|
"""Test that URLs already ending with .md are preserved"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/guide.md",
|
||||||
|
"https://example.com/docs/readme.md",
|
||||||
|
"https://example.com/docs/api-reference.md",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Should preserve .md URLs without modification
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertEqual(result[0], "https://example.com/docs/guide.md")
|
||||||
|
self.assertEqual(result[1], "https://example.com/docs/readme.md")
|
||||||
|
self.assertEqual(result[2], "https://example.com/docs/api-reference.md")
|
||||||
|
|
||||||
|
def test_md_extension_with_anchor_fragments(self):
|
||||||
|
"""Test that .md URLs with anchors are handled correctly"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/guide.md#introduction",
|
||||||
|
"https://example.com/docs/guide.md#advanced",
|
||||||
|
"https://example.com/docs/api.md#methods",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Should strip anchors but preserve .md extension
|
||||||
|
self.assertEqual(len(result), 2) # guide.md deduplicated
|
||||||
|
self.assertIn("https://example.com/docs/guide.md", result)
|
||||||
|
self.assertIn("https://example.com/docs/api.md", result)
|
||||||
|
|
||||||
|
def test_does_not_match_md_in_path(self):
|
||||||
|
"""Test that URLs containing 'md' in path (but not ending with .md) are converted"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/cmd-line",
|
||||||
|
"https://example.com/AMD-processors",
|
||||||
|
"https://example.com/metadata",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# All should be converted since they don't END with .md
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md")
|
||||||
|
self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md")
|
||||||
|
self.assertEqual(result[2], "https://example.com/metadata/index.html.md")
|
||||||
|
|
||||||
|
def test_removes_trailing_slashes(self):
|
||||||
|
"""Test that trailing slashes are removed before appending /index.html.md"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/api/",
|
||||||
|
"https://example.com/docs/guide//",
|
||||||
|
"https://example.com/docs/reference",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# All should have proper /index.html.md without double slashes
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
|
||||||
|
self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md")
|
||||||
|
self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md")
|
||||||
|
|
||||||
|
def test_mixed_urls_with_and_without_anchors(self):
|
||||||
|
"""Test mixed URLs with various formats"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/intro",
|
||||||
|
"https://example.com/docs/intro#getting-started",
|
||||||
|
"https://example.com/docs/api.md",
|
||||||
|
"https://example.com/docs/api.md#methods",
|
||||||
|
"https://example.com/docs/guide#section1",
|
||||||
|
"https://example.com/docs/guide",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Should deduplicate to 3 unique base URLs
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertIn("https://example.com/docs/intro/index.html.md", result)
|
||||||
|
self.assertIn("https://example.com/docs/api.md", result)
|
||||||
|
self.assertIn("https://example.com/docs/guide/index.html.md", result)
|
||||||
|
|
||||||
|
def test_empty_url_list(self):
|
||||||
|
"""Test that empty URL list returns empty result"""
|
||||||
|
urls = []
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
self.assertEqual(len(result), 0)
|
||||||
|
self.assertEqual(result, [])
|
||||||
|
|
||||||
|
def test_real_world_mikro_orm_case(self):
|
||||||
|
"""Test the exact URLs from issue #277 (MikroORM case)"""
|
||||||
|
urls = [
|
||||||
|
"https://mikro-orm.io/docs/quick-start",
|
||||||
|
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
|
||||||
|
"https://mikro-orm.io/docs/propagation",
|
||||||
|
"https://mikro-orm.io/docs/defining-entities#formulas",
|
||||||
|
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Should deduplicate to 3 unique base URLs
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertIn(
|
||||||
|
"https://mikro-orm.io/docs/quick-start/index.html.md", result
|
||||||
|
)
|
||||||
|
self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result)
|
||||||
|
self.assertIn(
|
||||||
|
"https://mikro-orm.io/docs/defining-entities/index.html.md", result
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should NOT contain any URLs with anchor fragments
|
||||||
|
for url in result:
|
||||||
|
self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
|
||||||
|
|
||||||
|
def test_preserves_query_parameters(self):
|
||||||
|
"""Test that query parameters are preserved (only anchors stripped)"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/search?q=test",
|
||||||
|
"https://example.com/docs/search?q=test#results",
|
||||||
|
"https://example.com/docs/api?version=2",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Query parameters should be preserved, anchors stripped
|
||||||
|
self.assertEqual(len(result), 2) # search deduplicated
|
||||||
|
# Note: Query parameters might not be ideal for .md conversion,
|
||||||
|
# but they should be preserved if present
|
||||||
|
self.assertTrue(
|
||||||
|
any("?q=test" in url for url in result),
|
||||||
|
"Query parameter should be preserved",
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
any("?version=2" in url for url in result),
|
||||||
|
"Query parameter should be preserved",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_complex_anchor_formats(self):
|
||||||
|
"""Test various anchor formats (encoded, with dashes, etc.)"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/guide#section-one",
|
||||||
|
"https://example.com/docs/guide#section_two",
|
||||||
|
"https://example.com/docs/guide#section%20three",
|
||||||
|
"https://example.com/docs/guide#123",
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# All should deduplicate to single base URL
|
||||||
|
self.assertEqual(len(result), 1)
|
||||||
|
self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md")
|
||||||
|
|
||||||
|
def test_url_order_preservation(self):
|
||||||
|
"""Test that first occurrence of base URL is preserved"""
|
||||||
|
urls = [
|
||||||
|
"https://example.com/docs/a",
|
||||||
|
"https://example.com/docs/b#anchor",
|
||||||
|
"https://example.com/docs/c",
|
||||||
|
"https://example.com/docs/a#different-anchor", # Duplicate base
|
||||||
|
]
|
||||||
|
|
||||||
|
result = self.converter._convert_to_md_urls(urls)
|
||||||
|
|
||||||
|
# Should have 3 unique URLs, first occurrence preserved
|
||||||
|
self.assertEqual(len(result), 3)
|
||||||
|
self.assertEqual(result[0], "https://example.com/docs/a/index.html.md")
|
||||||
|
self.assertEqual(result[1], "https://example.com/docs/b/index.html.md")
|
||||||
|
self.assertEqual(result[2], "https://example.com/docs/c/index.html.md")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user