The previous fix (a82cf69) only addressed anchor fragment stripping but
left the fundamental problem: _convert_to_md_urls() blindly appended
/index.html.md to ALL non-.md URLs from llms.txt. This only works for
Docusaurus sites — for sites like Discord docs it generates mass 404s.
Changes:
- _convert_to_md_urls() now strips anchors and deduplicates only,
preserving original URLs as-is instead of appending /index.html.md
- New _has_md_extension() helper uses urlparse().path.endswith(".md")
instead of error-prone ".md" in url substring matching
- Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775)
- Removed 24 lines of dead commented-out code
- Added real-world e2e test against docs.discord.com (no mocks)
- Updated unit tests for new behavior (32 tests)
Fixes #277
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
291 lines
11 KiB
Python
291 lines
11 KiB
Python
"""
|
|
Tests for URL conversion logic (_convert_to_md_urls).
|
|
Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
|
|
|
|
Updated: _convert_to_md_urls no longer blindly appends /index.html.md to non-.md URLs.
|
|
It now strips anchors, deduplicates, and preserves original URLs as-is.
|
|
"""
|
|
|
|
import unittest
|
|
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
|
|
class TestConvertToMdUrls(unittest.TestCase):
|
|
"""Test suite for _convert_to_md_urls method"""
|
|
|
|
def setUp(self):
|
|
"""Set up test converter instance"""
|
|
config = {
|
|
"name": "test",
|
|
"description": "Test",
|
|
"base_url": "https://example.com/docs/",
|
|
"selectors": {"main_content": "article"},
|
|
}
|
|
self.converter = DocToSkillConverter(config, dry_run=True)
|
|
|
|
def test_strips_anchor_fragments(self):
|
|
"""Test that anchor fragments (#anchor) are properly stripped from URLs"""
|
|
urls = [
|
|
"https://example.com/docs/quick-start#synchronous-initialization",
|
|
"https://example.com/docs/api#methods",
|
|
"https://example.com/docs/guide#installation",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# All should have anchors stripped
|
|
self.assertEqual(len(result), 3)
|
|
self.assertEqual(result[0], "https://example.com/docs/quick-start")
|
|
self.assertEqual(result[1], "https://example.com/docs/api")
|
|
self.assertEqual(result[2], "https://example.com/docs/guide")
|
|
|
|
def test_deduplicates_multiple_anchors_same_url(self):
|
|
"""Test that multiple anchors on the same URL are deduplicated"""
|
|
urls = [
|
|
"https://example.com/docs/api#method1",
|
|
"https://example.com/docs/api#method2",
|
|
"https://example.com/docs/api#method3",
|
|
"https://example.com/docs/api", # Same URL without anchor
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should only have one entry for the base URL
|
|
self.assertEqual(len(result), 1)
|
|
self.assertEqual(result[0], "https://example.com/docs/api")
|
|
|
|
def test_preserves_md_extension_urls(self):
|
|
"""Test that URLs already ending with .md are preserved"""
|
|
urls = [
|
|
"https://example.com/docs/guide.md",
|
|
"https://example.com/docs/readme.md",
|
|
"https://example.com/docs/api-reference.md",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should preserve .md URLs without modification
|
|
self.assertEqual(len(result), 3)
|
|
self.assertEqual(result[0], "https://example.com/docs/guide.md")
|
|
self.assertEqual(result[1], "https://example.com/docs/readme.md")
|
|
self.assertEqual(result[2], "https://example.com/docs/api-reference.md")
|
|
|
|
def test_md_extension_with_anchor_fragments(self):
|
|
"""Test that .md URLs with anchors are handled correctly"""
|
|
urls = [
|
|
"https://example.com/docs/guide.md#introduction",
|
|
"https://example.com/docs/guide.md#advanced",
|
|
"https://example.com/docs/api.md#methods",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should strip anchors but preserve .md extension
|
|
self.assertEqual(len(result), 2) # guide.md deduplicated
|
|
self.assertIn("https://example.com/docs/guide.md", result)
|
|
self.assertIn("https://example.com/docs/api.md", result)
|
|
|
|
def test_non_md_urls_not_converted(self):
|
|
"""Test that non-.md URLs are NOT converted to /index.html.md (issue #277)"""
|
|
urls = [
|
|
"https://example.com/docs/getting-started",
|
|
"https://example.com/docs/api-reference",
|
|
"https://example.com/docs/tutorials",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should preserve URLs as-is, NOT append /index.html.md
|
|
self.assertEqual(len(result), 3)
|
|
for url in result:
|
|
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
|
|
|
|
self.assertEqual(result[0], "https://example.com/docs/getting-started")
|
|
self.assertEqual(result[1], "https://example.com/docs/api-reference")
|
|
self.assertEqual(result[2], "https://example.com/docs/tutorials")
|
|
|
|
def test_does_not_match_md_in_path(self):
|
|
"""Test that URLs containing 'md' in path are preserved as-is"""
|
|
urls = [
|
|
"https://example.com/cmd-line",
|
|
"https://example.com/AMD-processors",
|
|
"https://example.com/metadata",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# URLs with 'md' substring (not extension) should be preserved as-is
|
|
self.assertEqual(len(result), 3)
|
|
self.assertEqual(result[0], "https://example.com/cmd-line")
|
|
self.assertEqual(result[1], "https://example.com/AMD-processors")
|
|
self.assertEqual(result[2], "https://example.com/metadata")
|
|
|
|
def test_removes_trailing_slashes_for_dedup(self):
|
|
"""Test that trailing slash variants are deduplicated"""
|
|
urls = [
|
|
"https://example.com/docs/api/",
|
|
"https://example.com/docs/api",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should deduplicate (trailing slash vs no slash)
|
|
self.assertEqual(len(result), 1)
|
|
|
|
def test_mixed_urls_with_and_without_anchors(self):
|
|
"""Test mixed URLs with various formats"""
|
|
urls = [
|
|
"https://example.com/docs/intro",
|
|
"https://example.com/docs/intro#getting-started",
|
|
"https://example.com/docs/api.md",
|
|
"https://example.com/docs/api.md#methods",
|
|
"https://example.com/docs/guide#section1",
|
|
"https://example.com/docs/guide",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should deduplicate to 3 unique base URLs
|
|
self.assertEqual(len(result), 3)
|
|
self.assertIn("https://example.com/docs/intro", result)
|
|
self.assertIn("https://example.com/docs/api.md", result)
|
|
self.assertIn("https://example.com/docs/guide", result)
|
|
|
|
def test_empty_url_list(self):
|
|
"""Test that empty URL list returns empty result"""
|
|
urls = []
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
self.assertEqual(len(result), 0)
|
|
self.assertEqual(result, [])
|
|
|
|
def test_real_world_mikro_orm_case(self):
|
|
"""Test the exact URLs from issue #277 (MikroORM case)"""
|
|
urls = [
|
|
"https://mikro-orm.io/docs/quick-start",
|
|
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
|
|
"https://mikro-orm.io/docs/propagation",
|
|
"https://mikro-orm.io/docs/defining-entities#formulas",
|
|
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should deduplicate to 3 unique base URLs
|
|
self.assertEqual(len(result), 3)
|
|
|
|
# Should NOT contain any URLs with anchor fragments
|
|
for url in result:
|
|
self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
|
|
|
|
# Should NOT contain /index.html.md
|
|
for url in result:
|
|
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
|
|
|
|
def test_preserves_query_parameters(self):
|
|
"""Test that query parameters are preserved (only anchors stripped)"""
|
|
urls = [
|
|
"https://example.com/docs/search?q=test",
|
|
"https://example.com/docs/search?q=test#results",
|
|
"https://example.com/docs/api?version=2",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Query parameters should be preserved, anchors stripped
|
|
self.assertEqual(len(result), 2) # search deduplicated
|
|
self.assertTrue(
|
|
any("?q=test" in url for url in result),
|
|
"Query parameter should be preserved",
|
|
)
|
|
self.assertTrue(
|
|
any("?version=2" in url for url in result),
|
|
"Query parameter should be preserved",
|
|
)
|
|
|
|
def test_complex_anchor_formats(self):
|
|
"""Test various anchor formats (encoded, with dashes, etc.)"""
|
|
urls = [
|
|
"https://example.com/docs/guide#section-one",
|
|
"https://example.com/docs/guide#section_two",
|
|
"https://example.com/docs/guide#section%20three",
|
|
"https://example.com/docs/guide#123",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# All should deduplicate to single base URL
|
|
self.assertEqual(len(result), 1)
|
|
self.assertEqual(result[0], "https://example.com/docs/guide")
|
|
|
|
def test_url_order_preservation(self):
|
|
"""Test that first occurrence of base URL is preserved"""
|
|
urls = [
|
|
"https://example.com/docs/a",
|
|
"https://example.com/docs/b#anchor",
|
|
"https://example.com/docs/c",
|
|
"https://example.com/docs/a#different-anchor", # Duplicate base
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# Should have 3 unique URLs, first occurrence preserved
|
|
self.assertEqual(len(result), 3)
|
|
self.assertEqual(result[0], "https://example.com/docs/a")
|
|
self.assertEqual(result[1], "https://example.com/docs/b")
|
|
self.assertEqual(result[2], "https://example.com/docs/c")
|
|
|
|
def test_discord_docs_case(self):
|
|
"""Test the Discord docs case reported by @skeith in issue #277"""
|
|
urls = [
|
|
"https://docs.discord.com/developers/activities/building-an-activity",
|
|
"https://docs.discord.com/developers/activities/design-patterns",
|
|
"https://docs.discord.com/developers/components/overview",
|
|
"https://docs.discord.com/developers/bots/getting-started#step-1",
|
|
]
|
|
|
|
result = self.converter._convert_to_md_urls(urls)
|
|
|
|
# No /index.html.md should be appended
|
|
for url in result:
|
|
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
|
|
self.assertNotIn("#", url, f"Should not contain anchor: {url}")
|
|
|
|
self.assertEqual(len(result), 4)
|
|
|
|
|
|
class TestHasMdExtension(unittest.TestCase):
|
|
"""Test suite for _has_md_extension static method"""
|
|
|
|
def test_md_extension(self):
|
|
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md"))
|
|
|
|
def test_md_with_query(self):
|
|
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md?v=1"))
|
|
|
|
def test_no_md_extension(self):
|
|
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page"))
|
|
|
|
def test_md_in_path_not_extension(self):
|
|
"""'cmd-line' contains 'md' but is not a .md extension"""
|
|
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/cmd-line"))
|
|
|
|
def test_md_in_domain(self):
|
|
"""'.md' in domain should not match"""
|
|
self.assertFalse(DocToSkillConverter._has_md_extension("https://docs.md.example.com/page"))
|
|
|
|
def test_mdx_not_md(self):
|
|
""".mdx is not .md"""
|
|
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.mdx"))
|
|
|
|
def test_md_in_middle_of_path(self):
|
|
""".md in middle of path should not match"""
|
|
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage"))
|
|
|
|
def test_index_html_md(self):
|
|
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md"))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|