fix: stop blindly appending /index.html.md to non-.md URLs (#277)
The previous fix (a82cf69) only addressed anchor fragment stripping but
left the fundamental problem: _convert_to_md_urls() blindly appended
/index.html.md to ALL non-.md URLs from llms.txt. This only works for
Docusaurus sites — for sites like Discord docs it generates mass 404s.
Changes:
- _convert_to_md_urls() now strips anchors and deduplicates only,
preserving original URLs as-is instead of appending /index.html.md
- New _has_md_extension() helper uses urlparse().path.endswith(".md")
instead of error-prone ".md" in url substring matching
- Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775)
- Removed 24 lines of dead commented-out code
- Added real-world e2e test against docs.discord.com (no mocks)
- Updated unit tests for new behavior (32 tests)
Fixes #277
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
"""
|
||||
Real-world integration test for Issue #277: URL conversion bug with anchor fragments.
|
||||
Tests the exact MikroORM case that was reported in the issue.
|
||||
Real-world integration test for Issue #277: URL conversion bug with anchor fragments
|
||||
and blind /index.html.md appending.
|
||||
|
||||
Tests the exact MikroORM and Discord cases reported in the issue.
|
||||
Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs.
|
||||
"""
|
||||
|
||||
import unittest
|
||||
@@ -28,7 +31,6 @@ class TestIssue277RealWorld(unittest.TestCase):
|
||||
|
||||
def test_mikro_orm_urls_from_issue_277(self):
|
||||
"""Test the exact URLs that caused 404 errors in issue #277"""
|
||||
# These are the actual problematic URLs from the bug report
|
||||
urls_from_llms_txt = [
|
||||
"https://mikro-orm.io/docs/",
|
||||
"https://mikro-orm.io/docs/reference.md",
|
||||
@@ -44,54 +46,23 @@ class TestIssue277RealWorld(unittest.TestCase):
|
||||
|
||||
# Verify no malformed URLs with anchor fragments
|
||||
for url in result:
|
||||
self.assertNotIn(
|
||||
"#synchronous-initialization/index.html.md",
|
||||
url,
|
||||
"Should not append /index.html.md after anchor fragments",
|
||||
)
|
||||
self.assertNotIn(
|
||||
"#formulas/index.html.md",
|
||||
url,
|
||||
"Should not append /index.html.md after anchor fragments",
|
||||
)
|
||||
self.assertNotIn(
|
||||
"#postgresql-native-enums/index.html.md",
|
||||
url,
|
||||
"Should not append /index.html.md after anchor fragments",
|
||||
)
|
||||
self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
|
||||
# No /index.html.md should be appended to non-.md URLs
|
||||
if not url.endswith(".md"):
|
||||
self.assertNotIn(
|
||||
"index.html.md", url, f"Should not append /index.html.md: {url}"
|
||||
)
|
||||
|
||||
# Verify correct transformed URLs
|
||||
|
||||
# Check that we got the expected number of unique URLs
|
||||
# Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it
|
||||
self.assertEqual(
|
||||
len(result),
|
||||
7,
|
||||
f"Should have 7 unique base URLs after deduplication, got {len(result)}",
|
||||
)
|
||||
|
||||
# Verify specific URLs that were causing 404s are now correct
|
||||
self.assertIn(
|
||||
"https://mikro-orm.io/docs/quick-start/index.html.md",
|
||||
result,
|
||||
"quick-start URL should be correctly transformed",
|
||||
)
|
||||
self.assertIn(
|
||||
"https://mikro-orm.io/docs/propagation/index.html.md",
|
||||
result,
|
||||
"propagation URL should be correctly transformed",
|
||||
)
|
||||
self.assertIn(
|
||||
"https://mikro-orm.io/docs/defining-entities.md",
|
||||
result,
|
||||
"defining-entities.md should preserve .md extension",
|
||||
)
|
||||
# .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated
|
||||
self.assertIn("https://mikro-orm.io/docs/reference.md", result)
|
||||
self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
|
||||
self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
|
||||
self.assertIn("https://mikro-orm.io/docs/quick-start", result)
|
||||
self.assertIn("https://mikro-orm.io/docs/propagation", result)
|
||||
|
||||
def test_no_404_causing_urls_generated(self):
|
||||
"""Verify that no URLs matching the 404 error pattern are generated"""
|
||||
# The exact 404-causing URL pattern from the issue
|
||||
problematic_patterns = [
|
||||
"/index.html.md#", # /index.html.md should never come after #
|
||||
"#synchronous-initialization/index.html.md",
|
||||
"#formulas/index.html.md",
|
||||
"#postgresql-native-enums/index.html.md",
|
||||
@@ -118,9 +89,30 @@ class TestIssue277RealWorld(unittest.TestCase):
|
||||
f"URL '{url}' contains problematic pattern '{pattern}' that causes 404",
|
||||
)
|
||||
|
||||
def test_no_blind_index_html_md_appending(self):
|
||||
"""Verify non-.md URLs don't get /index.html.md appended (core fix)"""
|
||||
urls = [
|
||||
"https://mikro-orm.io/docs/quick-start",
|
||||
"https://mikro-orm.io/docs/propagation",
|
||||
"https://mikro-orm.io/docs/filters",
|
||||
]
|
||||
|
||||
result = self.converter._convert_to_md_urls(urls)
|
||||
|
||||
self.assertEqual(len(result), 3)
|
||||
for url in result:
|
||||
self.assertNotIn(
|
||||
"/index.html.md",
|
||||
url,
|
||||
f"Should not blindly append /index.html.md: {url}",
|
||||
)
|
||||
|
||||
self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start")
|
||||
self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation")
|
||||
self.assertEqual(result[2], "https://mikro-orm.io/docs/filters")
|
||||
|
||||
def test_deduplication_prevents_multiple_requests(self):
|
||||
"""Verify that multiple anchors on same page don't create duplicate requests"""
|
||||
# From the issue: These should all map to the same base URL
|
||||
urls_with_multiple_anchors = [
|
||||
"https://mikro-orm.io/docs/defining-entities#formulas",
|
||||
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
|
||||
@@ -136,10 +128,7 @@ class TestIssue277RealWorld(unittest.TestCase):
|
||||
1,
|
||||
"Multiple anchors on same page should deduplicate to single request",
|
||||
)
|
||||
self.assertEqual(
|
||||
result[0],
|
||||
"https://mikro-orm.io/docs/defining-entities/index.html.md",
|
||||
)
|
||||
self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities")
|
||||
|
||||
def test_md_files_with_anchors_preserved(self):
|
||||
"""Test that .md files with anchors are handled correctly"""
|
||||
@@ -167,7 +156,6 @@ class TestIssue277RealWorld(unittest.TestCase):
|
||||
Integration test: Simulate real scraping scenario with llms.txt URLs.
|
||||
Verify that the converted URLs would not cause 404 errors.
|
||||
"""
|
||||
# Mock response for llms.txt content
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """
|
||||
@@ -179,7 +167,6 @@ https://mikro-orm.io/docs/defining-entities#formulas
|
||||
"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
# Simulate the llms.txt parsing flow
|
||||
urls_from_llms = [
|
||||
"https://mikro-orm.io/docs/quick-start",
|
||||
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
|
||||
@@ -187,42 +174,36 @@ https://mikro-orm.io/docs/defining-entities#formulas
|
||||
"https://mikro-orm.io/docs/defining-entities#formulas",
|
||||
]
|
||||
|
||||
# Convert URLs (this is what happens in _try_llms_txt_v2)
|
||||
converted_urls = self.converter._convert_to_md_urls(urls_from_llms)
|
||||
|
||||
# Verify converted URLs are valid
|
||||
# In real scenario, these would be added to pending_urls and scraped
|
||||
self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
|
||||
self.assertTrue(len(converted_urls) > 0)
|
||||
|
||||
# Verify no URLs would cause 404 (no anchors in middle of path)
|
||||
for url in converted_urls:
|
||||
# Check URL structure is valid
|
||||
# Should not contain # anywhere
|
||||
self.assertRegex(
|
||||
url,
|
||||
r"^https://[^#]+$", # Should not contain # anywhere
|
||||
r"^https://[^#]+$",
|
||||
f"URL should not contain anchor fragments: {url}",
|
||||
)
|
||||
|
||||
# Verify the problematic pattern from the issue doesn't exist
|
||||
self.assertNotRegex(
|
||||
url,
|
||||
r"#[^/]+/index\.html\.md",
|
||||
f"URL should not have /index.html.md after anchor: {url}",
|
||||
)
|
||||
# Should NOT have /index.html.md appended
|
||||
if not url.endswith(".md"):
|
||||
self.assertNotIn(
|
||||
"index.html.md",
|
||||
url,
|
||||
f"Should not append /index.html.md: {url}",
|
||||
)
|
||||
|
||||
def test_issue_277_error_message_urls(self):
|
||||
"""
|
||||
Test the exact URLs that appeared in error messages from the issue report.
|
||||
These were the actual 404-causing URLs that need to be fixed.
|
||||
"""
|
||||
# These are the MALFORMED URLs that caused 404 errors (with anchors in the middle)
|
||||
error_urls_with_anchors = [
|
||||
"https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md",
|
||||
"https://mikro-orm.io/docs/defining-entities#formulas/index.html.md",
|
||||
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md",
|
||||
]
|
||||
|
||||
# Extract the input URLs that would have generated these errors
|
||||
input_urls = [
|
||||
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
|
||||
"https://mikro-orm.io/docs/propagation",
|
||||
@@ -232,7 +213,7 @@ https://mikro-orm.io/docs/defining-entities#formulas
|
||||
|
||||
result = self.converter._convert_to_md_urls(input_urls)
|
||||
|
||||
# Verify NONE of the malformed error URLs (with anchors) are generated
|
||||
# Verify NONE of the malformed error URLs are generated
|
||||
for error_url in error_urls_with_anchors:
|
||||
self.assertNotIn(
|
||||
error_url,
|
||||
@@ -240,20 +221,82 @@ https://mikro-orm.io/docs/defining-entities#formulas
|
||||
f"Should not generate the 404-causing URL: {error_url}",
|
||||
)
|
||||
|
||||
# Verify correct URLs are generated instead
|
||||
correct_urls = [
|
||||
"https://mikro-orm.io/docs/quick-start/index.html.md",
|
||||
"https://mikro-orm.io/docs/propagation/index.html.md",
|
||||
"https://mikro-orm.io/docs/defining-entities/index.html.md",
|
||||
# Verify correct URLs are generated
|
||||
self.assertIn("https://mikro-orm.io/docs/quick-start", result)
|
||||
self.assertIn("https://mikro-orm.io/docs/propagation", result)
|
||||
self.assertIn("https://mikro-orm.io/docs/defining-entities", result)
|
||||
|
||||
|
||||
class TestIssue277DiscordDocs(unittest.TestCase):
|
||||
"""Test for Discord docs case reported by @skeith"""
|
||||
|
||||
def setUp(self):
|
||||
self.config = {
|
||||
"name": "DiscordDocs",
|
||||
"description": "Discord API Documentation",
|
||||
"base_url": "https://docs.discord.com/",
|
||||
"selectors": {"main_content": "article"},
|
||||
}
|
||||
self.converter = DocToSkillConverter(self.config, dry_run=True)
|
||||
|
||||
def test_discord_docs_no_index_html_md(self):
|
||||
"""Discord docs don't serve .md files - no /index.html.md should be appended"""
|
||||
urls = [
|
||||
"https://docs.discord.com/developers/activities/building-an-activity",
|
||||
"https://docs.discord.com/developers/activities/design-patterns",
|
||||
"https://docs.discord.com/developers/components/overview",
|
||||
"https://docs.discord.com/developers/bots/getting-started",
|
||||
]
|
||||
|
||||
for correct_url in correct_urls:
|
||||
self.assertIn(
|
||||
correct_url,
|
||||
result,
|
||||
f"Should generate the correct URL: {correct_url}",
|
||||
result = self.converter._convert_to_md_urls(urls)
|
||||
|
||||
self.assertEqual(len(result), 4)
|
||||
for url in result:
|
||||
self.assertNotIn(
|
||||
"index.html.md",
|
||||
url,
|
||||
f"Discord docs should not get /index.html.md appended: {url}",
|
||||
)
|
||||
|
||||
def test_discord_docs_md_urls_preserved(self):
|
||||
"""Discord llms.txt has .md URLs that should be preserved"""
|
||||
urls = [
|
||||
"https://docs.discord.com/developers/activities/building-an-activity.md",
|
||||
"https://docs.discord.com/developers/components/overview.md",
|
||||
"https://docs.discord.com/developers/change-log.md",
|
||||
]
|
||||
|
||||
result = self.converter._convert_to_md_urls(urls)
|
||||
|
||||
self.assertEqual(len(result), 3)
|
||||
self.assertEqual(
|
||||
result[0],
|
||||
"https://docs.discord.com/developers/activities/building-an-activity.md",
|
||||
)
|
||||
|
||||
def test_discord_docs_mixed_urls(self):
|
||||
"""Mix of .md and non-.md URLs from Discord docs"""
|
||||
urls = [
|
||||
"https://docs.discord.com/developers/activities/building-an-activity.md",
|
||||
"https://docs.discord.com/developers/overview",
|
||||
"https://docs.discord.com/developers/overview#quick-start",
|
||||
"https://docs.discord.com/developers/bots/getting-started.md#step-1",
|
||||
]
|
||||
|
||||
result = self.converter._convert_to_md_urls(urls)
|
||||
|
||||
# .md URLs preserved, non-.md as-is, anchors stripped & deduped
|
||||
self.assertEqual(len(result), 3)
|
||||
self.assertIn(
|
||||
"https://docs.discord.com/developers/activities/building-an-activity.md",
|
||||
result,
|
||||
)
|
||||
self.assertIn("https://docs.discord.com/developers/overview", result)
|
||||
self.assertIn(
|
||||
"https://docs.discord.com/developers/bots/getting-started.md",
|
||||
result,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user