Files
skill-seekers-reference/tests/test_issue_277_real_world.py
yusyus 2ef6e59d06 fix: stop blindly appending /index.html.md to non-.md URLs (#277)
The previous fix (a82cf69) only addressed anchor fragment stripping but
left the fundamental problem: _convert_to_md_urls() blindly appended
/index.html.md to ALL non-.md URLs from llms.txt. This only works for
Docusaurus sites — for sites like Discord docs it generates mass 404s.

Changes:
- _convert_to_md_urls() now strips anchors and deduplicates only,
  preserving original URLs as-is instead of appending /index.html.md
- New _has_md_extension() helper uses urlparse().path.endswith(".md")
  instead of error-prone ".md" in url substring matching
- Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775)
- Removed 24 lines of dead commented-out code
- Added real-world e2e test against docs.discord.com (no mocks)
- Updated unit tests for new behavior (32 tests)

Fixes #277

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-20 23:44:35 +03:00

303 lines
12 KiB
Python

"""
Real-world integration test for Issue #277: URL conversion bug with anchor fragments
and blind /index.html.md appending.
Tests the exact MikroORM and Discord cases reported in the issue.
Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs.
"""
import unittest
from unittest.mock import MagicMock, patch
from skill_seekers.cli.doc_scraper import DocToSkillConverter
class TestIssue277RealWorld(unittest.TestCase):
"""Integration test for Issue #277 using real MikroORM URLs"""
def setUp(self):
"""Set up test converter with MikroORM-like configuration"""
self.config = {
"name": "MikroORM",
"description": "ORM",
"base_url": "https://mikro-orm.io/docs/",
"selectors": {"main_content": "article"},
"url_patterns": {
"include": ["/docs"],
"exclude": [],
},
}
self.converter = DocToSkillConverter(self.config, dry_run=True)
def test_mikro_orm_urls_from_issue_277(self):
"""Test the exact URLs that caused 404 errors in issue #277"""
urls_from_llms_txt = [
"https://mikro-orm.io/docs/",
"https://mikro-orm.io/docs/reference.md",
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
"https://mikro-orm.io/docs/repositories.md#custom-repository",
"https://mikro-orm.io/docs/propagation",
"https://mikro-orm.io/docs/defining-entities.md#check-constraints",
"https://mikro-orm.io/docs/defining-entities#formulas",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
]
result = self.converter._convert_to_md_urls(urls_from_llms_txt)
# Verify no malformed URLs with anchor fragments
for url in result:
self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
# No /index.html.md should be appended to non-.md URLs
if not url.endswith(".md"):
self.assertNotIn(
"index.html.md", url, f"Should not append /index.html.md: {url}"
)
# .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated
self.assertIn("https://mikro-orm.io/docs/reference.md", result)
self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
self.assertIn("https://mikro-orm.io/docs/quick-start", result)
self.assertIn("https://mikro-orm.io/docs/propagation", result)
def test_no_404_causing_urls_generated(self):
"""Verify that no URLs matching the 404 error pattern are generated"""
problematic_patterns = [
"#synchronous-initialization/index.html.md",
"#formulas/index.html.md",
"#postgresql-native-enums/index.html.md",
"#custom-repository/index.html.md",
"#check-constraints/index.html.md",
]
urls = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
"https://mikro-orm.io/docs/defining-entities#formulas",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
"https://mikro-orm.io/docs/repositories.md#custom-repository",
"https://mikro-orm.io/docs/defining-entities.md#check-constraints",
]
result = self.converter._convert_to_md_urls(urls)
# Verify NONE of the problematic patterns exist
for url in result:
for pattern in problematic_patterns:
self.assertNotIn(
pattern,
url,
f"URL '{url}' contains problematic pattern '{pattern}' that causes 404",
)
def test_no_blind_index_html_md_appending(self):
"""Verify non-.md URLs don't get /index.html.md appended (core fix)"""
urls = [
"https://mikro-orm.io/docs/quick-start",
"https://mikro-orm.io/docs/propagation",
"https://mikro-orm.io/docs/filters",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 3)
for url in result:
self.assertNotIn(
"/index.html.md",
url,
f"Should not blindly append /index.html.md: {url}",
)
self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start")
self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation")
self.assertEqual(result[2], "https://mikro-orm.io/docs/filters")
def test_deduplication_prevents_multiple_requests(self):
"""Verify that multiple anchors on same page don't create duplicate requests"""
urls_with_multiple_anchors = [
"https://mikro-orm.io/docs/defining-entities#formulas",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
"https://mikro-orm.io/docs/defining-entities#indexes",
"https://mikro-orm.io/docs/defining-entities#check-constraints",
]
result = self.converter._convert_to_md_urls(urls_with_multiple_anchors)
# Should deduplicate to single URL
self.assertEqual(
len(result),
1,
"Multiple anchors on same page should deduplicate to single request",
)
self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities")
def test_md_files_with_anchors_preserved(self):
"""Test that .md files with anchors are handled correctly"""
urls = [
"https://mikro-orm.io/docs/repositories.md#custom-repository",
"https://mikro-orm.io/docs/defining-entities.md#check-constraints",
"https://mikro-orm.io/docs/inheritance-mapping.md#single-table-inheritance",
]
result = self.converter._convert_to_md_urls(urls)
# Should preserve .md extension, strip anchors, deduplicate
self.assertEqual(len(result), 3)
self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
self.assertIn("https://mikro-orm.io/docs/inheritance-mapping.md", result)
# Verify no anchors in results
for url in result:
self.assertNotIn("#", url, "Result should not contain anchor fragments")
@patch("skill_seekers.cli.doc_scraper.requests.get")
def test_real_scraping_scenario_no_404s(self, mock_get):
"""
Integration test: Simulate real scraping scenario with llms.txt URLs.
Verify that the converted URLs would not cause 404 errors.
"""
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = """
# MikroORM Documentation
https://mikro-orm.io/docs/quick-start
https://mikro-orm.io/docs/quick-start#synchronous-initialization
https://mikro-orm.io/docs/propagation
https://mikro-orm.io/docs/defining-entities#formulas
"""
mock_get.return_value = mock_response
urls_from_llms = [
"https://mikro-orm.io/docs/quick-start",
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
"https://mikro-orm.io/docs/propagation",
"https://mikro-orm.io/docs/defining-entities#formulas",
]
converted_urls = self.converter._convert_to_md_urls(urls_from_llms)
self.assertTrue(len(converted_urls) > 0)
for url in converted_urls:
# Should not contain # anywhere
self.assertRegex(
url,
r"^https://[^#]+$",
f"URL should not contain anchor fragments: {url}",
)
# Should NOT have /index.html.md appended
if not url.endswith(".md"):
self.assertNotIn(
"index.html.md",
url,
f"Should not append /index.html.md: {url}",
)
def test_issue_277_error_message_urls(self):
"""
Test the exact URLs that appeared in error messages from the issue report.
These were the actual 404-causing URLs that need to be fixed.
"""
error_urls_with_anchors = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md",
"https://mikro-orm.io/docs/defining-entities#formulas/index.html.md",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md",
]
input_urls = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
"https://mikro-orm.io/docs/propagation",
"https://mikro-orm.io/docs/defining-entities#formulas",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
]
result = self.converter._convert_to_md_urls(input_urls)
# Verify NONE of the malformed error URLs are generated
for error_url in error_urls_with_anchors:
self.assertNotIn(
error_url,
result,
f"Should not generate the 404-causing URL: {error_url}",
)
# Verify correct URLs are generated
self.assertIn("https://mikro-orm.io/docs/quick-start", result)
self.assertIn("https://mikro-orm.io/docs/propagation", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities", result)
class TestIssue277DiscordDocs(unittest.TestCase):
"""Test for Discord docs case reported by @skeith"""
def setUp(self):
self.config = {
"name": "DiscordDocs",
"description": "Discord API Documentation",
"base_url": "https://docs.discord.com/",
"selectors": {"main_content": "article"},
}
self.converter = DocToSkillConverter(self.config, dry_run=True)
def test_discord_docs_no_index_html_md(self):
"""Discord docs don't serve .md files - no /index.html.md should be appended"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity",
"https://docs.discord.com/developers/activities/design-patterns",
"https://docs.discord.com/developers/components/overview",
"https://docs.discord.com/developers/bots/getting-started",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 4)
for url in result:
self.assertNotIn(
"index.html.md",
url,
f"Discord docs should not get /index.html.md appended: {url}",
)
def test_discord_docs_md_urls_preserved(self):
"""Discord llms.txt has .md URLs that should be preserved"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity.md",
"https://docs.discord.com/developers/components/overview.md",
"https://docs.discord.com/developers/change-log.md",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 3)
self.assertEqual(
result[0],
"https://docs.discord.com/developers/activities/building-an-activity.md",
)
def test_discord_docs_mixed_urls(self):
"""Mix of .md and non-.md URLs from Discord docs"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity.md",
"https://docs.discord.com/developers/overview",
"https://docs.discord.com/developers/overview#quick-start",
"https://docs.discord.com/developers/bots/getting-started.md#step-1",
]
result = self.converter._convert_to_md_urls(urls)
# .md URLs preserved, non-.md as-is, anchors stripped & deduped
self.assertEqual(len(result), 3)
self.assertIn(
"https://docs.discord.com/developers/activities/building-an-activity.md",
result,
)
self.assertIn("https://docs.discord.com/developers/overview", result)
self.assertIn(
"https://docs.discord.com/developers/bots/getting-started.md",
result,
)
if __name__ == "__main__":
unittest.main()