fix: stop blindly appending /index.html.md to non-.md URLs (#277)

The previous fix (a82cf69) only addressed anchor fragment stripping but
left the fundamental problem: _convert_to_md_urls() blindly appended
/index.html.md to ALL non-.md URLs from llms.txt. This only works for
Docusaurus sites — for sites like Discord docs it generates mass 404s.

Changes:
- _convert_to_md_urls() now strips anchors and deduplicates only,
  preserving original URLs as-is instead of appending /index.html.md
- New _has_md_extension() helper uses urlparse().path.endswith(".md")
  instead of error-prone ".md" in url substring matching
- Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775)
- Removed 24 lines of dead commented-out code
- Added real-world e2e test against docs.discord.com (no mocks)
- Updated unit tests for new behavior (32 tests)

Fixes #277

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-20 23:44:35 +03:00
parent f6131c6798
commit 2ef6e59d06
4 changed files with 408 additions and 169 deletions

View File

@@ -252,6 +252,15 @@ class DocToSkillConverter:
return not any(pattern in url for pattern in self._exclude_patterns) return not any(pattern in url for pattern in self._exclude_patterns)
@staticmethod
def _has_md_extension(url: str) -> bool:
"""Check if URL path ends with .md extension.
Uses URL path parsing instead of substring matching to avoid
false positives on URLs like /embed/page or /cmd-line.
"""
return urlparse(url).path.endswith(".md")
def save_checkpoint(self) -> None: def save_checkpoint(self) -> None:
"""Save progress checkpoint""" """Save progress checkpoint"""
if not self.checkpoint_enabled or self.dry_run: if not self.checkpoint_enabled or self.dry_run:
@@ -462,7 +471,7 @@ class DocToSkillConverter:
else: else:
continue continue
full_url = full_url.split("#")[0] full_url = full_url.split("#")[0]
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links: if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links:
links.append(full_url) links.append(full_url)
return { return {
@@ -551,7 +560,7 @@ class DocToSkillConverter:
# Strip anchor fragments # Strip anchor fragments
full_url = full_url.split("#")[0] full_url = full_url.split("#")[0]
# Only include .md URLs to avoid client-side rendered HTML pages # Only include .md URLs to avoid client-side rendered HTML pages
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]: if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]:
page["links"].append(full_url) page["links"].append(full_url)
return page return page
@@ -713,7 +722,7 @@ class DocToSkillConverter:
response.raise_for_status() response.raise_for_status()
# Check if this is a Markdown file # Check if this is a Markdown file
if url.endswith(".md") or ".md" in url: if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url) page = self._extract_markdown_content(response.text, url)
else: else:
soup = BeautifulSoup(response.content, "html.parser") soup = BeautifulSoup(response.content, "html.parser")
@@ -772,7 +781,7 @@ class DocToSkillConverter:
response.raise_for_status() response.raise_for_status()
# Check if this is a Markdown file # Check if this is a Markdown file
if url.endswith(".md") or ".md" in url: if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url) page = self._extract_markdown_content(response.text, url)
else: else:
# BeautifulSoup parsing (still synchronous, but fast) # BeautifulSoup parsing (still synchronous, but fast)
@@ -798,71 +807,45 @@ class DocToSkillConverter:
def _convert_to_md_urls(self, urls: list[str]) -> list[str]: def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
""" """
Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs. Clean URLs from llms.txt: strip anchor fragments, deduplicate base URLs.
Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。 Previously this method blindly appended /index.html.md to non-.md URLs,
which caused 404 errors on sites that don't serve raw markdown files
(e.g. Discord docs, see issue #277). Now it preserves original URLs as-is
and lets the scraper handle both HTML and markdown content.
Args: Args:
urls: List of URLs to process urls: List of URLs to process
Returns: Returns:
List of .md URLs (未验证, deduplicated, no anchors) List of cleaned, deduplicated URLs (no anchors)
""" """
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
seen_base_urls = set() seen_base_urls = set()
md_urls = [] cleaned_urls = []
for url in urls: for url in urls:
# Parse URL to extract and remove fragment (anchor) # Parse URL to extract and remove fragment (anchor)
parsed = urlparse(url) parsed = urlparse(url)
base_url = urlunparse(parsed._replace(fragment="")) # Remove #anchor base_url = urlunparse(parsed._replace(fragment="")) # Remove #anchor
# Skip if we've already processed this base URL # Normalize trailing slashes for dedup (but keep original form)
if base_url in seen_base_urls: dedup_key = base_url.rstrip("/")
continue
seen_base_urls.add(base_url)
# Check if URL already ends with .md (not just contains "md") # Skip if we've already processed this base URL
if base_url.endswith(".md"): if dedup_key in seen_base_urls:
md_urls.append(base_url) continue
else: seen_base_urls.add(dedup_key)
# 直接转换为 .md 格式,不发送 HEAD 请求检查
base_url = base_url.rstrip("/") cleaned_urls.append(base_url)
md_url = f"{base_url}/index.html.md"
md_urls.append(md_url)
logger.info( logger.info(
" ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)", " ✓ Cleaned %d URLs to %d unique URLs (anchors stripped, will validate during crawl)",
len(urls), len(urls),
len(md_urls), len(cleaned_urls),
) )
return md_urls return cleaned_urls
# ORIGINAL _convert_to_md_urls (with HEAD request validation):
# def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
# md_urls = []
# non_md_urls = []
# for url in urls:
# if '.md' in url:
# md_urls.append(url)
# else:
# non_md_urls.append(url)
# if non_md_urls:
# logger.info(" 🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
# converted = 0
# for url in non_md_urls:
# url = url.rstrip('/')
# md_url = f"{url}/index.html.md"
# try:
# resp = requests.head(md_url, timeout=5, allow_redirects=True)
# if resp.status_code == 200:
# md_urls.append(md_url)
# converted += 1
# except Exception:
# pass
# logger.info(" ✓ Converted %d URLs to .md format", converted)
# return md_urls
def _try_llms_txt(self) -> bool: def _try_llms_txt(self) -> bool:
""" """
@@ -933,16 +916,16 @@ class DocToSkillConverter:
# Extract URLs from llms.txt and add to pending_urls for BFS crawling # Extract URLs from llms.txt and add to pending_urls for BFS crawling
extracted_urls = parser.extract_urls() extracted_urls = parser.extract_urls()
if extracted_urls: if extracted_urls:
# Convert non-.md URLs to .md format by trying /index.html.md suffix # Clean URLs: strip anchors, deduplicate
md_urls = self._convert_to_md_urls(extracted_urls) cleaned_urls = self._convert_to_md_urls(extracted_urls)
logger.info( logger.info(
"\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
len(extracted_urls), len(extracted_urls),
len(md_urls), len(cleaned_urls),
) )
# Filter URLs based on url_patterns config # Filter URLs based on url_patterns config
for url in md_urls: for url in cleaned_urls:
if self.is_valid_url(url): if self.is_valid_url(url):
self._enqueue_url(url) self._enqueue_url(url)
@@ -1019,16 +1002,16 @@ class DocToSkillConverter:
# Extract URLs from llms.txt and add to pending_urls for BFS crawling # Extract URLs from llms.txt and add to pending_urls for BFS crawling
extracted_urls = parser.extract_urls() extracted_urls = parser.extract_urls()
if extracted_urls: if extracted_urls:
# Convert non-.md URLs to .md format by trying /index.html.md suffix # Clean URLs: strip anchors, deduplicate
md_urls = self._convert_to_md_urls(extracted_urls) cleaned_urls = self._convert_to_md_urls(extracted_urls)
logger.info( logger.info(
"\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...", "\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
len(extracted_urls), len(extracted_urls),
len(md_urls), len(cleaned_urls),
) )
# Filter URLs based on url_patterns config # Filter URLs based on url_patterns config
for url in md_urls: for url in cleaned_urls:
if self.is_valid_url(url): if self.is_valid_url(url):
self._enqueue_url(url) self._enqueue_url(url)

View File

@@ -0,0 +1,146 @@
"""
E2E test for Issue #277 - Discord docs case reported by @skeith.
This test hits the REAL Discord docs llms.txt and verifies that
no /index.html.md URLs are generated. No mocks.
Requires network access. Marked as integration test.
"""
import os
import shutil
import unittest
import pytest
from skill_seekers.cli.doc_scraper import DocToSkillConverter
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
@pytest.mark.integration
class TestIssue277DiscordDocsE2E(unittest.TestCase):
"""E2E: Reproduce @skeith's report with real Discord docs."""
def setUp(self):
self.base_url = "https://docs.discord.com/"
self.config = {
"name": "DiscordDocsE2E",
"description": "Discord API Documentation",
"base_url": self.base_url,
"selectors": {"main_content": "article"},
"url_patterns": {"include": ["/developers"], "exclude": []},
}
self.output_dir = f"output/{self.config['name']}_data"
def tearDown(self):
# Clean up any output created
for path in [self.output_dir, f"output/{self.config['name']}"]:
if os.path.exists(path):
shutil.rmtree(path)
def test_discord_llms_txt_exists(self):
"""Verify Discord docs has llms.txt (precondition for the bug)."""
detector = LlmsTxtDetector(self.base_url)
variants = detector.detect_all()
self.assertTrue(
len(variants) > 0,
"Discord docs should have at least one llms.txt variant",
)
def test_discord_llms_txt_urls_no_index_html_md(self):
"""Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
# Step 1: Detect llms.txt
detector = LlmsTxtDetector(self.base_url)
variants = detector.detect_all()
self.assertTrue(len(variants) > 0, "No llms.txt found at docs.discord.com")
# Step 2: Download the largest variant (same logic as doc_scraper)
downloaded = {}
for variant_info in variants:
downloader = LlmsTxtDownloader(variant_info["url"])
content = downloader.download()
if content:
downloaded[variant_info["variant"]] = content
self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
largest_content = max(downloaded.values(), key=len)
# Step 3: Parse URLs from llms.txt
parser = LlmsTxtParser(largest_content, self.base_url)
extracted_urls = parser.extract_urls()
self.assertTrue(
len(extracted_urls) > 0,
"No URLs extracted from Discord llms.txt",
)
# Step 4: Run _convert_to_md_urls (the function that was causing 404s)
converter = DocToSkillConverter(self.config, dry_run=True)
converted_urls = converter._convert_to_md_urls(extracted_urls)
# Step 5: Verify NO /index.html.md was blindly appended
bad_urls = [u for u in converted_urls if "/index.html.md" in u]
self.assertEqual(
len(bad_urls),
0,
f"Found {len(bad_urls)} URLs with /index.html.md appended "
f"(would cause 404s):\n"
+ "\n".join(bad_urls[:10]),
)
# Step 6: Verify no anchor fragments leaked through
anchor_urls = [u for u in converted_urls if "#" in u]
self.assertEqual(
len(anchor_urls),
0,
f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
+ "\n".join(anchor_urls[:10]),
)
# Step 7: Verify we got a reasonable number of URLs
self.assertGreater(
len(converted_urls),
10,
"Expected at least 10 unique URLs from Discord docs",
)
def test_discord_full_pipeline_no_404_urls(self):
"""Full pipeline: detector -> downloader -> parser -> converter -> queue.
Simulates what `skill-seekers create https://docs.discord.com` does,
without actually scraping pages.
"""
converter = DocToSkillConverter(self.config, dry_run=True)
# Run _try_llms_txt which calls _convert_to_md_urls internally
os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
result = converter._try_llms_txt()
# _try_llms_txt returns False when it populates pending_urls for BFS
# (True means it parsed content directly, no BFS needed)
if not result:
# Check every URL in the queue
for url in converter.pending_urls:
self.assertNotIn(
"/index.html.md",
url,
f"Queue contains 404-causing URL: {url}",
)
self.assertNotIn(
"#",
url,
f"Queue contains URL with anchor fragment: {url}",
)
self.assertGreater(
len(converter.pending_urls),
0,
"Pipeline should have queued URLs for crawling",
)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,6 +1,9 @@
""" """
Real-world integration test for Issue #277: URL conversion bug with anchor fragments. Real-world integration test for Issue #277: URL conversion bug with anchor fragments
Tests the exact MikroORM case that was reported in the issue. and blind /index.html.md appending.
Tests the exact MikroORM and Discord cases reported in the issue.
Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs.
""" """
import unittest import unittest
@@ -28,7 +31,6 @@ class TestIssue277RealWorld(unittest.TestCase):
def test_mikro_orm_urls_from_issue_277(self): def test_mikro_orm_urls_from_issue_277(self):
"""Test the exact URLs that caused 404 errors in issue #277""" """Test the exact URLs that caused 404 errors in issue #277"""
# These are the actual problematic URLs from the bug report
urls_from_llms_txt = [ urls_from_llms_txt = [
"https://mikro-orm.io/docs/", "https://mikro-orm.io/docs/",
"https://mikro-orm.io/docs/reference.md", "https://mikro-orm.io/docs/reference.md",
@@ -44,54 +46,23 @@ class TestIssue277RealWorld(unittest.TestCase):
# Verify no malformed URLs with anchor fragments # Verify no malformed URLs with anchor fragments
for url in result: for url in result:
self.assertNotIn( self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
"#synchronous-initialization/index.html.md", # No /index.html.md should be appended to non-.md URLs
url, if not url.endswith(".md"):
"Should not append /index.html.md after anchor fragments", self.assertNotIn(
) "index.html.md", url, f"Should not append /index.html.md: {url}"
self.assertNotIn( )
"#formulas/index.html.md",
url,
"Should not append /index.html.md after anchor fragments",
)
self.assertNotIn(
"#postgresql-native-enums/index.html.md",
url,
"Should not append /index.html.md after anchor fragments",
)
# Verify correct transformed URLs # .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated
self.assertIn("https://mikro-orm.io/docs/reference.md", result)
# Check that we got the expected number of unique URLs self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
# Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
self.assertEqual( self.assertIn("https://mikro-orm.io/docs/quick-start", result)
len(result), self.assertIn("https://mikro-orm.io/docs/propagation", result)
7,
f"Should have 7 unique base URLs after deduplication, got {len(result)}",
)
# Verify specific URLs that were causing 404s are now correct
self.assertIn(
"https://mikro-orm.io/docs/quick-start/index.html.md",
result,
"quick-start URL should be correctly transformed",
)
self.assertIn(
"https://mikro-orm.io/docs/propagation/index.html.md",
result,
"propagation URL should be correctly transformed",
)
self.assertIn(
"https://mikro-orm.io/docs/defining-entities.md",
result,
"defining-entities.md should preserve .md extension",
)
def test_no_404_causing_urls_generated(self): def test_no_404_causing_urls_generated(self):
"""Verify that no URLs matching the 404 error pattern are generated""" """Verify that no URLs matching the 404 error pattern are generated"""
# The exact 404-causing URL pattern from the issue
problematic_patterns = [ problematic_patterns = [
"/index.html.md#", # /index.html.md should never come after #
"#synchronous-initialization/index.html.md", "#synchronous-initialization/index.html.md",
"#formulas/index.html.md", "#formulas/index.html.md",
"#postgresql-native-enums/index.html.md", "#postgresql-native-enums/index.html.md",
@@ -118,9 +89,30 @@ class TestIssue277RealWorld(unittest.TestCase):
f"URL '{url}' contains problematic pattern '{pattern}' that causes 404", f"URL '{url}' contains problematic pattern '{pattern}' that causes 404",
) )
def test_no_blind_index_html_md_appending(self):
"""Verify non-.md URLs don't get /index.html.md appended (core fix)"""
urls = [
"https://mikro-orm.io/docs/quick-start",
"https://mikro-orm.io/docs/propagation",
"https://mikro-orm.io/docs/filters",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 3)
for url in result:
self.assertNotIn(
"/index.html.md",
url,
f"Should not blindly append /index.html.md: {url}",
)
self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start")
self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation")
self.assertEqual(result[2], "https://mikro-orm.io/docs/filters")
def test_deduplication_prevents_multiple_requests(self): def test_deduplication_prevents_multiple_requests(self):
"""Verify that multiple anchors on same page don't create duplicate requests""" """Verify that multiple anchors on same page don't create duplicate requests"""
# From the issue: These should all map to the same base URL
urls_with_multiple_anchors = [ urls_with_multiple_anchors = [
"https://mikro-orm.io/docs/defining-entities#formulas", "https://mikro-orm.io/docs/defining-entities#formulas",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums", "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
@@ -136,10 +128,7 @@ class TestIssue277RealWorld(unittest.TestCase):
1, 1,
"Multiple anchors on same page should deduplicate to single request", "Multiple anchors on same page should deduplicate to single request",
) )
self.assertEqual( self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities")
result[0],
"https://mikro-orm.io/docs/defining-entities/index.html.md",
)
def test_md_files_with_anchors_preserved(self): def test_md_files_with_anchors_preserved(self):
"""Test that .md files with anchors are handled correctly""" """Test that .md files with anchors are handled correctly"""
@@ -167,7 +156,6 @@ class TestIssue277RealWorld(unittest.TestCase):
Integration test: Simulate real scraping scenario with llms.txt URLs. Integration test: Simulate real scraping scenario with llms.txt URLs.
Verify that the converted URLs would not cause 404 errors. Verify that the converted URLs would not cause 404 errors.
""" """
# Mock response for llms.txt content
mock_response = MagicMock() mock_response = MagicMock()
mock_response.status_code = 200 mock_response.status_code = 200
mock_response.text = """ mock_response.text = """
@@ -179,7 +167,6 @@ https://mikro-orm.io/docs/defining-entities#formulas
""" """
mock_get.return_value = mock_response mock_get.return_value = mock_response
# Simulate the llms.txt parsing flow
urls_from_llms = [ urls_from_llms = [
"https://mikro-orm.io/docs/quick-start", "https://mikro-orm.io/docs/quick-start",
"https://mikro-orm.io/docs/quick-start#synchronous-initialization", "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
@@ -187,42 +174,36 @@ https://mikro-orm.io/docs/defining-entities#formulas
"https://mikro-orm.io/docs/defining-entities#formulas", "https://mikro-orm.io/docs/defining-entities#formulas",
] ]
# Convert URLs (this is what happens in _try_llms_txt_v2)
converted_urls = self.converter._convert_to_md_urls(urls_from_llms) converted_urls = self.converter._convert_to_md_urls(urls_from_llms)
# Verify converted URLs are valid self.assertTrue(len(converted_urls) > 0)
# In real scenario, these would be added to pending_urls and scraped
self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
# Verify no URLs would cause 404 (no anchors in middle of path)
for url in converted_urls: for url in converted_urls:
# Check URL structure is valid # Should not contain # anywhere
self.assertRegex( self.assertRegex(
url, url,
r"^https://[^#]+$", # Should not contain # anywhere r"^https://[^#]+$",
f"URL should not contain anchor fragments: {url}", f"URL should not contain anchor fragments: {url}",
) )
# Should NOT have /index.html.md appended
# Verify the problematic pattern from the issue doesn't exist if not url.endswith(".md"):
self.assertNotRegex( self.assertNotIn(
url, "index.html.md",
r"#[^/]+/index\.html\.md", url,
f"URL should not have /index.html.md after anchor: {url}", f"Should not append /index.html.md: {url}",
) )
def test_issue_277_error_message_urls(self): def test_issue_277_error_message_urls(self):
""" """
Test the exact URLs that appeared in error messages from the issue report. Test the exact URLs that appeared in error messages from the issue report.
These were the actual 404-causing URLs that need to be fixed. These were the actual 404-causing URLs that need to be fixed.
""" """
# These are the MALFORMED URLs that caused 404 errors (with anchors in the middle)
error_urls_with_anchors = [ error_urls_with_anchors = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md", "https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md",
"https://mikro-orm.io/docs/defining-entities#formulas/index.html.md", "https://mikro-orm.io/docs/defining-entities#formulas/index.html.md",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md", "https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md",
] ]
# Extract the input URLs that would have generated these errors
input_urls = [ input_urls = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization", "https://mikro-orm.io/docs/quick-start#synchronous-initialization",
"https://mikro-orm.io/docs/propagation", "https://mikro-orm.io/docs/propagation",
@@ -232,7 +213,7 @@ https://mikro-orm.io/docs/defining-entities#formulas
result = self.converter._convert_to_md_urls(input_urls) result = self.converter._convert_to_md_urls(input_urls)
# Verify NONE of the malformed error URLs (with anchors) are generated # Verify NONE of the malformed error URLs are generated
for error_url in error_urls_with_anchors: for error_url in error_urls_with_anchors:
self.assertNotIn( self.assertNotIn(
error_url, error_url,
@@ -240,20 +221,82 @@ https://mikro-orm.io/docs/defining-entities#formulas
f"Should not generate the 404-causing URL: {error_url}", f"Should not generate the 404-causing URL: {error_url}",
) )
# Verify correct URLs are generated instead # Verify correct URLs are generated
correct_urls = [ self.assertIn("https://mikro-orm.io/docs/quick-start", result)
"https://mikro-orm.io/docs/quick-start/index.html.md", self.assertIn("https://mikro-orm.io/docs/propagation", result)
"https://mikro-orm.io/docs/propagation/index.html.md", self.assertIn("https://mikro-orm.io/docs/defining-entities", result)
"https://mikro-orm.io/docs/defining-entities/index.html.md",
class TestIssue277DiscordDocs(unittest.TestCase):
"""Test for Discord docs case reported by @skeith"""
def setUp(self):
self.config = {
"name": "DiscordDocs",
"description": "Discord API Documentation",
"base_url": "https://docs.discord.com/",
"selectors": {"main_content": "article"},
}
self.converter = DocToSkillConverter(self.config, dry_run=True)
def test_discord_docs_no_index_html_md(self):
"""Discord docs don't serve .md files - no /index.html.md should be appended"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity",
"https://docs.discord.com/developers/activities/design-patterns",
"https://docs.discord.com/developers/components/overview",
"https://docs.discord.com/developers/bots/getting-started",
] ]
for correct_url in correct_urls: result = self.converter._convert_to_md_urls(urls)
self.assertIn(
correct_url, self.assertEqual(len(result), 4)
result, for url in result:
f"Should generate the correct URL: {correct_url}", self.assertNotIn(
"index.html.md",
url,
f"Discord docs should not get /index.html.md appended: {url}",
) )
def test_discord_docs_md_urls_preserved(self):
"""Discord llms.txt has .md URLs that should be preserved"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity.md",
"https://docs.discord.com/developers/components/overview.md",
"https://docs.discord.com/developers/change-log.md",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 3)
self.assertEqual(
result[0],
"https://docs.discord.com/developers/activities/building-an-activity.md",
)
def test_discord_docs_mixed_urls(self):
"""Mix of .md and non-.md URLs from Discord docs"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity.md",
"https://docs.discord.com/developers/overview",
"https://docs.discord.com/developers/overview#quick-start",
"https://docs.discord.com/developers/bots/getting-started.md#step-1",
]
result = self.converter._convert_to_md_urls(urls)
# .md URLs preserved, non-.md as-is, anchors stripped & deduped
self.assertEqual(len(result), 3)
self.assertIn(
"https://docs.discord.com/developers/activities/building-an-activity.md",
result,
)
self.assertIn("https://docs.discord.com/developers/overview", result)
self.assertIn(
"https://docs.discord.com/developers/bots/getting-started.md",
result,
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -1,6 +1,9 @@
""" """
Tests for URL conversion logic (_convert_to_md_urls). Tests for URL conversion logic (_convert_to_md_urls).
Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors. Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
Updated: _convert_to_md_urls no longer blindly appends /index.html.md to non-.md URLs.
It now strips anchors, deduplicates, and preserves original URLs as-is.
""" """
import unittest import unittest
@@ -31,11 +34,11 @@ class TestConvertToMdUrls(unittest.TestCase):
result = self.converter._convert_to_md_urls(urls) result = self.converter._convert_to_md_urls(urls)
# All should be converted without anchor fragments # All should have anchors stripped
self.assertEqual(len(result), 3) self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md") self.assertEqual(result[0], "https://example.com/docs/quick-start")
self.assertEqual(result[1], "https://example.com/docs/api/index.html.md") self.assertEqual(result[1], "https://example.com/docs/api")
self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md") self.assertEqual(result[2], "https://example.com/docs/guide")
def test_deduplicates_multiple_anchors_same_url(self): def test_deduplicates_multiple_anchors_same_url(self):
"""Test that multiple anchors on the same URL are deduplicated""" """Test that multiple anchors on the same URL are deduplicated"""
@@ -50,7 +53,7 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should only have one entry for the base URL # Should only have one entry for the base URL
self.assertEqual(len(result), 1) self.assertEqual(len(result), 1)
self.assertEqual(result[0], "https://example.com/docs/api/index.html.md") self.assertEqual(result[0], "https://example.com/docs/api")
def test_preserves_md_extension_urls(self): def test_preserves_md_extension_urls(self):
"""Test that URLs already ending with .md are preserved""" """Test that URLs already ending with .md are preserved"""
@@ -83,8 +86,27 @@ class TestConvertToMdUrls(unittest.TestCase):
self.assertIn("https://example.com/docs/guide.md", result) self.assertIn("https://example.com/docs/guide.md", result)
self.assertIn("https://example.com/docs/api.md", result) self.assertIn("https://example.com/docs/api.md", result)
def test_non_md_urls_not_converted(self):
"""Test that non-.md URLs are NOT converted to /index.html.md (issue #277)"""
urls = [
"https://example.com/docs/getting-started",
"https://example.com/docs/api-reference",
"https://example.com/docs/tutorials",
]
result = self.converter._convert_to_md_urls(urls)
# Should preserve URLs as-is, NOT append /index.html.md
self.assertEqual(len(result), 3)
for url in result:
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
self.assertEqual(result[0], "https://example.com/docs/getting-started")
self.assertEqual(result[1], "https://example.com/docs/api-reference")
self.assertEqual(result[2], "https://example.com/docs/tutorials")
def test_does_not_match_md_in_path(self): def test_does_not_match_md_in_path(self):
"""Test that URLs containing 'md' in path (but not ending with .md) are converted""" """Test that URLs containing 'md' in path are preserved as-is"""
urls = [ urls = [
"https://example.com/cmd-line", "https://example.com/cmd-line",
"https://example.com/AMD-processors", "https://example.com/AMD-processors",
@@ -93,27 +115,23 @@ class TestConvertToMdUrls(unittest.TestCase):
result = self.converter._convert_to_md_urls(urls) result = self.converter._convert_to_md_urls(urls)
# All should be converted since they don't END with .md # URLs with 'md' substring (not extension) should be preserved as-is
self.assertEqual(len(result), 3) self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md") self.assertEqual(result[0], "https://example.com/cmd-line")
self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md") self.assertEqual(result[1], "https://example.com/AMD-processors")
self.assertEqual(result[2], "https://example.com/metadata/index.html.md") self.assertEqual(result[2], "https://example.com/metadata")
def test_removes_trailing_slashes(self): def test_removes_trailing_slashes_for_dedup(self):
"""Test that trailing slashes are removed before appending /index.html.md""" """Test that trailing slash variants are deduplicated"""
urls = [ urls = [
"https://example.com/docs/api/", "https://example.com/docs/api/",
"https://example.com/docs/guide//", "https://example.com/docs/api",
"https://example.com/docs/reference",
] ]
result = self.converter._convert_to_md_urls(urls) result = self.converter._convert_to_md_urls(urls)
# All should have proper /index.html.md without double slashes # Should deduplicate (trailing slash vs no slash)
self.assertEqual(len(result), 3) self.assertEqual(len(result), 1)
self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md")
self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md")
def test_mixed_urls_with_and_without_anchors(self): def test_mixed_urls_with_and_without_anchors(self):
"""Test mixed URLs with various formats""" """Test mixed URLs with various formats"""
@@ -130,9 +148,9 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should deduplicate to 3 unique base URLs # Should deduplicate to 3 unique base URLs
self.assertEqual(len(result), 3) self.assertEqual(len(result), 3)
self.assertIn("https://example.com/docs/intro/index.html.md", result) self.assertIn("https://example.com/docs/intro", result)
self.assertIn("https://example.com/docs/api.md", result) self.assertIn("https://example.com/docs/api.md", result)
self.assertIn("https://example.com/docs/guide/index.html.md", result) self.assertIn("https://example.com/docs/guide", result)
def test_empty_url_list(self): def test_empty_url_list(self):
"""Test that empty URL list returns empty result""" """Test that empty URL list returns empty result"""
@@ -155,14 +173,15 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should deduplicate to 3 unique base URLs # Should deduplicate to 3 unique base URLs
self.assertEqual(len(result), 3) self.assertEqual(len(result), 3)
self.assertIn("https://mikro-orm.io/docs/quick-start/index.html.md", result)
self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities/index.html.md", result)
# Should NOT contain any URLs with anchor fragments # Should NOT contain any URLs with anchor fragments
for url in result: for url in result:
self.assertNotIn("#", url, f"URL should not contain anchor: {url}") self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
# Should NOT contain /index.html.md
for url in result:
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
def test_preserves_query_parameters(self): def test_preserves_query_parameters(self):
"""Test that query parameters are preserved (only anchors stripped)""" """Test that query parameters are preserved (only anchors stripped)"""
urls = [ urls = [
@@ -175,8 +194,6 @@ class TestConvertToMdUrls(unittest.TestCase):
# Query parameters should be preserved, anchors stripped # Query parameters should be preserved, anchors stripped
self.assertEqual(len(result), 2) # search deduplicated self.assertEqual(len(result), 2) # search deduplicated
# Note: Query parameters might not be ideal for .md conversion,
# but they should be preserved if present
self.assertTrue( self.assertTrue(
any("?q=test" in url for url in result), any("?q=test" in url for url in result),
"Query parameter should be preserved", "Query parameter should be preserved",
@@ -199,7 +216,7 @@ class TestConvertToMdUrls(unittest.TestCase):
# All should deduplicate to single base URL # All should deduplicate to single base URL
self.assertEqual(len(result), 1) self.assertEqual(len(result), 1)
self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md") self.assertEqual(result[0], "https://example.com/docs/guide")
def test_url_order_preservation(self): def test_url_order_preservation(self):
"""Test that first occurrence of base URL is preserved""" """Test that first occurrence of base URL is preserved"""
@@ -214,9 +231,59 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should have 3 unique URLs, first occurrence preserved # Should have 3 unique URLs, first occurrence preserved
self.assertEqual(len(result), 3) self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/docs/a/index.html.md") self.assertEqual(result[0], "https://example.com/docs/a")
self.assertEqual(result[1], "https://example.com/docs/b/index.html.md") self.assertEqual(result[1], "https://example.com/docs/b")
self.assertEqual(result[2], "https://example.com/docs/c/index.html.md") self.assertEqual(result[2], "https://example.com/docs/c")
def test_discord_docs_case(self):
"""Test the Discord docs case reported by @skeith in issue #277"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity",
"https://docs.discord.com/developers/activities/design-patterns",
"https://docs.discord.com/developers/components/overview",
"https://docs.discord.com/developers/bots/getting-started#step-1",
]
result = self.converter._convert_to_md_urls(urls)
# No /index.html.md should be appended
for url in result:
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
self.assertNotIn("#", url, f"Should not contain anchor: {url}")
self.assertEqual(len(result), 4)
class TestHasMdExtension(unittest.TestCase):
"""Test suite for _has_md_extension static method"""
def test_md_extension(self):
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md"))
def test_md_with_query(self):
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md?v=1"))
def test_no_md_extension(self):
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page"))
def test_md_in_path_not_extension(self):
"""'cmd-line' contains 'md' but is not a .md extension"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/cmd-line"))
def test_md_in_domain(self):
"""'.md' in domain should not match"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://docs.md.example.com/page"))
def test_mdx_not_md(self):
""".mdx is not .md"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.mdx"))
def test_md_in_middle_of_path(self):
""".md in middle of path should not match"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage"))
def test_index_html_md(self):
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md"))
if __name__ == "__main__": if __name__ == "__main__":