fix: stop blindly appending /index.html.md to non-.md URLs (#277)

The previous fix (a82cf69) only addressed anchor fragment stripping but
left the fundamental problem: _convert_to_md_urls() blindly appended
/index.html.md to ALL non-.md URLs from llms.txt. This only works for
Docusaurus sites — for sites like Discord docs it generates mass 404s.

Changes:
- _convert_to_md_urls() now strips anchors and deduplicates only,
  preserving original URLs as-is instead of appending /index.html.md
- New _has_md_extension() helper uses urlparse().path.endswith(".md")
  instead of error-prone ".md" in url substring matching
- Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775)
- Removed 24 lines of dead commented-out code
- Added real-world e2e test against docs.discord.com (no mocks)
- Updated unit tests for new behavior (32 tests)

Fixes #277

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
yusyus
2026-03-20 23:44:35 +03:00
parent f6131c6798
commit 2ef6e59d06
4 changed files with 408 additions and 169 deletions

View File

@@ -252,6 +252,15 @@ class DocToSkillConverter:
return not any(pattern in url for pattern in self._exclude_patterns)
@staticmethod
def _has_md_extension(url: str) -> bool:
"""Check if URL path ends with .md extension.
Uses URL path parsing instead of substring matching to avoid
false positives on URLs like /embed/page or /cmd-line.
"""
return urlparse(url).path.endswith(".md")
def save_checkpoint(self) -> None:
"""Save progress checkpoint"""
if not self.checkpoint_enabled or self.dry_run:
@@ -462,7 +471,7 @@ class DocToSkillConverter:
else:
continue
full_url = full_url.split("#")[0]
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links:
if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links:
links.append(full_url)
return {
@@ -551,7 +560,7 @@ class DocToSkillConverter:
# Strip anchor fragments
full_url = full_url.split("#")[0]
# Only include .md URLs to avoid client-side rendered HTML pages
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]:
if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]:
page["links"].append(full_url)
return page
@@ -713,7 +722,7 @@ class DocToSkillConverter:
response.raise_for_status()
# Check if this is a Markdown file
if url.endswith(".md") or ".md" in url:
if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url)
else:
soup = BeautifulSoup(response.content, "html.parser")
@@ -772,7 +781,7 @@ class DocToSkillConverter:
response.raise_for_status()
# Check if this is a Markdown file
if url.endswith(".md") or ".md" in url:
if self._has_md_extension(url):
page = self._extract_markdown_content(response.text, url)
else:
# BeautifulSoup parsing (still synchronous, but fast)
@@ -798,71 +807,45 @@ class DocToSkillConverter:
def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
"""
Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。
Clean URLs from llms.txt: strip anchor fragments, deduplicate base URLs.
Previously this method blindly appended /index.html.md to non-.md URLs,
which caused 404 errors on sites that don't serve raw markdown files
(e.g. Discord docs, see issue #277). Now it preserves original URLs as-is
and lets the scraper handle both HTML and markdown content.
Args:
urls: List of URLs to process
Returns:
List of .md URLs (未验证, deduplicated, no anchors)
List of cleaned, deduplicated URLs (no anchors)
"""
from urllib.parse import urlparse, urlunparse
seen_base_urls = set()
md_urls = []
cleaned_urls = []
for url in urls:
# Parse URL to extract and remove fragment (anchor)
parsed = urlparse(url)
base_url = urlunparse(parsed._replace(fragment="")) # Remove #anchor
# Skip if we've already processed this base URL
if base_url in seen_base_urls:
continue
seen_base_urls.add(base_url)
# Normalize trailing slashes for dedup (but keep original form)
dedup_key = base_url.rstrip("/")
# Check if URL already ends with .md (not just contains "md")
if base_url.endswith(".md"):
md_urls.append(base_url)
else:
# 直接转换为 .md 格式,不发送 HEAD 请求检查
base_url = base_url.rstrip("/")
md_url = f"{base_url}/index.html.md"
md_urls.append(md_url)
# Skip if we've already processed this base URL
if dedup_key in seen_base_urls:
continue
seen_base_urls.add(dedup_key)
cleaned_urls.append(base_url)
logger.info(
" ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)",
" ✓ Cleaned %d URLs to %d unique URLs (anchors stripped, will validate during crawl)",
len(urls),
len(md_urls),
len(cleaned_urls),
)
return md_urls
# ORIGINAL _convert_to_md_urls (with HEAD request validation):
# def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
# md_urls = []
# non_md_urls = []
# for url in urls:
# if '.md' in url:
# md_urls.append(url)
# else:
# non_md_urls.append(url)
# if non_md_urls:
# logger.info(" 🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
# converted = 0
# for url in non_md_urls:
# url = url.rstrip('/')
# md_url = f"{url}/index.html.md"
# try:
# resp = requests.head(md_url, timeout=5, allow_redirects=True)
# if resp.status_code == 200:
# md_urls.append(md_url)
# converted += 1
# except Exception:
# pass
# logger.info(" ✓ Converted %d URLs to .md format", converted)
# return md_urls
return cleaned_urls
def _try_llms_txt(self) -> bool:
"""
@@ -933,16 +916,16 @@ class DocToSkillConverter:
# Extract URLs from llms.txt and add to pending_urls for BFS crawling
extracted_urls = parser.extract_urls()
if extracted_urls:
# Convert non-.md URLs to .md format by trying /index.html.md suffix
md_urls = self._convert_to_md_urls(extracted_urls)
# Clean URLs: strip anchors, deduplicate
cleaned_urls = self._convert_to_md_urls(extracted_urls)
logger.info(
"\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
"\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
len(extracted_urls),
len(md_urls),
len(cleaned_urls),
)
# Filter URLs based on url_patterns config
for url in md_urls:
for url in cleaned_urls:
if self.is_valid_url(url):
self._enqueue_url(url)
@@ -1019,16 +1002,16 @@ class DocToSkillConverter:
# Extract URLs from llms.txt and add to pending_urls for BFS crawling
extracted_urls = parser.extract_urls()
if extracted_urls:
# Convert non-.md URLs to .md format by trying /index.html.md suffix
md_urls = self._convert_to_md_urls(extracted_urls)
# Clean URLs: strip anchors, deduplicate
cleaned_urls = self._convert_to_md_urls(extracted_urls)
logger.info(
"\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
"\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
len(extracted_urls),
len(md_urls),
len(cleaned_urls),
)
# Filter URLs based on url_patterns config
for url in md_urls:
for url in cleaned_urls:
if self.is_valid_url(url):
self._enqueue_url(url)

View File

@@ -0,0 +1,146 @@
"""
E2E test for Issue #277 - Discord docs case reported by @skeith.
This test hits the REAL Discord docs llms.txt and verifies that
no /index.html.md URLs are generated. No mocks.
Requires network access. Marked as integration test.
"""
import os
import shutil
import unittest
import pytest
from skill_seekers.cli.doc_scraper import DocToSkillConverter
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
@pytest.mark.integration
class TestIssue277DiscordDocsE2E(unittest.TestCase):
"""E2E: Reproduce @skeith's report with real Discord docs."""
def setUp(self):
self.base_url = "https://docs.discord.com/"
self.config = {
"name": "DiscordDocsE2E",
"description": "Discord API Documentation",
"base_url": self.base_url,
"selectors": {"main_content": "article"},
"url_patterns": {"include": ["/developers"], "exclude": []},
}
self.output_dir = f"output/{self.config['name']}_data"
def tearDown(self):
# Clean up any output created
for path in [self.output_dir, f"output/{self.config['name']}"]:
if os.path.exists(path):
shutil.rmtree(path)
def test_discord_llms_txt_exists(self):
"""Verify Discord docs has llms.txt (precondition for the bug)."""
detector = LlmsTxtDetector(self.base_url)
variants = detector.detect_all()
self.assertTrue(
len(variants) > 0,
"Discord docs should have at least one llms.txt variant",
)
def test_discord_llms_txt_urls_no_index_html_md(self):
"""Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
# Step 1: Detect llms.txt
detector = LlmsTxtDetector(self.base_url)
variants = detector.detect_all()
self.assertTrue(len(variants) > 0, "No llms.txt found at docs.discord.com")
# Step 2: Download the largest variant (same logic as doc_scraper)
downloaded = {}
for variant_info in variants:
downloader = LlmsTxtDownloader(variant_info["url"])
content = downloader.download()
if content:
downloaded[variant_info["variant"]] = content
self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
largest_content = max(downloaded.values(), key=len)
# Step 3: Parse URLs from llms.txt
parser = LlmsTxtParser(largest_content, self.base_url)
extracted_urls = parser.extract_urls()
self.assertTrue(
len(extracted_urls) > 0,
"No URLs extracted from Discord llms.txt",
)
# Step 4: Run _convert_to_md_urls (the function that was causing 404s)
converter = DocToSkillConverter(self.config, dry_run=True)
converted_urls = converter._convert_to_md_urls(extracted_urls)
# Step 5: Verify NO /index.html.md was blindly appended
bad_urls = [u for u in converted_urls if "/index.html.md" in u]
self.assertEqual(
len(bad_urls),
0,
f"Found {len(bad_urls)} URLs with /index.html.md appended "
f"(would cause 404s):\n"
+ "\n".join(bad_urls[:10]),
)
# Step 6: Verify no anchor fragments leaked through
anchor_urls = [u for u in converted_urls if "#" in u]
self.assertEqual(
len(anchor_urls),
0,
f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
+ "\n".join(anchor_urls[:10]),
)
# Step 7: Verify we got a reasonable number of URLs
self.assertGreater(
len(converted_urls),
10,
"Expected at least 10 unique URLs from Discord docs",
)
def test_discord_full_pipeline_no_404_urls(self):
"""Full pipeline: detector -> downloader -> parser -> converter -> queue.
Simulates what `skill-seekers create https://docs.discord.com` does,
without actually scraping pages.
"""
converter = DocToSkillConverter(self.config, dry_run=True)
# Run _try_llms_txt which calls _convert_to_md_urls internally
os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
result = converter._try_llms_txt()
# _try_llms_txt returns False when it populates pending_urls for BFS
# (True means it parsed content directly, no BFS needed)
if not result:
# Check every URL in the queue
for url in converter.pending_urls:
self.assertNotIn(
"/index.html.md",
url,
f"Queue contains 404-causing URL: {url}",
)
self.assertNotIn(
"#",
url,
f"Queue contains URL with anchor fragment: {url}",
)
self.assertGreater(
len(converter.pending_urls),
0,
"Pipeline should have queued URLs for crawling",
)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,6 +1,9 @@
"""
Real-world integration test for Issue #277: URL conversion bug with anchor fragments.
Tests the exact MikroORM case that was reported in the issue.
Real-world integration test for Issue #277: URL conversion bug with anchor fragments
and blind /index.html.md appending.
Tests the exact MikroORM and Discord cases reported in the issue.
Updated: _convert_to_md_urls no longer appends /index.html.md to non-.md URLs.
"""
import unittest
@@ -28,7 +31,6 @@ class TestIssue277RealWorld(unittest.TestCase):
def test_mikro_orm_urls_from_issue_277(self):
"""Test the exact URLs that caused 404 errors in issue #277"""
# These are the actual problematic URLs from the bug report
urls_from_llms_txt = [
"https://mikro-orm.io/docs/",
"https://mikro-orm.io/docs/reference.md",
@@ -44,54 +46,23 @@ class TestIssue277RealWorld(unittest.TestCase):
# Verify no malformed URLs with anchor fragments
for url in result:
self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
# No /index.html.md should be appended to non-.md URLs
if not url.endswith(".md"):
self.assertNotIn(
"#synchronous-initialization/index.html.md",
url,
"Should not append /index.html.md after anchor fragments",
)
self.assertNotIn(
"#formulas/index.html.md",
url,
"Should not append /index.html.md after anchor fragments",
)
self.assertNotIn(
"#postgresql-native-enums/index.html.md",
url,
"Should not append /index.html.md after anchor fragments",
"index.html.md", url, f"Should not append /index.html.md: {url}"
)
# Verify correct transformed URLs
# Check that we got the expected number of unique URLs
# Note: defining-entities has both .md and non-.md versions, so we have 2 entries for it
self.assertEqual(
len(result),
7,
f"Should have 7 unique base URLs after deduplication, got {len(result)}",
)
# Verify specific URLs that were causing 404s are now correct
self.assertIn(
"https://mikro-orm.io/docs/quick-start/index.html.md",
result,
"quick-start URL should be correctly transformed",
)
self.assertIn(
"https://mikro-orm.io/docs/propagation/index.html.md",
result,
"propagation URL should be correctly transformed",
)
self.assertIn(
"https://mikro-orm.io/docs/defining-entities.md",
result,
"defining-entities.md should preserve .md extension",
)
# .md URLs preserved, non-.md URLs preserved as-is, anchors deduplicated
self.assertIn("https://mikro-orm.io/docs/reference.md", result)
self.assertIn("https://mikro-orm.io/docs/repositories.md", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities.md", result)
self.assertIn("https://mikro-orm.io/docs/quick-start", result)
self.assertIn("https://mikro-orm.io/docs/propagation", result)
def test_no_404_causing_urls_generated(self):
"""Verify that no URLs matching the 404 error pattern are generated"""
# The exact 404-causing URL pattern from the issue
problematic_patterns = [
"/index.html.md#", # /index.html.md should never come after #
"#synchronous-initialization/index.html.md",
"#formulas/index.html.md",
"#postgresql-native-enums/index.html.md",
@@ -118,9 +89,30 @@ class TestIssue277RealWorld(unittest.TestCase):
f"URL '{url}' contains problematic pattern '{pattern}' that causes 404",
)
def test_no_blind_index_html_md_appending(self):
"""Verify non-.md URLs don't get /index.html.md appended (core fix)"""
urls = [
"https://mikro-orm.io/docs/quick-start",
"https://mikro-orm.io/docs/propagation",
"https://mikro-orm.io/docs/filters",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 3)
for url in result:
self.assertNotIn(
"/index.html.md",
url,
f"Should not blindly append /index.html.md: {url}",
)
self.assertEqual(result[0], "https://mikro-orm.io/docs/quick-start")
self.assertEqual(result[1], "https://mikro-orm.io/docs/propagation")
self.assertEqual(result[2], "https://mikro-orm.io/docs/filters")
def test_deduplication_prevents_multiple_requests(self):
"""Verify that multiple anchors on same page don't create duplicate requests"""
# From the issue: These should all map to the same base URL
urls_with_multiple_anchors = [
"https://mikro-orm.io/docs/defining-entities#formulas",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums",
@@ -136,10 +128,7 @@ class TestIssue277RealWorld(unittest.TestCase):
1,
"Multiple anchors on same page should deduplicate to single request",
)
self.assertEqual(
result[0],
"https://mikro-orm.io/docs/defining-entities/index.html.md",
)
self.assertEqual(result[0], "https://mikro-orm.io/docs/defining-entities")
def test_md_files_with_anchors_preserved(self):
"""Test that .md files with anchors are handled correctly"""
@@ -167,7 +156,6 @@ class TestIssue277RealWorld(unittest.TestCase):
Integration test: Simulate real scraping scenario with llms.txt URLs.
Verify that the converted URLs would not cause 404 errors.
"""
# Mock response for llms.txt content
mock_response = MagicMock()
mock_response.status_code = 200
mock_response.text = """
@@ -179,7 +167,6 @@ https://mikro-orm.io/docs/defining-entities#formulas
"""
mock_get.return_value = mock_response
# Simulate the llms.txt parsing flow
urls_from_llms = [
"https://mikro-orm.io/docs/quick-start",
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
@@ -187,27 +174,23 @@ https://mikro-orm.io/docs/defining-entities#formulas
"https://mikro-orm.io/docs/defining-entities#formulas",
]
# Convert URLs (this is what happens in _try_llms_txt_v2)
converted_urls = self.converter._convert_to_md_urls(urls_from_llms)
# Verify converted URLs are valid
# In real scenario, these would be added to pending_urls and scraped
self.assertTrue(len(converted_urls) > 0, "Should generate at least one URL to scrape")
self.assertTrue(len(converted_urls) > 0)
# Verify no URLs would cause 404 (no anchors in middle of path)
for url in converted_urls:
# Check URL structure is valid
# Should not contain # anywhere
self.assertRegex(
url,
r"^https://[^#]+$", # Should not contain # anywhere
r"^https://[^#]+$",
f"URL should not contain anchor fragments: {url}",
)
# Verify the problematic pattern from the issue doesn't exist
self.assertNotRegex(
# Should NOT have /index.html.md appended
if not url.endswith(".md"):
self.assertNotIn(
"index.html.md",
url,
r"#[^/]+/index\.html\.md",
f"URL should not have /index.html.md after anchor: {url}",
f"Should not append /index.html.md: {url}",
)
def test_issue_277_error_message_urls(self):
@@ -215,14 +198,12 @@ https://mikro-orm.io/docs/defining-entities#formulas
Test the exact URLs that appeared in error messages from the issue report.
These were the actual 404-causing URLs that need to be fixed.
"""
# These are the MALFORMED URLs that caused 404 errors (with anchors in the middle)
error_urls_with_anchors = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization/index.html.md",
"https://mikro-orm.io/docs/defining-entities#formulas/index.html.md",
"https://mikro-orm.io/docs/defining-entities#postgresql-native-enums/index.html.md",
]
# Extract the input URLs that would have generated these errors
input_urls = [
"https://mikro-orm.io/docs/quick-start#synchronous-initialization",
"https://mikro-orm.io/docs/propagation",
@@ -232,7 +213,7 @@ https://mikro-orm.io/docs/defining-entities#formulas
result = self.converter._convert_to_md_urls(input_urls)
# Verify NONE of the malformed error URLs (with anchors) are generated
# Verify NONE of the malformed error URLs are generated
for error_url in error_urls_with_anchors:
self.assertNotIn(
error_url,
@@ -240,18 +221,80 @@ https://mikro-orm.io/docs/defining-entities#formulas
f"Should not generate the 404-causing URL: {error_url}",
)
# Verify correct URLs are generated instead
correct_urls = [
"https://mikro-orm.io/docs/quick-start/index.html.md",
"https://mikro-orm.io/docs/propagation/index.html.md",
"https://mikro-orm.io/docs/defining-entities/index.html.md",
# Verify correct URLs are generated
self.assertIn("https://mikro-orm.io/docs/quick-start", result)
self.assertIn("https://mikro-orm.io/docs/propagation", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities", result)
class TestIssue277DiscordDocs(unittest.TestCase):
"""Test for Discord docs case reported by @skeith"""
def setUp(self):
self.config = {
"name": "DiscordDocs",
"description": "Discord API Documentation",
"base_url": "https://docs.discord.com/",
"selectors": {"main_content": "article"},
}
self.converter = DocToSkillConverter(self.config, dry_run=True)
def test_discord_docs_no_index_html_md(self):
"""Discord docs don't serve .md files - no /index.html.md should be appended"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity",
"https://docs.discord.com/developers/activities/design-patterns",
"https://docs.discord.com/developers/components/overview",
"https://docs.discord.com/developers/bots/getting-started",
]
for correct_url in correct_urls:
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 4)
for url in result:
self.assertNotIn(
"index.html.md",
url,
f"Discord docs should not get /index.html.md appended: {url}",
)
def test_discord_docs_md_urls_preserved(self):
"""Discord llms.txt has .md URLs that should be preserved"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity.md",
"https://docs.discord.com/developers/components/overview.md",
"https://docs.discord.com/developers/change-log.md",
]
result = self.converter._convert_to_md_urls(urls)
self.assertEqual(len(result), 3)
self.assertEqual(
result[0],
"https://docs.discord.com/developers/activities/building-an-activity.md",
)
def test_discord_docs_mixed_urls(self):
"""Mix of .md and non-.md URLs from Discord docs"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity.md",
"https://docs.discord.com/developers/overview",
"https://docs.discord.com/developers/overview#quick-start",
"https://docs.discord.com/developers/bots/getting-started.md#step-1",
]
result = self.converter._convert_to_md_urls(urls)
# .md URLs preserved, non-.md as-is, anchors stripped & deduped
self.assertEqual(len(result), 3)
self.assertIn(
correct_url,
"https://docs.discord.com/developers/activities/building-an-activity.md",
result,
)
self.assertIn("https://docs.discord.com/developers/overview", result)
self.assertIn(
"https://docs.discord.com/developers/bots/getting-started.md",
result,
f"Should generate the correct URL: {correct_url}",
)

View File

@@ -1,6 +1,9 @@
"""
Tests for URL conversion logic (_convert_to_md_urls).
Covers bug fix for issue #277: URLs with anchor fragments causing 404 errors.
Updated: _convert_to_md_urls no longer blindly appends /index.html.md to non-.md URLs.
It now strips anchors, deduplicates, and preserves original URLs as-is.
"""
import unittest
@@ -31,11 +34,11 @@ class TestConvertToMdUrls(unittest.TestCase):
result = self.converter._convert_to_md_urls(urls)
# All should be converted without anchor fragments
# All should have anchors stripped
self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/docs/quick-start/index.html.md")
self.assertEqual(result[1], "https://example.com/docs/api/index.html.md")
self.assertEqual(result[2], "https://example.com/docs/guide/index.html.md")
self.assertEqual(result[0], "https://example.com/docs/quick-start")
self.assertEqual(result[1], "https://example.com/docs/api")
self.assertEqual(result[2], "https://example.com/docs/guide")
def test_deduplicates_multiple_anchors_same_url(self):
"""Test that multiple anchors on the same URL are deduplicated"""
@@ -50,7 +53,7 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should only have one entry for the base URL
self.assertEqual(len(result), 1)
self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
self.assertEqual(result[0], "https://example.com/docs/api")
def test_preserves_md_extension_urls(self):
"""Test that URLs already ending with .md are preserved"""
@@ -83,8 +86,27 @@ class TestConvertToMdUrls(unittest.TestCase):
self.assertIn("https://example.com/docs/guide.md", result)
self.assertIn("https://example.com/docs/api.md", result)
def test_non_md_urls_not_converted(self):
"""Test that non-.md URLs are NOT converted to /index.html.md (issue #277)"""
urls = [
"https://example.com/docs/getting-started",
"https://example.com/docs/api-reference",
"https://example.com/docs/tutorials",
]
result = self.converter._convert_to_md_urls(urls)
# Should preserve URLs as-is, NOT append /index.html.md
self.assertEqual(len(result), 3)
for url in result:
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
self.assertEqual(result[0], "https://example.com/docs/getting-started")
self.assertEqual(result[1], "https://example.com/docs/api-reference")
self.assertEqual(result[2], "https://example.com/docs/tutorials")
def test_does_not_match_md_in_path(self):
"""Test that URLs containing 'md' in path (but not ending with .md) are converted"""
"""Test that URLs containing 'md' in path are preserved as-is"""
urls = [
"https://example.com/cmd-line",
"https://example.com/AMD-processors",
@@ -93,27 +115,23 @@ class TestConvertToMdUrls(unittest.TestCase):
result = self.converter._convert_to_md_urls(urls)
# All should be converted since they don't END with .md
# URLs with 'md' substring (not extension) should be preserved as-is
self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/cmd-line/index.html.md")
self.assertEqual(result[1], "https://example.com/AMD-processors/index.html.md")
self.assertEqual(result[2], "https://example.com/metadata/index.html.md")
self.assertEqual(result[0], "https://example.com/cmd-line")
self.assertEqual(result[1], "https://example.com/AMD-processors")
self.assertEqual(result[2], "https://example.com/metadata")
def test_removes_trailing_slashes(self):
"""Test that trailing slashes are removed before appending /index.html.md"""
def test_removes_trailing_slashes_for_dedup(self):
"""Test that trailing slash variants are deduplicated"""
urls = [
"https://example.com/docs/api/",
"https://example.com/docs/guide//",
"https://example.com/docs/reference",
"https://example.com/docs/api",
]
result = self.converter._convert_to_md_urls(urls)
# All should have proper /index.html.md without double slashes
self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/docs/api/index.html.md")
self.assertEqual(result[1], "https://example.com/docs/guide/index.html.md")
self.assertEqual(result[2], "https://example.com/docs/reference/index.html.md")
# Should deduplicate (trailing slash vs no slash)
self.assertEqual(len(result), 1)
def test_mixed_urls_with_and_without_anchors(self):
"""Test mixed URLs with various formats"""
@@ -130,9 +148,9 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should deduplicate to 3 unique base URLs
self.assertEqual(len(result), 3)
self.assertIn("https://example.com/docs/intro/index.html.md", result)
self.assertIn("https://example.com/docs/intro", result)
self.assertIn("https://example.com/docs/api.md", result)
self.assertIn("https://example.com/docs/guide/index.html.md", result)
self.assertIn("https://example.com/docs/guide", result)
def test_empty_url_list(self):
"""Test that empty URL list returns empty result"""
@@ -155,14 +173,15 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should deduplicate to 3 unique base URLs
self.assertEqual(len(result), 3)
self.assertIn("https://mikro-orm.io/docs/quick-start/index.html.md", result)
self.assertIn("https://mikro-orm.io/docs/propagation/index.html.md", result)
self.assertIn("https://mikro-orm.io/docs/defining-entities/index.html.md", result)
# Should NOT contain any URLs with anchor fragments
for url in result:
self.assertNotIn("#", url, f"URL should not contain anchor: {url}")
# Should NOT contain /index.html.md
for url in result:
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
def test_preserves_query_parameters(self):
"""Test that query parameters are preserved (only anchors stripped)"""
urls = [
@@ -175,8 +194,6 @@ class TestConvertToMdUrls(unittest.TestCase):
# Query parameters should be preserved, anchors stripped
self.assertEqual(len(result), 2) # search deduplicated
# Note: Query parameters might not be ideal for .md conversion,
# but they should be preserved if present
self.assertTrue(
any("?q=test" in url for url in result),
"Query parameter should be preserved",
@@ -199,7 +216,7 @@ class TestConvertToMdUrls(unittest.TestCase):
# All should deduplicate to single base URL
self.assertEqual(len(result), 1)
self.assertEqual(result[0], "https://example.com/docs/guide/index.html.md")
self.assertEqual(result[0], "https://example.com/docs/guide")
def test_url_order_preservation(self):
"""Test that first occurrence of base URL is preserved"""
@@ -214,9 +231,59 @@ class TestConvertToMdUrls(unittest.TestCase):
# Should have 3 unique URLs, first occurrence preserved
self.assertEqual(len(result), 3)
self.assertEqual(result[0], "https://example.com/docs/a/index.html.md")
self.assertEqual(result[1], "https://example.com/docs/b/index.html.md")
self.assertEqual(result[2], "https://example.com/docs/c/index.html.md")
self.assertEqual(result[0], "https://example.com/docs/a")
self.assertEqual(result[1], "https://example.com/docs/b")
self.assertEqual(result[2], "https://example.com/docs/c")
def test_discord_docs_case(self):
"""Test the Discord docs case reported by @skeith in issue #277"""
urls = [
"https://docs.discord.com/developers/activities/building-an-activity",
"https://docs.discord.com/developers/activities/design-patterns",
"https://docs.discord.com/developers/components/overview",
"https://docs.discord.com/developers/bots/getting-started#step-1",
]
result = self.converter._convert_to_md_urls(urls)
# No /index.html.md should be appended
for url in result:
self.assertNotIn("index.html.md", url, f"Should not append /index.html.md: {url}")
self.assertNotIn("#", url, f"Should not contain anchor: {url}")
self.assertEqual(len(result), 4)
class TestHasMdExtension(unittest.TestCase):
"""Test suite for _has_md_extension static method"""
def test_md_extension(self):
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md"))
def test_md_with_query(self):
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page.md?v=1"))
def test_no_md_extension(self):
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page"))
def test_md_in_path_not_extension(self):
"""'cmd-line' contains 'md' but is not a .md extension"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/cmd-line"))
def test_md_in_domain(self):
"""'.md' in domain should not match"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://docs.md.example.com/page"))
def test_mdx_not_md(self):
""".mdx is not .md"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.mdx"))
def test_md_in_middle_of_path(self):
""".md in middle of path should not match"""
self.assertFalse(DocToSkillConverter._has_md_extension("https://example.com/page.md/subpage"))
def test_index_html_md(self):
self.assertTrue(DocToSkillConverter._has_md_extension("https://example.com/page/index.html.md"))
if __name__ == "__main__":