fix: stop blindly appending /index.html.md to non-.md URLs (#277)
The previous fix (a82cf69) only addressed anchor fragment stripping but
left the fundamental problem: _convert_to_md_urls() blindly appended
/index.html.md to ALL non-.md URLs from llms.txt. This only works for
Docusaurus sites — for sites like Discord docs it generates mass 404s.
Changes:
- _convert_to_md_urls() now strips anchors and deduplicates only,
preserving original URLs as-is instead of appending /index.html.md
- New _has_md_extension() helper uses urlparse().path.endswith(".md")
instead of error-prone ".md" in url substring matching
- Fixed ".md" in url checks at 4 locations (lines 465, 554, 716, 775)
- Removed 24 lines of dead commented-out code
- Added real-world e2e test against docs.discord.com (no mocks)
- Updated unit tests for new behavior (32 tests)
Fixes #277
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -252,6 +252,15 @@ class DocToSkillConverter:
|
||||
|
||||
return not any(pattern in url for pattern in self._exclude_patterns)
|
||||
|
||||
@staticmethod
|
||||
def _has_md_extension(url: str) -> bool:
|
||||
"""Check if URL path ends with .md extension.
|
||||
|
||||
Uses URL path parsing instead of substring matching to avoid
|
||||
false positives on URLs like /embed/page or /cmd-line.
|
||||
"""
|
||||
return urlparse(url).path.endswith(".md")
|
||||
|
||||
def save_checkpoint(self) -> None:
|
||||
"""Save progress checkpoint"""
|
||||
if not self.checkpoint_enabled or self.dry_run:
|
||||
@@ -462,7 +471,7 @@ class DocToSkillConverter:
|
||||
else:
|
||||
continue
|
||||
full_url = full_url.split("#")[0]
|
||||
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in links:
|
||||
if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in links:
|
||||
links.append(full_url)
|
||||
|
||||
return {
|
||||
@@ -551,7 +560,7 @@ class DocToSkillConverter:
|
||||
# Strip anchor fragments
|
||||
full_url = full_url.split("#")[0]
|
||||
# Only include .md URLs to avoid client-side rendered HTML pages
|
||||
if ".md" in full_url and self.is_valid_url(full_url) and full_url not in page["links"]:
|
||||
if self._has_md_extension(full_url) and self.is_valid_url(full_url) and full_url not in page["links"]:
|
||||
page["links"].append(full_url)
|
||||
|
||||
return page
|
||||
@@ -713,7 +722,7 @@ class DocToSkillConverter:
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if this is a Markdown file
|
||||
if url.endswith(".md") or ".md" in url:
|
||||
if self._has_md_extension(url):
|
||||
page = self._extract_markdown_content(response.text, url)
|
||||
else:
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
@@ -772,7 +781,7 @@ class DocToSkillConverter:
|
||||
response.raise_for_status()
|
||||
|
||||
# Check if this is a Markdown file
|
||||
if url.endswith(".md") or ".md" in url:
|
||||
if self._has_md_extension(url):
|
||||
page = self._extract_markdown_content(response.text, url)
|
||||
else:
|
||||
# BeautifulSoup parsing (still synchronous, but fast)
|
||||
@@ -798,71 +807,45 @@ class DocToSkillConverter:
|
||||
|
||||
def _convert_to_md_urls(self, urls: list[str]) -> list[str]:
|
||||
"""
|
||||
Convert URLs to .md format, trying /index.html.md suffix for non-.md URLs.
|
||||
Strips anchor fragments (#anchor) and deduplicates base URLs to avoid 404 errors.
|
||||
不预先检查 URL 是否存在,直接加入队列,在爬取时再验证。
|
||||
Clean URLs from llms.txt: strip anchor fragments, deduplicate base URLs.
|
||||
|
||||
Previously this method blindly appended /index.html.md to non-.md URLs,
|
||||
which caused 404 errors on sites that don't serve raw markdown files
|
||||
(e.g. Discord docs, see issue #277). Now it preserves original URLs as-is
|
||||
and lets the scraper handle both HTML and markdown content.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to process
|
||||
|
||||
Returns:
|
||||
List of .md URLs (未验证, deduplicated, no anchors)
|
||||
List of cleaned, deduplicated URLs (no anchors)
|
||||
"""
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
seen_base_urls = set()
|
||||
md_urls = []
|
||||
cleaned_urls = []
|
||||
|
||||
for url in urls:
|
||||
# Parse URL to extract and remove fragment (anchor)
|
||||
parsed = urlparse(url)
|
||||
base_url = urlunparse(parsed._replace(fragment="")) # Remove #anchor
|
||||
|
||||
# Skip if we've already processed this base URL
|
||||
if base_url in seen_base_urls:
|
||||
continue
|
||||
seen_base_urls.add(base_url)
|
||||
# Normalize trailing slashes for dedup (but keep original form)
|
||||
dedup_key = base_url.rstrip("/")
|
||||
|
||||
# Check if URL already ends with .md (not just contains "md")
|
||||
if base_url.endswith(".md"):
|
||||
md_urls.append(base_url)
|
||||
else:
|
||||
# 直接转换为 .md 格式,不发送 HEAD 请求检查
|
||||
base_url = base_url.rstrip("/")
|
||||
md_url = f"{base_url}/index.html.md"
|
||||
md_urls.append(md_url)
|
||||
# Skip if we've already processed this base URL
|
||||
if dedup_key in seen_base_urls:
|
||||
continue
|
||||
seen_base_urls.add(dedup_key)
|
||||
|
||||
cleaned_urls.append(base_url)
|
||||
|
||||
logger.info(
|
||||
" ✓ Converted %d URLs to %d unique .md URLs (anchors stripped, will validate during crawl)",
|
||||
" ✓ Cleaned %d URLs to %d unique URLs (anchors stripped, will validate during crawl)",
|
||||
len(urls),
|
||||
len(md_urls),
|
||||
len(cleaned_urls),
|
||||
)
|
||||
return md_urls
|
||||
|
||||
# ORIGINAL _convert_to_md_urls (with HEAD request validation):
|
||||
# def _convert_to_md_urls(self, urls: List[str]) -> List[str]:
|
||||
# md_urls = []
|
||||
# non_md_urls = []
|
||||
# for url in urls:
|
||||
# if '.md' in url:
|
||||
# md_urls.append(url)
|
||||
# else:
|
||||
# non_md_urls.append(url)
|
||||
# if non_md_urls:
|
||||
# logger.info(" 🔄 Trying to convert %d non-.md URLs to .md format...", len(non_md_urls))
|
||||
# converted = 0
|
||||
# for url in non_md_urls:
|
||||
# url = url.rstrip('/')
|
||||
# md_url = f"{url}/index.html.md"
|
||||
# try:
|
||||
# resp = requests.head(md_url, timeout=5, allow_redirects=True)
|
||||
# if resp.status_code == 200:
|
||||
# md_urls.append(md_url)
|
||||
# converted += 1
|
||||
# except Exception:
|
||||
# pass
|
||||
# logger.info(" ✓ Converted %d URLs to .md format", converted)
|
||||
# return md_urls
|
||||
return cleaned_urls
|
||||
|
||||
def _try_llms_txt(self) -> bool:
|
||||
"""
|
||||
@@ -933,16 +916,16 @@ class DocToSkillConverter:
|
||||
# Extract URLs from llms.txt and add to pending_urls for BFS crawling
|
||||
extracted_urls = parser.extract_urls()
|
||||
if extracted_urls:
|
||||
# Convert non-.md URLs to .md format by trying /index.html.md suffix
|
||||
md_urls = self._convert_to_md_urls(extracted_urls)
|
||||
# Clean URLs: strip anchors, deduplicate
|
||||
cleaned_urls = self._convert_to_md_urls(extracted_urls)
|
||||
logger.info(
|
||||
"\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
|
||||
"\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
|
||||
len(extracted_urls),
|
||||
len(md_urls),
|
||||
len(cleaned_urls),
|
||||
)
|
||||
|
||||
# Filter URLs based on url_patterns config
|
||||
for url in md_urls:
|
||||
for url in cleaned_urls:
|
||||
if self.is_valid_url(url):
|
||||
self._enqueue_url(url)
|
||||
|
||||
@@ -1019,16 +1002,16 @@ class DocToSkillConverter:
|
||||
# Extract URLs from llms.txt and add to pending_urls for BFS crawling
|
||||
extracted_urls = parser.extract_urls()
|
||||
if extracted_urls:
|
||||
# Convert non-.md URLs to .md format by trying /index.html.md suffix
|
||||
md_urls = self._convert_to_md_urls(extracted_urls)
|
||||
# Clean URLs: strip anchors, deduplicate
|
||||
cleaned_urls = self._convert_to_md_urls(extracted_urls)
|
||||
logger.info(
|
||||
"\n🔗 Found %d URLs in llms.txt (%d .md files), starting BFS crawl...",
|
||||
"\n🔗 Found %d URLs in llms.txt (%d unique), starting BFS crawl...",
|
||||
len(extracted_urls),
|
||||
len(md_urls),
|
||||
len(cleaned_urls),
|
||||
)
|
||||
|
||||
# Filter URLs based on url_patterns config
|
||||
for url in md_urls:
|
||||
for url in cleaned_urls:
|
||||
if self.is_valid_url(url):
|
||||
self._enqueue_url(url)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user