Python 3.14's urlparse() raises ValueError on URLs with unencoded brackets that look like malformed IPv6 (e.g. http://[fdaa:x:x:x::x from docs.openclaw.ai llms-full.txt). sanitize_url() called urlparse() BEFORE encoding brackets, so it crashed before it could fix them. Fix: catch ValueError from urlparse, encode ALL brackets, then retry. This is safe because if urlparse rejected the brackets, they are NOT valid IPv6 host literals and should be encoded anyway. Also fixed Discord e2e tests to skip gracefully on network issues. Fixes #284 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
149 lines
5.3 KiB
Python
149 lines
5.3 KiB
Python
"""
|
|
E2E test for Issue #277 - Discord docs case reported by @skeith.
|
|
|
|
This test hits the REAL Discord docs llms.txt and verifies that
|
|
no /index.html.md URLs are generated. No mocks.
|
|
|
|
Requires network access. Marked as integration test.
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import unittest
|
|
|
|
import pytest
|
|
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
|
|
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
|
|
@pytest.mark.integration
|
|
class TestIssue277DiscordDocsE2E(unittest.TestCase):
|
|
"""E2E: Reproduce @skeith's report with real Discord docs."""
|
|
|
|
def setUp(self):
|
|
self.base_url = "https://docs.discord.com/"
|
|
self.config = {
|
|
"name": "DiscordDocsE2E",
|
|
"description": "Discord API Documentation",
|
|
"base_url": self.base_url,
|
|
"selectors": {"main_content": "article"},
|
|
"url_patterns": {"include": ["/developers"], "exclude": []},
|
|
}
|
|
self.output_dir = f"output/{self.config['name']}_data"
|
|
|
|
def tearDown(self):
|
|
# Clean up any output created
|
|
for path in [self.output_dir, f"output/{self.config['name']}"]:
|
|
if os.path.exists(path):
|
|
shutil.rmtree(path)
|
|
|
|
def _detect_variants(self):
|
|
"""Helper: detect llms.txt variants, skip test if site unreachable."""
|
|
detector = LlmsTxtDetector(self.base_url)
|
|
variants = detector.detect_all()
|
|
if not variants:
|
|
self.skipTest("Discord docs llms.txt not reachable (network/rate-limit)")
|
|
return variants
|
|
|
|
def test_discord_llms_txt_exists(self):
|
|
"""Verify Discord docs has llms.txt (precondition for the bug)."""
|
|
variants = self._detect_variants()
|
|
self.assertGreater(len(variants), 0)
|
|
|
|
def test_discord_llms_txt_urls_no_index_html_md(self):
|
|
"""Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
|
|
# Step 1: Detect llms.txt
|
|
variants = self._detect_variants()
|
|
|
|
# Step 2: Download the largest variant (same logic as doc_scraper)
|
|
downloaded = {}
|
|
for variant_info in variants:
|
|
downloader = LlmsTxtDownloader(variant_info["url"])
|
|
content = downloader.download()
|
|
if content:
|
|
downloaded[variant_info["variant"]] = content
|
|
|
|
self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
|
|
|
|
largest_content = max(downloaded.values(), key=len)
|
|
|
|
# Step 3: Parse URLs from llms.txt
|
|
parser = LlmsTxtParser(largest_content, self.base_url)
|
|
extracted_urls = parser.extract_urls()
|
|
self.assertTrue(
|
|
len(extracted_urls) > 0,
|
|
"No URLs extracted from Discord llms.txt",
|
|
)
|
|
|
|
# Step 4: Run _convert_to_md_urls (the function that was causing 404s)
|
|
converter = DocToSkillConverter(self.config, dry_run=True)
|
|
converted_urls = converter._convert_to_md_urls(extracted_urls)
|
|
|
|
# Step 5: Verify NO /index.html.md was blindly appended
|
|
bad_urls = [u for u in converted_urls if "/index.html.md" in u]
|
|
self.assertEqual(
|
|
len(bad_urls),
|
|
0,
|
|
f"Found {len(bad_urls)} URLs with /index.html.md appended "
|
|
f"(would cause 404s):\n"
|
|
+ "\n".join(bad_urls[:10]),
|
|
)
|
|
|
|
# Step 6: Verify no anchor fragments leaked through
|
|
anchor_urls = [u for u in converted_urls if "#" in u]
|
|
self.assertEqual(
|
|
len(anchor_urls),
|
|
0,
|
|
f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
|
|
+ "\n".join(anchor_urls[:10]),
|
|
)
|
|
|
|
# Step 7: Verify we got a reasonable number of URLs
|
|
self.assertGreater(
|
|
len(converted_urls),
|
|
10,
|
|
"Expected at least 10 unique URLs from Discord docs",
|
|
)
|
|
|
|
def test_discord_full_pipeline_no_404_urls(self):
|
|
"""Full pipeline: detector -> downloader -> parser -> converter -> queue.
|
|
|
|
Simulates what `skill-seekers create https://docs.discord.com` does,
|
|
without actually scraping pages.
|
|
"""
|
|
converter = DocToSkillConverter(self.config, dry_run=True)
|
|
|
|
# Run _try_llms_txt which calls _convert_to_md_urls internally
|
|
os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
|
|
os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
|
|
result = converter._try_llms_txt()
|
|
|
|
# _try_llms_txt returns False when it populates pending_urls for BFS
|
|
# (True means it parsed content directly, no BFS needed)
|
|
if not result:
|
|
# Check every URL in the queue
|
|
for url in converter.pending_urls:
|
|
self.assertNotIn(
|
|
"/index.html.md",
|
|
url,
|
|
f"Queue contains 404-causing URL: {url}",
|
|
)
|
|
self.assertNotIn(
|
|
"#",
|
|
url,
|
|
f"Queue contains URL with anchor fragment: {url}",
|
|
)
|
|
|
|
self.assertGreater(
|
|
len(converter.pending_urls),
|
|
0,
|
|
"Pipeline should have queued URLs for crawling",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|