Files
skill-seekers-reference/tests/test_issue_277_discord_e2e.py
yusyus 1d3d7389d7 fix: sanitize_url crashes on Python 3.14 strict urlparse (#284)
Python 3.14's urlparse() raises ValueError on URLs with unencoded
brackets that look like malformed IPv6 (e.g. http://[fdaa:x:x:x::x
from docs.openclaw.ai llms-full.txt). sanitize_url() called urlparse()
BEFORE encoding brackets, so it crashed before it could fix them.

Fix: catch ValueError from urlparse, encode ALL brackets, then retry.
This is safe because if urlparse rejected the brackets, they are NOT
valid IPv6 host literals and should be encoded anyway.

Also fixed Discord e2e tests to skip gracefully on network issues.

Fixes #284

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 00:30:48 +03:00

149 lines
5.3 KiB
Python

"""
E2E test for Issue #277 - Discord docs case reported by @skeith.
This test hits the REAL Discord docs llms.txt and verifies that
no /index.html.md URLs are generated. No mocks.
Requires network access. Marked as integration test.
"""
import os
import shutil
import unittest
import pytest
from skill_seekers.cli.doc_scraper import DocToSkillConverter
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
@pytest.mark.integration
class TestIssue277DiscordDocsE2E(unittest.TestCase):
"""E2E: Reproduce @skeith's report with real Discord docs."""
def setUp(self):
self.base_url = "https://docs.discord.com/"
self.config = {
"name": "DiscordDocsE2E",
"description": "Discord API Documentation",
"base_url": self.base_url,
"selectors": {"main_content": "article"},
"url_patterns": {"include": ["/developers"], "exclude": []},
}
self.output_dir = f"output/{self.config['name']}_data"
def tearDown(self):
# Clean up any output created
for path in [self.output_dir, f"output/{self.config['name']}"]:
if os.path.exists(path):
shutil.rmtree(path)
def _detect_variants(self):
"""Helper: detect llms.txt variants, skip test if site unreachable."""
detector = LlmsTxtDetector(self.base_url)
variants = detector.detect_all()
if not variants:
self.skipTest("Discord docs llms.txt not reachable (network/rate-limit)")
return variants
def test_discord_llms_txt_exists(self):
"""Verify Discord docs has llms.txt (precondition for the bug)."""
variants = self._detect_variants()
self.assertGreater(len(variants), 0)
def test_discord_llms_txt_urls_no_index_html_md(self):
"""Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
# Step 1: Detect llms.txt
variants = self._detect_variants()
# Step 2: Download the largest variant (same logic as doc_scraper)
downloaded = {}
for variant_info in variants:
downloader = LlmsTxtDownloader(variant_info["url"])
content = downloader.download()
if content:
downloaded[variant_info["variant"]] = content
self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
largest_content = max(downloaded.values(), key=len)
# Step 3: Parse URLs from llms.txt
parser = LlmsTxtParser(largest_content, self.base_url)
extracted_urls = parser.extract_urls()
self.assertTrue(
len(extracted_urls) > 0,
"No URLs extracted from Discord llms.txt",
)
# Step 4: Run _convert_to_md_urls (the function that was causing 404s)
converter = DocToSkillConverter(self.config, dry_run=True)
converted_urls = converter._convert_to_md_urls(extracted_urls)
# Step 5: Verify NO /index.html.md was blindly appended
bad_urls = [u for u in converted_urls if "/index.html.md" in u]
self.assertEqual(
len(bad_urls),
0,
f"Found {len(bad_urls)} URLs with /index.html.md appended "
f"(would cause 404s):\n"
+ "\n".join(bad_urls[:10]),
)
# Step 6: Verify no anchor fragments leaked through
anchor_urls = [u for u in converted_urls if "#" in u]
self.assertEqual(
len(anchor_urls),
0,
f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
+ "\n".join(anchor_urls[:10]),
)
# Step 7: Verify we got a reasonable number of URLs
self.assertGreater(
len(converted_urls),
10,
"Expected at least 10 unique URLs from Discord docs",
)
def test_discord_full_pipeline_no_404_urls(self):
"""Full pipeline: detector -> downloader -> parser -> converter -> queue.
Simulates what `skill-seekers create https://docs.discord.com` does,
without actually scraping pages.
"""
converter = DocToSkillConverter(self.config, dry_run=True)
# Run _try_llms_txt which calls _convert_to_md_urls internally
os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
result = converter._try_llms_txt()
# _try_llms_txt returns False when it populates pending_urls for BFS
# (True means it parsed content directly, no BFS needed)
if not result:
# Check every URL in the queue
for url in converter.pending_urls:
self.assertNotIn(
"/index.html.md",
url,
f"Queue contains 404-causing URL: {url}",
)
self.assertNotIn(
"#",
url,
f"Queue contains URL with anchor fragment: {url}",
)
self.assertGreater(
len(converter.pending_urls),
0,
"Pipeline should have queued URLs for crawling",
)
if __name__ == "__main__":
unittest.main()