Files
skill-seekers-reference/tests/test_issue_277_discord_e2e.py
yusyus 0fa99641aa style: fix pre-existing ruff format issues in 5 files
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 21:24:21 +03:00

147 lines
5.3 KiB
Python

"""
E2E test for Issue #277 - Discord docs case reported by @skeith.
This test hits the REAL Discord docs llms.txt and verifies that
no /index.html.md URLs are generated. No mocks.
Requires network access. Marked as integration test.
"""
import os
import shutil
import unittest
import pytest
from skill_seekers.cli.doc_scraper import DocToSkillConverter
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
@pytest.mark.integration
class TestIssue277DiscordDocsE2E(unittest.TestCase):
"""E2E: Reproduce @skeith's report with real Discord docs."""
def setUp(self):
self.base_url = "https://docs.discord.com/"
self.config = {
"name": "DiscordDocsE2E",
"description": "Discord API Documentation",
"base_url": self.base_url,
"selectors": {"main_content": "article"},
"url_patterns": {"include": ["/developers"], "exclude": []},
}
self.output_dir = f"output/{self.config['name']}_data"
def tearDown(self):
# Clean up any output created
for path in [self.output_dir, f"output/{self.config['name']}"]:
if os.path.exists(path):
shutil.rmtree(path)
def _detect_variants(self):
"""Helper: detect llms.txt variants, skip test if site unreachable."""
detector = LlmsTxtDetector(self.base_url)
variants = detector.detect_all()
if not variants:
self.skipTest("Discord docs llms.txt not reachable (network/rate-limit)")
return variants
def test_discord_llms_txt_exists(self):
"""Verify Discord docs has llms.txt (precondition for the bug)."""
variants = self._detect_variants()
self.assertGreater(len(variants), 0)
def test_discord_llms_txt_urls_no_index_html_md(self):
"""Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
# Step 1: Detect llms.txt
variants = self._detect_variants()
# Step 2: Download the largest variant (same logic as doc_scraper)
downloaded = {}
for variant_info in variants:
downloader = LlmsTxtDownloader(variant_info["url"])
content = downloader.download()
if content:
downloaded[variant_info["variant"]] = content
self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")
largest_content = max(downloaded.values(), key=len)
# Step 3: Parse URLs from llms.txt
parser = LlmsTxtParser(largest_content, self.base_url)
extracted_urls = parser.extract_urls()
self.assertTrue(
len(extracted_urls) > 0,
"No URLs extracted from Discord llms.txt",
)
# Step 4: Run _convert_to_md_urls (the function that was causing 404s)
converter = DocToSkillConverter(self.config, dry_run=True)
converted_urls = converter._convert_to_md_urls(extracted_urls)
# Step 5: Verify NO /index.html.md was blindly appended
bad_urls = [u for u in converted_urls if "/index.html.md" in u]
self.assertEqual(
len(bad_urls),
0,
f"Found {len(bad_urls)} URLs with /index.html.md appended "
f"(would cause 404s):\n" + "\n".join(bad_urls[:10]),
)
# Step 6: Verify no anchor fragments leaked through
anchor_urls = [u for u in converted_urls if "#" in u]
self.assertEqual(
len(anchor_urls),
0,
f"Found {len(anchor_urls)} URLs with anchor fragments:\n" + "\n".join(anchor_urls[:10]),
)
# Step 7: Verify we got a reasonable number of URLs
self.assertGreater(
len(converted_urls),
10,
"Expected at least 10 unique URLs from Discord docs",
)
def test_discord_full_pipeline_no_404_urls(self):
"""Full pipeline: detector -> downloader -> parser -> converter -> queue.
Simulates what `skill-seekers create https://docs.discord.com` does,
without actually scraping pages.
"""
converter = DocToSkillConverter(self.config, dry_run=True)
# Run _try_llms_txt which calls _convert_to_md_urls internally
os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
result = converter._try_llms_txt()
# _try_llms_txt returns False when it populates pending_urls for BFS
# (True means it parsed content directly, no BFS needed)
if not result:
# Check every URL in the queue
for url in converter.pending_urls:
self.assertNotIn(
"/index.html.md",
url,
f"Queue contains 404-causing URL: {url}",
)
self.assertNotIn(
"#",
url,
f"Queue contains URL with anchor fragment: {url}",
)
self.assertGreater(
len(converter.pending_urls),
0,
"Pipeline should have queued URLs for crawling",
)
if __name__ == "__main__":
unittest.main()