skill-seekers-reference/tests/test_issue_277_discord_e2e.py

"""
E2E test for Issue #277 - Discord docs case reported by @skeith.

This test hits the REAL Discord docs llms.txt and verifies that
no /index.html.md URLs are generated. No mocks.

Requires network access. Marked as integration test.
"""

import os
import shutil
import unittest

import pytest

from skill_seekers.cli.doc_scraper import DocToSkillConverter
from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector
from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser


@pytest.mark.integration
class TestIssue277DiscordDocsE2E(unittest.TestCase):
    """E2E: Reproduce @skeith's report with real Discord docs."""

    def setUp(self):
        self.base_url = "https://docs.discord.com/"
        self.config = {
            "name": "DiscordDocsE2E",
            "description": "Discord API Documentation",
            "base_url": self.base_url,
            "selectors": {"main_content": "article"},
            "url_patterns": {"include": ["/developers"], "exclude": []},
        }
        self.output_dir = f"output/{self.config['name']}_data"

    def tearDown(self):
        # Clean up any output created
        for path in [self.output_dir, f"output/{self.config['name']}"]:
            if os.path.exists(path):
                shutil.rmtree(path)

    def _detect_variants(self):
        """Helper: detect llms.txt variants, skip test if site unreachable."""
        detector = LlmsTxtDetector(self.base_url)
        variants = detector.detect_all()
        if not variants:
            self.skipTest("Discord docs llms.txt not reachable (network/rate-limit)")
        return variants

    def test_discord_llms_txt_exists(self):
        """Verify Discord docs has llms.txt (precondition for the bug)."""
        variants = self._detect_variants()
        self.assertGreater(len(variants), 0)

    def test_discord_llms_txt_urls_no_index_html_md(self):
        """Core test: URLs extracted from Discord llms.txt must NOT get /index.html.md appended."""
        # Step 1: Detect llms.txt
        variants = self._detect_variants()

        # Step 2: Download the largest variant (same logic as doc_scraper)
        downloaded = {}
        for variant_info in variants:
            downloader = LlmsTxtDownloader(variant_info["url"])
            content = downloader.download()
            if content:
                downloaded[variant_info["variant"]] = content

        self.assertTrue(len(downloaded) > 0, "Failed to download any llms.txt variant")

        largest_content = max(downloaded.values(), key=len)

        # Step 3: Parse URLs from llms.txt
        parser = LlmsTxtParser(largest_content, self.base_url)
        extracted_urls = parser.extract_urls()
        self.assertTrue(
            len(extracted_urls) > 0,
            "No URLs extracted from Discord llms.txt",
        )

        # Step 4: Run _convert_to_md_urls (the function that was causing 404s)
        converter = DocToSkillConverter(self.config, dry_run=True)
        converted_urls = converter._convert_to_md_urls(extracted_urls)

        # Step 5: Verify NO /index.html.md was blindly appended
        bad_urls = [u for u in converted_urls if "/index.html.md" in u]
        self.assertEqual(
            len(bad_urls),
            0,
            f"Found {len(bad_urls)} URLs with /index.html.md appended "
            f"(would cause 404s):\n"
            + "\n".join(bad_urls[:10]),
        )

        # Step 6: Verify no anchor fragments leaked through
        anchor_urls = [u for u in converted_urls if "#" in u]
        self.assertEqual(
            len(anchor_urls),
            0,
            f"Found {len(anchor_urls)} URLs with anchor fragments:\n"
            + "\n".join(anchor_urls[:10]),
        )

        # Step 7: Verify we got a reasonable number of URLs
        self.assertGreater(
            len(converted_urls),
            10,
            "Expected at least 10 unique URLs from Discord docs",
        )

    def test_discord_full_pipeline_no_404_urls(self):
        """Full pipeline: detector -> downloader -> parser -> converter -> queue.

        Simulates what `skill-seekers create https://docs.discord.com` does,
        without actually scraping pages.
        """
        converter = DocToSkillConverter(self.config, dry_run=True)

        # Run _try_llms_txt which calls _convert_to_md_urls internally
        os.makedirs(os.path.join(converter.skill_dir, "references"), exist_ok=True)
        os.makedirs(os.path.join(converter.data_dir, "pages"), exist_ok=True)
        result = converter._try_llms_txt()

        # _try_llms_txt returns False when it populates pending_urls for BFS
        # (True means it parsed content directly, no BFS needed)
        if not result:
            # Check every URL in the queue
            for url in converter.pending_urls:
                self.assertNotIn(
                    "/index.html.md",
                    url,
                    f"Queue contains 404-causing URL: {url}",
                )
                self.assertNotIn(
                    "#",
                    url,
                    f"Queue contains URL with anchor fragment: {url}",
                )

            self.assertGreater(
                len(converter.pending_urls),
                0,
                "Pipeline should have queued URLs for crawling",
            )


if __name__ == "__main__":
    unittest.main()