skill-seekers-reference/tests/test_markdown_parsing.py

"""
Tests for Markdown parsing and BFS URL crawling features.

Tests the following functionality:
1. Markdown file content extraction (_extract_markdown_content)
2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
3. URL extraction from llms.txt (extract_urls, _clean_url)
4. Empty/short content filtering in save_page
"""

import os
import shutil
import unittest


class TestMarkdownContentExtraction(unittest.TestCase):
    """Test Markdown file parsing in doc_scraper."""

    def setUp(self):
        """Set up test fixtures."""
        from skill_seekers.cli.doc_scraper import DocToSkillConverter

        self.config = {
            "name": "test_md_parsing",
            "base_url": "https://example.com",
            "selectors": {},
            "url_patterns": {"include": [], "exclude": []},
            "categories": {},
        }
        self.converter = DocToSkillConverter(self.config)

    def tearDown(self):
        """Clean up output directory."""
        output_dir = f"output/{self.config['name']}_data"
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)

    def test_extract_title_from_h1(self):
        """Test extracting title from first h1."""
        content = "# My Documentation Title\n\nSome content here."
        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
        self.assertEqual(result["title"], "My Documentation Title")

    def test_extract_headings_h2_to_h6(self):
        """Test extracting h2-h6 headings (not h1)."""
        content = """# Title

## Section One
### Subsection A
#### Deep Section
##### Deeper
###### Deepest

Content here.
"""
        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
        # Should have 5 headings (h2-h6), not h1
        self.assertEqual(len(result["headings"]), 5)
        self.assertEqual(result["headings"][0]["level"], "h2")
        self.assertEqual(result["headings"][0]["text"], "Section One")

    def test_extract_code_blocks_with_language(self):
        """Test extracting code blocks with language tags."""
        content = """# API Guide

```python
def hello():
    return "Hello, World!"
```

Some explanation.

```javascript
const greet = () => console.log("Hi");
```

```
plain code without language
```
"""
        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
        self.assertEqual(len(result["code_samples"]), 3)
        self.assertEqual(result["code_samples"][0]["language"], "python")
        self.assertEqual(result["code_samples"][1]["language"], "javascript")
        self.assertIn(result["code_samples"][2]["language"], ("unknown", "text"))

    def test_extract_markdown_links_only_md_files(self):
        """Test that only .md links are extracted."""
        content = """# Links

- [Markdown Doc](./guide.md)
- [Another MD](https://example.com/api.md)
- [HTML Page](./page.html)
- [External](https://google.com)
"""
        result = self.converter._extract_markdown_content(
            content, "https://example.com/docs/test.md"
        )
        # Should only include .md links
        md_links = [link for link in result["links"] if ".md" in link]
        self.assertEqual(len(md_links), len(result["links"]))

    def test_extract_content_paragraphs(self):
        """Test extracting paragraph content."""
        content = """# Title

This is a paragraph with enough content to pass the minimum length filter.

Short.

Another paragraph that should be included in the final content output.
"""
        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
        self.assertIn("paragraph with enough content", result["content"])
        self.assertNotIn("Short.", result["content"])

    def test_detect_html_in_md_url(self):
        """Test that HTML content is detected when .md URL returns HTML."""
        html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
        result = self.converter._extract_markdown_content(
            html_content, "https://example.com/test.md"
        )
        self.assertEqual(result["title"], "Page")


class TestHtmlAsMarkdownExtraction(unittest.TestCase):
    """Test HTML to markdown-like extraction."""

    def setUp(self):
        """Set up test fixtures."""
        from skill_seekers.cli.doc_scraper import DocToSkillConverter

        self.config = {
            "name": "test_html_fallback",
            "base_url": "https://example.com",
            "selectors": {},
            "url_patterns": {"include": [], "exclude": []},
            "categories": {},
        }
        self.converter = DocToSkillConverter(self.config)

    def tearDown(self):
        """Clean up output directory."""
        output_dir = f"output/{self.config['name']}_data"
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)

    def test_extract_title_from_html(self):
        """Test extracting title from HTML title tag."""
        html = "<html><head><title>My Page Title</title></head><body></body></html>"
        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
        self.assertEqual(result["title"], "My Page Title")

    def test_find_main_content_area(self):
        """Test finding main content from various selectors."""
        html = """
        <html><body>
            <nav>Navigation</nav>
            <main>
                <h1>Main Content</h1>
                <p>This is the main content area with enough text to pass filters.</p>
            </main>
            <footer>Footer</footer>
        </body></html>
        """
        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
        self.assertIn("main content area", result["content"].lower())

    def test_extract_code_blocks_from_html(self):
        """Test extracting code blocks from HTML pre/code tags."""
        html = """
        <html><body>
            <main>
                <pre><code class="language-python">print("hello")</code></pre>
            </main>
        </body></html>
        """
        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
        self.assertTrue(len(result["code_samples"]) > 0)

    def test_fallback_to_body_when_no_main(self):
        """Test fallback to body when no main/article element."""
        html = """
        <html><body>
            <div>
                <h2>Section</h2>
                <p>Content in body without main element, long enough to pass filter.</p>
            </div>
        </body></html>
        """
        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
        self.assertTrue(len(result["headings"]) > 0 or len(result["content"]) > 0)


class TestLlmsTxtUrlExtraction(unittest.TestCase):
    """Test URL extraction from llms.txt content."""

    def test_extract_markdown_style_links(self):
        """Test extracting [text](url) style links."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        content = """
# Documentation Index

- [Getting Started](https://docs.example.com/start.md)
- [API Reference](https://docs.example.com/api/index.md)
- [Advanced Guide](https://docs.example.com/advanced.md)
"""
        parser = LlmsTxtParser(content, base_url="https://docs.example.com")
        urls = parser.extract_urls()

        self.assertIn("https://docs.example.com/start.md", urls)
        self.assertIn("https://docs.example.com/api/index.md", urls)
        self.assertIn("https://docs.example.com/advanced.md", urls)

    def test_extract_bare_urls(self):
        """Test extracting bare URLs without markdown syntax."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        content = """
Documentation: https://example.com/docs/guide.md
API: https://example.com/api/reference.md
"""
        parser = LlmsTxtParser(content)
        urls = parser.extract_urls()

        self.assertIn("https://example.com/docs/guide.md", urls)
        self.assertIn("https://example.com/api/reference.md", urls)

    def test_resolve_relative_urls(self):
        """Test resolving relative URLs with base_url."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        content = """
- [Local Doc](./docs/guide.md)
- [Parent](../api/ref.md)
"""
        parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
        urls = parser.extract_urls()

        # Should resolve relative paths
        self.assertTrue(any("docs/guide.md" in url for url in urls))

    def test_clean_url_invalid_anchor_pattern(self):
        """Test cleaning URLs with invalid anchor patterns."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        # Invalid: path after anchor
        result = parser._clean_url("https://example.com/page#section/index.html.md")
        self.assertEqual(result, "https://example.com/page")

    def test_clean_url_valid_anchor(self):
        """Test that valid anchors are preserved."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        # Valid anchor should be unchanged
        result = parser._clean_url("https://example.com/page.md#section")
        self.assertEqual(result, "https://example.com/page.md#section")

    def test_clean_url_no_anchor(self):
        """Test that URLs without anchors are unchanged."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        result = parser._clean_url("https://example.com/docs/guide.md")
        self.assertEqual(result, "https://example.com/docs/guide.md")

    def test_clean_url_bracket_encoding(self):
        """Test that square brackets are percent-encoded in URL path (#284)."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        result = parser._clean_url("https://example.com/api/[v1]/users")
        self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users")

    def test_clean_url_bracket_encoding_preserves_host(self):
        """Test that bracket encoding does not affect host (IPv6 literals)."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        # Brackets should only be encoded in path, not in host
        result = parser._clean_url("https://example.com/path/[param]/end")
        self.assertIn("%5B", result)
        self.assertIn("%5D", result)
        self.assertIn("example.com", result)

    def test_clean_url_bracket_in_query(self):
        """Test that brackets in query params are also encoded."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        result = parser._clean_url("https://example.com/search?filter=[active]")
        self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D")

    def test_clean_url_malformed_anchor_with_brackets(self):
        """Test combined malformed anchor stripping + bracket encoding."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        # Malformed anchor should be stripped, then brackets encoded
        result = parser._clean_url("https://example.com/api/[v1]/page#section/deep")
        self.assertEqual(result, "https://example.com/api/%5Bv1%5D/page")

    def test_clean_url_malformed_ipv6_no_crash(self):
        """Test that incomplete IPv6 placeholder URLs don't crash (issue #284).

        Python 3.14 raises ValueError from urlparse() on these URLs.
        Seen in real-world llms-full.txt from docs.openclaw.ai.
        """
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        parser = LlmsTxtParser("", base_url="https://example.com")

        # Must not raise ValueError
        result = parser._clean_url("http://[fdaa:x:x:x:x::x")
        self.assertIn("%5B", result)
        self.assertNotIn("[", result)

    def test_extract_urls_with_ipv6_placeholder_no_crash(self):
        """Test that extract_urls handles content with broken IPv6 URLs (issue #284)."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        content = """# Docs
- [Guide](https://example.com/guide.md)
- Connect to http://[fdaa:x:x:x:x::x for private networking
- [API](https://example.com/api.md)
"""
        parser = LlmsTxtParser(content, base_url="https://example.com")

        # Must not raise ValueError
        urls = parser.extract_urls()
        # Should still extract the valid URLs
        valid = [u for u in urls if "example.com" in u]
        self.assertGreaterEqual(len(valid), 2)

    def test_deduplicate_urls(self):
        """Test that duplicate URLs are removed."""
        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser

        content = """
- [Doc 1](https://example.com/doc.md)
- [Doc 2](https://example.com/doc.md)
https://example.com/doc.md
"""
        parser = LlmsTxtParser(content)
        urls = parser.extract_urls()

        # Should only have one instance
        count = sum(1 for u in urls if u == "https://example.com/doc.md")
        self.assertEqual(count, 1)


class TestSavePageContentFiltering(unittest.TestCase):
    """Test content filtering in save_page."""

    def setUp(self):
        """Set up test fixtures."""
        from skill_seekers.cli.doc_scraper import DocToSkillConverter

        self.config = {
            "name": "test_save_filter",
            "base_url": "https://example.com",
            "selectors": {},
            "url_patterns": {"include": [], "exclude": []},
            "categories": {},
        }
        self.converter = DocToSkillConverter(self.config)

    def tearDown(self):
        """Clean up output directory."""
        output_dir = f"output/{self.config['name']}_data"
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)

    def test_skip_empty_content(self):
        """Test that pages with empty content are skipped."""
        page = {
            "url": "https://example.com/empty",
            "title": "Empty Page",
            "content": "",
            "headings": [],
            "code_samples": [],
        }

        self.converter.save_page(page)

        pages_dir = os.path.join(self.converter.data_dir, "pages")
        if os.path.exists(pages_dir):
            self.assertEqual(len(os.listdir(pages_dir)), 0)

    def test_skip_short_content_under_50_chars(self):
        """Test that pages with content < 50 chars are skipped."""
        page = {
            "url": "https://example.com/short",
            "title": "Short",
            "content": "This is too short.",  # 18 chars
            "headings": [],
            "code_samples": [],
        }

        self.converter.save_page(page)

        pages_dir = os.path.join(self.converter.data_dir, "pages")
        if os.path.exists(pages_dir):
            self.assertEqual(len(os.listdir(pages_dir)), 0)

    def test_save_content_over_50_chars(self):
        """Test that pages with content >= 50 chars are saved."""
        page = {
            "url": "https://example.com/valid",
            "title": "Valid Page",
            "content": "A" * 60,  # 60 chars, should pass
            "headings": [],
            "code_samples": [],
        }

        self.converter.save_page(page)

        pages_dir = os.path.join(self.converter.data_dir, "pages")
        self.assertTrue(os.path.exists(pages_dir))
        self.assertEqual(len(os.listdir(pages_dir)), 1)


if __name__ == "__main__":
    unittest.main()