Files
skill-seekers-reference/tests/test_markdown_parsing.py
yusyus 1d3d7389d7 fix: sanitize_url crashes on Python 3.14 strict urlparse (#284)
Python 3.14's urlparse() raises ValueError on URLs with unencoded
brackets that look like malformed IPv6 (e.g. http://[fdaa:x:x:x::x
from docs.openclaw.ai llms-full.txt). sanitize_url() called urlparse()
BEFORE encoding brackets, so it crashed before it could fix them.

Fix: catch ValueError from urlparse, encode ALL brackets, then retry.
This is safe because if urlparse rejected the brackets, they are NOT
valid IPv6 host literals and should be encoded anyway.

Also fixed Discord e2e tests to skip gracefully on network issues.

Fixes #284

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 00:30:48 +03:00

435 lines
16 KiB
Python

"""
Tests for Markdown parsing and BFS URL crawling features.
Tests the following functionality:
1. Markdown file content extraction (_extract_markdown_content)
2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
3. URL extraction from llms.txt (extract_urls, _clean_url)
4. Empty/short content filtering in save_page
"""
import os
import shutil
import unittest
class TestMarkdownContentExtraction(unittest.TestCase):
"""Test Markdown file parsing in doc_scraper."""
def setUp(self):
"""Set up test fixtures."""
from skill_seekers.cli.doc_scraper import DocToSkillConverter
self.config = {
"name": "test_md_parsing",
"base_url": "https://example.com",
"selectors": {},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
}
self.converter = DocToSkillConverter(self.config)
def tearDown(self):
"""Clean up output directory."""
output_dir = f"output/{self.config['name']}_data"
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
def test_extract_title_from_h1(self):
"""Test extracting title from first h1."""
content = "# My Documentation Title\n\nSome content here."
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
self.assertEqual(result["title"], "My Documentation Title")
def test_extract_headings_h2_to_h6(self):
"""Test extracting h2-h6 headings (not h1)."""
content = """# Title
## Section One
### Subsection A
#### Deep Section
##### Deeper
###### Deepest
Content here.
"""
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
# Should have 5 headings (h2-h6), not h1
self.assertEqual(len(result["headings"]), 5)
self.assertEqual(result["headings"][0]["level"], "h2")
self.assertEqual(result["headings"][0]["text"], "Section One")
def test_extract_code_blocks_with_language(self):
"""Test extracting code blocks with language tags."""
content = """# API Guide
```python
def hello():
return "Hello, World!"
```
Some explanation.
```javascript
const greet = () => console.log("Hi");
```
```
plain code without language
```
"""
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
self.assertEqual(len(result["code_samples"]), 3)
self.assertEqual(result["code_samples"][0]["language"], "python")
self.assertEqual(result["code_samples"][1]["language"], "javascript")
self.assertIn(result["code_samples"][2]["language"], ("unknown", "text"))
def test_extract_markdown_links_only_md_files(self):
"""Test that only .md links are extracted."""
content = """# Links
- [Markdown Doc](./guide.md)
- [Another MD](https://example.com/api.md)
- [HTML Page](./page.html)
- [External](https://google.com)
"""
result = self.converter._extract_markdown_content(
content, "https://example.com/docs/test.md"
)
# Should only include .md links
md_links = [link for link in result["links"] if ".md" in link]
self.assertEqual(len(md_links), len(result["links"]))
def test_extract_content_paragraphs(self):
"""Test extracting paragraph content."""
content = """# Title
This is a paragraph with enough content to pass the minimum length filter.
Short.
Another paragraph that should be included in the final content output.
"""
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
self.assertIn("paragraph with enough content", result["content"])
self.assertNotIn("Short.", result["content"])
def test_detect_html_in_md_url(self):
"""Test that HTML content is detected when .md URL returns HTML."""
html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
result = self.converter._extract_markdown_content(
html_content, "https://example.com/test.md"
)
self.assertEqual(result["title"], "Page")
class TestHtmlAsMarkdownExtraction(unittest.TestCase):
"""Test HTML to markdown-like extraction."""
def setUp(self):
"""Set up test fixtures."""
from skill_seekers.cli.doc_scraper import DocToSkillConverter
self.config = {
"name": "test_html_fallback",
"base_url": "https://example.com",
"selectors": {},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
}
self.converter = DocToSkillConverter(self.config)
def tearDown(self):
"""Clean up output directory."""
output_dir = f"output/{self.config['name']}_data"
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
def test_extract_title_from_html(self):
"""Test extracting title from HTML title tag."""
html = "<html><head><title>My Page Title</title></head><body></body></html>"
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
self.assertEqual(result["title"], "My Page Title")
def test_find_main_content_area(self):
"""Test finding main content from various selectors."""
html = """
<html><body>
<nav>Navigation</nav>
<main>
<h1>Main Content</h1>
<p>This is the main content area with enough text to pass filters.</p>
</main>
<footer>Footer</footer>
</body></html>
"""
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
self.assertIn("main content area", result["content"].lower())
def test_extract_code_blocks_from_html(self):
"""Test extracting code blocks from HTML pre/code tags."""
html = """
<html><body>
<main>
<pre><code class="language-python">print("hello")</code></pre>
</main>
</body></html>
"""
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
self.assertTrue(len(result["code_samples"]) > 0)
def test_fallback_to_body_when_no_main(self):
"""Test fallback to body when no main/article element."""
html = """
<html><body>
<div>
<h2>Section</h2>
<p>Content in body without main element, long enough to pass filter.</p>
</div>
</body></html>
"""
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
self.assertTrue(len(result["headings"]) > 0 or len(result["content"]) > 0)
class TestLlmsTxtUrlExtraction(unittest.TestCase):
"""Test URL extraction from llms.txt content."""
def test_extract_markdown_style_links(self):
"""Test extracting [text](url) style links."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
content = """
# Documentation Index
- [Getting Started](https://docs.example.com/start.md)
- [API Reference](https://docs.example.com/api/index.md)
- [Advanced Guide](https://docs.example.com/advanced.md)
"""
parser = LlmsTxtParser(content, base_url="https://docs.example.com")
urls = parser.extract_urls()
self.assertIn("https://docs.example.com/start.md", urls)
self.assertIn("https://docs.example.com/api/index.md", urls)
self.assertIn("https://docs.example.com/advanced.md", urls)
def test_extract_bare_urls(self):
"""Test extracting bare URLs without markdown syntax."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
content = """
Documentation: https://example.com/docs/guide.md
API: https://example.com/api/reference.md
"""
parser = LlmsTxtParser(content)
urls = parser.extract_urls()
self.assertIn("https://example.com/docs/guide.md", urls)
self.assertIn("https://example.com/api/reference.md", urls)
def test_resolve_relative_urls(self):
"""Test resolving relative URLs with base_url."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
content = """
- [Local Doc](./docs/guide.md)
- [Parent](../api/ref.md)
"""
parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
urls = parser.extract_urls()
# Should resolve relative paths
self.assertTrue(any("docs/guide.md" in url for url in urls))
def test_clean_url_invalid_anchor_pattern(self):
"""Test cleaning URLs with invalid anchor patterns."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
# Invalid: path after anchor
result = parser._clean_url("https://example.com/page#section/index.html.md")
self.assertEqual(result, "https://example.com/page")
def test_clean_url_valid_anchor(self):
"""Test that valid anchors are preserved."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
# Valid anchor should be unchanged
result = parser._clean_url("https://example.com/page.md#section")
self.assertEqual(result, "https://example.com/page.md#section")
def test_clean_url_no_anchor(self):
"""Test that URLs without anchors are unchanged."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
result = parser._clean_url("https://example.com/docs/guide.md")
self.assertEqual(result, "https://example.com/docs/guide.md")
def test_clean_url_bracket_encoding(self):
"""Test that square brackets are percent-encoded in URL path (#284)."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
result = parser._clean_url("https://example.com/api/[v1]/users")
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users")
def test_clean_url_bracket_encoding_preserves_host(self):
"""Test that bracket encoding does not affect host (IPv6 literals)."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
# Brackets should only be encoded in path, not in host
result = parser._clean_url("https://example.com/path/[param]/end")
self.assertIn("%5B", result)
self.assertIn("%5D", result)
self.assertIn("example.com", result)
def test_clean_url_bracket_in_query(self):
"""Test that brackets in query params are also encoded."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
result = parser._clean_url("https://example.com/search?filter=[active]")
self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D")
def test_clean_url_malformed_anchor_with_brackets(self):
"""Test combined malformed anchor stripping + bracket encoding."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
# Malformed anchor should be stripped, then brackets encoded
result = parser._clean_url("https://example.com/api/[v1]/page#section/deep")
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/page")
def test_clean_url_malformed_ipv6_no_crash(self):
"""Test that incomplete IPv6 placeholder URLs don't crash (issue #284).
Python 3.14 raises ValueError from urlparse() on these URLs.
Seen in real-world llms-full.txt from docs.openclaw.ai.
"""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
parser = LlmsTxtParser("", base_url="https://example.com")
# Must not raise ValueError
result = parser._clean_url("http://[fdaa:x:x:x:x::x")
self.assertIn("%5B", result)
self.assertNotIn("[", result)
def test_extract_urls_with_ipv6_placeholder_no_crash(self):
"""Test that extract_urls handles content with broken IPv6 URLs (issue #284)."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
content = """# Docs
- [Guide](https://example.com/guide.md)
- Connect to http://[fdaa:x:x:x:x::x for private networking
- [API](https://example.com/api.md)
"""
parser = LlmsTxtParser(content, base_url="https://example.com")
# Must not raise ValueError
urls = parser.extract_urls()
# Should still extract the valid URLs
valid = [u for u in urls if "example.com" in u]
self.assertGreaterEqual(len(valid), 2)
def test_deduplicate_urls(self):
"""Test that duplicate URLs are removed."""
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
content = """
- [Doc 1](https://example.com/doc.md)
- [Doc 2](https://example.com/doc.md)
https://example.com/doc.md
"""
parser = LlmsTxtParser(content)
urls = parser.extract_urls()
# Should only have one instance
count = sum(1 for u in urls if u == "https://example.com/doc.md")
self.assertEqual(count, 1)
class TestSavePageContentFiltering(unittest.TestCase):
"""Test content filtering in save_page."""
def setUp(self):
"""Set up test fixtures."""
from skill_seekers.cli.doc_scraper import DocToSkillConverter
self.config = {
"name": "test_save_filter",
"base_url": "https://example.com",
"selectors": {},
"url_patterns": {"include": [], "exclude": []},
"categories": {},
}
self.converter = DocToSkillConverter(self.config)
def tearDown(self):
"""Clean up output directory."""
output_dir = f"output/{self.config['name']}_data"
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
def test_skip_empty_content(self):
"""Test that pages with empty content are skipped."""
page = {
"url": "https://example.com/empty",
"title": "Empty Page",
"content": "",
"headings": [],
"code_samples": [],
}
self.converter.save_page(page)
pages_dir = os.path.join(self.converter.data_dir, "pages")
if os.path.exists(pages_dir):
self.assertEqual(len(os.listdir(pages_dir)), 0)
def test_skip_short_content_under_50_chars(self):
"""Test that pages with content < 50 chars are skipped."""
page = {
"url": "https://example.com/short",
"title": "Short",
"content": "This is too short.", # 18 chars
"headings": [],
"code_samples": [],
}
self.converter.save_page(page)
pages_dir = os.path.join(self.converter.data_dir, "pages")
if os.path.exists(pages_dir):
self.assertEqual(len(os.listdir(pages_dir)), 0)
def test_save_content_over_50_chars(self):
"""Test that pages with content >= 50 chars are saved."""
page = {
"url": "https://example.com/valid",
"title": "Valid Page",
"content": "A" * 60, # 60 chars, should pass
"headings": [],
"code_samples": [],
}
self.converter.save_page(page)
pages_dir = os.path.join(self.converter.data_dir, "pages")
self.assertTrue(os.path.exists(pages_dir))
self.assertEqual(len(os.listdir(pages_dir)), 1)
if __name__ == "__main__":
unittest.main()