Python 3.14's urlparse() raises ValueError on URLs with unencoded brackets that look like malformed IPv6 (e.g. http://[fdaa:x:x:x::x from docs.openclaw.ai llms-full.txt). sanitize_url() called urlparse() BEFORE encoding brackets, so it crashed before it could fix them. Fix: catch ValueError from urlparse, encode ALL brackets, then retry. This is safe because if urlparse rejected the brackets, they are NOT valid IPv6 host literals and should be encoded anyway. Also fixed Discord e2e tests to skip gracefully on network issues. Fixes #284 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
435 lines
16 KiB
Python
435 lines
16 KiB
Python
"""
|
|
Tests for Markdown parsing and BFS URL crawling features.
|
|
|
|
Tests the following functionality:
|
|
1. Markdown file content extraction (_extract_markdown_content)
|
|
2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
|
|
3. URL extraction from llms.txt (extract_urls, _clean_url)
|
|
4. Empty/short content filtering in save_page
|
|
"""
|
|
|
|
import os
|
|
import shutil
|
|
import unittest
|
|
|
|
|
|
class TestMarkdownContentExtraction(unittest.TestCase):
|
|
"""Test Markdown file parsing in doc_scraper."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
self.config = {
|
|
"name": "test_md_parsing",
|
|
"base_url": "https://example.com",
|
|
"selectors": {},
|
|
"url_patterns": {"include": [], "exclude": []},
|
|
"categories": {},
|
|
}
|
|
self.converter = DocToSkillConverter(self.config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up output directory."""
|
|
output_dir = f"output/{self.config['name']}_data"
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
def test_extract_title_from_h1(self):
|
|
"""Test extracting title from first h1."""
|
|
content = "# My Documentation Title\n\nSome content here."
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
self.assertEqual(result["title"], "My Documentation Title")
|
|
|
|
def test_extract_headings_h2_to_h6(self):
|
|
"""Test extracting h2-h6 headings (not h1)."""
|
|
content = """# Title
|
|
|
|
## Section One
|
|
### Subsection A
|
|
#### Deep Section
|
|
##### Deeper
|
|
###### Deepest
|
|
|
|
Content here.
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
# Should have 5 headings (h2-h6), not h1
|
|
self.assertEqual(len(result["headings"]), 5)
|
|
self.assertEqual(result["headings"][0]["level"], "h2")
|
|
self.assertEqual(result["headings"][0]["text"], "Section One")
|
|
|
|
def test_extract_code_blocks_with_language(self):
|
|
"""Test extracting code blocks with language tags."""
|
|
content = """# API Guide
|
|
|
|
```python
|
|
def hello():
|
|
return "Hello, World!"
|
|
```
|
|
|
|
Some explanation.
|
|
|
|
```javascript
|
|
const greet = () => console.log("Hi");
|
|
```
|
|
|
|
```
|
|
plain code without language
|
|
```
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
self.assertEqual(len(result["code_samples"]), 3)
|
|
self.assertEqual(result["code_samples"][0]["language"], "python")
|
|
self.assertEqual(result["code_samples"][1]["language"], "javascript")
|
|
self.assertIn(result["code_samples"][2]["language"], ("unknown", "text"))
|
|
|
|
def test_extract_markdown_links_only_md_files(self):
|
|
"""Test that only .md links are extracted."""
|
|
content = """# Links
|
|
|
|
- [Markdown Doc](./guide.md)
|
|
- [Another MD](https://example.com/api.md)
|
|
- [HTML Page](./page.html)
|
|
- [External](https://google.com)
|
|
"""
|
|
result = self.converter._extract_markdown_content(
|
|
content, "https://example.com/docs/test.md"
|
|
)
|
|
# Should only include .md links
|
|
md_links = [link for link in result["links"] if ".md" in link]
|
|
self.assertEqual(len(md_links), len(result["links"]))
|
|
|
|
def test_extract_content_paragraphs(self):
|
|
"""Test extracting paragraph content."""
|
|
content = """# Title
|
|
|
|
This is a paragraph with enough content to pass the minimum length filter.
|
|
|
|
Short.
|
|
|
|
Another paragraph that should be included in the final content output.
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
self.assertIn("paragraph with enough content", result["content"])
|
|
self.assertNotIn("Short.", result["content"])
|
|
|
|
def test_detect_html_in_md_url(self):
|
|
"""Test that HTML content is detected when .md URL returns HTML."""
|
|
html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
|
|
result = self.converter._extract_markdown_content(
|
|
html_content, "https://example.com/test.md"
|
|
)
|
|
self.assertEqual(result["title"], "Page")
|
|
|
|
|
|
class TestHtmlAsMarkdownExtraction(unittest.TestCase):
|
|
"""Test HTML to markdown-like extraction."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
self.config = {
|
|
"name": "test_html_fallback",
|
|
"base_url": "https://example.com",
|
|
"selectors": {},
|
|
"url_patterns": {"include": [], "exclude": []},
|
|
"categories": {},
|
|
}
|
|
self.converter = DocToSkillConverter(self.config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up output directory."""
|
|
output_dir = f"output/{self.config['name']}_data"
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
def test_extract_title_from_html(self):
|
|
"""Test extracting title from HTML title tag."""
|
|
html = "<html><head><title>My Page Title</title></head><body></body></html>"
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertEqual(result["title"], "My Page Title")
|
|
|
|
def test_find_main_content_area(self):
|
|
"""Test finding main content from various selectors."""
|
|
html = """
|
|
<html><body>
|
|
<nav>Navigation</nav>
|
|
<main>
|
|
<h1>Main Content</h1>
|
|
<p>This is the main content area with enough text to pass filters.</p>
|
|
</main>
|
|
<footer>Footer</footer>
|
|
</body></html>
|
|
"""
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertIn("main content area", result["content"].lower())
|
|
|
|
def test_extract_code_blocks_from_html(self):
|
|
"""Test extracting code blocks from HTML pre/code tags."""
|
|
html = """
|
|
<html><body>
|
|
<main>
|
|
<pre><code class="language-python">print("hello")</code></pre>
|
|
</main>
|
|
</body></html>
|
|
"""
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertTrue(len(result["code_samples"]) > 0)
|
|
|
|
def test_fallback_to_body_when_no_main(self):
|
|
"""Test fallback to body when no main/article element."""
|
|
html = """
|
|
<html><body>
|
|
<div>
|
|
<h2>Section</h2>
|
|
<p>Content in body without main element, long enough to pass filter.</p>
|
|
</div>
|
|
</body></html>
|
|
"""
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertTrue(len(result["headings"]) > 0 or len(result["content"]) > 0)
|
|
|
|
|
|
class TestLlmsTxtUrlExtraction(unittest.TestCase):
|
|
"""Test URL extraction from llms.txt content."""
|
|
|
|
def test_extract_markdown_style_links(self):
|
|
"""Test extracting [text](url) style links."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
# Documentation Index
|
|
|
|
- [Getting Started](https://docs.example.com/start.md)
|
|
- [API Reference](https://docs.example.com/api/index.md)
|
|
- [Advanced Guide](https://docs.example.com/advanced.md)
|
|
"""
|
|
parser = LlmsTxtParser(content, base_url="https://docs.example.com")
|
|
urls = parser.extract_urls()
|
|
|
|
self.assertIn("https://docs.example.com/start.md", urls)
|
|
self.assertIn("https://docs.example.com/api/index.md", urls)
|
|
self.assertIn("https://docs.example.com/advanced.md", urls)
|
|
|
|
def test_extract_bare_urls(self):
|
|
"""Test extracting bare URLs without markdown syntax."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
Documentation: https://example.com/docs/guide.md
|
|
API: https://example.com/api/reference.md
|
|
"""
|
|
parser = LlmsTxtParser(content)
|
|
urls = parser.extract_urls()
|
|
|
|
self.assertIn("https://example.com/docs/guide.md", urls)
|
|
self.assertIn("https://example.com/api/reference.md", urls)
|
|
|
|
def test_resolve_relative_urls(self):
|
|
"""Test resolving relative URLs with base_url."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
- [Local Doc](./docs/guide.md)
|
|
- [Parent](../api/ref.md)
|
|
"""
|
|
parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
|
|
urls = parser.extract_urls()
|
|
|
|
# Should resolve relative paths
|
|
self.assertTrue(any("docs/guide.md" in url for url in urls))
|
|
|
|
def test_clean_url_invalid_anchor_pattern(self):
|
|
"""Test cleaning URLs with invalid anchor patterns."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Invalid: path after anchor
|
|
result = parser._clean_url("https://example.com/page#section/index.html.md")
|
|
self.assertEqual(result, "https://example.com/page")
|
|
|
|
def test_clean_url_valid_anchor(self):
|
|
"""Test that valid anchors are preserved."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Valid anchor should be unchanged
|
|
result = parser._clean_url("https://example.com/page.md#section")
|
|
self.assertEqual(result, "https://example.com/page.md#section")
|
|
|
|
def test_clean_url_no_anchor(self):
|
|
"""Test that URLs without anchors are unchanged."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
result = parser._clean_url("https://example.com/docs/guide.md")
|
|
self.assertEqual(result, "https://example.com/docs/guide.md")
|
|
|
|
def test_clean_url_bracket_encoding(self):
|
|
"""Test that square brackets are percent-encoded in URL path (#284)."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
result = parser._clean_url("https://example.com/api/[v1]/users")
|
|
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/users")
|
|
|
|
def test_clean_url_bracket_encoding_preserves_host(self):
|
|
"""Test that bracket encoding does not affect host (IPv6 literals)."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Brackets should only be encoded in path, not in host
|
|
result = parser._clean_url("https://example.com/path/[param]/end")
|
|
self.assertIn("%5B", result)
|
|
self.assertIn("%5D", result)
|
|
self.assertIn("example.com", result)
|
|
|
|
def test_clean_url_bracket_in_query(self):
|
|
"""Test that brackets in query params are also encoded."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
result = parser._clean_url("https://example.com/search?filter=[active]")
|
|
self.assertEqual(result, "https://example.com/search?filter=%5Bactive%5D")
|
|
|
|
def test_clean_url_malformed_anchor_with_brackets(self):
|
|
"""Test combined malformed anchor stripping + bracket encoding."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Malformed anchor should be stripped, then brackets encoded
|
|
result = parser._clean_url("https://example.com/api/[v1]/page#section/deep")
|
|
self.assertEqual(result, "https://example.com/api/%5Bv1%5D/page")
|
|
|
|
def test_clean_url_malformed_ipv6_no_crash(self):
|
|
"""Test that incomplete IPv6 placeholder URLs don't crash (issue #284).
|
|
|
|
Python 3.14 raises ValueError from urlparse() on these URLs.
|
|
Seen in real-world llms-full.txt from docs.openclaw.ai.
|
|
"""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Must not raise ValueError
|
|
result = parser._clean_url("http://[fdaa:x:x:x:x::x")
|
|
self.assertIn("%5B", result)
|
|
self.assertNotIn("[", result)
|
|
|
|
def test_extract_urls_with_ipv6_placeholder_no_crash(self):
|
|
"""Test that extract_urls handles content with broken IPv6 URLs (issue #284)."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """# Docs
|
|
- [Guide](https://example.com/guide.md)
|
|
- Connect to http://[fdaa:x:x:x:x::x for private networking
|
|
- [API](https://example.com/api.md)
|
|
"""
|
|
parser = LlmsTxtParser(content, base_url="https://example.com")
|
|
|
|
# Must not raise ValueError
|
|
urls = parser.extract_urls()
|
|
# Should still extract the valid URLs
|
|
valid = [u for u in urls if "example.com" in u]
|
|
self.assertGreaterEqual(len(valid), 2)
|
|
|
|
def test_deduplicate_urls(self):
|
|
"""Test that duplicate URLs are removed."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
- [Doc 1](https://example.com/doc.md)
|
|
- [Doc 2](https://example.com/doc.md)
|
|
https://example.com/doc.md
|
|
"""
|
|
parser = LlmsTxtParser(content)
|
|
urls = parser.extract_urls()
|
|
|
|
# Should only have one instance
|
|
count = sum(1 for u in urls if u == "https://example.com/doc.md")
|
|
self.assertEqual(count, 1)
|
|
|
|
|
|
class TestSavePageContentFiltering(unittest.TestCase):
|
|
"""Test content filtering in save_page."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
self.config = {
|
|
"name": "test_save_filter",
|
|
"base_url": "https://example.com",
|
|
"selectors": {},
|
|
"url_patterns": {"include": [], "exclude": []},
|
|
"categories": {},
|
|
}
|
|
self.converter = DocToSkillConverter(self.config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up output directory."""
|
|
output_dir = f"output/{self.config['name']}_data"
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
def test_skip_empty_content(self):
|
|
"""Test that pages with empty content are skipped."""
|
|
page = {
|
|
"url": "https://example.com/empty",
|
|
"title": "Empty Page",
|
|
"content": "",
|
|
"headings": [],
|
|
"code_samples": [],
|
|
}
|
|
|
|
self.converter.save_page(page)
|
|
|
|
pages_dir = os.path.join(self.converter.data_dir, "pages")
|
|
if os.path.exists(pages_dir):
|
|
self.assertEqual(len(os.listdir(pages_dir)), 0)
|
|
|
|
def test_skip_short_content_under_50_chars(self):
|
|
"""Test that pages with content < 50 chars are skipped."""
|
|
page = {
|
|
"url": "https://example.com/short",
|
|
"title": "Short",
|
|
"content": "This is too short.", # 18 chars
|
|
"headings": [],
|
|
"code_samples": [],
|
|
}
|
|
|
|
self.converter.save_page(page)
|
|
|
|
pages_dir = os.path.join(self.converter.data_dir, "pages")
|
|
if os.path.exists(pages_dir):
|
|
self.assertEqual(len(os.listdir(pages_dir)), 0)
|
|
|
|
def test_save_content_over_50_chars(self):
|
|
"""Test that pages with content >= 50 chars are saved."""
|
|
page = {
|
|
"url": "https://example.com/valid",
|
|
"title": "Valid Page",
|
|
"content": "A" * 60, # 60 chars, should pass
|
|
"headings": [],
|
|
"code_samples": [],
|
|
}
|
|
|
|
self.converter.save_page(page)
|
|
|
|
pages_dir = os.path.join(self.converter.data_dir, "pages")
|
|
self.assertTrue(os.path.exists(pages_dir))
|
|
self.assertEqual(len(os.listdir(pages_dir)), 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|