- Add test_markdown_parsing.py with 20 tests covering: - Markdown content extraction (titles, headings, code blocks, links) - HTML fallback when .md URL returns HTML - llms.txt URL extraction and cleaning - Empty/short content filtering - Add test_multi_source.py with 12 tests covering: - List-based scraped_data structure - Per-source subdirectory generation for docs/github/pdf - Index file generation for each source type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
360 lines
12 KiB
Python
360 lines
12 KiB
Python
"""
|
|
Tests for Markdown parsing and BFS URL crawling features.
|
|
|
|
Tests the following functionality:
|
|
1. Markdown file content extraction (_extract_markdown_content)
|
|
2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
|
|
3. URL extraction from llms.txt (extract_urls, _clean_url)
|
|
4. Empty/short content filtering in save_page
|
|
"""
|
|
|
|
import unittest
|
|
import tempfile
|
|
import os
|
|
import shutil
|
|
|
|
|
|
class TestMarkdownContentExtraction(unittest.TestCase):
|
|
"""Test Markdown file parsing in doc_scraper."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
self.config = {
|
|
'name': 'test_md_parsing',
|
|
'base_url': 'https://example.com',
|
|
'selectors': {},
|
|
'url_patterns': {'include': [], 'exclude': []},
|
|
'categories': {}
|
|
}
|
|
self.converter = DocToSkillConverter(self.config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up output directory."""
|
|
output_dir = f"output/{self.config['name']}_data"
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
def test_extract_title_from_h1(self):
|
|
"""Test extracting title from first h1."""
|
|
content = "# My Documentation Title\n\nSome content here."
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
self.assertEqual(result['title'], "My Documentation Title")
|
|
|
|
def test_extract_headings_h2_to_h6(self):
|
|
"""Test extracting h2-h6 headings (not h1)."""
|
|
content = """# Title
|
|
|
|
## Section One
|
|
### Subsection A
|
|
#### Deep Section
|
|
##### Deeper
|
|
###### Deepest
|
|
|
|
Content here.
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
# Should have 5 headings (h2-h6), not h1
|
|
self.assertEqual(len(result['headings']), 5)
|
|
self.assertEqual(result['headings'][0]['level'], 'h2')
|
|
self.assertEqual(result['headings'][0]['text'], 'Section One')
|
|
|
|
def test_extract_code_blocks_with_language(self):
|
|
"""Test extracting code blocks with language tags."""
|
|
content = """# API Guide
|
|
|
|
```python
|
|
def hello():
|
|
return "Hello, World!"
|
|
```
|
|
|
|
Some explanation.
|
|
|
|
```javascript
|
|
const greet = () => console.log("Hi");
|
|
```
|
|
|
|
```
|
|
plain code without language
|
|
```
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
self.assertEqual(len(result['code_samples']), 3)
|
|
self.assertEqual(result['code_samples'][0]['language'], 'python')
|
|
self.assertEqual(result['code_samples'][1]['language'], 'javascript')
|
|
self.assertEqual(result['code_samples'][2]['language'], 'unknown')
|
|
|
|
def test_extract_markdown_links_only_md_files(self):
|
|
"""Test that only .md links are extracted."""
|
|
content = """# Links
|
|
|
|
- [Markdown Doc](./guide.md)
|
|
- [Another MD](https://example.com/api.md)
|
|
- [HTML Page](./page.html)
|
|
- [External](https://google.com)
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md")
|
|
# Should only include .md links
|
|
md_links = [l for l in result['links'] if '.md' in l]
|
|
self.assertEqual(len(md_links), len(result['links']))
|
|
|
|
def test_extract_content_paragraphs(self):
|
|
"""Test extracting paragraph content."""
|
|
content = """# Title
|
|
|
|
This is a paragraph with enough content to pass the minimum length filter.
|
|
|
|
Short.
|
|
|
|
Another paragraph that should be included in the final content output.
|
|
"""
|
|
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
|
self.assertIn("paragraph with enough content", result['content'])
|
|
self.assertNotIn("Short.", result['content'])
|
|
|
|
def test_detect_html_in_md_url(self):
|
|
"""Test that HTML content is detected when .md URL returns HTML."""
|
|
html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
|
|
result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md")
|
|
self.assertEqual(result['title'], "Page")
|
|
|
|
|
|
class TestHtmlAsMarkdownExtraction(unittest.TestCase):
|
|
"""Test HTML to markdown-like extraction."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
self.config = {
|
|
'name': 'test_html_fallback',
|
|
'base_url': 'https://example.com',
|
|
'selectors': {},
|
|
'url_patterns': {'include': [], 'exclude': []},
|
|
'categories': {}
|
|
}
|
|
self.converter = DocToSkillConverter(self.config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up output directory."""
|
|
output_dir = f"output/{self.config['name']}_data"
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
def test_extract_title_from_html(self):
|
|
"""Test extracting title from HTML title tag."""
|
|
html = "<html><head><title>My Page Title</title></head><body></body></html>"
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertEqual(result['title'], "My Page Title")
|
|
|
|
def test_find_main_content_area(self):
|
|
"""Test finding main content from various selectors."""
|
|
html = """
|
|
<html><body>
|
|
<nav>Navigation</nav>
|
|
<main>
|
|
<h1>Main Content</h1>
|
|
<p>This is the main content area with enough text to pass filters.</p>
|
|
</main>
|
|
<footer>Footer</footer>
|
|
</body></html>
|
|
"""
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertIn("main content area", result['content'].lower())
|
|
|
|
def test_extract_code_blocks_from_html(self):
|
|
"""Test extracting code blocks from HTML pre/code tags."""
|
|
html = """
|
|
<html><body>
|
|
<main>
|
|
<pre><code class="language-python">print("hello")</code></pre>
|
|
</main>
|
|
</body></html>
|
|
"""
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertTrue(len(result['code_samples']) > 0)
|
|
|
|
def test_fallback_to_body_when_no_main(self):
|
|
"""Test fallback to body when no main/article element."""
|
|
html = """
|
|
<html><body>
|
|
<div>
|
|
<h2>Section</h2>
|
|
<p>Content in body without main element, long enough to pass filter.</p>
|
|
</div>
|
|
</body></html>
|
|
"""
|
|
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
|
self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0)
|
|
|
|
|
|
class TestLlmsTxtUrlExtraction(unittest.TestCase):
|
|
"""Test URL extraction from llms.txt content."""
|
|
|
|
def test_extract_markdown_style_links(self):
|
|
"""Test extracting [text](url) style links."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
# Documentation Index
|
|
|
|
- [Getting Started](https://docs.example.com/start.md)
|
|
- [API Reference](https://docs.example.com/api/index.md)
|
|
- [Advanced Guide](https://docs.example.com/advanced.md)
|
|
"""
|
|
parser = LlmsTxtParser(content, base_url="https://docs.example.com")
|
|
urls = parser.extract_urls()
|
|
|
|
self.assertIn("https://docs.example.com/start.md", urls)
|
|
self.assertIn("https://docs.example.com/api/index.md", urls)
|
|
self.assertIn("https://docs.example.com/advanced.md", urls)
|
|
|
|
def test_extract_bare_urls(self):
|
|
"""Test extracting bare URLs without markdown syntax."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
Documentation: https://example.com/docs/guide.md
|
|
API: https://example.com/api/reference.md
|
|
"""
|
|
parser = LlmsTxtParser(content)
|
|
urls = parser.extract_urls()
|
|
|
|
self.assertIn("https://example.com/docs/guide.md", urls)
|
|
self.assertIn("https://example.com/api/reference.md", urls)
|
|
|
|
def test_resolve_relative_urls(self):
|
|
"""Test resolving relative URLs with base_url."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
- [Local Doc](./docs/guide.md)
|
|
- [Parent](../api/ref.md)
|
|
"""
|
|
parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
|
|
urls = parser.extract_urls()
|
|
|
|
# Should resolve relative paths
|
|
self.assertTrue(any("docs/guide.md" in url for url in urls))
|
|
|
|
def test_clean_url_invalid_anchor_pattern(self):
|
|
"""Test cleaning URLs with invalid anchor patterns."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Invalid: path after anchor
|
|
result = parser._clean_url("https://example.com/page#section/index.html.md")
|
|
self.assertEqual(result, "https://example.com/page")
|
|
|
|
def test_clean_url_valid_anchor(self):
|
|
"""Test that valid anchors are preserved."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
# Valid anchor should be unchanged
|
|
result = parser._clean_url("https://example.com/page.md#section")
|
|
self.assertEqual(result, "https://example.com/page.md#section")
|
|
|
|
def test_clean_url_no_anchor(self):
|
|
"""Test that URLs without anchors are unchanged."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
parser = LlmsTxtParser("", base_url="https://example.com")
|
|
|
|
result = parser._clean_url("https://example.com/docs/guide.md")
|
|
self.assertEqual(result, "https://example.com/docs/guide.md")
|
|
|
|
def test_deduplicate_urls(self):
|
|
"""Test that duplicate URLs are removed."""
|
|
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
|
|
|
content = """
|
|
- [Doc 1](https://example.com/doc.md)
|
|
- [Doc 2](https://example.com/doc.md)
|
|
https://example.com/doc.md
|
|
"""
|
|
parser = LlmsTxtParser(content)
|
|
urls = parser.extract_urls()
|
|
|
|
# Should only have one instance
|
|
count = sum(1 for u in urls if u == "https://example.com/doc.md")
|
|
self.assertEqual(count, 1)
|
|
|
|
|
|
class TestSavePageContentFiltering(unittest.TestCase):
|
|
"""Test content filtering in save_page."""
|
|
|
|
def setUp(self):
|
|
"""Set up test fixtures."""
|
|
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
|
|
|
self.config = {
|
|
'name': 'test_save_filter',
|
|
'base_url': 'https://example.com',
|
|
'selectors': {},
|
|
'url_patterns': {'include': [], 'exclude': []},
|
|
'categories': {}
|
|
}
|
|
self.converter = DocToSkillConverter(self.config)
|
|
|
|
def tearDown(self):
|
|
"""Clean up output directory."""
|
|
output_dir = f"output/{self.config['name']}_data"
|
|
if os.path.exists(output_dir):
|
|
shutil.rmtree(output_dir)
|
|
|
|
def test_skip_empty_content(self):
|
|
"""Test that pages with empty content are skipped."""
|
|
page = {
|
|
'url': 'https://example.com/empty',
|
|
'title': 'Empty Page',
|
|
'content': '',
|
|
'headings': [],
|
|
'code_samples': []
|
|
}
|
|
|
|
self.converter.save_page(page)
|
|
|
|
pages_dir = os.path.join(self.converter.data_dir, 'pages')
|
|
if os.path.exists(pages_dir):
|
|
self.assertEqual(len(os.listdir(pages_dir)), 0)
|
|
|
|
def test_skip_short_content_under_50_chars(self):
|
|
"""Test that pages with content < 50 chars are skipped."""
|
|
page = {
|
|
'url': 'https://example.com/short',
|
|
'title': 'Short',
|
|
'content': 'This is too short.', # 18 chars
|
|
'headings': [],
|
|
'code_samples': []
|
|
}
|
|
|
|
self.converter.save_page(page)
|
|
|
|
pages_dir = os.path.join(self.converter.data_dir, 'pages')
|
|
if os.path.exists(pages_dir):
|
|
self.assertEqual(len(os.listdir(pages_dir)), 0)
|
|
|
|
def test_save_content_over_50_chars(self):
|
|
"""Test that pages with content >= 50 chars are saved."""
|
|
page = {
|
|
'url': 'https://example.com/valid',
|
|
'title': 'Valid Page',
|
|
'content': 'A' * 60, # 60 chars, should pass
|
|
'headings': [],
|
|
'code_samples': []
|
|
}
|
|
|
|
self.converter.save_page(page)
|
|
|
|
pages_dir = os.path.join(self.converter.data_dir, 'pages')
|
|
self.assertTrue(os.path.exists(pages_dir))
|
|
self.assertEqual(len(os.listdir(pages_dir)), 1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|