test: add unit tests for markdown parsing and multi-source features
- Add test_markdown_parsing.py with 20 tests covering: - Markdown content extraction (titles, headings, code blocks, links) - HTML fallback when .md URL returns HTML - llms.txt URL extraction and cleaning - Empty/short content filtering - Add test_multi_source.py with 12 tests covering: - List-based scraped_data structure - Per-source subdirectory generation for docs/github/pdf - Index file generation for each source type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
359
tests/test_markdown_parsing.py
Normal file
359
tests/test_markdown_parsing.py
Normal file
@@ -0,0 +1,359 @@
|
||||
"""
|
||||
Tests for Markdown parsing and BFS URL crawling features.
|
||||
|
||||
Tests the following functionality:
|
||||
1. Markdown file content extraction (_extract_markdown_content)
|
||||
2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
|
||||
3. URL extraction from llms.txt (extract_urls, _clean_url)
|
||||
4. Empty/short content filtering in save_page
|
||||
"""
|
||||
|
||||
import unittest
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
class TestMarkdownContentExtraction(unittest.TestCase):
|
||||
"""Test Markdown file parsing in doc_scraper."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
self.config = {
|
||||
'name': 'test_md_parsing',
|
||||
'base_url': 'https://example.com',
|
||||
'selectors': {},
|
||||
'url_patterns': {'include': [], 'exclude': []},
|
||||
'categories': {}
|
||||
}
|
||||
self.converter = DocToSkillConverter(self.config)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up output directory."""
|
||||
output_dir = f"output/{self.config['name']}_data"
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
|
||||
def test_extract_title_from_h1(self):
|
||||
"""Test extracting title from first h1."""
|
||||
content = "# My Documentation Title\n\nSome content here."
|
||||
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
||||
self.assertEqual(result['title'], "My Documentation Title")
|
||||
|
||||
def test_extract_headings_h2_to_h6(self):
|
||||
"""Test extracting h2-h6 headings (not h1)."""
|
||||
content = """# Title
|
||||
|
||||
## Section One
|
||||
### Subsection A
|
||||
#### Deep Section
|
||||
##### Deeper
|
||||
###### Deepest
|
||||
|
||||
Content here.
|
||||
"""
|
||||
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
||||
# Should have 5 headings (h2-h6), not h1
|
||||
self.assertEqual(len(result['headings']), 5)
|
||||
self.assertEqual(result['headings'][0]['level'], 'h2')
|
||||
self.assertEqual(result['headings'][0]['text'], 'Section One')
|
||||
|
||||
def test_extract_code_blocks_with_language(self):
|
||||
"""Test extracting code blocks with language tags."""
|
||||
content = """# API Guide
|
||||
|
||||
```python
|
||||
def hello():
|
||||
return "Hello, World!"
|
||||
```
|
||||
|
||||
Some explanation.
|
||||
|
||||
```javascript
|
||||
const greet = () => console.log("Hi");
|
||||
```
|
||||
|
||||
```
|
||||
plain code without language
|
||||
```
|
||||
"""
|
||||
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
||||
self.assertEqual(len(result['code_samples']), 3)
|
||||
self.assertEqual(result['code_samples'][0]['language'], 'python')
|
||||
self.assertEqual(result['code_samples'][1]['language'], 'javascript')
|
||||
self.assertEqual(result['code_samples'][2]['language'], 'unknown')
|
||||
|
||||
def test_extract_markdown_links_only_md_files(self):
|
||||
"""Test that only .md links are extracted."""
|
||||
content = """# Links
|
||||
|
||||
- [Markdown Doc](./guide.md)
|
||||
- [Another MD](https://example.com/api.md)
|
||||
- [HTML Page](./page.html)
|
||||
- [External](https://google.com)
|
||||
"""
|
||||
result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md")
|
||||
# Should only include .md links
|
||||
md_links = [l for l in result['links'] if '.md' in l]
|
||||
self.assertEqual(len(md_links), len(result['links']))
|
||||
|
||||
def test_extract_content_paragraphs(self):
|
||||
"""Test extracting paragraph content."""
|
||||
content = """# Title
|
||||
|
||||
This is a paragraph with enough content to pass the minimum length filter.
|
||||
|
||||
Short.
|
||||
|
||||
Another paragraph that should be included in the final content output.
|
||||
"""
|
||||
result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
|
||||
self.assertIn("paragraph with enough content", result['content'])
|
||||
self.assertNotIn("Short.", result['content'])
|
||||
|
||||
def test_detect_html_in_md_url(self):
|
||||
"""Test that HTML content is detected when .md URL returns HTML."""
|
||||
html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
|
||||
result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md")
|
||||
self.assertEqual(result['title'], "Page")
|
||||
|
||||
|
||||
class TestHtmlAsMarkdownExtraction(unittest.TestCase):
|
||||
"""Test HTML to markdown-like extraction."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
self.config = {
|
||||
'name': 'test_html_fallback',
|
||||
'base_url': 'https://example.com',
|
||||
'selectors': {},
|
||||
'url_patterns': {'include': [], 'exclude': []},
|
||||
'categories': {}
|
||||
}
|
||||
self.converter = DocToSkillConverter(self.config)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up output directory."""
|
||||
output_dir = f"output/{self.config['name']}_data"
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
|
||||
def test_extract_title_from_html(self):
|
||||
"""Test extracting title from HTML title tag."""
|
||||
html = "<html><head><title>My Page Title</title></head><body></body></html>"
|
||||
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
||||
self.assertEqual(result['title'], "My Page Title")
|
||||
|
||||
def test_find_main_content_area(self):
|
||||
"""Test finding main content from various selectors."""
|
||||
html = """
|
||||
<html><body>
|
||||
<nav>Navigation</nav>
|
||||
<main>
|
||||
<h1>Main Content</h1>
|
||||
<p>This is the main content area with enough text to pass filters.</p>
|
||||
</main>
|
||||
<footer>Footer</footer>
|
||||
</body></html>
|
||||
"""
|
||||
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
||||
self.assertIn("main content area", result['content'].lower())
|
||||
|
||||
def test_extract_code_blocks_from_html(self):
|
||||
"""Test extracting code blocks from HTML pre/code tags."""
|
||||
html = """
|
||||
<html><body>
|
||||
<main>
|
||||
<pre><code class="language-python">print("hello")</code></pre>
|
||||
</main>
|
||||
</body></html>
|
||||
"""
|
||||
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
||||
self.assertTrue(len(result['code_samples']) > 0)
|
||||
|
||||
def test_fallback_to_body_when_no_main(self):
|
||||
"""Test fallback to body when no main/article element."""
|
||||
html = """
|
||||
<html><body>
|
||||
<div>
|
||||
<h2>Section</h2>
|
||||
<p>Content in body without main element, long enough to pass filter.</p>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
|
||||
self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0)
|
||||
|
||||
|
||||
class TestLlmsTxtUrlExtraction(unittest.TestCase):
|
||||
"""Test URL extraction from llms.txt content."""
|
||||
|
||||
def test_extract_markdown_style_links(self):
|
||||
"""Test extracting [text](url) style links."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
content = """
|
||||
# Documentation Index
|
||||
|
||||
- [Getting Started](https://docs.example.com/start.md)
|
||||
- [API Reference](https://docs.example.com/api/index.md)
|
||||
- [Advanced Guide](https://docs.example.com/advanced.md)
|
||||
"""
|
||||
parser = LlmsTxtParser(content, base_url="https://docs.example.com")
|
||||
urls = parser.extract_urls()
|
||||
|
||||
self.assertIn("https://docs.example.com/start.md", urls)
|
||||
self.assertIn("https://docs.example.com/api/index.md", urls)
|
||||
self.assertIn("https://docs.example.com/advanced.md", urls)
|
||||
|
||||
def test_extract_bare_urls(self):
|
||||
"""Test extracting bare URLs without markdown syntax."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
content = """
|
||||
Documentation: https://example.com/docs/guide.md
|
||||
API: https://example.com/api/reference.md
|
||||
"""
|
||||
parser = LlmsTxtParser(content)
|
||||
urls = parser.extract_urls()
|
||||
|
||||
self.assertIn("https://example.com/docs/guide.md", urls)
|
||||
self.assertIn("https://example.com/api/reference.md", urls)
|
||||
|
||||
def test_resolve_relative_urls(self):
|
||||
"""Test resolving relative URLs with base_url."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
content = """
|
||||
- [Local Doc](./docs/guide.md)
|
||||
- [Parent](../api/ref.md)
|
||||
"""
|
||||
parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
|
||||
urls = parser.extract_urls()
|
||||
|
||||
# Should resolve relative paths
|
||||
self.assertTrue(any("docs/guide.md" in url for url in urls))
|
||||
|
||||
def test_clean_url_invalid_anchor_pattern(self):
|
||||
"""Test cleaning URLs with invalid anchor patterns."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
# Invalid: path after anchor
|
||||
result = parser._clean_url("https://example.com/page#section/index.html.md")
|
||||
self.assertEqual(result, "https://example.com/page")
|
||||
|
||||
def test_clean_url_valid_anchor(self):
|
||||
"""Test that valid anchors are preserved."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
# Valid anchor should be unchanged
|
||||
result = parser._clean_url("https://example.com/page.md#section")
|
||||
self.assertEqual(result, "https://example.com/page.md#section")
|
||||
|
||||
def test_clean_url_no_anchor(self):
|
||||
"""Test that URLs without anchors are unchanged."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
parser = LlmsTxtParser("", base_url="https://example.com")
|
||||
|
||||
result = parser._clean_url("https://example.com/docs/guide.md")
|
||||
self.assertEqual(result, "https://example.com/docs/guide.md")
|
||||
|
||||
def test_deduplicate_urls(self):
|
||||
"""Test that duplicate URLs are removed."""
|
||||
from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
|
||||
|
||||
content = """
|
||||
- [Doc 1](https://example.com/doc.md)
|
||||
- [Doc 2](https://example.com/doc.md)
|
||||
https://example.com/doc.md
|
||||
"""
|
||||
parser = LlmsTxtParser(content)
|
||||
urls = parser.extract_urls()
|
||||
|
||||
# Should only have one instance
|
||||
count = sum(1 for u in urls if u == "https://example.com/doc.md")
|
||||
self.assertEqual(count, 1)
|
||||
|
||||
|
||||
class TestSavePageContentFiltering(unittest.TestCase):
|
||||
"""Test content filtering in save_page."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
from skill_seekers.cli.doc_scraper import DocToSkillConverter
|
||||
|
||||
self.config = {
|
||||
'name': 'test_save_filter',
|
||||
'base_url': 'https://example.com',
|
||||
'selectors': {},
|
||||
'url_patterns': {'include': [], 'exclude': []},
|
||||
'categories': {}
|
||||
}
|
||||
self.converter = DocToSkillConverter(self.config)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up output directory."""
|
||||
output_dir = f"output/{self.config['name']}_data"
|
||||
if os.path.exists(output_dir):
|
||||
shutil.rmtree(output_dir)
|
||||
|
||||
def test_skip_empty_content(self):
|
||||
"""Test that pages with empty content are skipped."""
|
||||
page = {
|
||||
'url': 'https://example.com/empty',
|
||||
'title': 'Empty Page',
|
||||
'content': '',
|
||||
'headings': [],
|
||||
'code_samples': []
|
||||
}
|
||||
|
||||
self.converter.save_page(page)
|
||||
|
||||
pages_dir = os.path.join(self.converter.data_dir, 'pages')
|
||||
if os.path.exists(pages_dir):
|
||||
self.assertEqual(len(os.listdir(pages_dir)), 0)
|
||||
|
||||
def test_skip_short_content_under_50_chars(self):
|
||||
"""Test that pages with content < 50 chars are skipped."""
|
||||
page = {
|
||||
'url': 'https://example.com/short',
|
||||
'title': 'Short',
|
||||
'content': 'This is too short.', # 18 chars
|
||||
'headings': [],
|
||||
'code_samples': []
|
||||
}
|
||||
|
||||
self.converter.save_page(page)
|
||||
|
||||
pages_dir = os.path.join(self.converter.data_dir, 'pages')
|
||||
if os.path.exists(pages_dir):
|
||||
self.assertEqual(len(os.listdir(pages_dir)), 0)
|
||||
|
||||
def test_save_content_over_50_chars(self):
|
||||
"""Test that pages with content >= 50 chars are saved."""
|
||||
page = {
|
||||
'url': 'https://example.com/valid',
|
||||
'title': 'Valid Page',
|
||||
'content': 'A' * 60, # 60 chars, should pass
|
||||
'headings': [],
|
||||
'code_samples': []
|
||||
}
|
||||
|
||||
self.converter.save_page(page)
|
||||
|
||||
pages_dir = os.path.join(self.converter.data_dir, 'pages')
|
||||
self.assertTrue(os.path.exists(pages_dir))
|
||||
self.assertEqual(len(os.listdir(pages_dir)), 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
433
tests/test_multi_source.py
Normal file
433
tests/test_multi_source.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
Tests for multi-source support in unified scraper and skill builder.
|
||||
|
||||
Tests the following functionality:
|
||||
1. Multiple sources of same type in unified_scraper (list structure)
|
||||
2. Source counters and unique naming
|
||||
3. Per-source reference directory generation in unified_skill_builder
|
||||
4. Multiple documentation sources handling
|
||||
5. Multiple GitHub repositories handling
|
||||
"""
|
||||
|
||||
import unittest
|
||||
import tempfile
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
class TestUnifiedScraperDataStructure(unittest.TestCase):
|
||||
"""Test scraped_data list structure in unified_scraper."""
|
||||
|
||||
def test_scraped_data_uses_list_structure(self):
|
||||
"""Test that scraped_data uses list for each source type."""
|
||||
from skill_seekers.cli.unified_scraper import UnifiedScraper
|
||||
|
||||
config = {
|
||||
'name': 'test_multi',
|
||||
'description': 'Test skill',
|
||||
'sources': [
|
||||
{'type': 'documentation', 'base_url': 'https://example.com'}
|
||||
]
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
original_dir = os.getcwd()
|
||||
try:
|
||||
os.chdir(temp_dir)
|
||||
scraper = UnifiedScraper(config)
|
||||
|
||||
self.assertIsInstance(scraper.scraped_data['documentation'], list)
|
||||
self.assertIsInstance(scraper.scraped_data['github'], list)
|
||||
self.assertIsInstance(scraper.scraped_data['pdf'], list)
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
def test_source_counters_initialized_to_zero(self):
|
||||
"""Test that source counters start at zero."""
|
||||
from skill_seekers.cli.unified_scraper import UnifiedScraper
|
||||
|
||||
config = {
|
||||
'name': 'test_counters',
|
||||
'description': 'Test skill',
|
||||
'sources': [
|
||||
{'type': 'documentation', 'base_url': 'https://example.com'}
|
||||
]
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
original_dir = os.getcwd()
|
||||
try:
|
||||
os.chdir(temp_dir)
|
||||
scraper = UnifiedScraper(config)
|
||||
|
||||
self.assertEqual(scraper._source_counters['documentation'], 0)
|
||||
self.assertEqual(scraper._source_counters['github'], 0)
|
||||
self.assertEqual(scraper._source_counters['pdf'], 0)
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
def test_empty_lists_initially(self):
|
||||
"""Test that source lists are empty initially."""
|
||||
from skill_seekers.cli.unified_scraper import UnifiedScraper
|
||||
|
||||
config = {
|
||||
'name': 'test_empty',
|
||||
'description': 'Test skill',
|
||||
'sources': [
|
||||
{'type': 'documentation', 'base_url': 'https://example.com'}
|
||||
]
|
||||
}
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
original_dir = os.getcwd()
|
||||
try:
|
||||
os.chdir(temp_dir)
|
||||
scraper = UnifiedScraper(config)
|
||||
|
||||
self.assertEqual(len(scraper.scraped_data['documentation']), 0)
|
||||
self.assertEqual(len(scraper.scraped_data['github']), 0)
|
||||
self.assertEqual(len(scraper.scraped_data['pdf']), 0)
|
||||
finally:
|
||||
os.chdir(original_dir)
|
||||
|
||||
|
||||
class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase):
|
||||
"""Test documentation reference generation for multiple sources."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
self.original_dir = os.getcwd()
|
||||
os.chdir(self.temp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test fixtures."""
|
||||
os.chdir(self.original_dir)
|
||||
if os.path.exists(self.temp_dir):
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def test_creates_subdirectory_per_source(self):
|
||||
"""Test that each doc source gets its own subdirectory."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
# Create mock refs directories
|
||||
refs_dir1 = os.path.join(self.temp_dir, 'refs1')
|
||||
refs_dir2 = os.path.join(self.temp_dir, 'refs2')
|
||||
os.makedirs(refs_dir1)
|
||||
os.makedirs(refs_dir2)
|
||||
|
||||
config = {
|
||||
'name': 'test_docs_refs',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [
|
||||
{'source_id': 'source_a', 'base_url': 'https://a.com', 'total_pages': 5, 'refs_dir': refs_dir1},
|
||||
{'source_id': 'source_b', 'base_url': 'https://b.com', 'total_pages': 3, 'refs_dir': refs_dir2}
|
||||
],
|
||||
'github': [],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_docs_references(scraped_data['documentation'])
|
||||
|
||||
docs_dir = os.path.join(builder.skill_dir, 'references', 'documentation')
|
||||
self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_a')))
|
||||
self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_b')))
|
||||
|
||||
def test_creates_index_per_source(self):
|
||||
"""Test that each source subdirectory has its own index.md."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
refs_dir = os.path.join(self.temp_dir, 'refs')
|
||||
os.makedirs(refs_dir)
|
||||
|
||||
config = {
|
||||
'name': 'test_source_index',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [
|
||||
{'source_id': 'my_source', 'base_url': 'https://example.com', 'total_pages': 10, 'refs_dir': refs_dir}
|
||||
],
|
||||
'github': [],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_docs_references(scraped_data['documentation'])
|
||||
|
||||
source_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'my_source', 'index.md')
|
||||
self.assertTrue(os.path.exists(source_index))
|
||||
|
||||
with open(source_index, 'r') as f:
|
||||
content = f.read()
|
||||
self.assertIn('my_source', content)
|
||||
self.assertIn('https://example.com', content)
|
||||
|
||||
def test_creates_main_index_listing_all_sources(self):
|
||||
"""Test that main index.md lists all documentation sources."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
refs_dir1 = os.path.join(self.temp_dir, 'refs1')
|
||||
refs_dir2 = os.path.join(self.temp_dir, 'refs2')
|
||||
os.makedirs(refs_dir1)
|
||||
os.makedirs(refs_dir2)
|
||||
|
||||
config = {
|
||||
'name': 'test_main_index',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [
|
||||
{'source_id': 'docs_one', 'base_url': 'https://one.com', 'total_pages': 10, 'refs_dir': refs_dir1},
|
||||
{'source_id': 'docs_two', 'base_url': 'https://two.com', 'total_pages': 20, 'refs_dir': refs_dir2}
|
||||
],
|
||||
'github': [],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_docs_references(scraped_data['documentation'])
|
||||
|
||||
main_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'index.md')
|
||||
self.assertTrue(os.path.exists(main_index))
|
||||
|
||||
with open(main_index, 'r') as f:
|
||||
content = f.read()
|
||||
self.assertIn('docs_one', content)
|
||||
self.assertIn('docs_two', content)
|
||||
self.assertIn('2 documentation sources', content)
|
||||
|
||||
def test_copies_reference_files_to_source_dir(self):
|
||||
"""Test that reference files are copied to source subdirectory."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
refs_dir = os.path.join(self.temp_dir, 'refs')
|
||||
os.makedirs(refs_dir)
|
||||
|
||||
# Create mock reference files
|
||||
with open(os.path.join(refs_dir, 'api.md'), 'w') as f:
|
||||
f.write('# API Reference')
|
||||
with open(os.path.join(refs_dir, 'guide.md'), 'w') as f:
|
||||
f.write('# User Guide')
|
||||
|
||||
config = {
|
||||
'name': 'test_copy_refs',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [
|
||||
{'source_id': 'test_source', 'base_url': 'https://test.com', 'total_pages': 5, 'refs_dir': refs_dir}
|
||||
],
|
||||
'github': [],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_docs_references(scraped_data['documentation'])
|
||||
|
||||
source_dir = os.path.join(builder.skill_dir, 'references', 'documentation', 'test_source')
|
||||
self.assertTrue(os.path.exists(os.path.join(source_dir, 'api.md')))
|
||||
self.assertTrue(os.path.exists(os.path.join(source_dir, 'guide.md')))
|
||||
|
||||
|
||||
class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase):
|
||||
"""Test GitHub reference generation for multiple repositories."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
self.original_dir = os.getcwd()
|
||||
os.chdir(self.temp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test fixtures."""
|
||||
os.chdir(self.original_dir)
|
||||
if os.path.exists(self.temp_dir):
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def test_creates_subdirectory_per_repo(self):
|
||||
"""Test that each GitHub repo gets its own subdirectory."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
config = {
|
||||
'name': 'test_github_refs',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [],
|
||||
'github': [
|
||||
{'repo': 'org/repo1', 'repo_id': 'org_repo1', 'data': {'readme': '# Repo 1', 'issues': [], 'releases': [], 'repo_info': {}}},
|
||||
{'repo': 'org/repo2', 'repo_id': 'org_repo2', 'data': {'readme': '# Repo 2', 'issues': [], 'releases': [], 'repo_info': {}}}
|
||||
],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_github_references(scraped_data['github'])
|
||||
|
||||
github_dir = os.path.join(builder.skill_dir, 'references', 'github')
|
||||
self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo1')))
|
||||
self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo2')))
|
||||
|
||||
def test_creates_readme_per_repo(self):
|
||||
"""Test that README.md is created for each repo."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
config = {
|
||||
'name': 'test_readme',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [],
|
||||
'github': [
|
||||
{'repo': 'test/myrepo', 'repo_id': 'test_myrepo', 'data': {'readme': '# My Repository\n\nDescription here.', 'issues': [], 'releases': [], 'repo_info': {}}}
|
||||
],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_github_references(scraped_data['github'])
|
||||
|
||||
readme_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_myrepo', 'README.md')
|
||||
self.assertTrue(os.path.exists(readme_path))
|
||||
|
||||
with open(readme_path, 'r') as f:
|
||||
content = f.read()
|
||||
self.assertIn('test/myrepo', content)
|
||||
|
||||
def test_creates_issues_file_when_issues_exist(self):
|
||||
"""Test that issues.md is created when repo has issues."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
config = {
|
||||
'name': 'test_issues',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [],
|
||||
'github': [
|
||||
{
|
||||
'repo': 'test/repo',
|
||||
'repo_id': 'test_repo',
|
||||
'data': {
|
||||
'readme': '# Repo',
|
||||
'issues': [
|
||||
{'number': 1, 'title': 'Bug report', 'state': 'open', 'labels': ['bug'], 'url': 'https://github.com/test/repo/issues/1'},
|
||||
{'number': 2, 'title': 'Feature request', 'state': 'closed', 'labels': ['enhancement'], 'url': 'https://github.com/test/repo/issues/2'}
|
||||
],
|
||||
'releases': [],
|
||||
'repo_info': {}
|
||||
}
|
||||
}
|
||||
],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_github_references(scraped_data['github'])
|
||||
|
||||
issues_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_repo', 'issues.md')
|
||||
self.assertTrue(os.path.exists(issues_path))
|
||||
|
||||
with open(issues_path, 'r') as f:
|
||||
content = f.read()
|
||||
self.assertIn('Bug report', content)
|
||||
self.assertIn('Feature request', content)
|
||||
|
||||
def test_creates_main_index_listing_all_repos(self):
|
||||
"""Test that main index.md lists all GitHub repositories."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
config = {
|
||||
'name': 'test_github_index',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [],
|
||||
'github': [
|
||||
{'repo': 'org/first', 'repo_id': 'org_first', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 100}}},
|
||||
{'repo': 'org/second', 'repo_id': 'org_second', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 50}}}
|
||||
],
|
||||
'pdf': []
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_github_references(scraped_data['github'])
|
||||
|
||||
main_index = os.path.join(builder.skill_dir, 'references', 'github', 'index.md')
|
||||
self.assertTrue(os.path.exists(main_index))
|
||||
|
||||
with open(main_index, 'r') as f:
|
||||
content = f.read()
|
||||
self.assertIn('org/first', content)
|
||||
self.assertIn('org/second', content)
|
||||
self.assertIn('2 GitHub repositories', content)
|
||||
|
||||
|
||||
class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase):
|
||||
"""Test PDF reference generation for multiple sources."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
self.original_dir = os.getcwd()
|
||||
os.chdir(self.temp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test fixtures."""
|
||||
os.chdir(self.original_dir)
|
||||
if os.path.exists(self.temp_dir):
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def test_creates_pdf_index_with_count(self):
|
||||
"""Test that PDF index shows correct document count."""
|
||||
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
||||
|
||||
config = {
|
||||
'name': 'test_pdf',
|
||||
'description': 'Test',
|
||||
'sources': []
|
||||
}
|
||||
|
||||
scraped_data = {
|
||||
'documentation': [],
|
||||
'github': [],
|
||||
'pdf': [
|
||||
{'path': '/path/to/doc1.pdf'},
|
||||
{'path': '/path/to/doc2.pdf'},
|
||||
{'path': '/path/to/doc3.pdf'}
|
||||
]
|
||||
}
|
||||
|
||||
builder = UnifiedSkillBuilder(config, scraped_data)
|
||||
builder._generate_pdf_references(scraped_data['pdf'])
|
||||
|
||||
pdf_index = os.path.join(builder.skill_dir, 'references', 'pdf', 'index.md')
|
||||
self.assertTrue(os.path.exists(pdf_index))
|
||||
|
||||
with open(pdf_index, 'r') as f:
|
||||
content = f.read()
|
||||
self.assertIn('3 PDF document', content)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user