From 4b764ed1c50317511163a0126fc74be1c20007e4 Mon Sep 17 00:00:00 2001 From: tsyhahaha Date: Mon, 5 Jan 2026 22:13:19 +0800 Subject: [PATCH] test: add unit tests for markdown parsing and multi-source features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add test_markdown_parsing.py with 20 tests covering: - Markdown content extraction (titles, headings, code blocks, links) - HTML fallback when .md URL returns HTML - llms.txt URL extraction and cleaning - Empty/short content filtering - Add test_multi_source.py with 12 tests covering: - List-based scraped_data structure - Per-source subdirectory generation for docs/github/pdf - Index file generation for each source type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tests/test_markdown_parsing.py | 359 +++++++++++++++++++++++++++ tests/test_multi_source.py | 433 +++++++++++++++++++++++++++++++++ 2 files changed, 792 insertions(+) create mode 100644 tests/test_markdown_parsing.py create mode 100644 tests/test_multi_source.py diff --git a/tests/test_markdown_parsing.py b/tests/test_markdown_parsing.py new file mode 100644 index 0000000..9917225 --- /dev/null +++ b/tests/test_markdown_parsing.py @@ -0,0 +1,359 @@ +""" +Tests for Markdown parsing and BFS URL crawling features. + +Tests the following functionality: +1. Markdown file content extraction (_extract_markdown_content) +2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown) +3. URL extraction from llms.txt (extract_urls, _clean_url) +4. Empty/short content filtering in save_page +""" + +import unittest +import tempfile +import os +import shutil + + +class TestMarkdownContentExtraction(unittest.TestCase): + """Test Markdown file parsing in doc_scraper.""" + + def setUp(self): + """Set up test fixtures.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter + + self.config = { + 'name': 'test_md_parsing', + 'base_url': 'https://example.com', + 'selectors': {}, + 'url_patterns': {'include': [], 'exclude': []}, + 'categories': {} + } + self.converter = DocToSkillConverter(self.config) + + def tearDown(self): + """Clean up output directory.""" + output_dir = f"output/{self.config['name']}_data" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + def test_extract_title_from_h1(self): + """Test extracting title from first h1.""" + content = "# My Documentation Title\n\nSome content here." + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + self.assertEqual(result['title'], "My Documentation Title") + + def test_extract_headings_h2_to_h6(self): + """Test extracting h2-h6 headings (not h1).""" + content = """# Title + +## Section One +### Subsection A +#### Deep Section +##### Deeper +###### Deepest + +Content here. +""" + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + # Should have 5 headings (h2-h6), not h1 + self.assertEqual(len(result['headings']), 5) + self.assertEqual(result['headings'][0]['level'], 'h2') + self.assertEqual(result['headings'][0]['text'], 'Section One') + + def test_extract_code_blocks_with_language(self): + """Test extracting code blocks with language tags.""" + content = """# API Guide + +```python +def hello(): + return "Hello, World!" +``` + +Some explanation. + +```javascript +const greet = () => console.log("Hi"); +``` + +``` +plain code without language +``` +""" + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + self.assertEqual(len(result['code_samples']), 3) + self.assertEqual(result['code_samples'][0]['language'], 'python') + self.assertEqual(result['code_samples'][1]['language'], 'javascript') + self.assertEqual(result['code_samples'][2]['language'], 'unknown') + + def test_extract_markdown_links_only_md_files(self): + """Test that only .md links are extracted.""" + content = """# Links + +- [Markdown Doc](./guide.md) +- [Another MD](https://example.com/api.md) +- [HTML Page](./page.html) +- [External](https://google.com) +""" + result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md") + # Should only include .md links + md_links = [l for l in result['links'] if '.md' in l] + self.assertEqual(len(md_links), len(result['links'])) + + def test_extract_content_paragraphs(self): + """Test extracting paragraph content.""" + content = """# Title + +This is a paragraph with enough content to pass the minimum length filter. + +Short. + +Another paragraph that should be included in the final content output. +""" + result = self.converter._extract_markdown_content(content, "https://example.com/test.md") + self.assertIn("paragraph with enough content", result['content']) + self.assertNotIn("Short.", result['content']) + + def test_detect_html_in_md_url(self): + """Test that HTML content is detected when .md URL returns HTML.""" + html_content = "Page

Hello

" + result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md") + self.assertEqual(result['title'], "Page") + + +class TestHtmlAsMarkdownExtraction(unittest.TestCase): + """Test HTML to markdown-like extraction.""" + + def setUp(self): + """Set up test fixtures.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter + + self.config = { + 'name': 'test_html_fallback', + 'base_url': 'https://example.com', + 'selectors': {}, + 'url_patterns': {'include': [], 'exclude': []}, + 'categories': {} + } + self.converter = DocToSkillConverter(self.config) + + def tearDown(self): + """Clean up output directory.""" + output_dir = f"output/{self.config['name']}_data" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + def test_extract_title_from_html(self): + """Test extracting title from HTML title tag.""" + html = "My Page Title" + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertEqual(result['title'], "My Page Title") + + def test_find_main_content_area(self): + """Test finding main content from various selectors.""" + html = """ + + +
+

Main Content

+

This is the main content area with enough text to pass filters.

+
+
Footer
+ + """ + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertIn("main content area", result['content'].lower()) + + def test_extract_code_blocks_from_html(self): + """Test extracting code blocks from HTML pre/code tags.""" + html = """ + +
+
print("hello")
+
+ + """ + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertTrue(len(result['code_samples']) > 0) + + def test_fallback_to_body_when_no_main(self): + """Test fallback to body when no main/article element.""" + html = """ + +
+

Section

+

Content in body without main element, long enough to pass filter.

+
+ + """ + result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md") + self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0) + + +class TestLlmsTxtUrlExtraction(unittest.TestCase): + """Test URL extraction from llms.txt content.""" + + def test_extract_markdown_style_links(self): + """Test extracting [text](url) style links.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +# Documentation Index + +- [Getting Started](https://docs.example.com/start.md) +- [API Reference](https://docs.example.com/api/index.md) +- [Advanced Guide](https://docs.example.com/advanced.md) +""" + parser = LlmsTxtParser(content, base_url="https://docs.example.com") + urls = parser.extract_urls() + + self.assertIn("https://docs.example.com/start.md", urls) + self.assertIn("https://docs.example.com/api/index.md", urls) + self.assertIn("https://docs.example.com/advanced.md", urls) + + def test_extract_bare_urls(self): + """Test extracting bare URLs without markdown syntax.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +Documentation: https://example.com/docs/guide.md +API: https://example.com/api/reference.md +""" + parser = LlmsTxtParser(content) + urls = parser.extract_urls() + + self.assertIn("https://example.com/docs/guide.md", urls) + self.assertIn("https://example.com/api/reference.md", urls) + + def test_resolve_relative_urls(self): + """Test resolving relative URLs with base_url.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +- [Local Doc](./docs/guide.md) +- [Parent](../api/ref.md) +""" + parser = LlmsTxtParser(content, base_url="https://example.com/learn/") + urls = parser.extract_urls() + + # Should resolve relative paths + self.assertTrue(any("docs/guide.md" in url for url in urls)) + + def test_clean_url_invalid_anchor_pattern(self): + """Test cleaning URLs with invalid anchor patterns.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + # Invalid: path after anchor + result = parser._clean_url("https://example.com/page#section/index.html.md") + self.assertEqual(result, "https://example.com/page") + + def test_clean_url_valid_anchor(self): + """Test that valid anchors are preserved.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + # Valid anchor should be unchanged + result = parser._clean_url("https://example.com/page.md#section") + self.assertEqual(result, "https://example.com/page.md#section") + + def test_clean_url_no_anchor(self): + """Test that URLs without anchors are unchanged.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + parser = LlmsTxtParser("", base_url="https://example.com") + + result = parser._clean_url("https://example.com/docs/guide.md") + self.assertEqual(result, "https://example.com/docs/guide.md") + + def test_deduplicate_urls(self): + """Test that duplicate URLs are removed.""" + from skill_seekers.cli.llms_txt_parser import LlmsTxtParser + + content = """ +- [Doc 1](https://example.com/doc.md) +- [Doc 2](https://example.com/doc.md) +https://example.com/doc.md +""" + parser = LlmsTxtParser(content) + urls = parser.extract_urls() + + # Should only have one instance + count = sum(1 for u in urls if u == "https://example.com/doc.md") + self.assertEqual(count, 1) + + +class TestSavePageContentFiltering(unittest.TestCase): + """Test content filtering in save_page.""" + + def setUp(self): + """Set up test fixtures.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter + + self.config = { + 'name': 'test_save_filter', + 'base_url': 'https://example.com', + 'selectors': {}, + 'url_patterns': {'include': [], 'exclude': []}, + 'categories': {} + } + self.converter = DocToSkillConverter(self.config) + + def tearDown(self): + """Clean up output directory.""" + output_dir = f"output/{self.config['name']}_data" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + def test_skip_empty_content(self): + """Test that pages with empty content are skipped.""" + page = { + 'url': 'https://example.com/empty', + 'title': 'Empty Page', + 'content': '', + 'headings': [], + 'code_samples': [] + } + + self.converter.save_page(page) + + pages_dir = os.path.join(self.converter.data_dir, 'pages') + if os.path.exists(pages_dir): + self.assertEqual(len(os.listdir(pages_dir)), 0) + + def test_skip_short_content_under_50_chars(self): + """Test that pages with content < 50 chars are skipped.""" + page = { + 'url': 'https://example.com/short', + 'title': 'Short', + 'content': 'This is too short.', # 18 chars + 'headings': [], + 'code_samples': [] + } + + self.converter.save_page(page) + + pages_dir = os.path.join(self.converter.data_dir, 'pages') + if os.path.exists(pages_dir): + self.assertEqual(len(os.listdir(pages_dir)), 0) + + def test_save_content_over_50_chars(self): + """Test that pages with content >= 50 chars are saved.""" + page = { + 'url': 'https://example.com/valid', + 'title': 'Valid Page', + 'content': 'A' * 60, # 60 chars, should pass + 'headings': [], + 'code_samples': [] + } + + self.converter.save_page(page) + + pages_dir = os.path.join(self.converter.data_dir, 'pages') + self.assertTrue(os.path.exists(pages_dir)) + self.assertEqual(len(os.listdir(pages_dir)), 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_multi_source.py b/tests/test_multi_source.py new file mode 100644 index 0000000..80644d2 --- /dev/null +++ b/tests/test_multi_source.py @@ -0,0 +1,433 @@ +""" +Tests for multi-source support in unified scraper and skill builder. + +Tests the following functionality: +1. Multiple sources of same type in unified_scraper (list structure) +2. Source counters and unique naming +3. Per-source reference directory generation in unified_skill_builder +4. Multiple documentation sources handling +5. Multiple GitHub repositories handling +""" + +import unittest +import tempfile +import os +import shutil + + +class TestUnifiedScraperDataStructure(unittest.TestCase): + """Test scraped_data list structure in unified_scraper.""" + + def test_scraped_data_uses_list_structure(self): + """Test that scraped_data uses list for each source type.""" + from skill_seekers.cli.unified_scraper import UnifiedScraper + + config = { + 'name': 'test_multi', + 'description': 'Test skill', + 'sources': [ + {'type': 'documentation', 'base_url': 'https://example.com'} + ] + } + + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = os.getcwd() + try: + os.chdir(temp_dir) + scraper = UnifiedScraper(config) + + self.assertIsInstance(scraper.scraped_data['documentation'], list) + self.assertIsInstance(scraper.scraped_data['github'], list) + self.assertIsInstance(scraper.scraped_data['pdf'], list) + finally: + os.chdir(original_dir) + + def test_source_counters_initialized_to_zero(self): + """Test that source counters start at zero.""" + from skill_seekers.cli.unified_scraper import UnifiedScraper + + config = { + 'name': 'test_counters', + 'description': 'Test skill', + 'sources': [ + {'type': 'documentation', 'base_url': 'https://example.com'} + ] + } + + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = os.getcwd() + try: + os.chdir(temp_dir) + scraper = UnifiedScraper(config) + + self.assertEqual(scraper._source_counters['documentation'], 0) + self.assertEqual(scraper._source_counters['github'], 0) + self.assertEqual(scraper._source_counters['pdf'], 0) + finally: + os.chdir(original_dir) + + def test_empty_lists_initially(self): + """Test that source lists are empty initially.""" + from skill_seekers.cli.unified_scraper import UnifiedScraper + + config = { + 'name': 'test_empty', + 'description': 'Test skill', + 'sources': [ + {'type': 'documentation', 'base_url': 'https://example.com'} + ] + } + + with tempfile.TemporaryDirectory() as temp_dir: + original_dir = os.getcwd() + try: + os.chdir(temp_dir) + scraper = UnifiedScraper(config) + + self.assertEqual(len(scraper.scraped_data['documentation']), 0) + self.assertEqual(len(scraper.scraped_data['github']), 0) + self.assertEqual(len(scraper.scraped_data['pdf']), 0) + finally: + os.chdir(original_dir) + + +class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase): + """Test documentation reference generation for multiple sources.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.temp_dir) + + def tearDown(self): + """Clean up test fixtures.""" + os.chdir(self.original_dir) + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_creates_subdirectory_per_source(self): + """Test that each doc source gets its own subdirectory.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + # Create mock refs directories + refs_dir1 = os.path.join(self.temp_dir, 'refs1') + refs_dir2 = os.path.join(self.temp_dir, 'refs2') + os.makedirs(refs_dir1) + os.makedirs(refs_dir2) + + config = { + 'name': 'test_docs_refs', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'source_a', 'base_url': 'https://a.com', 'total_pages': 5, 'refs_dir': refs_dir1}, + {'source_id': 'source_b', 'base_url': 'https://b.com', 'total_pages': 3, 'refs_dir': refs_dir2} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + docs_dir = os.path.join(builder.skill_dir, 'references', 'documentation') + self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_a'))) + self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_b'))) + + def test_creates_index_per_source(self): + """Test that each source subdirectory has its own index.md.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + refs_dir = os.path.join(self.temp_dir, 'refs') + os.makedirs(refs_dir) + + config = { + 'name': 'test_source_index', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'my_source', 'base_url': 'https://example.com', 'total_pages': 10, 'refs_dir': refs_dir} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + source_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'my_source', 'index.md') + self.assertTrue(os.path.exists(source_index)) + + with open(source_index, 'r') as f: + content = f.read() + self.assertIn('my_source', content) + self.assertIn('https://example.com', content) + + def test_creates_main_index_listing_all_sources(self): + """Test that main index.md lists all documentation sources.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + refs_dir1 = os.path.join(self.temp_dir, 'refs1') + refs_dir2 = os.path.join(self.temp_dir, 'refs2') + os.makedirs(refs_dir1) + os.makedirs(refs_dir2) + + config = { + 'name': 'test_main_index', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'docs_one', 'base_url': 'https://one.com', 'total_pages': 10, 'refs_dir': refs_dir1}, + {'source_id': 'docs_two', 'base_url': 'https://two.com', 'total_pages': 20, 'refs_dir': refs_dir2} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + main_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'index.md') + self.assertTrue(os.path.exists(main_index)) + + with open(main_index, 'r') as f: + content = f.read() + self.assertIn('docs_one', content) + self.assertIn('docs_two', content) + self.assertIn('2 documentation sources', content) + + def test_copies_reference_files_to_source_dir(self): + """Test that reference files are copied to source subdirectory.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + refs_dir = os.path.join(self.temp_dir, 'refs') + os.makedirs(refs_dir) + + # Create mock reference files + with open(os.path.join(refs_dir, 'api.md'), 'w') as f: + f.write('# API Reference') + with open(os.path.join(refs_dir, 'guide.md'), 'w') as f: + f.write('# User Guide') + + config = { + 'name': 'test_copy_refs', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [ + {'source_id': 'test_source', 'base_url': 'https://test.com', 'total_pages': 5, 'refs_dir': refs_dir} + ], + 'github': [], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_docs_references(scraped_data['documentation']) + + source_dir = os.path.join(builder.skill_dir, 'references', 'documentation', 'test_source') + self.assertTrue(os.path.exists(os.path.join(source_dir, 'api.md'))) + self.assertTrue(os.path.exists(os.path.join(source_dir, 'guide.md'))) + + +class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase): + """Test GitHub reference generation for multiple repositories.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.temp_dir) + + def tearDown(self): + """Clean up test fixtures.""" + os.chdir(self.original_dir) + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_creates_subdirectory_per_repo(self): + """Test that each GitHub repo gets its own subdirectory.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_github_refs', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + {'repo': 'org/repo1', 'repo_id': 'org_repo1', 'data': {'readme': '# Repo 1', 'issues': [], 'releases': [], 'repo_info': {}}}, + {'repo': 'org/repo2', 'repo_id': 'org_repo2', 'data': {'readme': '# Repo 2', 'issues': [], 'releases': [], 'repo_info': {}}} + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + github_dir = os.path.join(builder.skill_dir, 'references', 'github') + self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo1'))) + self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo2'))) + + def test_creates_readme_per_repo(self): + """Test that README.md is created for each repo.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_readme', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + {'repo': 'test/myrepo', 'repo_id': 'test_myrepo', 'data': {'readme': '# My Repository\n\nDescription here.', 'issues': [], 'releases': [], 'repo_info': {}}} + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + readme_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_myrepo', 'README.md') + self.assertTrue(os.path.exists(readme_path)) + + with open(readme_path, 'r') as f: + content = f.read() + self.assertIn('test/myrepo', content) + + def test_creates_issues_file_when_issues_exist(self): + """Test that issues.md is created when repo has issues.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_issues', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + { + 'repo': 'test/repo', + 'repo_id': 'test_repo', + 'data': { + 'readme': '# Repo', + 'issues': [ + {'number': 1, 'title': 'Bug report', 'state': 'open', 'labels': ['bug'], 'url': 'https://github.com/test/repo/issues/1'}, + {'number': 2, 'title': 'Feature request', 'state': 'closed', 'labels': ['enhancement'], 'url': 'https://github.com/test/repo/issues/2'} + ], + 'releases': [], + 'repo_info': {} + } + } + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + issues_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_repo', 'issues.md') + self.assertTrue(os.path.exists(issues_path)) + + with open(issues_path, 'r') as f: + content = f.read() + self.assertIn('Bug report', content) + self.assertIn('Feature request', content) + + def test_creates_main_index_listing_all_repos(self): + """Test that main index.md lists all GitHub repositories.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_github_index', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [ + {'repo': 'org/first', 'repo_id': 'org_first', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 100}}}, + {'repo': 'org/second', 'repo_id': 'org_second', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 50}}} + ], + 'pdf': [] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_github_references(scraped_data['github']) + + main_index = os.path.join(builder.skill_dir, 'references', 'github', 'index.md') + self.assertTrue(os.path.exists(main_index)) + + with open(main_index, 'r') as f: + content = f.read() + self.assertIn('org/first', content) + self.assertIn('org/second', content) + self.assertIn('2 GitHub repositories', content) + + +class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase): + """Test PDF reference generation for multiple sources.""" + + def setUp(self): + """Set up test fixtures.""" + self.temp_dir = tempfile.mkdtemp() + self.original_dir = os.getcwd() + os.chdir(self.temp_dir) + + def tearDown(self): + """Clean up test fixtures.""" + os.chdir(self.original_dir) + if os.path.exists(self.temp_dir): + shutil.rmtree(self.temp_dir) + + def test_creates_pdf_index_with_count(self): + """Test that PDF index shows correct document count.""" + from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder + + config = { + 'name': 'test_pdf', + 'description': 'Test', + 'sources': [] + } + + scraped_data = { + 'documentation': [], + 'github': [], + 'pdf': [ + {'path': '/path/to/doc1.pdf'}, + {'path': '/path/to/doc2.pdf'}, + {'path': '/path/to/doc3.pdf'} + ] + } + + builder = UnifiedSkillBuilder(config, scraped_data) + builder._generate_pdf_references(scraped_data['pdf']) + + pdf_index = os.path.join(builder.skill_dir, 'references', 'pdf', 'index.md') + self.assertTrue(os.path.exists(pdf_index)) + + with open(pdf_index, 'r') as f: + content = f.read() + self.assertIn('3 PDF document', content) + + +if __name__ == '__main__': + unittest.main()