test: add unit tests for markdown parsing and multi-source features

- Add test_markdown_parsing.py with 20 tests covering: - Markdown content extraction (titles, headings, code blocks, links) - HTML fallback when .md URL returns HTML - llms.txt URL extraction and cleaning - Empty/short content filtering - Add test_multi_source.py with 12 tests covering: - List-based scraped_data structure - Per-source subdirectory generation for docs/github/pdf - Index file generation for each source type 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-05 22:13:19 +08:00
parent 8cf43582a4
commit 4b764ed1c5
2 changed files with 792 additions and 0 deletions
--- a/tests/test_markdown_parsing.py
+++ b/tests/test_markdown_parsing.py
@@ -0,0 +1,359 @@
+"""
+Tests for Markdown parsing and BFS URL crawling features.
+
+Tests the following functionality:
+1. Markdown file content extraction (_extract_markdown_content)
+2. HTML fallback when .md URL returns HTML (_extract_html_as_markdown)
+3. URL extraction from llms.txt (extract_urls, _clean_url)
+4. Empty/short content filtering in save_page
+"""
+
+import unittest
+import tempfile
+import os
+import shutil
+
+
+class TestMarkdownContentExtraction(unittest.TestCase):
+    """Test Markdown file parsing in doc_scraper."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+        self.config = {
+            'name': 'test_md_parsing',
+            'base_url': 'https://example.com',
+            'selectors': {},
+            'url_patterns': {'include': [], 'exclude': []},
+            'categories': {}
+        }
+        self.converter = DocToSkillConverter(self.config)
+
+    def tearDown(self):
+        """Clean up output directory."""
+        output_dir = f"output/{self.config['name']}_data"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    def test_extract_title_from_h1(self):
+        """Test extracting title from first h1."""
+        content = "# My Documentation Title\n\nSome content here."
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        self.assertEqual(result['title'], "My Documentation Title")
+
+    def test_extract_headings_h2_to_h6(self):
+        """Test extracting h2-h6 headings (not h1)."""
+        content = """# Title
+
+## Section One
+### Subsection A
+#### Deep Section
+##### Deeper
+###### Deepest
+
+Content here.
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        # Should have 5 headings (h2-h6), not h1
+        self.assertEqual(len(result['headings']), 5)
+        self.assertEqual(result['headings'][0]['level'], 'h2')
+        self.assertEqual(result['headings'][0]['text'], 'Section One')
+
+    def test_extract_code_blocks_with_language(self):
+        """Test extracting code blocks with language tags."""
+        content = """# API Guide
+
+```python
+def hello():
+    return "Hello, World!"
+```
+
+Some explanation.
+
+```javascript
+const greet = () => console.log("Hi");
+```
+
+```
+plain code without language
+```
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        self.assertEqual(len(result['code_samples']), 3)
+        self.assertEqual(result['code_samples'][0]['language'], 'python')
+        self.assertEqual(result['code_samples'][1]['language'], 'javascript')
+        self.assertEqual(result['code_samples'][2]['language'], 'unknown')
+
+    def test_extract_markdown_links_only_md_files(self):
+        """Test that only .md links are extracted."""
+        content = """# Links
+
+- [Markdown Doc](./guide.md)
+- [Another MD](https://example.com/api.md)
+- [HTML Page](./page.html)
+- [External](https://google.com)
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/docs/test.md")
+        # Should only include .md links
+        md_links = [l for l in result['links'] if '.md' in l]
+        self.assertEqual(len(md_links), len(result['links']))
+
+    def test_extract_content_paragraphs(self):
+        """Test extracting paragraph content."""
+        content = """# Title
+
+This is a paragraph with enough content to pass the minimum length filter.
+
+Short.
+
+Another paragraph that should be included in the final content output.
+"""
+        result = self.converter._extract_markdown_content(content, "https://example.com/test.md")
+        self.assertIn("paragraph with enough content", result['content'])
+        self.assertNotIn("Short.", result['content'])
+
+    def test_detect_html_in_md_url(self):
+        """Test that HTML content is detected when .md URL returns HTML."""
+        html_content = "<!DOCTYPE html><html><head><title>Page</title></head><body><h1>Hello</h1></body></html>"
+        result = self.converter._extract_markdown_content(html_content, "https://example.com/test.md")
+        self.assertEqual(result['title'], "Page")
+
+
+class TestHtmlAsMarkdownExtraction(unittest.TestCase):
+    """Test HTML to markdown-like extraction."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+        self.config = {
+            'name': 'test_html_fallback',
+            'base_url': 'https://example.com',
+            'selectors': {},
+            'url_patterns': {'include': [], 'exclude': []},
+            'categories': {}
+        }
+        self.converter = DocToSkillConverter(self.config)
+
+    def tearDown(self):
+        """Clean up output directory."""
+        output_dir = f"output/{self.config['name']}_data"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    def test_extract_title_from_html(self):
+        """Test extracting title from HTML title tag."""
+        html = "<html><head><title>My Page Title</title></head><body></body></html>"
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertEqual(result['title'], "My Page Title")
+
+    def test_find_main_content_area(self):
+        """Test finding main content from various selectors."""
+        html = """
+        <html><body>
+            <nav>Navigation</nav>
+            <main>
+                <h1>Main Content</h1>
+                <p>This is the main content area with enough text to pass filters.</p>
+            </main>
+            <footer>Footer</footer>
+        </body></html>
+        """
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertIn("main content area", result['content'].lower())
+
+    def test_extract_code_blocks_from_html(self):
+        """Test extracting code blocks from HTML pre/code tags."""
+        html = """
+        <html><body>
+            <main>
+                <pre><code class="language-python">print("hello")</code></pre>
+            </main>
+        </body></html>
+        """
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertTrue(len(result['code_samples']) > 0)
+
+    def test_fallback_to_body_when_no_main(self):
+        """Test fallback to body when no main/article element."""
+        html = """
+        <html><body>
+            <div>
+                <h2>Section</h2>
+                <p>Content in body without main element, long enough to pass filter.</p>
+            </div>
+        </body></html>
+        """
+        result = self.converter._extract_html_as_markdown(html, "https://example.com/test.md")
+        self.assertTrue(len(result['headings']) > 0 or len(result['content']) > 0)
+
+
+class TestLlmsTxtUrlExtraction(unittest.TestCase):
+    """Test URL extraction from llms.txt content."""
+
+    def test_extract_markdown_style_links(self):
+        """Test extracting [text](url) style links."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+# Documentation Index
+
+- [Getting Started](https://docs.example.com/start.md)
+- [API Reference](https://docs.example.com/api/index.md)
+- [Advanced Guide](https://docs.example.com/advanced.md)
+"""
+        parser = LlmsTxtParser(content, base_url="https://docs.example.com")
+        urls = parser.extract_urls()
+
+        self.assertIn("https://docs.example.com/start.md", urls)
+        self.assertIn("https://docs.example.com/api/index.md", urls)
+        self.assertIn("https://docs.example.com/advanced.md", urls)
+
+    def test_extract_bare_urls(self):
+        """Test extracting bare URLs without markdown syntax."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+Documentation: https://example.com/docs/guide.md
+API: https://example.com/api/reference.md
+"""
+        parser = LlmsTxtParser(content)
+        urls = parser.extract_urls()
+
+        self.assertIn("https://example.com/docs/guide.md", urls)
+        self.assertIn("https://example.com/api/reference.md", urls)
+
+    def test_resolve_relative_urls(self):
+        """Test resolving relative URLs with base_url."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+- [Local Doc](./docs/guide.md)
+- [Parent](../api/ref.md)
+"""
+        parser = LlmsTxtParser(content, base_url="https://example.com/learn/")
+        urls = parser.extract_urls()
+
+        # Should resolve relative paths
+        self.assertTrue(any("docs/guide.md" in url for url in urls))
+
+    def test_clean_url_invalid_anchor_pattern(self):
+        """Test cleaning URLs with invalid anchor patterns."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        parser = LlmsTxtParser("", base_url="https://example.com")
+
+        # Invalid: path after anchor
+        result = parser._clean_url("https://example.com/page#section/index.html.md")
+        self.assertEqual(result, "https://example.com/page")
+
+    def test_clean_url_valid_anchor(self):
+        """Test that valid anchors are preserved."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        parser = LlmsTxtParser("", base_url="https://example.com")
+
+        # Valid anchor should be unchanged
+        result = parser._clean_url("https://example.com/page.md#section")
+        self.assertEqual(result, "https://example.com/page.md#section")
+
+    def test_clean_url_no_anchor(self):
+        """Test that URLs without anchors are unchanged."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        parser = LlmsTxtParser("", base_url="https://example.com")
+
+        result = parser._clean_url("https://example.com/docs/guide.md")
+        self.assertEqual(result, "https://example.com/docs/guide.md")
+
+    def test_deduplicate_urls(self):
+        """Test that duplicate URLs are removed."""
+        from skill_seekers.cli.llms_txt_parser import LlmsTxtParser
+
+        content = """
+- [Doc 1](https://example.com/doc.md)
+- [Doc 2](https://example.com/doc.md)
+https://example.com/doc.md
+"""
+        parser = LlmsTxtParser(content)
+        urls = parser.extract_urls()
+
+        # Should only have one instance
+        count = sum(1 for u in urls if u == "https://example.com/doc.md")
+        self.assertEqual(count, 1)
+
+
+class TestSavePageContentFiltering(unittest.TestCase):
+    """Test content filtering in save_page."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        from skill_seekers.cli.doc_scraper import DocToSkillConverter
+
+        self.config = {
+            'name': 'test_save_filter',
+            'base_url': 'https://example.com',
+            'selectors': {},
+            'url_patterns': {'include': [], 'exclude': []},
+            'categories': {}
+        }
+        self.converter = DocToSkillConverter(self.config)
+
+    def tearDown(self):
+        """Clean up output directory."""
+        output_dir = f"output/{self.config['name']}_data"
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+    def test_skip_empty_content(self):
+        """Test that pages with empty content are skipped."""
+        page = {
+            'url': 'https://example.com/empty',
+            'title': 'Empty Page',
+            'content': '',
+            'headings': [],
+            'code_samples': []
+        }
+
+        self.converter.save_page(page)
+
+        pages_dir = os.path.join(self.converter.data_dir, 'pages')
+        if os.path.exists(pages_dir):
+            self.assertEqual(len(os.listdir(pages_dir)), 0)
+
+    def test_skip_short_content_under_50_chars(self):
+        """Test that pages with content < 50 chars are skipped."""
+        page = {
+            'url': 'https://example.com/short',
+            'title': 'Short',
+            'content': 'This is too short.',  # 18 chars
+            'headings': [],
+            'code_samples': []
+        }
+
+        self.converter.save_page(page)
+
+        pages_dir = os.path.join(self.converter.data_dir, 'pages')
+        if os.path.exists(pages_dir):
+            self.assertEqual(len(os.listdir(pages_dir)), 0)
+
+    def test_save_content_over_50_chars(self):
+        """Test that pages with content >= 50 chars are saved."""
+        page = {
+            'url': 'https://example.com/valid',
+            'title': 'Valid Page',
+            'content': 'A' * 60,  # 60 chars, should pass
+            'headings': [],
+            'code_samples': []
+        }
+
+        self.converter.save_page(page)
+
+        pages_dir = os.path.join(self.converter.data_dir, 'pages')
+        self.assertTrue(os.path.exists(pages_dir))
+        self.assertEqual(len(os.listdir(pages_dir)), 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_multi_source.py
+++ b/tests/test_multi_source.py
@@ -0,0 +1,433 @@
+"""
+Tests for multi-source support in unified scraper and skill builder.
+
+Tests the following functionality:
+1. Multiple sources of same type in unified_scraper (list structure)
+2. Source counters and unique naming
+3. Per-source reference directory generation in unified_skill_builder
+4. Multiple documentation sources handling
+5. Multiple GitHub repositories handling
+"""
+
+import unittest
+import tempfile
+import os
+import shutil
+
+
+class TestUnifiedScraperDataStructure(unittest.TestCase):
+    """Test scraped_data list structure in unified_scraper."""
+
+    def test_scraped_data_uses_list_structure(self):
+        """Test that scraped_data uses list for each source type."""
+        from skill_seekers.cli.unified_scraper import UnifiedScraper
+
+        config = {
+            'name': 'test_multi',
+            'description': 'Test skill',
+            'sources': [
+                {'type': 'documentation', 'base_url': 'https://example.com'}
+            ]
+        }
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            original_dir = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                scraper = UnifiedScraper(config)
+
+                self.assertIsInstance(scraper.scraped_data['documentation'], list)
+                self.assertIsInstance(scraper.scraped_data['github'], list)
+                self.assertIsInstance(scraper.scraped_data['pdf'], list)
+            finally:
+                os.chdir(original_dir)
+
+    def test_source_counters_initialized_to_zero(self):
+        """Test that source counters start at zero."""
+        from skill_seekers.cli.unified_scraper import UnifiedScraper
+
+        config = {
+            'name': 'test_counters',
+            'description': 'Test skill',
+            'sources': [
+                {'type': 'documentation', 'base_url': 'https://example.com'}
+            ]
+        }
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            original_dir = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                scraper = UnifiedScraper(config)
+
+                self.assertEqual(scraper._source_counters['documentation'], 0)
+                self.assertEqual(scraper._source_counters['github'], 0)
+                self.assertEqual(scraper._source_counters['pdf'], 0)
+            finally:
+                os.chdir(original_dir)
+
+    def test_empty_lists_initially(self):
+        """Test that source lists are empty initially."""
+        from skill_seekers.cli.unified_scraper import UnifiedScraper
+
+        config = {
+            'name': 'test_empty',
+            'description': 'Test skill',
+            'sources': [
+                {'type': 'documentation', 'base_url': 'https://example.com'}
+            ]
+        }
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            original_dir = os.getcwd()
+            try:
+                os.chdir(temp_dir)
+                scraper = UnifiedScraper(config)
+
+                self.assertEqual(len(scraper.scraped_data['documentation']), 0)
+                self.assertEqual(len(scraper.scraped_data['github']), 0)
+                self.assertEqual(len(scraper.scraped_data['pdf']), 0)
+            finally:
+                os.chdir(original_dir)
+
+
+class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase):
+    """Test documentation reference generation for multiple sources."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.original_dir = os.getcwd()
+        os.chdir(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        os.chdir(self.original_dir)
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+
+    def test_creates_subdirectory_per_source(self):
+        """Test that each doc source gets its own subdirectory."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        # Create mock refs directories
+        refs_dir1 = os.path.join(self.temp_dir, 'refs1')
+        refs_dir2 = os.path.join(self.temp_dir, 'refs2')
+        os.makedirs(refs_dir1)
+        os.makedirs(refs_dir2)
+
+        config = {
+            'name': 'test_docs_refs',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'source_a', 'base_url': 'https://a.com', 'total_pages': 5, 'refs_dir': refs_dir1},
+                {'source_id': 'source_b', 'base_url': 'https://b.com', 'total_pages': 3, 'refs_dir': refs_dir2}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        docs_dir = os.path.join(builder.skill_dir, 'references', 'documentation')
+        self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_a')))
+        self.assertTrue(os.path.exists(os.path.join(docs_dir, 'source_b')))
+
+    def test_creates_index_per_source(self):
+        """Test that each source subdirectory has its own index.md."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        refs_dir = os.path.join(self.temp_dir, 'refs')
+        os.makedirs(refs_dir)
+
+        config = {
+            'name': 'test_source_index',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'my_source', 'base_url': 'https://example.com', 'total_pages': 10, 'refs_dir': refs_dir}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        source_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'my_source', 'index.md')
+        self.assertTrue(os.path.exists(source_index))
+
+        with open(source_index, 'r') as f:
+            content = f.read()
+            self.assertIn('my_source', content)
+            self.assertIn('https://example.com', content)
+
+    def test_creates_main_index_listing_all_sources(self):
+        """Test that main index.md lists all documentation sources."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        refs_dir1 = os.path.join(self.temp_dir, 'refs1')
+        refs_dir2 = os.path.join(self.temp_dir, 'refs2')
+        os.makedirs(refs_dir1)
+        os.makedirs(refs_dir2)
+
+        config = {
+            'name': 'test_main_index',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'docs_one', 'base_url': 'https://one.com', 'total_pages': 10, 'refs_dir': refs_dir1},
+                {'source_id': 'docs_two', 'base_url': 'https://two.com', 'total_pages': 20, 'refs_dir': refs_dir2}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        main_index = os.path.join(builder.skill_dir, 'references', 'documentation', 'index.md')
+        self.assertTrue(os.path.exists(main_index))
+
+        with open(main_index, 'r') as f:
+            content = f.read()
+            self.assertIn('docs_one', content)
+            self.assertIn('docs_two', content)
+            self.assertIn('2 documentation sources', content)
+
+    def test_copies_reference_files_to_source_dir(self):
+        """Test that reference files are copied to source subdirectory."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        refs_dir = os.path.join(self.temp_dir, 'refs')
+        os.makedirs(refs_dir)
+
+        # Create mock reference files
+        with open(os.path.join(refs_dir, 'api.md'), 'w') as f:
+            f.write('# API Reference')
+        with open(os.path.join(refs_dir, 'guide.md'), 'w') as f:
+            f.write('# User Guide')
+
+        config = {
+            'name': 'test_copy_refs',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [
+                {'source_id': 'test_source', 'base_url': 'https://test.com', 'total_pages': 5, 'refs_dir': refs_dir}
+            ],
+            'github': [],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_docs_references(scraped_data['documentation'])
+
+        source_dir = os.path.join(builder.skill_dir, 'references', 'documentation', 'test_source')
+        self.assertTrue(os.path.exists(os.path.join(source_dir, 'api.md')))
+        self.assertTrue(os.path.exists(os.path.join(source_dir, 'guide.md')))
+
+
+class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase):
+    """Test GitHub reference generation for multiple repositories."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.original_dir = os.getcwd()
+        os.chdir(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        os.chdir(self.original_dir)
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+
+    def test_creates_subdirectory_per_repo(self):
+        """Test that each GitHub repo gets its own subdirectory."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_github_refs',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {'repo': 'org/repo1', 'repo_id': 'org_repo1', 'data': {'readme': '# Repo 1', 'issues': [], 'releases': [], 'repo_info': {}}},
+                {'repo': 'org/repo2', 'repo_id': 'org_repo2', 'data': {'readme': '# Repo 2', 'issues': [], 'releases': [], 'repo_info': {}}}
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        github_dir = os.path.join(builder.skill_dir, 'references', 'github')
+        self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo1')))
+        self.assertTrue(os.path.exists(os.path.join(github_dir, 'org_repo2')))
+
+    def test_creates_readme_per_repo(self):
+        """Test that README.md is created for each repo."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_readme',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {'repo': 'test/myrepo', 'repo_id': 'test_myrepo', 'data': {'readme': '# My Repository\n\nDescription here.', 'issues': [], 'releases': [], 'repo_info': {}}}
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        readme_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_myrepo', 'README.md')
+        self.assertTrue(os.path.exists(readme_path))
+
+        with open(readme_path, 'r') as f:
+            content = f.read()
+            self.assertIn('test/myrepo', content)
+
+    def test_creates_issues_file_when_issues_exist(self):
+        """Test that issues.md is created when repo has issues."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_issues',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {
+                    'repo': 'test/repo',
+                    'repo_id': 'test_repo',
+                    'data': {
+                        'readme': '# Repo',
+                        'issues': [
+                            {'number': 1, 'title': 'Bug report', 'state': 'open', 'labels': ['bug'], 'url': 'https://github.com/test/repo/issues/1'},
+                            {'number': 2, 'title': 'Feature request', 'state': 'closed', 'labels': ['enhancement'], 'url': 'https://github.com/test/repo/issues/2'}
+                        ],
+                        'releases': [],
+                        'repo_info': {}
+                    }
+                }
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        issues_path = os.path.join(builder.skill_dir, 'references', 'github', 'test_repo', 'issues.md')
+        self.assertTrue(os.path.exists(issues_path))
+
+        with open(issues_path, 'r') as f:
+            content = f.read()
+            self.assertIn('Bug report', content)
+            self.assertIn('Feature request', content)
+
+    def test_creates_main_index_listing_all_repos(self):
+        """Test that main index.md lists all GitHub repositories."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_github_index',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [
+                {'repo': 'org/first', 'repo_id': 'org_first', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 100}}},
+                {'repo': 'org/second', 'repo_id': 'org_second', 'data': {'readme': '#', 'issues': [], 'releases': [], 'repo_info': {'stars': 50}}}
+            ],
+            'pdf': []
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_github_references(scraped_data['github'])
+
+        main_index = os.path.join(builder.skill_dir, 'references', 'github', 'index.md')
+        self.assertTrue(os.path.exists(main_index))
+
+        with open(main_index, 'r') as f:
+            content = f.read()
+            self.assertIn('org/first', content)
+            self.assertIn('org/second', content)
+            self.assertIn('2 GitHub repositories', content)
+
+
+class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase):
+    """Test PDF reference generation for multiple sources."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.original_dir = os.getcwd()
+        os.chdir(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        os.chdir(self.original_dir)
+        if os.path.exists(self.temp_dir):
+            shutil.rmtree(self.temp_dir)
+
+    def test_creates_pdf_index_with_count(self):
+        """Test that PDF index shows correct document count."""
+        from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
+
+        config = {
+            'name': 'test_pdf',
+            'description': 'Test',
+            'sources': []
+        }
+
+        scraped_data = {
+            'documentation': [],
+            'github': [],
+            'pdf': [
+                {'path': '/path/to/doc1.pdf'},
+                {'path': '/path/to/doc2.pdf'},
+                {'path': '/path/to/doc3.pdf'}
+            ]
+        }
+
+        builder = UnifiedSkillBuilder(config, scraped_data)
+        builder._generate_pdf_references(scraped_data['pdf'])
+
+        pdf_index = os.path.join(builder.skill_dir, 'references', 'pdf', 'index.md')
+        self.assertTrue(os.path.exists(pdf_index))
+
+        with open(pdf_index, 'r') as f:
+            content = f.read()
+            self.assertIn('3 PDF document', content)
+
+
+if __name__ == '__main__':
+    unittest.main()