""" Tests for multi-source support in unified scraper and skill builder. Tests the following functionality: 1. Multiple sources of same type in unified_scraper (list structure) 2. Source counters and unique naming 3. Per-source reference directory generation in unified_skill_builder 4. Multiple documentation sources handling 5. Multiple GitHub repositories handling """ import os import shutil import tempfile import unittest class TestUnifiedScraperDataStructure(unittest.TestCase): """Test scraped_data list structure in unified_scraper.""" def test_scraped_data_uses_list_structure(self): """Test that scraped_data uses list for each source type.""" from skill_seekers.cli.unified_scraper import UnifiedScraper config = { "name": "test_multi", "description": "Test skill", "sources": [{"type": "documentation", "base_url": "https://example.com"}], } with tempfile.TemporaryDirectory() as temp_dir: original_dir = os.getcwd() try: os.chdir(temp_dir) scraper = UnifiedScraper(config) self.assertIsInstance(scraper.scraped_data["documentation"], list) self.assertIsInstance(scraper.scraped_data["github"], list) self.assertIsInstance(scraper.scraped_data["pdf"], list) finally: os.chdir(original_dir) def test_source_counters_initialized_to_zero(self): """Test that source counters start at zero.""" from skill_seekers.cli.unified_scraper import UnifiedScraper config = { "name": "test_counters", "description": "Test skill", "sources": [{"type": "documentation", "base_url": "https://example.com"}], } with tempfile.TemporaryDirectory() as temp_dir: original_dir = os.getcwd() try: os.chdir(temp_dir) scraper = UnifiedScraper(config) self.assertEqual(scraper._source_counters["documentation"], 0) self.assertEqual(scraper._source_counters["github"], 0) self.assertEqual(scraper._source_counters["pdf"], 0) finally: os.chdir(original_dir) def test_empty_lists_initially(self): """Test that source lists are empty initially.""" from skill_seekers.cli.unified_scraper import UnifiedScraper config = { "name": "test_empty", "description": "Test skill", "sources": [{"type": "documentation", "base_url": "https://example.com"}], } with tempfile.TemporaryDirectory() as temp_dir: original_dir = os.getcwd() try: os.chdir(temp_dir) scraper = UnifiedScraper(config) self.assertEqual(len(scraper.scraped_data["documentation"]), 0) self.assertEqual(len(scraper.scraped_data["github"]), 0) self.assertEqual(len(scraper.scraped_data["pdf"]), 0) finally: os.chdir(original_dir) class TestUnifiedSkillBuilderDocsReferences(unittest.TestCase): """Test documentation reference generation for multiple sources.""" def setUp(self): """Set up test fixtures.""" self.temp_dir = tempfile.mkdtemp() self.original_dir = os.getcwd() os.chdir(self.temp_dir) def tearDown(self): """Clean up test fixtures.""" os.chdir(self.original_dir) if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) def test_creates_subdirectory_per_source(self): """Test that each doc source gets its own subdirectory.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder # Create mock refs directories refs_dir1 = os.path.join(self.temp_dir, "refs1") refs_dir2 = os.path.join(self.temp_dir, "refs2") os.makedirs(refs_dir1) os.makedirs(refs_dir2) config = {"name": "test_docs_refs", "description": "Test", "sources": []} scraped_data = { "documentation": [ { "source_id": "source_a", "base_url": "https://a.com", "total_pages": 5, "refs_dir": refs_dir1, }, { "source_id": "source_b", "base_url": "https://b.com", "total_pages": 3, "refs_dir": refs_dir2, }, ], "github": [], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_docs_references(scraped_data["documentation"]) docs_dir = os.path.join(builder.skill_dir, "references", "documentation") self.assertTrue(os.path.exists(os.path.join(docs_dir, "source_a"))) self.assertTrue(os.path.exists(os.path.join(docs_dir, "source_b"))) def test_creates_index_per_source(self): """Test that each source subdirectory has its own index.md.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder refs_dir = os.path.join(self.temp_dir, "refs") os.makedirs(refs_dir) config = {"name": "test_source_index", "description": "Test", "sources": []} scraped_data = { "documentation": [ { "source_id": "my_source", "base_url": "https://example.com", "total_pages": 10, "refs_dir": refs_dir, } ], "github": [], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_docs_references(scraped_data["documentation"]) source_index = os.path.join( builder.skill_dir, "references", "documentation", "my_source", "index.md" ) self.assertTrue(os.path.exists(source_index)) with open(source_index) as f: content = f.read() self.assertIn("my_source", content) self.assertIn("https://example.com", content) def test_creates_main_index_listing_all_sources(self): """Test that main index.md lists all documentation sources.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder refs_dir1 = os.path.join(self.temp_dir, "refs1") refs_dir2 = os.path.join(self.temp_dir, "refs2") os.makedirs(refs_dir1) os.makedirs(refs_dir2) config = {"name": "test_main_index", "description": "Test", "sources": []} scraped_data = { "documentation": [ { "source_id": "docs_one", "base_url": "https://one.com", "total_pages": 10, "refs_dir": refs_dir1, }, { "source_id": "docs_two", "base_url": "https://two.com", "total_pages": 20, "refs_dir": refs_dir2, }, ], "github": [], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_docs_references(scraped_data["documentation"]) main_index = os.path.join(builder.skill_dir, "references", "documentation", "index.md") self.assertTrue(os.path.exists(main_index)) with open(main_index) as f: content = f.read() self.assertIn("docs_one", content) self.assertIn("docs_two", content) self.assertIn("2 documentation sources", content) def test_copies_reference_files_to_source_dir(self): """Test that reference files are copied to source subdirectory.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder refs_dir = os.path.join(self.temp_dir, "refs") os.makedirs(refs_dir) # Create mock reference files with open(os.path.join(refs_dir, "api.md"), "w") as f: f.write("# API Reference") with open(os.path.join(refs_dir, "guide.md"), "w") as f: f.write("# User Guide") config = {"name": "test_copy_refs", "description": "Test", "sources": []} scraped_data = { "documentation": [ { "source_id": "test_source", "base_url": "https://test.com", "total_pages": 5, "refs_dir": refs_dir, } ], "github": [], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_docs_references(scraped_data["documentation"]) source_dir = os.path.join(builder.skill_dir, "references", "documentation", "test_source") self.assertTrue(os.path.exists(os.path.join(source_dir, "api.md"))) self.assertTrue(os.path.exists(os.path.join(source_dir, "guide.md"))) class TestUnifiedSkillBuilderGitHubReferences(unittest.TestCase): """Test GitHub reference generation for multiple repositories.""" def setUp(self): """Set up test fixtures.""" self.temp_dir = tempfile.mkdtemp() self.original_dir = os.getcwd() os.chdir(self.temp_dir) def tearDown(self): """Clean up test fixtures.""" os.chdir(self.original_dir) if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) def test_creates_subdirectory_per_repo(self): """Test that each GitHub repo gets its own subdirectory.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder config = {"name": "test_github_refs", "description": "Test", "sources": []} scraped_data = { "documentation": [], "github": [ { "repo": "org/repo1", "repo_id": "org_repo1", "data": {"readme": "# Repo 1", "issues": [], "releases": [], "repo_info": {}}, }, { "repo": "org/repo2", "repo_id": "org_repo2", "data": {"readme": "# Repo 2", "issues": [], "releases": [], "repo_info": {}}, }, ], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_github_references(scraped_data["github"]) github_dir = os.path.join(builder.skill_dir, "references", "github") self.assertTrue(os.path.exists(os.path.join(github_dir, "org_repo1"))) self.assertTrue(os.path.exists(os.path.join(github_dir, "org_repo2"))) def test_creates_readme_per_repo(self): """Test that README.md is created for each repo.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder config = {"name": "test_readme", "description": "Test", "sources": []} scraped_data = { "documentation": [], "github": [ { "repo": "test/myrepo", "repo_id": "test_myrepo", "data": { "readme": "# My Repository\n\nDescription here.", "issues": [], "releases": [], "repo_info": {}, }, } ], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_github_references(scraped_data["github"]) readme_path = os.path.join( builder.skill_dir, "references", "github", "test_myrepo", "README.md" ) self.assertTrue(os.path.exists(readme_path)) with open(readme_path) as f: content = f.read() self.assertIn("test/myrepo", content) def test_creates_issues_file_when_issues_exist(self): """Test that issues.md is created when repo has issues.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder config = {"name": "test_issues", "description": "Test", "sources": []} scraped_data = { "documentation": [], "github": [ { "repo": "test/repo", "repo_id": "test_repo", "data": { "readme": "# Repo", "issues": [ { "number": 1, "title": "Bug report", "state": "open", "labels": ["bug"], "url": "https://github.com/test/repo/issues/1", }, { "number": 2, "title": "Feature request", "state": "closed", "labels": ["enhancement"], "url": "https://github.com/test/repo/issues/2", }, ], "releases": [], "repo_info": {}, }, } ], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_github_references(scraped_data["github"]) issues_path = os.path.join( builder.skill_dir, "references", "github", "test_repo", "issues.md" ) self.assertTrue(os.path.exists(issues_path)) with open(issues_path) as f: content = f.read() self.assertIn("Bug report", content) self.assertIn("Feature request", content) def test_creates_main_index_listing_all_repos(self): """Test that main index.md lists all GitHub repositories.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder config = {"name": "test_github_index", "description": "Test", "sources": []} scraped_data = { "documentation": [], "github": [ { "repo": "org/first", "repo_id": "org_first", "data": { "readme": "#", "issues": [], "releases": [], "repo_info": {"stars": 100}, }, }, { "repo": "org/second", "repo_id": "org_second", "data": { "readme": "#", "issues": [], "releases": [], "repo_info": {"stars": 50}, }, }, ], "pdf": [], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_github_references(scraped_data["github"]) main_index = os.path.join(builder.skill_dir, "references", "github", "index.md") self.assertTrue(os.path.exists(main_index)) with open(main_index) as f: content = f.read() self.assertIn("org/first", content) self.assertIn("org/second", content) self.assertIn("2 GitHub repositories", content) class TestUnifiedSkillBuilderPdfReferences(unittest.TestCase): """Test PDF reference generation for multiple sources.""" def setUp(self): """Set up test fixtures.""" self.temp_dir = tempfile.mkdtemp() self.original_dir = os.getcwd() os.chdir(self.temp_dir) def tearDown(self): """Clean up test fixtures.""" os.chdir(self.original_dir) if os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir) def test_creates_pdf_index_with_count(self): """Test that PDF index shows correct document count.""" from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder config = {"name": "test_pdf", "description": "Test", "sources": []} scraped_data = { "documentation": [], "github": [], "pdf": [ {"path": "/path/to/doc1.pdf"}, {"path": "/path/to/doc2.pdf"}, {"path": "/path/to/doc3.pdf"}, ], } builder = UnifiedSkillBuilder(config, scraped_data) builder._generate_pdf_references(scraped_data["pdf"]) pdf_index = os.path.join(builder.skill_dir, "references", "pdf", "index.md") self.assertTrue(os.path.exists(pdf_index)) with open(pdf_index) as f: content = f.read() self.assertIn("3 PDF document", content) if __name__ == "__main__": unittest.main()