"""Tests for configurable directory exclusions in GitHub scraper. Tests Issue #203: Make EXCLUDED_DIRS configurable """ import unittest from unittest.mock import patch, Mock from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS class TestExcludedDirsDefaults(unittest.TestCase): """Test default EXCLUDED_DIRS behavior (backward compatibility).""" @patch('skill_seekers.cli.github_scraper.Github') def test_defaults_when_no_config(self, mock_github): """Test that default exclusions are used when no config provided.""" config = { 'repo': 'owner/repo' } scraper = GitHubScraper(config) # Should use default EXCLUDED_DIRS self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS) @patch('skill_seekers.cli.github_scraper.Github') def test_defaults_exclude_common_dirs(self, mock_github): """Test that default exclusions work correctly.""" config = { 'repo': 'owner/repo' } scraper = GitHubScraper(config) # Test common directories are excluded self.assertTrue(scraper.should_exclude_dir('venv')) self.assertTrue(scraper.should_exclude_dir('node_modules')) self.assertTrue(scraper.should_exclude_dir('__pycache__')) self.assertTrue(scraper.should_exclude_dir('.git')) self.assertTrue(scraper.should_exclude_dir('build')) # Test normal directories are not excluded self.assertFalse(scraper.should_exclude_dir('src')) self.assertFalse(scraper.should_exclude_dir('tests')) self.assertFalse(scraper.should_exclude_dir('docs')) @patch('skill_seekers.cli.github_scraper.Github') def test_dot_directories_always_excluded(self, mock_github): """Test that directories starting with '.' are always excluded.""" config = { 'repo': 'owner/repo' } scraper = GitHubScraper(config) # Dot directories should be excluded (even if not in EXCLUDED_DIRS) self.assertTrue(scraper.should_exclude_dir('.hidden')) self.assertTrue(scraper.should_exclude_dir('.cache')) self.assertTrue(scraper.should_exclude_dir('.vscode')) class TestExcludedDirsAdditional(unittest.TestCase): """Test exclude_dirs_additional (extend mode).""" @patch('skill_seekers.cli.github_scraper.Github') def test_extend_with_additional_dirs(self, mock_github): """Test adding custom exclusions to defaults.""" config = { 'repo': 'owner/repo', 'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party'] } scraper = GitHubScraper(config) # Should include both defaults and additional self.assertIn('venv', scraper.excluded_dirs) # Default self.assertIn('node_modules', scraper.excluded_dirs) # Default self.assertIn('proprietary', scraper.excluded_dirs) # Additional self.assertIn('vendor', scraper.excluded_dirs) # Additional self.assertIn('third_party', scraper.excluded_dirs) # Additional # Verify total count self.assertEqual( len(scraper.excluded_dirs), len(EXCLUDED_DIRS) + 3 ) @patch('skill_seekers.cli.github_scraper.Github') def test_extend_excludes_additional_dirs(self, mock_github): """Test that additional directories are actually excluded.""" config = { 'repo': 'owner/repo', 'exclude_dirs_additional': ['legacy', 'deprecated'] } scraper = GitHubScraper(config) # Additional dirs should be excluded self.assertTrue(scraper.should_exclude_dir('legacy')) self.assertTrue(scraper.should_exclude_dir('deprecated')) # Default dirs still excluded self.assertTrue(scraper.should_exclude_dir('venv')) self.assertTrue(scraper.should_exclude_dir('node_modules')) # Normal dirs not excluded self.assertFalse(scraper.should_exclude_dir('src')) @patch('skill_seekers.cli.github_scraper.Github') def test_extend_with_empty_list(self, mock_github): """Test that empty additional list works correctly.""" config = { 'repo': 'owner/repo', 'exclude_dirs_additional': [] } scraper = GitHubScraper(config) # Should just have defaults self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS) class TestExcludedDirsReplace(unittest.TestCase): """Test exclude_dirs (replace mode).""" @patch('skill_seekers.cli.github_scraper.Github') def test_replace_with_custom_list(self, mock_github): """Test replacing default exclusions entirely.""" config = { 'repo': 'owner/repo', 'exclude_dirs': ['node_modules', 'custom_vendor'] } scraper = GitHubScraper(config) # Should ONLY have specified dirs self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'}) self.assertEqual(len(scraper.excluded_dirs), 2) @patch('skill_seekers.cli.github_scraper.Github') def test_replace_excludes_only_specified_dirs(self, mock_github): """Test that only specified directories are excluded in replace mode.""" config = { 'repo': 'owner/repo', 'exclude_dirs': ['node_modules', '.git'] } scraper = GitHubScraper(config) # Specified dirs should be excluded self.assertTrue(scraper.should_exclude_dir('node_modules')) # Note: .git would be excluded anyway due to dot prefix self.assertTrue(scraper.should_exclude_dir('.git')) # Default dirs NOT in our list should NOT be excluded self.assertFalse(scraper.should_exclude_dir('venv')) self.assertFalse(scraper.should_exclude_dir('__pycache__')) self.assertFalse(scraper.should_exclude_dir('build')) # Normal dirs still not excluded self.assertFalse(scraper.should_exclude_dir('src')) @patch('skill_seekers.cli.github_scraper.Github') def test_replace_with_empty_list(self, mock_github): """Test that empty replace list allows all directories (except dot-prefixed).""" config = { 'repo': 'owner/repo', 'exclude_dirs': [] } scraper = GitHubScraper(config) # No explicit exclusions self.assertEqual(scraper.excluded_dirs, set()) # Nothing explicitly excluded self.assertFalse(scraper.should_exclude_dir('venv')) self.assertFalse(scraper.should_exclude_dir('node_modules')) self.assertFalse(scraper.should_exclude_dir('build')) # But dot dirs still excluded (different logic) self.assertTrue(scraper.should_exclude_dir('.git')) self.assertTrue(scraper.should_exclude_dir('.hidden')) class TestExcludedDirsPrecedence(unittest.TestCase): """Test precedence when both options provided.""" @patch('skill_seekers.cli.github_scraper.Github') def test_replace_takes_precedence_over_additional(self, mock_github): """Test that exclude_dirs takes precedence over exclude_dirs_additional.""" config = { 'repo': 'owner/repo', 'exclude_dirs': ['only', 'these'], # Replace mode 'exclude_dirs_additional': ['ignored'] # Should be ignored } scraper = GitHubScraper(config) # Should use replace mode (exclude_dirs), ignore additional self.assertEqual(scraper.excluded_dirs, {'only', 'these'}) self.assertNotIn('ignored', scraper.excluded_dirs) self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored class TestExcludedDirsEdgeCases(unittest.TestCase): """Test edge cases and error handling.""" @patch('skill_seekers.cli.github_scraper.Github') def test_duplicate_exclusions_in_additional(self, mock_github): """Test that duplicates in additional list are handled (set deduplication).""" config = { 'repo': 'owner/repo', 'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed) } scraper = GitHubScraper(config) # Should deduplicate automatically (using set) self.assertIn('venv', scraper.excluded_dirs) self.assertIn('custom', scraper.excluded_dirs) # Count should account for deduplication self.assertEqual( len(scraper.excluded_dirs), len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional ) @patch('skill_seekers.cli.github_scraper.Github') def test_case_sensitive_exclusions(self, mock_github): """Test that exclusions are case-sensitive.""" config = { 'repo': 'owner/repo', 'exclude_dirs': ['Venv', 'NODE_MODULES'] } scraper = GitHubScraper(config) # Case-sensitive matching self.assertTrue(scraper.should_exclude_dir('Venv')) self.assertTrue(scraper.should_exclude_dir('NODE_MODULES')) self.assertFalse(scraper.should_exclude_dir('venv')) # Different case self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case if __name__ == '__main__': unittest.main()