diff --git a/CLAUDE.md b/CLAUDE.md index 54014f2..577f630 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -437,12 +437,51 @@ Config files (`configs/*.json`) define scraping behavior: - `rate_limit`: Delay between requests (seconds) - `max_pages`: Maximum pages to scrape - `skip_llms_txt`: Skip llms.txt detection, force HTML scraping (default: false) +- `exclude_dirs_additional`: Add custom directories to default exclusions (for local repo analysis) +- `exclude_dirs`: Replace default directory exclusions entirely (advanced, for local repo analysis) ## Key Features & Implementation ### Auto-Detect Existing Data Tool checks for `output/{name}_data/` and prompts to reuse, avoiding re-scraping (check_existing_data() in doc_scraper.py:653-660). +### Configurable Directory Exclusions (Local Repository Analysis) + +When using `local_repo_path` for unlimited local repository analysis, you can customize which directories to exclude from analysis. + +**Smart Defaults:** +Automatically excludes common directories: `venv`, `node_modules`, `__pycache__`, `.git`, `build`, `dist`, `.pytest_cache`, `htmlcov`, `.tox`, `.mypy_cache`, etc. + +**Extend Mode** (`exclude_dirs_additional`): Add custom exclusions to defaults +```json +{ + "sources": [{ + "type": "github", + "local_repo_path": "/path/to/repo", + "exclude_dirs_additional": ["proprietary", "legacy", "third_party"] + }] +} +``` + +**Replace Mode** (`exclude_dirs`): Override defaults entirely (advanced) +```json +{ + "sources": [{ + "type": "github", + "local_repo_path": "/path/to/repo", + "exclude_dirs": ["node_modules", ".git", "custom_vendor"] + }] +} +``` + +**Use Cases:** +- Monorepos with custom directory structures +- Enterprise projects with non-standard naming +- Including unusual directories (e.g., analyzing venv code) +- Minimal exclusions for small/simple projects + +See: `should_exclude_dir()` in github_scraper.py:304-306 + ### Language Detection Detects code languages from: 1. CSS class attributes (`language-*`, `lang-*`) diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index 1466834..861f6c6 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -87,6 +87,28 @@ class GitHubScraper: self.local_repo_path = os.path.expanduser(self.local_repo_path) logger.info(f"Local repository mode enabled: {self.local_repo_path}") + # Configure directory exclusions (smart defaults + optional customization) + self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults + + # Option 1: Replace mode - Use only specified exclusions + if 'exclude_dirs' in config: + self.excluded_dirs = set(config['exclude_dirs']) + logger.warning( + f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - " + "defaults overridden" + ) + logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}") + + # Option 2: Extend mode - Add to default exclusions + elif 'exclude_dirs_additional' in config: + additional = set(config['exclude_dirs_additional']) + self.excluded_dirs = self.excluded_dirs.union(additional) + logger.info( + f"Added {len(additional)} custom directory exclusions " + f"(total: {len(self.excluded_dirs)})" + ) + logger.debug(f"Additional exclusions: {sorted(additional)}") + # GitHub client setup (C1.1) token = self._get_token() self.github = Github(token) if token else Github() @@ -281,7 +303,7 @@ class GitHubScraper: def should_exclude_dir(self, dir_name: str) -> bool: """Check if directory should be excluded from analysis.""" - return dir_name in EXCLUDED_DIRS or dir_name.startswith('.') + return dir_name in self.excluded_dirs or dir_name.startswith('.') def _extract_file_tree(self): """Extract repository file tree structure (dual-mode: GitHub API or local filesystem).""" diff --git a/tests/test_excluded_dirs_config.py b/tests/test_excluded_dirs_config.py new file mode 100644 index 0000000..b425f62 --- /dev/null +++ b/tests/test_excluded_dirs_config.py @@ -0,0 +1,246 @@ +"""Tests for configurable directory exclusions in GitHub scraper. + +Tests Issue #203: Make EXCLUDED_DIRS configurable +""" + +import unittest +from unittest.mock import patch, Mock +from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS + + +class TestExcludedDirsDefaults(unittest.TestCase): + """Test default EXCLUDED_DIRS behavior (backward compatibility).""" + + @patch('skill_seekers.cli.github_scraper.Github') + def test_defaults_when_no_config(self, mock_github): + """Test that default exclusions are used when no config provided.""" + config = { + 'repo': 'owner/repo' + } + + scraper = GitHubScraper(config) + + # Should use default EXCLUDED_DIRS + self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_defaults_exclude_common_dirs(self, mock_github): + """Test that default exclusions work correctly.""" + config = { + 'repo': 'owner/repo' + } + + scraper = GitHubScraper(config) + + # Test common directories are excluded + self.assertTrue(scraper.should_exclude_dir('venv')) + self.assertTrue(scraper.should_exclude_dir('node_modules')) + self.assertTrue(scraper.should_exclude_dir('__pycache__')) + self.assertTrue(scraper.should_exclude_dir('.git')) + self.assertTrue(scraper.should_exclude_dir('build')) + + # Test normal directories are not excluded + self.assertFalse(scraper.should_exclude_dir('src')) + self.assertFalse(scraper.should_exclude_dir('tests')) + self.assertFalse(scraper.should_exclude_dir('docs')) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_dot_directories_always_excluded(self, mock_github): + """Test that directories starting with '.' are always excluded.""" + config = { + 'repo': 'owner/repo' + } + + scraper = GitHubScraper(config) + + # Dot directories should be excluded (even if not in EXCLUDED_DIRS) + self.assertTrue(scraper.should_exclude_dir('.hidden')) + self.assertTrue(scraper.should_exclude_dir('.cache')) + self.assertTrue(scraper.should_exclude_dir('.vscode')) + + +class TestExcludedDirsAdditional(unittest.TestCase): + """Test exclude_dirs_additional (extend mode).""" + + @patch('skill_seekers.cli.github_scraper.Github') + def test_extend_with_additional_dirs(self, mock_github): + """Test adding custom exclusions to defaults.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party'] + } + + scraper = GitHubScraper(config) + + # Should include both defaults and additional + self.assertIn('venv', scraper.excluded_dirs) # Default + self.assertIn('node_modules', scraper.excluded_dirs) # Default + self.assertIn('proprietary', scraper.excluded_dirs) # Additional + self.assertIn('vendor', scraper.excluded_dirs) # Additional + self.assertIn('third_party', scraper.excluded_dirs) # Additional + + # Verify total count + self.assertEqual( + len(scraper.excluded_dirs), + len(EXCLUDED_DIRS) + 3 + ) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_extend_excludes_additional_dirs(self, mock_github): + """Test that additional directories are actually excluded.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs_additional': ['legacy', 'deprecated'] + } + + scraper = GitHubScraper(config) + + # Additional dirs should be excluded + self.assertTrue(scraper.should_exclude_dir('legacy')) + self.assertTrue(scraper.should_exclude_dir('deprecated')) + + # Default dirs still excluded + self.assertTrue(scraper.should_exclude_dir('venv')) + self.assertTrue(scraper.should_exclude_dir('node_modules')) + + # Normal dirs not excluded + self.assertFalse(scraper.should_exclude_dir('src')) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_extend_with_empty_list(self, mock_github): + """Test that empty additional list works correctly.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs_additional': [] + } + + scraper = GitHubScraper(config) + + # Should just have defaults + self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS) + + +class TestExcludedDirsReplace(unittest.TestCase): + """Test exclude_dirs (replace mode).""" + + @patch('skill_seekers.cli.github_scraper.Github') + def test_replace_with_custom_list(self, mock_github): + """Test replacing default exclusions entirely.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs': ['node_modules', 'custom_vendor'] + } + + scraper = GitHubScraper(config) + + # Should ONLY have specified dirs + self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'}) + self.assertEqual(len(scraper.excluded_dirs), 2) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_replace_excludes_only_specified_dirs(self, mock_github): + """Test that only specified directories are excluded in replace mode.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs': ['node_modules', '.git'] + } + + scraper = GitHubScraper(config) + + # Specified dirs should be excluded + self.assertTrue(scraper.should_exclude_dir('node_modules')) + # Note: .git would be excluded anyway due to dot prefix + self.assertTrue(scraper.should_exclude_dir('.git')) + + # Default dirs NOT in our list should NOT be excluded + self.assertFalse(scraper.should_exclude_dir('venv')) + self.assertFalse(scraper.should_exclude_dir('__pycache__')) + self.assertFalse(scraper.should_exclude_dir('build')) + + # Normal dirs still not excluded + self.assertFalse(scraper.should_exclude_dir('src')) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_replace_with_empty_list(self, mock_github): + """Test that empty replace list allows all directories (except dot-prefixed).""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs': [] + } + + scraper = GitHubScraper(config) + + # No explicit exclusions + self.assertEqual(scraper.excluded_dirs, set()) + + # Nothing explicitly excluded + self.assertFalse(scraper.should_exclude_dir('venv')) + self.assertFalse(scraper.should_exclude_dir('node_modules')) + self.assertFalse(scraper.should_exclude_dir('build')) + + # But dot dirs still excluded (different logic) + self.assertTrue(scraper.should_exclude_dir('.git')) + self.assertTrue(scraper.should_exclude_dir('.hidden')) + + +class TestExcludedDirsPrecedence(unittest.TestCase): + """Test precedence when both options provided.""" + + @patch('skill_seekers.cli.github_scraper.Github') + def test_replace_takes_precedence_over_additional(self, mock_github): + """Test that exclude_dirs takes precedence over exclude_dirs_additional.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs': ['only', 'these'], # Replace mode + 'exclude_dirs_additional': ['ignored'] # Should be ignored + } + + scraper = GitHubScraper(config) + + # Should use replace mode (exclude_dirs), ignore additional + self.assertEqual(scraper.excluded_dirs, {'only', 'these'}) + self.assertNotIn('ignored', scraper.excluded_dirs) + self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored + + +class TestExcludedDirsEdgeCases(unittest.TestCase): + """Test edge cases and error handling.""" + + @patch('skill_seekers.cli.github_scraper.Github') + def test_duplicate_exclusions_in_additional(self, mock_github): + """Test that duplicates in additional list are handled (set deduplication).""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed) + } + + scraper = GitHubScraper(config) + + # Should deduplicate automatically (using set) + self.assertIn('venv', scraper.excluded_dirs) + self.assertIn('custom', scraper.excluded_dirs) + # Count should account for deduplication + self.assertEqual( + len(scraper.excluded_dirs), + len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional + ) + + @patch('skill_seekers.cli.github_scraper.Github') + def test_case_sensitive_exclusions(self, mock_github): + """Test that exclusions are case-sensitive.""" + config = { + 'repo': 'owner/repo', + 'exclude_dirs': ['Venv', 'NODE_MODULES'] + } + + scraper = GitHubScraper(config) + + # Case-sensitive matching + self.assertTrue(scraper.should_exclude_dir('Venv')) + self.assertTrue(scraper.should_exclude_dir('NODE_MODULES')) + self.assertFalse(scraper.should_exclude_dir('venv')) # Different case + self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case + + +if __name__ == '__main__': + unittest.main()