feat: Make EXCLUDED_DIRS configurable for local repository analysis
Closes #203 Adds configuration options to customize directory exclusions during local repository analysis, while maintaining backward compatibility with smart defaults. **New Config Options:** 1. `exclude_dirs_additional` - Extend defaults (most common) - Adds custom directories to default exclusions - Example: ["proprietary", "legacy", "third_party"] - Total exclusions = defaults + additional 2. `exclude_dirs` - Replace defaults (advanced users) - Completely overrides default exclusions - Example: ["node_modules", ".git", "custom_vendor"] - Gives full control over exclusions **Implementation:** - Modified GitHubScraper.__init__() to parse exclude_dirs config - Changed should_exclude_dir() to use instance variable instead of global - Added logging for custom exclusions (INFO for extend, WARNING for replace) - Maintains backward compatibility (no config = use defaults) **Testing:** - Added 12 comprehensive tests in test_excluded_dirs_config.py - 3 tests for defaults (backward compatibility) - 3 tests for extend mode - 3 tests for replace mode - 1 test for precedence - 2 tests for edge cases - All 12 new tests passing ✅ - All 22 existing github_scraper tests passing ✅ **Documentation:** - Updated CLAUDE.md config parameters section - Added detailed "Configurable Directory Exclusions" feature section - Included examples for both modes - Listed common use cases (monorepos, enterprise, legacy codebases) **Use Cases:** - Monorepos with custom directory structures - Enterprise projects with non-standard naming conventions - Including unusual directories for analysis - Minimal exclusions for small/simple projects **Backward Compatibility:** ✅ Fully backward compatible - existing configs work unchanged ✅ Smart defaults maintained when no config provided ✅ All existing tests pass Co-authored-by: jimmy058910 <jimmy058910@users.noreply.github.com>
This commit is contained in:
246
tests/test_excluded_dirs_config.py
Normal file
246
tests/test_excluded_dirs_config.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""Tests for configurable directory exclusions in GitHub scraper.
|
||||
|
||||
Tests Issue #203: Make EXCLUDED_DIRS configurable
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from unittest.mock import patch, Mock
|
||||
from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS
|
||||
|
||||
|
||||
class TestExcludedDirsDefaults(unittest.TestCase):
|
||||
"""Test default EXCLUDED_DIRS behavior (backward compatibility)."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_defaults_when_no_config(self, mock_github):
|
||||
"""Test that default exclusions are used when no config provided."""
|
||||
config = {
|
||||
'repo': 'owner/repo'
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should use default EXCLUDED_DIRS
|
||||
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_defaults_exclude_common_dirs(self, mock_github):
|
||||
"""Test that default exclusions work correctly."""
|
||||
config = {
|
||||
'repo': 'owner/repo'
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Test common directories are excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('venv'))
|
||||
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||
self.assertTrue(scraper.should_exclude_dir('__pycache__'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||
self.assertTrue(scraper.should_exclude_dir('build'))
|
||||
|
||||
# Test normal directories are not excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||
self.assertFalse(scraper.should_exclude_dir('tests'))
|
||||
self.assertFalse(scraper.should_exclude_dir('docs'))
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_dot_directories_always_excluded(self, mock_github):
|
||||
"""Test that directories starting with '.' are always excluded."""
|
||||
config = {
|
||||
'repo': 'owner/repo'
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Dot directories should be excluded (even if not in EXCLUDED_DIRS)
|
||||
self.assertTrue(scraper.should_exclude_dir('.hidden'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.cache'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.vscode'))
|
||||
|
||||
|
||||
class TestExcludedDirsAdditional(unittest.TestCase):
|
||||
"""Test exclude_dirs_additional (extend mode)."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_extend_with_additional_dirs(self, mock_github):
|
||||
"""Test adding custom exclusions to defaults."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should include both defaults and additional
|
||||
self.assertIn('venv', scraper.excluded_dirs) # Default
|
||||
self.assertIn('node_modules', scraper.excluded_dirs) # Default
|
||||
self.assertIn('proprietary', scraper.excluded_dirs) # Additional
|
||||
self.assertIn('vendor', scraper.excluded_dirs) # Additional
|
||||
self.assertIn('third_party', scraper.excluded_dirs) # Additional
|
||||
|
||||
# Verify total count
|
||||
self.assertEqual(
|
||||
len(scraper.excluded_dirs),
|
||||
len(EXCLUDED_DIRS) + 3
|
||||
)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_extend_excludes_additional_dirs(self, mock_github):
|
||||
"""Test that additional directories are actually excluded."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': ['legacy', 'deprecated']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Additional dirs should be excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('legacy'))
|
||||
self.assertTrue(scraper.should_exclude_dir('deprecated'))
|
||||
|
||||
# Default dirs still excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('venv'))
|
||||
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||
|
||||
# Normal dirs not excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_extend_with_empty_list(self, mock_github):
|
||||
"""Test that empty additional list works correctly."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': []
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should just have defaults
|
||||
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
|
||||
|
||||
|
||||
class TestExcludedDirsReplace(unittest.TestCase):
|
||||
"""Test exclude_dirs (replace mode)."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_with_custom_list(self, mock_github):
|
||||
"""Test replacing default exclusions entirely."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['node_modules', 'custom_vendor']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should ONLY have specified dirs
|
||||
self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'})
|
||||
self.assertEqual(len(scraper.excluded_dirs), 2)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_excludes_only_specified_dirs(self, mock_github):
|
||||
"""Test that only specified directories are excluded in replace mode."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['node_modules', '.git']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Specified dirs should be excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||
# Note: .git would be excluded anyway due to dot prefix
|
||||
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||
|
||||
# Default dirs NOT in our list should NOT be excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('venv'))
|
||||
self.assertFalse(scraper.should_exclude_dir('__pycache__'))
|
||||
self.assertFalse(scraper.should_exclude_dir('build'))
|
||||
|
||||
# Normal dirs still not excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_with_empty_list(self, mock_github):
|
||||
"""Test that empty replace list allows all directories (except dot-prefixed)."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': []
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# No explicit exclusions
|
||||
self.assertEqual(scraper.excluded_dirs, set())
|
||||
|
||||
# Nothing explicitly excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('venv'))
|
||||
self.assertFalse(scraper.should_exclude_dir('node_modules'))
|
||||
self.assertFalse(scraper.should_exclude_dir('build'))
|
||||
|
||||
# But dot dirs still excluded (different logic)
|
||||
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.hidden'))
|
||||
|
||||
|
||||
class TestExcludedDirsPrecedence(unittest.TestCase):
|
||||
"""Test precedence when both options provided."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_takes_precedence_over_additional(self, mock_github):
|
||||
"""Test that exclude_dirs takes precedence over exclude_dirs_additional."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['only', 'these'], # Replace mode
|
||||
'exclude_dirs_additional': ['ignored'] # Should be ignored
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should use replace mode (exclude_dirs), ignore additional
|
||||
self.assertEqual(scraper.excluded_dirs, {'only', 'these'})
|
||||
self.assertNotIn('ignored', scraper.excluded_dirs)
|
||||
self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored
|
||||
|
||||
|
||||
class TestExcludedDirsEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases and error handling."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_duplicate_exclusions_in_additional(self, mock_github):
|
||||
"""Test that duplicates in additional list are handled (set deduplication)."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed)
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should deduplicate automatically (using set)
|
||||
self.assertIn('venv', scraper.excluded_dirs)
|
||||
self.assertIn('custom', scraper.excluded_dirs)
|
||||
# Count should account for deduplication
|
||||
self.assertEqual(
|
||||
len(scraper.excluded_dirs),
|
||||
len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional
|
||||
)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_case_sensitive_exclusions(self, mock_github):
|
||||
"""Test that exclusions are case-sensitive."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['Venv', 'NODE_MODULES']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Case-sensitive matching
|
||||
self.assertTrue(scraper.should_exclude_dir('Venv'))
|
||||
self.assertTrue(scraper.should_exclude_dir('NODE_MODULES'))
|
||||
self.assertFalse(scraper.should_exclude_dir('venv')) # Different case
|
||||
self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user