feat: Make EXCLUDED_DIRS configurable for local repository analysis

Closes #203

Adds configuration options to customize directory exclusions during local
repository analysis, while maintaining backward compatibility with smart
defaults.

**New Config Options:**

1. `exclude_dirs_additional` - Extend defaults (most common)
   - Adds custom directories to default exclusions
   - Example: ["proprietary", "legacy", "third_party"]
   - Total exclusions = defaults + additional

2. `exclude_dirs` - Replace defaults (advanced users)
   - Completely overrides default exclusions
   - Example: ["node_modules", ".git", "custom_vendor"]
   - Gives full control over exclusions

**Implementation:**

- Modified GitHubScraper.__init__() to parse exclude_dirs config
- Changed should_exclude_dir() to use instance variable instead of global
- Added logging for custom exclusions (INFO for extend, WARNING for replace)
- Maintains backward compatibility (no config = use defaults)

**Testing:**

- Added 12 comprehensive tests in test_excluded_dirs_config.py
  - 3 tests for defaults (backward compatibility)
  - 3 tests for extend mode
  - 3 tests for replace mode
  - 1 test for precedence
  - 2 tests for edge cases
- All 12 new tests passing 
- All 22 existing github_scraper tests passing 

**Documentation:**

- Updated CLAUDE.md config parameters section
- Added detailed "Configurable Directory Exclusions" feature section
- Included examples for both modes
- Listed common use cases (monorepos, enterprise, legacy codebases)

**Use Cases:**

- Monorepos with custom directory structures
- Enterprise projects with non-standard naming conventions
- Including unusual directories for analysis
- Minimal exclusions for small/simple projects

**Backward Compatibility:**

 Fully backward compatible - existing configs work unchanged
 Smart defaults maintained when no config provided
 All existing tests pass

Co-authored-by: jimmy058910 <jimmy058910@users.noreply.github.com>
This commit is contained in:
yusyus
2025-11-29 23:53:27 +03:00
parent bd20b32470
commit ea289cebe1
3 changed files with 308 additions and 1 deletions

View File

@@ -0,0 +1,246 @@
"""Tests for configurable directory exclusions in GitHub scraper.
Tests Issue #203: Make EXCLUDED_DIRS configurable
"""
import unittest
from unittest.mock import patch, Mock
from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS
class TestExcludedDirsDefaults(unittest.TestCase):
"""Test default EXCLUDED_DIRS behavior (backward compatibility)."""
@patch('skill_seekers.cli.github_scraper.Github')
def test_defaults_when_no_config(self, mock_github):
"""Test that default exclusions are used when no config provided."""
config = {
'repo': 'owner/repo'
}
scraper = GitHubScraper(config)
# Should use default EXCLUDED_DIRS
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
@patch('skill_seekers.cli.github_scraper.Github')
def test_defaults_exclude_common_dirs(self, mock_github):
"""Test that default exclusions work correctly."""
config = {
'repo': 'owner/repo'
}
scraper = GitHubScraper(config)
# Test common directories are excluded
self.assertTrue(scraper.should_exclude_dir('venv'))
self.assertTrue(scraper.should_exclude_dir('node_modules'))
self.assertTrue(scraper.should_exclude_dir('__pycache__'))
self.assertTrue(scraper.should_exclude_dir('.git'))
self.assertTrue(scraper.should_exclude_dir('build'))
# Test normal directories are not excluded
self.assertFalse(scraper.should_exclude_dir('src'))
self.assertFalse(scraper.should_exclude_dir('tests'))
self.assertFalse(scraper.should_exclude_dir('docs'))
@patch('skill_seekers.cli.github_scraper.Github')
def test_dot_directories_always_excluded(self, mock_github):
"""Test that directories starting with '.' are always excluded."""
config = {
'repo': 'owner/repo'
}
scraper = GitHubScraper(config)
# Dot directories should be excluded (even if not in EXCLUDED_DIRS)
self.assertTrue(scraper.should_exclude_dir('.hidden'))
self.assertTrue(scraper.should_exclude_dir('.cache'))
self.assertTrue(scraper.should_exclude_dir('.vscode'))
class TestExcludedDirsAdditional(unittest.TestCase):
"""Test exclude_dirs_additional (extend mode)."""
@patch('skill_seekers.cli.github_scraper.Github')
def test_extend_with_additional_dirs(self, mock_github):
"""Test adding custom exclusions to defaults."""
config = {
'repo': 'owner/repo',
'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party']
}
scraper = GitHubScraper(config)
# Should include both defaults and additional
self.assertIn('venv', scraper.excluded_dirs) # Default
self.assertIn('node_modules', scraper.excluded_dirs) # Default
self.assertIn('proprietary', scraper.excluded_dirs) # Additional
self.assertIn('vendor', scraper.excluded_dirs) # Additional
self.assertIn('third_party', scraper.excluded_dirs) # Additional
# Verify total count
self.assertEqual(
len(scraper.excluded_dirs),
len(EXCLUDED_DIRS) + 3
)
@patch('skill_seekers.cli.github_scraper.Github')
def test_extend_excludes_additional_dirs(self, mock_github):
"""Test that additional directories are actually excluded."""
config = {
'repo': 'owner/repo',
'exclude_dirs_additional': ['legacy', 'deprecated']
}
scraper = GitHubScraper(config)
# Additional dirs should be excluded
self.assertTrue(scraper.should_exclude_dir('legacy'))
self.assertTrue(scraper.should_exclude_dir('deprecated'))
# Default dirs still excluded
self.assertTrue(scraper.should_exclude_dir('venv'))
self.assertTrue(scraper.should_exclude_dir('node_modules'))
# Normal dirs not excluded
self.assertFalse(scraper.should_exclude_dir('src'))
@patch('skill_seekers.cli.github_scraper.Github')
def test_extend_with_empty_list(self, mock_github):
"""Test that empty additional list works correctly."""
config = {
'repo': 'owner/repo',
'exclude_dirs_additional': []
}
scraper = GitHubScraper(config)
# Should just have defaults
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
class TestExcludedDirsReplace(unittest.TestCase):
"""Test exclude_dirs (replace mode)."""
@patch('skill_seekers.cli.github_scraper.Github')
def test_replace_with_custom_list(self, mock_github):
"""Test replacing default exclusions entirely."""
config = {
'repo': 'owner/repo',
'exclude_dirs': ['node_modules', 'custom_vendor']
}
scraper = GitHubScraper(config)
# Should ONLY have specified dirs
self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'})
self.assertEqual(len(scraper.excluded_dirs), 2)
@patch('skill_seekers.cli.github_scraper.Github')
def test_replace_excludes_only_specified_dirs(self, mock_github):
"""Test that only specified directories are excluded in replace mode."""
config = {
'repo': 'owner/repo',
'exclude_dirs': ['node_modules', '.git']
}
scraper = GitHubScraper(config)
# Specified dirs should be excluded
self.assertTrue(scraper.should_exclude_dir('node_modules'))
# Note: .git would be excluded anyway due to dot prefix
self.assertTrue(scraper.should_exclude_dir('.git'))
# Default dirs NOT in our list should NOT be excluded
self.assertFalse(scraper.should_exclude_dir('venv'))
self.assertFalse(scraper.should_exclude_dir('__pycache__'))
self.assertFalse(scraper.should_exclude_dir('build'))
# Normal dirs still not excluded
self.assertFalse(scraper.should_exclude_dir('src'))
@patch('skill_seekers.cli.github_scraper.Github')
def test_replace_with_empty_list(self, mock_github):
"""Test that empty replace list allows all directories (except dot-prefixed)."""
config = {
'repo': 'owner/repo',
'exclude_dirs': []
}
scraper = GitHubScraper(config)
# No explicit exclusions
self.assertEqual(scraper.excluded_dirs, set())
# Nothing explicitly excluded
self.assertFalse(scraper.should_exclude_dir('venv'))
self.assertFalse(scraper.should_exclude_dir('node_modules'))
self.assertFalse(scraper.should_exclude_dir('build'))
# But dot dirs still excluded (different logic)
self.assertTrue(scraper.should_exclude_dir('.git'))
self.assertTrue(scraper.should_exclude_dir('.hidden'))
class TestExcludedDirsPrecedence(unittest.TestCase):
"""Test precedence when both options provided."""
@patch('skill_seekers.cli.github_scraper.Github')
def test_replace_takes_precedence_over_additional(self, mock_github):
"""Test that exclude_dirs takes precedence over exclude_dirs_additional."""
config = {
'repo': 'owner/repo',
'exclude_dirs': ['only', 'these'], # Replace mode
'exclude_dirs_additional': ['ignored'] # Should be ignored
}
scraper = GitHubScraper(config)
# Should use replace mode (exclude_dirs), ignore additional
self.assertEqual(scraper.excluded_dirs, {'only', 'these'})
self.assertNotIn('ignored', scraper.excluded_dirs)
self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored
class TestExcludedDirsEdgeCases(unittest.TestCase):
"""Test edge cases and error handling."""
@patch('skill_seekers.cli.github_scraper.Github')
def test_duplicate_exclusions_in_additional(self, mock_github):
"""Test that duplicates in additional list are handled (set deduplication)."""
config = {
'repo': 'owner/repo',
'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed)
}
scraper = GitHubScraper(config)
# Should deduplicate automatically (using set)
self.assertIn('venv', scraper.excluded_dirs)
self.assertIn('custom', scraper.excluded_dirs)
# Count should account for deduplication
self.assertEqual(
len(scraper.excluded_dirs),
len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional
)
@patch('skill_seekers.cli.github_scraper.Github')
def test_case_sensitive_exclusions(self, mock_github):
"""Test that exclusions are case-sensitive."""
config = {
'repo': 'owner/repo',
'exclude_dirs': ['Venv', 'NODE_MODULES']
}
scraper = GitHubScraper(config)
# Case-sensitive matching
self.assertTrue(scraper.should_exclude_dir('Venv'))
self.assertTrue(scraper.should_exclude_dir('NODE_MODULES'))
self.assertFalse(scraper.should_exclude_dir('venv')) # Different case
self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case
if __name__ == '__main__':
unittest.main()