feat: Make EXCLUDED_DIRS configurable for local repository analysis
Closes #203 Adds configuration options to customize directory exclusions during local repository analysis, while maintaining backward compatibility with smart defaults. **New Config Options:** 1. `exclude_dirs_additional` - Extend defaults (most common) - Adds custom directories to default exclusions - Example: ["proprietary", "legacy", "third_party"] - Total exclusions = defaults + additional 2. `exclude_dirs` - Replace defaults (advanced users) - Completely overrides default exclusions - Example: ["node_modules", ".git", "custom_vendor"] - Gives full control over exclusions **Implementation:** - Modified GitHubScraper.__init__() to parse exclude_dirs config - Changed should_exclude_dir() to use instance variable instead of global - Added logging for custom exclusions (INFO for extend, WARNING for replace) - Maintains backward compatibility (no config = use defaults) **Testing:** - Added 12 comprehensive tests in test_excluded_dirs_config.py - 3 tests for defaults (backward compatibility) - 3 tests for extend mode - 3 tests for replace mode - 1 test for precedence - 2 tests for edge cases - All 12 new tests passing ✅ - All 22 existing github_scraper tests passing ✅ **Documentation:** - Updated CLAUDE.md config parameters section - Added detailed "Configurable Directory Exclusions" feature section - Included examples for both modes - Listed common use cases (monorepos, enterprise, legacy codebases) **Use Cases:** - Monorepos with custom directory structures - Enterprise projects with non-standard naming conventions - Including unusual directories for analysis - Minimal exclusions for small/simple projects **Backward Compatibility:** ✅ Fully backward compatible - existing configs work unchanged ✅ Smart defaults maintained when no config provided ✅ All existing tests pass Co-authored-by: jimmy058910 <jimmy058910@users.noreply.github.com>
This commit is contained in:
39
CLAUDE.md
39
CLAUDE.md
@@ -437,12 +437,51 @@ Config files (`configs/*.json`) define scraping behavior:
|
||||
- `rate_limit`: Delay between requests (seconds)
|
||||
- `max_pages`: Maximum pages to scrape
|
||||
- `skip_llms_txt`: Skip llms.txt detection, force HTML scraping (default: false)
|
||||
- `exclude_dirs_additional`: Add custom directories to default exclusions (for local repo analysis)
|
||||
- `exclude_dirs`: Replace default directory exclusions entirely (advanced, for local repo analysis)
|
||||
|
||||
## Key Features & Implementation
|
||||
|
||||
### Auto-Detect Existing Data
|
||||
Tool checks for `output/{name}_data/` and prompts to reuse, avoiding re-scraping (check_existing_data() in doc_scraper.py:653-660).
|
||||
|
||||
### Configurable Directory Exclusions (Local Repository Analysis)
|
||||
|
||||
When using `local_repo_path` for unlimited local repository analysis, you can customize which directories to exclude from analysis.
|
||||
|
||||
**Smart Defaults:**
|
||||
Automatically excludes common directories: `venv`, `node_modules`, `__pycache__`, `.git`, `build`, `dist`, `.pytest_cache`, `htmlcov`, `.tox`, `.mypy_cache`, etc.
|
||||
|
||||
**Extend Mode** (`exclude_dirs_additional`): Add custom exclusions to defaults
|
||||
```json
|
||||
{
|
||||
"sources": [{
|
||||
"type": "github",
|
||||
"local_repo_path": "/path/to/repo",
|
||||
"exclude_dirs_additional": ["proprietary", "legacy", "third_party"]
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
**Replace Mode** (`exclude_dirs`): Override defaults entirely (advanced)
|
||||
```json
|
||||
{
|
||||
"sources": [{
|
||||
"type": "github",
|
||||
"local_repo_path": "/path/to/repo",
|
||||
"exclude_dirs": ["node_modules", ".git", "custom_vendor"]
|
||||
}]
|
||||
}
|
||||
```
|
||||
|
||||
**Use Cases:**
|
||||
- Monorepos with custom directory structures
|
||||
- Enterprise projects with non-standard naming
|
||||
- Including unusual directories (e.g., analyzing venv code)
|
||||
- Minimal exclusions for small/simple projects
|
||||
|
||||
See: `should_exclude_dir()` in github_scraper.py:304-306
|
||||
|
||||
### Language Detection
|
||||
Detects code languages from:
|
||||
1. CSS class attributes (`language-*`, `lang-*`)
|
||||
|
||||
@@ -87,6 +87,28 @@ class GitHubScraper:
|
||||
self.local_repo_path = os.path.expanduser(self.local_repo_path)
|
||||
logger.info(f"Local repository mode enabled: {self.local_repo_path}")
|
||||
|
||||
# Configure directory exclusions (smart defaults + optional customization)
|
||||
self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults
|
||||
|
||||
# Option 1: Replace mode - Use only specified exclusions
|
||||
if 'exclude_dirs' in config:
|
||||
self.excluded_dirs = set(config['exclude_dirs'])
|
||||
logger.warning(
|
||||
f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - "
|
||||
"defaults overridden"
|
||||
)
|
||||
logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}")
|
||||
|
||||
# Option 2: Extend mode - Add to default exclusions
|
||||
elif 'exclude_dirs_additional' in config:
|
||||
additional = set(config['exclude_dirs_additional'])
|
||||
self.excluded_dirs = self.excluded_dirs.union(additional)
|
||||
logger.info(
|
||||
f"Added {len(additional)} custom directory exclusions "
|
||||
f"(total: {len(self.excluded_dirs)})"
|
||||
)
|
||||
logger.debug(f"Additional exclusions: {sorted(additional)}")
|
||||
|
||||
# GitHub client setup (C1.1)
|
||||
token = self._get_token()
|
||||
self.github = Github(token) if token else Github()
|
||||
@@ -281,7 +303,7 @@ class GitHubScraper:
|
||||
|
||||
def should_exclude_dir(self, dir_name: str) -> bool:
|
||||
"""Check if directory should be excluded from analysis."""
|
||||
return dir_name in EXCLUDED_DIRS or dir_name.startswith('.')
|
||||
return dir_name in self.excluded_dirs or dir_name.startswith('.')
|
||||
|
||||
def _extract_file_tree(self):
|
||||
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
|
||||
|
||||
246
tests/test_excluded_dirs_config.py
Normal file
246
tests/test_excluded_dirs_config.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""Tests for configurable directory exclusions in GitHub scraper.
|
||||
|
||||
Tests Issue #203: Make EXCLUDED_DIRS configurable
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from unittest.mock import patch, Mock
|
||||
from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS
|
||||
|
||||
|
||||
class TestExcludedDirsDefaults(unittest.TestCase):
|
||||
"""Test default EXCLUDED_DIRS behavior (backward compatibility)."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_defaults_when_no_config(self, mock_github):
|
||||
"""Test that default exclusions are used when no config provided."""
|
||||
config = {
|
||||
'repo': 'owner/repo'
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should use default EXCLUDED_DIRS
|
||||
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_defaults_exclude_common_dirs(self, mock_github):
|
||||
"""Test that default exclusions work correctly."""
|
||||
config = {
|
||||
'repo': 'owner/repo'
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Test common directories are excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('venv'))
|
||||
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||
self.assertTrue(scraper.should_exclude_dir('__pycache__'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||
self.assertTrue(scraper.should_exclude_dir('build'))
|
||||
|
||||
# Test normal directories are not excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||
self.assertFalse(scraper.should_exclude_dir('tests'))
|
||||
self.assertFalse(scraper.should_exclude_dir('docs'))
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_dot_directories_always_excluded(self, mock_github):
|
||||
"""Test that directories starting with '.' are always excluded."""
|
||||
config = {
|
||||
'repo': 'owner/repo'
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Dot directories should be excluded (even if not in EXCLUDED_DIRS)
|
||||
self.assertTrue(scraper.should_exclude_dir('.hidden'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.cache'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.vscode'))
|
||||
|
||||
|
||||
class TestExcludedDirsAdditional(unittest.TestCase):
|
||||
"""Test exclude_dirs_additional (extend mode)."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_extend_with_additional_dirs(self, mock_github):
|
||||
"""Test adding custom exclusions to defaults."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should include both defaults and additional
|
||||
self.assertIn('venv', scraper.excluded_dirs) # Default
|
||||
self.assertIn('node_modules', scraper.excluded_dirs) # Default
|
||||
self.assertIn('proprietary', scraper.excluded_dirs) # Additional
|
||||
self.assertIn('vendor', scraper.excluded_dirs) # Additional
|
||||
self.assertIn('third_party', scraper.excluded_dirs) # Additional
|
||||
|
||||
# Verify total count
|
||||
self.assertEqual(
|
||||
len(scraper.excluded_dirs),
|
||||
len(EXCLUDED_DIRS) + 3
|
||||
)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_extend_excludes_additional_dirs(self, mock_github):
|
||||
"""Test that additional directories are actually excluded."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': ['legacy', 'deprecated']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Additional dirs should be excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('legacy'))
|
||||
self.assertTrue(scraper.should_exclude_dir('deprecated'))
|
||||
|
||||
# Default dirs still excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('venv'))
|
||||
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||
|
||||
# Normal dirs not excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_extend_with_empty_list(self, mock_github):
|
||||
"""Test that empty additional list works correctly."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': []
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should just have defaults
|
||||
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
|
||||
|
||||
|
||||
class TestExcludedDirsReplace(unittest.TestCase):
|
||||
"""Test exclude_dirs (replace mode)."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_with_custom_list(self, mock_github):
|
||||
"""Test replacing default exclusions entirely."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['node_modules', 'custom_vendor']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should ONLY have specified dirs
|
||||
self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'})
|
||||
self.assertEqual(len(scraper.excluded_dirs), 2)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_excludes_only_specified_dirs(self, mock_github):
|
||||
"""Test that only specified directories are excluded in replace mode."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['node_modules', '.git']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Specified dirs should be excluded
|
||||
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||
# Note: .git would be excluded anyway due to dot prefix
|
||||
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||
|
||||
# Default dirs NOT in our list should NOT be excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('venv'))
|
||||
self.assertFalse(scraper.should_exclude_dir('__pycache__'))
|
||||
self.assertFalse(scraper.should_exclude_dir('build'))
|
||||
|
||||
# Normal dirs still not excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_with_empty_list(self, mock_github):
|
||||
"""Test that empty replace list allows all directories (except dot-prefixed)."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': []
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# No explicit exclusions
|
||||
self.assertEqual(scraper.excluded_dirs, set())
|
||||
|
||||
# Nothing explicitly excluded
|
||||
self.assertFalse(scraper.should_exclude_dir('venv'))
|
||||
self.assertFalse(scraper.should_exclude_dir('node_modules'))
|
||||
self.assertFalse(scraper.should_exclude_dir('build'))
|
||||
|
||||
# But dot dirs still excluded (different logic)
|
||||
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||
self.assertTrue(scraper.should_exclude_dir('.hidden'))
|
||||
|
||||
|
||||
class TestExcludedDirsPrecedence(unittest.TestCase):
|
||||
"""Test precedence when both options provided."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_replace_takes_precedence_over_additional(self, mock_github):
|
||||
"""Test that exclude_dirs takes precedence over exclude_dirs_additional."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['only', 'these'], # Replace mode
|
||||
'exclude_dirs_additional': ['ignored'] # Should be ignored
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should use replace mode (exclude_dirs), ignore additional
|
||||
self.assertEqual(scraper.excluded_dirs, {'only', 'these'})
|
||||
self.assertNotIn('ignored', scraper.excluded_dirs)
|
||||
self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored
|
||||
|
||||
|
||||
class TestExcludedDirsEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases and error handling."""
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_duplicate_exclusions_in_additional(self, mock_github):
|
||||
"""Test that duplicates in additional list are handled (set deduplication)."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed)
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Should deduplicate automatically (using set)
|
||||
self.assertIn('venv', scraper.excluded_dirs)
|
||||
self.assertIn('custom', scraper.excluded_dirs)
|
||||
# Count should account for deduplication
|
||||
self.assertEqual(
|
||||
len(scraper.excluded_dirs),
|
||||
len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional
|
||||
)
|
||||
|
||||
@patch('skill_seekers.cli.github_scraper.Github')
|
||||
def test_case_sensitive_exclusions(self, mock_github):
|
||||
"""Test that exclusions are case-sensitive."""
|
||||
config = {
|
||||
'repo': 'owner/repo',
|
||||
'exclude_dirs': ['Venv', 'NODE_MODULES']
|
||||
}
|
||||
|
||||
scraper = GitHubScraper(config)
|
||||
|
||||
# Case-sensitive matching
|
||||
self.assertTrue(scraper.should_exclude_dir('Venv'))
|
||||
self.assertTrue(scraper.should_exclude_dir('NODE_MODULES'))
|
||||
self.assertFalse(scraper.should_exclude_dir('venv')) # Different case
|
||||
self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user