feat: Make EXCLUDED_DIRS configurable for local repository analysis
Closes #203 Adds configuration options to customize directory exclusions during local repository analysis, while maintaining backward compatibility with smart defaults. **New Config Options:** 1. `exclude_dirs_additional` - Extend defaults (most common) - Adds custom directories to default exclusions - Example: ["proprietary", "legacy", "third_party"] - Total exclusions = defaults + additional 2. `exclude_dirs` - Replace defaults (advanced users) - Completely overrides default exclusions - Example: ["node_modules", ".git", "custom_vendor"] - Gives full control over exclusions **Implementation:** - Modified GitHubScraper.__init__() to parse exclude_dirs config - Changed should_exclude_dir() to use instance variable instead of global - Added logging for custom exclusions (INFO for extend, WARNING for replace) - Maintains backward compatibility (no config = use defaults) **Testing:** - Added 12 comprehensive tests in test_excluded_dirs_config.py - 3 tests for defaults (backward compatibility) - 3 tests for extend mode - 3 tests for replace mode - 1 test for precedence - 2 tests for edge cases - All 12 new tests passing ✅ - All 22 existing github_scraper tests passing ✅ **Documentation:** - Updated CLAUDE.md config parameters section - Added detailed "Configurable Directory Exclusions" feature section - Included examples for both modes - Listed common use cases (monorepos, enterprise, legacy codebases) **Use Cases:** - Monorepos with custom directory structures - Enterprise projects with non-standard naming conventions - Including unusual directories for analysis - Minimal exclusions for small/simple projects **Backward Compatibility:** ✅ Fully backward compatible - existing configs work unchanged ✅ Smart defaults maintained when no config provided ✅ All existing tests pass Co-authored-by: jimmy058910 <jimmy058910@users.noreply.github.com>
This commit is contained in:
39
CLAUDE.md
39
CLAUDE.md
@@ -437,12 +437,51 @@ Config files (`configs/*.json`) define scraping behavior:
|
|||||||
- `rate_limit`: Delay between requests (seconds)
|
- `rate_limit`: Delay between requests (seconds)
|
||||||
- `max_pages`: Maximum pages to scrape
|
- `max_pages`: Maximum pages to scrape
|
||||||
- `skip_llms_txt`: Skip llms.txt detection, force HTML scraping (default: false)
|
- `skip_llms_txt`: Skip llms.txt detection, force HTML scraping (default: false)
|
||||||
|
- `exclude_dirs_additional`: Add custom directories to default exclusions (for local repo analysis)
|
||||||
|
- `exclude_dirs`: Replace default directory exclusions entirely (advanced, for local repo analysis)
|
||||||
|
|
||||||
## Key Features & Implementation
|
## Key Features & Implementation
|
||||||
|
|
||||||
### Auto-Detect Existing Data
|
### Auto-Detect Existing Data
|
||||||
Tool checks for `output/{name}_data/` and prompts to reuse, avoiding re-scraping (check_existing_data() in doc_scraper.py:653-660).
|
Tool checks for `output/{name}_data/` and prompts to reuse, avoiding re-scraping (check_existing_data() in doc_scraper.py:653-660).
|
||||||
|
|
||||||
|
### Configurable Directory Exclusions (Local Repository Analysis)
|
||||||
|
|
||||||
|
When using `local_repo_path` for unlimited local repository analysis, you can customize which directories to exclude from analysis.
|
||||||
|
|
||||||
|
**Smart Defaults:**
|
||||||
|
Automatically excludes common directories: `venv`, `node_modules`, `__pycache__`, `.git`, `build`, `dist`, `.pytest_cache`, `htmlcov`, `.tox`, `.mypy_cache`, etc.
|
||||||
|
|
||||||
|
**Extend Mode** (`exclude_dirs_additional`): Add custom exclusions to defaults
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sources": [{
|
||||||
|
"type": "github",
|
||||||
|
"local_repo_path": "/path/to/repo",
|
||||||
|
"exclude_dirs_additional": ["proprietary", "legacy", "third_party"]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Replace Mode** (`exclude_dirs`): Override defaults entirely (advanced)
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"sources": [{
|
||||||
|
"type": "github",
|
||||||
|
"local_repo_path": "/path/to/repo",
|
||||||
|
"exclude_dirs": ["node_modules", ".git", "custom_vendor"]
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Use Cases:**
|
||||||
|
- Monorepos with custom directory structures
|
||||||
|
- Enterprise projects with non-standard naming
|
||||||
|
- Including unusual directories (e.g., analyzing venv code)
|
||||||
|
- Minimal exclusions for small/simple projects
|
||||||
|
|
||||||
|
See: `should_exclude_dir()` in github_scraper.py:304-306
|
||||||
|
|
||||||
### Language Detection
|
### Language Detection
|
||||||
Detects code languages from:
|
Detects code languages from:
|
||||||
1. CSS class attributes (`language-*`, `lang-*`)
|
1. CSS class attributes (`language-*`, `lang-*`)
|
||||||
|
|||||||
@@ -87,6 +87,28 @@ class GitHubScraper:
|
|||||||
self.local_repo_path = os.path.expanduser(self.local_repo_path)
|
self.local_repo_path = os.path.expanduser(self.local_repo_path)
|
||||||
logger.info(f"Local repository mode enabled: {self.local_repo_path}")
|
logger.info(f"Local repository mode enabled: {self.local_repo_path}")
|
||||||
|
|
||||||
|
# Configure directory exclusions (smart defaults + optional customization)
|
||||||
|
self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults
|
||||||
|
|
||||||
|
# Option 1: Replace mode - Use only specified exclusions
|
||||||
|
if 'exclude_dirs' in config:
|
||||||
|
self.excluded_dirs = set(config['exclude_dirs'])
|
||||||
|
logger.warning(
|
||||||
|
f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - "
|
||||||
|
"defaults overridden"
|
||||||
|
)
|
||||||
|
logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}")
|
||||||
|
|
||||||
|
# Option 2: Extend mode - Add to default exclusions
|
||||||
|
elif 'exclude_dirs_additional' in config:
|
||||||
|
additional = set(config['exclude_dirs_additional'])
|
||||||
|
self.excluded_dirs = self.excluded_dirs.union(additional)
|
||||||
|
logger.info(
|
||||||
|
f"Added {len(additional)} custom directory exclusions "
|
||||||
|
f"(total: {len(self.excluded_dirs)})"
|
||||||
|
)
|
||||||
|
logger.debug(f"Additional exclusions: {sorted(additional)}")
|
||||||
|
|
||||||
# GitHub client setup (C1.1)
|
# GitHub client setup (C1.1)
|
||||||
token = self._get_token()
|
token = self._get_token()
|
||||||
self.github = Github(token) if token else Github()
|
self.github = Github(token) if token else Github()
|
||||||
@@ -281,7 +303,7 @@ class GitHubScraper:
|
|||||||
|
|
||||||
def should_exclude_dir(self, dir_name: str) -> bool:
|
def should_exclude_dir(self, dir_name: str) -> bool:
|
||||||
"""Check if directory should be excluded from analysis."""
|
"""Check if directory should be excluded from analysis."""
|
||||||
return dir_name in EXCLUDED_DIRS or dir_name.startswith('.')
|
return dir_name in self.excluded_dirs or dir_name.startswith('.')
|
||||||
|
|
||||||
def _extract_file_tree(self):
|
def _extract_file_tree(self):
|
||||||
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
|
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
|
||||||
|
|||||||
246
tests/test_excluded_dirs_config.py
Normal file
246
tests/test_excluded_dirs_config.py
Normal file
@@ -0,0 +1,246 @@
|
|||||||
|
"""Tests for configurable directory exclusions in GitHub scraper.
|
||||||
|
|
||||||
|
Tests Issue #203: Make EXCLUDED_DIRS configurable
|
||||||
|
"""
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import patch, Mock
|
||||||
|
from skill_seekers.cli.github_scraper import GitHubScraper, EXCLUDED_DIRS
|
||||||
|
|
||||||
|
|
||||||
|
class TestExcludedDirsDefaults(unittest.TestCase):
|
||||||
|
"""Test default EXCLUDED_DIRS behavior (backward compatibility)."""
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_defaults_when_no_config(self, mock_github):
|
||||||
|
"""Test that default exclusions are used when no config provided."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo'
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Should use default EXCLUDED_DIRS
|
||||||
|
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_defaults_exclude_common_dirs(self, mock_github):
|
||||||
|
"""Test that default exclusions work correctly."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo'
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Test common directories are excluded
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('venv'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('__pycache__'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('build'))
|
||||||
|
|
||||||
|
# Test normal directories are not excluded
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('tests'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('docs'))
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_dot_directories_always_excluded(self, mock_github):
|
||||||
|
"""Test that directories starting with '.' are always excluded."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo'
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Dot directories should be excluded (even if not in EXCLUDED_DIRS)
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.hidden'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.cache'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.vscode'))
|
||||||
|
|
||||||
|
|
||||||
|
class TestExcludedDirsAdditional(unittest.TestCase):
|
||||||
|
"""Test exclude_dirs_additional (extend mode)."""
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_extend_with_additional_dirs(self, mock_github):
|
||||||
|
"""Test adding custom exclusions to defaults."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs_additional': ['proprietary', 'vendor', 'third_party']
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Should include both defaults and additional
|
||||||
|
self.assertIn('venv', scraper.excluded_dirs) # Default
|
||||||
|
self.assertIn('node_modules', scraper.excluded_dirs) # Default
|
||||||
|
self.assertIn('proprietary', scraper.excluded_dirs) # Additional
|
||||||
|
self.assertIn('vendor', scraper.excluded_dirs) # Additional
|
||||||
|
self.assertIn('third_party', scraper.excluded_dirs) # Additional
|
||||||
|
|
||||||
|
# Verify total count
|
||||||
|
self.assertEqual(
|
||||||
|
len(scraper.excluded_dirs),
|
||||||
|
len(EXCLUDED_DIRS) + 3
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_extend_excludes_additional_dirs(self, mock_github):
|
||||||
|
"""Test that additional directories are actually excluded."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs_additional': ['legacy', 'deprecated']
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Additional dirs should be excluded
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('legacy'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('deprecated'))
|
||||||
|
|
||||||
|
# Default dirs still excluded
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('venv'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||||
|
|
||||||
|
# Normal dirs not excluded
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_extend_with_empty_list(self, mock_github):
|
||||||
|
"""Test that empty additional list works correctly."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs_additional': []
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Should just have defaults
|
||||||
|
self.assertEqual(scraper.excluded_dirs, EXCLUDED_DIRS)
|
||||||
|
|
||||||
|
|
||||||
|
class TestExcludedDirsReplace(unittest.TestCase):
|
||||||
|
"""Test exclude_dirs (replace mode)."""
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_replace_with_custom_list(self, mock_github):
|
||||||
|
"""Test replacing default exclusions entirely."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs': ['node_modules', 'custom_vendor']
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Should ONLY have specified dirs
|
||||||
|
self.assertEqual(scraper.excluded_dirs, {'node_modules', 'custom_vendor'})
|
||||||
|
self.assertEqual(len(scraper.excluded_dirs), 2)
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_replace_excludes_only_specified_dirs(self, mock_github):
|
||||||
|
"""Test that only specified directories are excluded in replace mode."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs': ['node_modules', '.git']
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Specified dirs should be excluded
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('node_modules'))
|
||||||
|
# Note: .git would be excluded anyway due to dot prefix
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||||
|
|
||||||
|
# Default dirs NOT in our list should NOT be excluded
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('venv'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('__pycache__'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('build'))
|
||||||
|
|
||||||
|
# Normal dirs still not excluded
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('src'))
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_replace_with_empty_list(self, mock_github):
|
||||||
|
"""Test that empty replace list allows all directories (except dot-prefixed)."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs': []
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# No explicit exclusions
|
||||||
|
self.assertEqual(scraper.excluded_dirs, set())
|
||||||
|
|
||||||
|
# Nothing explicitly excluded
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('venv'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('node_modules'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('build'))
|
||||||
|
|
||||||
|
# But dot dirs still excluded (different logic)
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.git'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('.hidden'))
|
||||||
|
|
||||||
|
|
||||||
|
class TestExcludedDirsPrecedence(unittest.TestCase):
|
||||||
|
"""Test precedence when both options provided."""
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_replace_takes_precedence_over_additional(self, mock_github):
|
||||||
|
"""Test that exclude_dirs takes precedence over exclude_dirs_additional."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs': ['only', 'these'], # Replace mode
|
||||||
|
'exclude_dirs_additional': ['ignored'] # Should be ignored
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Should use replace mode (exclude_dirs), ignore additional
|
||||||
|
self.assertEqual(scraper.excluded_dirs, {'only', 'these'})
|
||||||
|
self.assertNotIn('ignored', scraper.excluded_dirs)
|
||||||
|
self.assertNotIn('venv', scraper.excluded_dirs) # Defaults also ignored
|
||||||
|
|
||||||
|
|
||||||
|
class TestExcludedDirsEdgeCases(unittest.TestCase):
|
||||||
|
"""Test edge cases and error handling."""
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_duplicate_exclusions_in_additional(self, mock_github):
|
||||||
|
"""Test that duplicates in additional list are handled (set deduplication)."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs_additional': ['venv', 'custom', 'venv'] # venv is duplicate (default + listed)
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Should deduplicate automatically (using set)
|
||||||
|
self.assertIn('venv', scraper.excluded_dirs)
|
||||||
|
self.assertIn('custom', scraper.excluded_dirs)
|
||||||
|
# Count should account for deduplication
|
||||||
|
self.assertEqual(
|
||||||
|
len(scraper.excluded_dirs),
|
||||||
|
len(EXCLUDED_DIRS) + 1 # Only 'custom' is truly additional
|
||||||
|
)
|
||||||
|
|
||||||
|
@patch('skill_seekers.cli.github_scraper.Github')
|
||||||
|
def test_case_sensitive_exclusions(self, mock_github):
|
||||||
|
"""Test that exclusions are case-sensitive."""
|
||||||
|
config = {
|
||||||
|
'repo': 'owner/repo',
|
||||||
|
'exclude_dirs': ['Venv', 'NODE_MODULES']
|
||||||
|
}
|
||||||
|
|
||||||
|
scraper = GitHubScraper(config)
|
||||||
|
|
||||||
|
# Case-sensitive matching
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('Venv'))
|
||||||
|
self.assertTrue(scraper.should_exclude_dir('NODE_MODULES'))
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('venv')) # Different case
|
||||||
|
self.assertFalse(scraper.should_exclude_dir('node_modules')) # Different case
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user