feat(C2.1): Add .gitignore support to github_scraper for local repos

- Add pathspec import with graceful fallback
- Add gitignore_spec attribute to GitHubScraper class
- Implement _load_gitignore() method to parse .gitignore files
- Update should_exclude_dir() to check .gitignore rules
- Load .gitignore automatically in local repository mode
- Handle directory patterns with and without trailing slash
- Add 4 comprehensive tests for .gitignore functionality

Closes #63 - C2.1 File Tree Walker with .gitignore support complete

Features:
- Loads .gitignore from local repository root
- Respects .gitignore patterns for directory exclusion
- Falls back gracefully when pathspec not installed
- Works alongside existing hard-coded exclusions
- Only active in local_repo_path mode (not GitHub API mode)

Test coverage:
- test_load_gitignore_exists: .gitignore parsing
- test_load_gitignore_missing: Missing .gitignore handling
- test_should_exclude_dir_with_gitignore: .gitignore exclusion
- test_should_exclude_dir_default_exclusions: Existing exclusions still work

Integration:
- github_scraper.py now has same .gitignore support as codebase_scraper.py
- Both tools use pathspec library for consistent behavior
- Enables proper repository analysis respecting project .gitignore rules
This commit is contained in:
yusyus
2026-01-01 23:21:12 +03:00
parent a99f71e714
commit eac1f4ef8e
2 changed files with 144 additions and 0 deletions

View File

@@ -31,6 +31,13 @@ except ImportError:
print("Error: PyGithub not installed. Run: pip install PyGithub")
sys.exit(1)
# Try to import pathspec for .gitignore support
try:
import pathspec
PATHSPEC_AVAILABLE = True
except ImportError:
PATHSPEC_AVAILABLE = False
# Configure logging FIRST (before using logger)
logging.basicConfig(
level=logging.INFO,
@@ -191,6 +198,11 @@ class GitHubScraper:
)
logger.debug(f"Additional exclusions: {sorted(additional)}")
# Load .gitignore for additional exclusions (C2.1)
self.gitignore_spec = None
if self.local_repo_path:
self.gitignore_spec = self._load_gitignore()
# GitHub client setup (C1.1)
token = self._get_token()
self.github = Github(token) if token else Github()
@@ -484,8 +496,46 @@ class GitHubScraper:
if excluded in dir_path or dir_path.startswith(excluded):
return True
# Check .gitignore rules if available (C2.1)
if self.gitignore_spec and dir_path:
# For directories, we need to check both with and without trailing slash
# as .gitignore patterns can match either way
dir_path_with_slash = dir_path if dir_path.endswith('/') else dir_path + '/'
if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(dir_path_with_slash):
logger.debug(f"Directory excluded by .gitignore: {dir_path}")
return True
return False
def _load_gitignore(self) -> Optional['pathspec.PathSpec']:
"""
Load .gitignore file and create pathspec matcher (C2.1).
Returns:
PathSpec object if .gitignore found, None otherwise
"""
if not PATHSPEC_AVAILABLE:
logger.warning("pathspec not installed - .gitignore support disabled")
logger.warning("Install with: pip install pathspec")
return None
if not self.local_repo_path:
return None
gitignore_path = Path(self.local_repo_path) / '.gitignore'
if not gitignore_path.exists():
logger.debug(f"No .gitignore found in {self.local_repo_path}")
return None
try:
with open(gitignore_path, 'r', encoding='utf-8') as f:
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
logger.info(f"Loaded .gitignore from {gitignore_path}")
return spec
except Exception as e:
logger.warning(f"Failed to load .gitignore: {e}")
return None
def _extract_file_tree(self):
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
logger.info("Building file tree...")

View File

@@ -962,6 +962,100 @@ class TestSymlinkHandling(unittest.TestCase):
self.assertIn('Major update', scraper.extracted_data['changelog'])
class TestGitignoreSupport(unittest.TestCase):
"""Test .gitignore support in github_scraper (C2.1)"""
def setUp(self):
"""Set up test environment"""
if not PYGITHUB_AVAILABLE:
self.skipTest("PyGithub not installed")
from skill_seekers.cli.github_scraper import GitHubScraper
self.GitHubScraper = GitHubScraper
self.temp_dir = tempfile.mkdtemp()
self.repo_path = Path(self.temp_dir)
def tearDown(self):
"""Clean up test environment"""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_load_gitignore_exists(self):
"""Test loading existing .gitignore file."""
# Create .gitignore
gitignore_path = self.repo_path / '.gitignore'
gitignore_path.write_text('*.log\ntemp/\n__pycache__/')
config = {
'repo': 'test/repo',
'local_repo_path': str(self.repo_path)
}
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
# Should load .gitignore if pathspec available
if hasattr(scraper, 'gitignore_spec'):
# pathspec is installed
self.assertIsNotNone(scraper.gitignore_spec)
else:
# pathspec not installed
self.assertIsNone(scraper.gitignore_spec)
def test_load_gitignore_missing(self):
"""Test behavior when no .gitignore exists."""
config = {
'repo': 'test/repo',
'local_repo_path': str(self.repo_path)
}
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
# Should be None when no .gitignore found
self.assertIsNone(scraper.gitignore_spec)
def test_should_exclude_dir_with_gitignore(self):
"""Test directory exclusion with .gitignore rules."""
# Create .gitignore
gitignore_path = self.repo_path / '.gitignore'
gitignore_path.write_text('temp/\nbuild/\n*.egg-info')
config = {
'repo': 'test/repo',
'local_repo_path': str(self.repo_path)
}
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
# Test .gitignore exclusion (if pathspec available)
if scraper.gitignore_spec:
self.assertTrue(scraper.should_exclude_dir('temp', 'temp'))
self.assertTrue(scraper.should_exclude_dir('build', 'build'))
# Non-excluded dir should pass
self.assertFalse(scraper.should_exclude_dir('src', 'src'))
def test_should_exclude_dir_default_exclusions(self):
"""Test that default exclusions still work."""
config = {
'repo': 'test/repo',
'local_repo_path': str(self.repo_path)
}
with patch('skill_seekers.cli.github_scraper.Github'):
scraper = self.GitHubScraper(config)
# Default exclusions should still work
self.assertTrue(scraper.should_exclude_dir('node_modules'))
self.assertTrue(scraper.should_exclude_dir('venv'))
self.assertTrue(scraper.should_exclude_dir('__pycache__'))
# Normal directories should not be excluded
self.assertFalse(scraper.should_exclude_dir('src'))
self.assertFalse(scraper.should_exclude_dir('tests'))
class TestErrorHandling(unittest.TestCase):
"""Test error handling and edge cases"""