feat(C2.1): Add .gitignore support to github_scraper for local repos

- Add pathspec import with graceful fallback
- Add gitignore_spec attribute to GitHubScraper class
- Implement _load_gitignore() method to parse .gitignore files
- Update should_exclude_dir() to check .gitignore rules
- Load .gitignore automatically in local repository mode
- Handle directory patterns with and without trailing slash
- Add 4 comprehensive tests for .gitignore functionality

Closes #63 - C2.1 File Tree Walker with .gitignore support complete

Features:
- Loads .gitignore from local repository root
- Respects .gitignore patterns for directory exclusion
- Falls back gracefully when pathspec not installed
- Works alongside existing hard-coded exclusions
- Only active in local_repo_path mode (not GitHub API mode)

Test coverage:
- test_load_gitignore_exists: .gitignore parsing
- test_load_gitignore_missing: Missing .gitignore handling
- test_should_exclude_dir_with_gitignore: .gitignore exclusion
- test_should_exclude_dir_default_exclusions: Existing exclusions still work

Integration:
- github_scraper.py now has same .gitignore support as codebase_scraper.py
- Both tools use pathspec library for consistent behavior
- Enables proper repository analysis respecting project .gitignore rules
This commit is contained in:
yusyus
2026-01-01 23:21:12 +03:00
parent a99f71e714
commit eac1f4ef8e
2 changed files with 144 additions and 0 deletions

View File

@@ -31,6 +31,13 @@ except ImportError:
print("Error: PyGithub not installed. Run: pip install PyGithub")
sys.exit(1)
# Try to import pathspec for .gitignore support
try:
import pathspec
PATHSPEC_AVAILABLE = True
except ImportError:
PATHSPEC_AVAILABLE = False
# Configure logging FIRST (before using logger)
logging.basicConfig(
level=logging.INFO,
@@ -191,6 +198,11 @@ class GitHubScraper:
)
logger.debug(f"Additional exclusions: {sorted(additional)}")
# Load .gitignore for additional exclusions (C2.1)
self.gitignore_spec = None
if self.local_repo_path:
self.gitignore_spec = self._load_gitignore()
# GitHub client setup (C1.1)
token = self._get_token()
self.github = Github(token) if token else Github()
@@ -484,8 +496,46 @@ class GitHubScraper:
if excluded in dir_path or dir_path.startswith(excluded):
return True
# Check .gitignore rules if available (C2.1)
if self.gitignore_spec and dir_path:
# For directories, we need to check both with and without trailing slash
# as .gitignore patterns can match either way
dir_path_with_slash = dir_path if dir_path.endswith('/') else dir_path + '/'
if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(dir_path_with_slash):
logger.debug(f"Directory excluded by .gitignore: {dir_path}")
return True
return False
def _load_gitignore(self) -> Optional['pathspec.PathSpec']:
"""
Load .gitignore file and create pathspec matcher (C2.1).
Returns:
PathSpec object if .gitignore found, None otherwise
"""
if not PATHSPEC_AVAILABLE:
logger.warning("pathspec not installed - .gitignore support disabled")
logger.warning("Install with: pip install pathspec")
return None
if not self.local_repo_path:
return None
gitignore_path = Path(self.local_repo_path) / '.gitignore'
if not gitignore_path.exists():
logger.debug(f"No .gitignore found in {self.local_repo_path}")
return None
try:
with open(gitignore_path, 'r', encoding='utf-8') as f:
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
logger.info(f"Loaded .gitignore from {gitignore_path}")
return spec
except Exception as e:
logger.warning(f"Failed to load .gitignore: {e}")
return None
def _extract_file_tree(self):
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
logger.info("Building file tree...")