feat(C2.1): Add .gitignore support to github_scraper for local repos
- Add pathspec import with graceful fallback - Add gitignore_spec attribute to GitHubScraper class - Implement _load_gitignore() method to parse .gitignore files - Update should_exclude_dir() to check .gitignore rules - Load .gitignore automatically in local repository mode - Handle directory patterns with and without trailing slash - Add 4 comprehensive tests for .gitignore functionality Closes #63 - C2.1 File Tree Walker with .gitignore support complete Features: - Loads .gitignore from local repository root - Respects .gitignore patterns for directory exclusion - Falls back gracefully when pathspec not installed - Works alongside existing hard-coded exclusions - Only active in local_repo_path mode (not GitHub API mode) Test coverage: - test_load_gitignore_exists: .gitignore parsing - test_load_gitignore_missing: Missing .gitignore handling - test_should_exclude_dir_with_gitignore: .gitignore exclusion - test_should_exclude_dir_default_exclusions: Existing exclusions still work Integration: - github_scraper.py now has same .gitignore support as codebase_scraper.py - Both tools use pathspec library for consistent behavior - Enables proper repository analysis respecting project .gitignore rules
This commit is contained in:
@@ -31,6 +31,13 @@ except ImportError:
|
||||
print("Error: PyGithub not installed. Run: pip install PyGithub")
|
||||
sys.exit(1)
|
||||
|
||||
# Try to import pathspec for .gitignore support
|
||||
try:
|
||||
import pathspec
|
||||
PATHSPEC_AVAILABLE = True
|
||||
except ImportError:
|
||||
PATHSPEC_AVAILABLE = False
|
||||
|
||||
# Configure logging FIRST (before using logger)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -191,6 +198,11 @@ class GitHubScraper:
|
||||
)
|
||||
logger.debug(f"Additional exclusions: {sorted(additional)}")
|
||||
|
||||
# Load .gitignore for additional exclusions (C2.1)
|
||||
self.gitignore_spec = None
|
||||
if self.local_repo_path:
|
||||
self.gitignore_spec = self._load_gitignore()
|
||||
|
||||
# GitHub client setup (C1.1)
|
||||
token = self._get_token()
|
||||
self.github = Github(token) if token else Github()
|
||||
@@ -484,8 +496,46 @@ class GitHubScraper:
|
||||
if excluded in dir_path or dir_path.startswith(excluded):
|
||||
return True
|
||||
|
||||
# Check .gitignore rules if available (C2.1)
|
||||
if self.gitignore_spec and dir_path:
|
||||
# For directories, we need to check both with and without trailing slash
|
||||
# as .gitignore patterns can match either way
|
||||
dir_path_with_slash = dir_path if dir_path.endswith('/') else dir_path + '/'
|
||||
if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(dir_path_with_slash):
|
||||
logger.debug(f"Directory excluded by .gitignore: {dir_path}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _load_gitignore(self) -> Optional['pathspec.PathSpec']:
|
||||
"""
|
||||
Load .gitignore file and create pathspec matcher (C2.1).
|
||||
|
||||
Returns:
|
||||
PathSpec object if .gitignore found, None otherwise
|
||||
"""
|
||||
if not PATHSPEC_AVAILABLE:
|
||||
logger.warning("pathspec not installed - .gitignore support disabled")
|
||||
logger.warning("Install with: pip install pathspec")
|
||||
return None
|
||||
|
||||
if not self.local_repo_path:
|
||||
return None
|
||||
|
||||
gitignore_path = Path(self.local_repo_path) / '.gitignore'
|
||||
if not gitignore_path.exists():
|
||||
logger.debug(f"No .gitignore found in {self.local_repo_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(gitignore_path, 'r', encoding='utf-8') as f:
|
||||
spec = pathspec.PathSpec.from_lines('gitwildmatch', f)
|
||||
logger.info(f"Loaded .gitignore from {gitignore_path}")
|
||||
return spec
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load .gitignore: {e}")
|
||||
return None
|
||||
|
||||
def _extract_file_tree(self):
|
||||
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
|
||||
logger.info("Building file tree...")
|
||||
|
||||
Reference in New Issue
Block a user