From eac1f4ef8e8a62c61aa5546ca80e8c7f260625e7 Mon Sep 17 00:00:00 2001 From: yusyus Date: Thu, 1 Jan 2026 23:21:12 +0300 Subject: [PATCH] feat(C2.1): Add .gitignore support to github_scraper for local repos - Add pathspec import with graceful fallback - Add gitignore_spec attribute to GitHubScraper class - Implement _load_gitignore() method to parse .gitignore files - Update should_exclude_dir() to check .gitignore rules - Load .gitignore automatically in local repository mode - Handle directory patterns with and without trailing slash - Add 4 comprehensive tests for .gitignore functionality Closes #63 - C2.1 File Tree Walker with .gitignore support complete Features: - Loads .gitignore from local repository root - Respects .gitignore patterns for directory exclusion - Falls back gracefully when pathspec not installed - Works alongside existing hard-coded exclusions - Only active in local_repo_path mode (not GitHub API mode) Test coverage: - test_load_gitignore_exists: .gitignore parsing - test_load_gitignore_missing: Missing .gitignore handling - test_should_exclude_dir_with_gitignore: .gitignore exclusion - test_should_exclude_dir_default_exclusions: Existing exclusions still work Integration: - github_scraper.py now has same .gitignore support as codebase_scraper.py - Both tools use pathspec library for consistent behavior - Enables proper repository analysis respecting project .gitignore rules --- src/skill_seekers/cli/github_scraper.py | 50 +++++++++++++ tests/test_github_scraper.py | 94 +++++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index c04b5d3..9d31785 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -31,6 +31,13 @@ except ImportError: print("Error: PyGithub not installed. Run: pip install PyGithub") sys.exit(1) +# Try to import pathspec for .gitignore support +try: + import pathspec + PATHSPEC_AVAILABLE = True +except ImportError: + PATHSPEC_AVAILABLE = False + # Configure logging FIRST (before using logger) logging.basicConfig( level=logging.INFO, @@ -191,6 +198,11 @@ class GitHubScraper: ) logger.debug(f"Additional exclusions: {sorted(additional)}") + # Load .gitignore for additional exclusions (C2.1) + self.gitignore_spec = None + if self.local_repo_path: + self.gitignore_spec = self._load_gitignore() + # GitHub client setup (C1.1) token = self._get_token() self.github = Github(token) if token else Github() @@ -484,8 +496,46 @@ class GitHubScraper: if excluded in dir_path or dir_path.startswith(excluded): return True + # Check .gitignore rules if available (C2.1) + if self.gitignore_spec and dir_path: + # For directories, we need to check both with and without trailing slash + # as .gitignore patterns can match either way + dir_path_with_slash = dir_path if dir_path.endswith('/') else dir_path + '/' + if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(dir_path_with_slash): + logger.debug(f"Directory excluded by .gitignore: {dir_path}") + return True + return False + def _load_gitignore(self) -> Optional['pathspec.PathSpec']: + """ + Load .gitignore file and create pathspec matcher (C2.1). + + Returns: + PathSpec object if .gitignore found, None otherwise + """ + if not PATHSPEC_AVAILABLE: + logger.warning("pathspec not installed - .gitignore support disabled") + logger.warning("Install with: pip install pathspec") + return None + + if not self.local_repo_path: + return None + + gitignore_path = Path(self.local_repo_path) / '.gitignore' + if not gitignore_path.exists(): + logger.debug(f"No .gitignore found in {self.local_repo_path}") + return None + + try: + with open(gitignore_path, 'r', encoding='utf-8') as f: + spec = pathspec.PathSpec.from_lines('gitwildmatch', f) + logger.info(f"Loaded .gitignore from {gitignore_path}") + return spec + except Exception as e: + logger.warning(f"Failed to load .gitignore: {e}") + return None + def _extract_file_tree(self): """Extract repository file tree structure (dual-mode: GitHub API or local filesystem).""" logger.info("Building file tree...") diff --git a/tests/test_github_scraper.py b/tests/test_github_scraper.py index 46cf6d2..82f3299 100644 --- a/tests/test_github_scraper.py +++ b/tests/test_github_scraper.py @@ -962,6 +962,100 @@ class TestSymlinkHandling(unittest.TestCase): self.assertIn('Major update', scraper.extracted_data['changelog']) +class TestGitignoreSupport(unittest.TestCase): + """Test .gitignore support in github_scraper (C2.1)""" + + def setUp(self): + """Set up test environment""" + if not PYGITHUB_AVAILABLE: + self.skipTest("PyGithub not installed") + from skill_seekers.cli.github_scraper import GitHubScraper + self.GitHubScraper = GitHubScraper + + self.temp_dir = tempfile.mkdtemp() + self.repo_path = Path(self.temp_dir) + + def tearDown(self): + """Clean up test environment""" + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_load_gitignore_exists(self): + """Test loading existing .gitignore file.""" + # Create .gitignore + gitignore_path = self.repo_path / '.gitignore' + gitignore_path.write_text('*.log\ntemp/\n__pycache__/') + + config = { + 'repo': 'test/repo', + 'local_repo_path': str(self.repo_path) + } + + with patch('skill_seekers.cli.github_scraper.Github'): + scraper = self.GitHubScraper(config) + + # Should load .gitignore if pathspec available + if hasattr(scraper, 'gitignore_spec'): + # pathspec is installed + self.assertIsNotNone(scraper.gitignore_spec) + else: + # pathspec not installed + self.assertIsNone(scraper.gitignore_spec) + + def test_load_gitignore_missing(self): + """Test behavior when no .gitignore exists.""" + config = { + 'repo': 'test/repo', + 'local_repo_path': str(self.repo_path) + } + + with patch('skill_seekers.cli.github_scraper.Github'): + scraper = self.GitHubScraper(config) + + # Should be None when no .gitignore found + self.assertIsNone(scraper.gitignore_spec) + + def test_should_exclude_dir_with_gitignore(self): + """Test directory exclusion with .gitignore rules.""" + # Create .gitignore + gitignore_path = self.repo_path / '.gitignore' + gitignore_path.write_text('temp/\nbuild/\n*.egg-info') + + config = { + 'repo': 'test/repo', + 'local_repo_path': str(self.repo_path) + } + + with patch('skill_seekers.cli.github_scraper.Github'): + scraper = self.GitHubScraper(config) + + # Test .gitignore exclusion (if pathspec available) + if scraper.gitignore_spec: + self.assertTrue(scraper.should_exclude_dir('temp', 'temp')) + self.assertTrue(scraper.should_exclude_dir('build', 'build')) + + # Non-excluded dir should pass + self.assertFalse(scraper.should_exclude_dir('src', 'src')) + + def test_should_exclude_dir_default_exclusions(self): + """Test that default exclusions still work.""" + config = { + 'repo': 'test/repo', + 'local_repo_path': str(self.repo_path) + } + + with patch('skill_seekers.cli.github_scraper.Github'): + scraper = self.GitHubScraper(config) + + # Default exclusions should still work + self.assertTrue(scraper.should_exclude_dir('node_modules')) + self.assertTrue(scraper.should_exclude_dir('venv')) + self.assertTrue(scraper.should_exclude_dir('__pycache__')) + + # Normal directories should not be excluded + self.assertFalse(scraper.should_exclude_dir('src')) + self.assertFalse(scraper.should_exclude_dir('tests')) + + class TestErrorHandling(unittest.TestCase): """Test error handling and edge cases"""