Merge PR #195: Unlimited Local Repository Analysis + 10 Bug Fixes

Merges feature/unlimited-local-analysis-bug-fixes by @jimmy058910. This PR adds valuable local repository analysis capabilities that bypass GitHub API rate limits, plus 10 important bug fixes. Key features: - Local repository analysis via filesystem scanning - Bypasses GitHub API rate limits for unlimited analysis - EXCLUDED_DIRS constant for proper venv/cache exclusion - Bug fixes for logger initialization and imports All 22 GitHub scraper tests passing after merge. Co-authored-by: jimmy058910 <jimmy058910@users.noreply.github.com>
2025-11-29 22:48:20 +03:00
parent 6e68531f98 58ec69eb52
commit a75b612fb2
5 changed files with 137 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
- (No unreleased changes yet)
+- Unlimited local repository analysis via `local_repo_path` configuration parameter
 - Auto-exclusion of virtual environments, build artifacts, and cache directories
 - Support for analyzing repositories without GitHub API rate limits (50 → unlimited files)
 ### Fixed
 - Fixed logger initialization error causing `AttributeError: 'NoneType' object has no attribute 'setLevel'` (Issue #190)
 - Fixed 3 NoneType subscriptable errors in release tag parsing
 - Fixed relative import paths causing `ModuleNotFoundError`
 - Fixed hardcoded 50-file analysis limit preventing comprehensive code analysis
 - Fixed GitHub API file tree limitation (140 → 345 files discovered)
 - Fixed AST parser "not iterable" errors eliminating 100% of parsing failures (95 → 0 errors)
 - Fixed virtual environment file pollution reducing file tree noise by 95%
 - Fixed `force_rescrape` flag not checked before interactive prompt causing EOFError in CI/CD environments
 ### Improved
 - Increased code analysis coverage from 14% to 93.6% (+79.6 percentage points)
 - Improved file discovery from 140 to 345 files (+146%)
 - Improved class extraction from 55 to 585 classes (+964%)
 - Improved function extraction from 512 to 2,784 functions (+444%)
 ---
--- a/src/skill_seekers/cli/code_analyzer.py
+++ b/src/skill_seekers/cli/code_analyzer.py
@@ -117,8 +117,17 @@ class CodeAnalyzer:
                classes.append(asdict(class_sig))
            elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
                # Only top-level functions (not methods)
-                if not any(isinstance(parent, ast.ClassDef)
+                # Fix AST parser to check isinstance(parent.body, list) before 'in' operator
-                          for parent in ast.walk(tree) if hasattr(parent, 'body') and node in parent.body):
+                is_method = False
                try:
                    is_method = any(isinstance(parent, ast.ClassDef)
                                  for parent in ast.walk(tree)
                                  if hasattr(parent, 'body') and isinstance(parent.body, list) and node in parent.body)
                except (TypeError, AttributeError):
                    # If body is not iterable or check fails, assume it's a top-level function
                    is_method = False
                if not is_method:
                    func_sig = self._extract_python_function(node)
                    functions.append(asdict(func_sig))
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -1652,10 +1652,22 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa
    exists, page_count = check_existing_data(config['name'])
    if exists and not args.skip_scrape and not args.fresh:
-        logger.info("\n✓ Found existing data: %d pages", page_count)
+        # Check force_rescrape flag from config
-        response = input("Use existing data? (y/n): ").strip().lower()
+        if config.get('force_rescrape', False):
-        if response == 'y':
+            # Auto-delete cached data and rescrape
-            args.skip_scrape = True
+            logger.info("\n✓ Found existing data: %d pages", page_count)
            logger.info("  force_rescrape enabled - deleting cached data and rescaping")
            import shutil
            data_dir = f"output/{config['name']}_data"
            if os.path.exists(data_dir):
                shutil.rmtree(data_dir)
                logger.info(f"  Deleted: {data_dir}")
        else:
            # Only prompt if force_rescrape is False
            logger.info("\n✓ Found existing data: %d pages", page_count)
            response = input("Use existing data? (y/n): ").strip().lower()
            if response == 'y':
                args.skip_scrape = True
    elif exists and args.fresh:
        logger.info("\n✓ Found existing data: %d pages", page_count)
        logger.info("  --fresh flag set, will re-scrape from scratch")
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -46,6 +46,17 @@ except ImportError:
    CODE_ANALYZER_AVAILABLE = False
    logger.warning("Code analyzer not available - deep analysis disabled")
 # Directories to exclude from local repository analysis
 EXCLUDED_DIRS = {
    'venv', 'env', '.venv', '.env',  # Virtual environments
    'node_modules', '__pycache__', '.pytest_cache',  # Dependencies and caches
    '.git', '.svn', '.hg',  # Version control
    'build', 'dist', '*.egg-info',  # Build artifacts
    'htmlcov', '.coverage',  # Coverage reports
    '.tox', '.nox',  # Testing environments
    '.mypy_cache', '.ruff_cache',  # Linter caches
 }
 class GitHubScraper:
    """
@@ -63,13 +74,19 @@ class GitHubScraper:
    - Releases
    """
-    def __init__(self, config: Dict[str, Any]):
+    def __init__(self, config: Dict[str, Any], local_repo_path: Optional[str] = None):
        """Initialize GitHub scraper with configuration."""
        self.config = config
        self.repo_name = config['repo']
        self.name = config.get('name', self.repo_name.split('/')[-1])
        self.description = config.get('description', f'Skill for {self.repo_name}')
        # Local repository path (optional - enables unlimited analysis)
        self.local_repo_path = local_repo_path or config.get('local_repo_path')
        if self.local_repo_path:
            self.local_repo_path = os.path.expanduser(self.local_repo_path)
            logger.info(f"Local repository mode enabled: {self.local_repo_path}")
        # GitHub client setup (C1.1)
        token = self._get_token()
        self.github = Github(token) if token else Github()
@@ -262,10 +279,66 @@ class GitHubScraper:
        except GithubException as e:
            logger.warning(f"Could not fetch languages: {e}")
    def should_exclude_dir(self, dir_name: str) -> bool:
        """Check if directory should be excluded from analysis."""
        return dir_name in EXCLUDED_DIRS or dir_name.startswith('.')
    def _extract_file_tree(self):
-        """Extract repository file tree structure."""
+        """Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
        logger.info("Building file tree...")
        if self.local_repo_path:
            # Local filesystem mode - unlimited files
            self._extract_file_tree_local()
        else:
            # GitHub API mode - limited by API rate limits
            self._extract_file_tree_github()
    def _extract_file_tree_local(self):
        """Extract file tree from local filesystem (unlimited files)."""
        if not os.path.exists(self.local_repo_path):
            logger.error(f"Local repository path not found: {self.local_repo_path}")
            return
        file_tree = []
        for root, dirs, files in os.walk(self.local_repo_path):
            # Exclude directories in-place to prevent os.walk from descending into them
            dirs[:] = [d for d in dirs if not self.should_exclude_dir(d)]
            # Calculate relative path from repo root
            rel_root = os.path.relpath(root, self.local_repo_path)
            if rel_root == '.':
                rel_root = ''
            # Add directories
            for dir_name in dirs:
                dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name
                file_tree.append({
                    'path': dir_path,
                    'type': 'dir',
                    'size': None
                })
            # Add files
            for file_name in files:
                file_path = os.path.join(rel_root, file_name) if rel_root else file_name
                full_path = os.path.join(root, file_name)
                try:
                    file_size = os.path.getsize(full_path)
                except OSError:
                    file_size = None
                file_tree.append({
                    'path': file_path,
                    'type': 'file',
                    'size': file_size
                })
        self.extracted_data['file_tree'] = file_tree
        logger.info(f"File tree built (local mode): {len(file_tree)} items")
    def _extract_file_tree_github(self):
        """Extract file tree from GitHub API (rate-limited)."""
        try:
            contents = self.repo.get_contents("")
            file_tree = []
@@ -284,7 +357,7 @@ class GitHubScraper:
                    contents.extend(self.repo.get_contents(file_content.path))
            self.extracted_data['file_tree'] = file_tree
-            logger.info(f"File tree built: {len(file_tree)} items")
+            logger.info(f"File tree built (GitHub API mode): {len(file_tree)} items")
        except GithubException as e:
            logger.warning(f"Could not build file tree: {e}")
@@ -351,8 +424,16 @@ class GitHubScraper:
            # Analyze this file
            try:
-                file_content = self.repo.get_contents(file_path)
+                # Read file content based on mode
-                content = file_content.decoded_content.decode('utf-8')
+                if self.local_repo_path:
                    # Local mode - read from filesystem
                    full_path = os.path.join(self.local_repo_path, file_path)
                    with open(full_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                else:
                    # GitHub API mode - fetch from API
                    file_content = self.repo.get_contents(file_path)
                    content = file_content.decoded_content.decode('utf-8')
                analysis_result = self.code_analyzer.analyze_file(
                    file_path,
@@ -375,9 +456,9 @@ class GitHubScraper:
                logger.debug(f"Could not analyze {file_path}: {e}")
                continue
-            # Limit number of files analyzed to avoid rate limits
+            # Limit number of files analyzed to avoid rate limits (GitHub API mode only)
-            if len(analyzed_files) >= 50:
+            if not self.local_repo_path and len(analyzed_files) >= 50:
-                logger.info(f"Reached analysis limit (50 files)")
+                logger.info(f"Reached analysis limit (50 files, GitHub API mode)")
                break
        self.extracted_data['code_analysis'] = {
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -187,7 +187,8 @@ class UnifiedScraper:
            'include_releases': source.get('include_releases', True),
            'include_code': source.get('include_code', True),
            'code_analysis_depth': source.get('code_analysis_depth', 'surface'),
-            'file_patterns': source.get('file_patterns', [])
+            'file_patterns': source.get('file_patterns', []),
            'local_repo_path': source.get('local_repo_path')  # Pass local_repo_path from config
        }
        # Scrape