fix: Fix local repo extraction limitations (code analyzer, exclusions, enhancement)

This commit fixes three critical limitations discovered during local repository skill extraction testing: **Fix 1: Code Analyzer Import Issue** - Changed unified_scraper.py to use absolute imports instead of relative imports - Fixed: `from github_scraper import` → `from skill_seekers.cli.github_scraper import` - Fixed: `from pdf_scraper import` → `from skill_seekers.cli.pdf_scraper import` - Result: CodeAnalyzer now available during extraction, deep analysis works **Fix 2: Unity Library Exclusions** - Updated should_exclude_dir() to accept and check full directory paths - Updated _extract_file_tree_local() to pass both dir name and full path - Added exclusion config passing from unified_scraper to github_scraper - Result: exclude_dirs_additional now works (297 files excluded in test) **Fix 3: AI Enhancement for Single Sources** - Changed read_reference_files() to use rglob() for recursive search - Now finds reference files in subdirectories (e.g., references/github/README.md) - Result: AI enhancement works with unified skills that have nested references **Test Results:** - Code Analyzer: ✅ Working (deep analysis running) - Unity Exclusions: ✅ Working (297 files excluded from 679) - AI Enhancement: ✅ Working (finds and reads nested references) **Files Changed:** - src/skill_seekers/cli/unified_scraper.py (Fix 1 & 2) - src/skill_seekers/cli/github_scraper.py (Fix 2) - src/skill_seekers/cli/utils.py (Fix 3) **Test Artifacts:** - configs/deck_deck_go_local.json (test configuration) - docs/LOCAL_REPO_TEST_RESULTS.md (comprehensive test report) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-21 22:24:38 +03:00
parent ae69c507a0
commit 65ded6c07c
5 changed files with 567 additions and 21 deletions
--- a/src/skill_seekers/cli/github_scraper.py
+++ b/src/skill_seekers/cli/github_scraper.py
@@ -301,9 +301,29 @@ class GitHubScraper:
        except GithubException as e:
            logger.warning(f"Could not fetch languages: {e}")

-    def should_exclude_dir(self, dir_name: str) -> bool:
-        """Check if directory should be excluded from analysis."""
-        return dir_name in self.excluded_dirs or dir_name.startswith('.')
+    def should_exclude_dir(self, dir_name: str, dir_path: str = None) -> bool:
+        """
+        Check if directory should be excluded from analysis.
+
+        Args:
+            dir_name: Directory name (e.g., "Examples & Extras")
+            dir_path: Full relative path (e.g., "TextMesh Pro/Examples & Extras")
+
+        Returns:
+            True if directory should be excluded
+        """
+        # Check directory name
+        if dir_name in self.excluded_dirs or dir_name.startswith('.'):
+            return True
+
+        # Check full path if provided (for nested exclusions like "TextMesh Pro/Examples & Extras")
+        if dir_path:
+            for excluded in self.excluded_dirs:
+                # Match if path contains the exclusion pattern
+                if excluded in dir_path or dir_path.startswith(excluded):
+                    return True
+
+        return False

    def _extract_file_tree(self):
        """Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
@@ -322,16 +342,29 @@ class GitHubScraper:
            logger.error(f"Local repository path not found: {self.local_repo_path}")
            return

-        file_tree = []
-        for root, dirs, files in os.walk(self.local_repo_path):
-            # Exclude directories in-place to prevent os.walk from descending into them
-            dirs[:] = [d for d in dirs if not self.should_exclude_dir(d)]
+        # Log exclusions for debugging
+        logger.info(f"Directory exclusions ({len(self.excluded_dirs)} total): {sorted(list(self.excluded_dirs)[:10])}")

-            # Calculate relative path from repo root
+        file_tree = []
+        excluded_count = 0
+        for root, dirs, files in os.walk(self.local_repo_path):
+            # Calculate relative path from repo root first (needed for exclusion checks)
            rel_root = os.path.relpath(root, self.local_repo_path)
            if rel_root == '.':
                rel_root = ''

+            # Exclude directories in-place to prevent os.walk from descending into them
+            # Pass both dir name and full path for path-based exclusions
+            filtered_dirs = []
+            for d in dirs:
+                dir_path = os.path.join(rel_root, d) if rel_root else d
+                if self.should_exclude_dir(d, dir_path):
+                    excluded_count += 1
+                    logger.debug(f"Excluding directory: {dir_path}")
+                else:
+                    filtered_dirs.append(d)
+            dirs[:] = filtered_dirs
+
            # Add directories
            for dir_name in dirs:
                dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name
@@ -357,7 +390,7 @@ class GitHubScraper:
                })

        self.extracted_data['file_tree'] = file_tree
-        logger.info(f"File tree built (local mode): {len(file_tree)} items")
+        logger.info(f"File tree built (local mode): {len(file_tree)} items ({excluded_count} directories excluded)")

    def _extract_file_tree_github(self):
        """Extract file tree from GitHub API (rate-limited)."""
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -23,10 +23,10 @@ from typing import Dict, List, Any, Optional

 # Import validators and scrapers
 try:
-    from config_validator import ConfigValidator, validate_config
-    from conflict_detector import ConflictDetector
-    from merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
-    from unified_skill_builder import UnifiedSkillBuilder
+    from skill_seekers.cli.config_validator import ConfigValidator, validate_config
+    from skill_seekers.cli.conflict_detector import ConflictDetector
+    from skill_seekers.cli.merge_sources import RuleBasedMerger, ClaudeEnhancedMerger
+    from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
 except ImportError as e:
    print(f"Error importing modules: {e}")
    print("Make sure you're running from the project root directory")
@@ -168,10 +168,8 @@ class UnifiedScraper:

    def _scrape_github(self, source: Dict[str, Any]):
        """Scrape GitHub repository."""
-        sys.path.insert(0, str(Path(__file__).parent))
-
        try:
-            from github_scraper import GitHubScraper
+            from skill_seekers.cli.github_scraper import GitHubScraper
        except ImportError:
            logger.error("github_scraper.py not found")
            return
@@ -191,6 +189,12 @@ class UnifiedScraper:
            'local_repo_path': source.get('local_repo_path')  # Pass local_repo_path from config
        }

+        # Pass directory exclusions if specified (optional)
+        if 'exclude_dirs' in source:
+            github_config['exclude_dirs'] = source['exclude_dirs']
+        if 'exclude_dirs_additional' in source:
+            github_config['exclude_dirs_additional'] = source['exclude_dirs_additional']
+
        # Scrape
        logger.info(f"Scraping GitHub repository: {source['repo']}")
        scraper = GitHubScraper(github_config)
@@ -210,10 +214,8 @@ class UnifiedScraper:

    def _scrape_pdf(self, source: Dict[str, Any]):
        """Scrape PDF document."""
-        sys.path.insert(0, str(Path(__file__).parent))
-
        try:
-            from pdf_scraper import PDFToSkillConverter
+            from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
        except ImportError:
            logger.error("pdf_scraper.py not found")
            return
--- a/src/skill_seekers/cli/utils.py
+++ b/src/skill_seekers/cli/utils.py
@@ -203,7 +203,8 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
        return references

    total_chars = 0
-    for ref_file in sorted(references_dir.glob("*.md")):
+    # Search recursively for all .md files (including subdirectories like github/README.md)
+    for ref_file in sorted(references_dir.rglob("*.md")):
        if ref_file.name == "index.md":
            continue

@@ -213,7 +214,9 @@ def read_reference_files(skill_dir: Union[str, Path], max_chars: int = 100000, p
        if len(content) > preview_limit:
            content = content[:preview_limit] + "\n\n[Content truncated...]"

-        references[ref_file.name] = content
+        # Use relative path from references_dir as key for nested files
+        relative_path = ref_file.relative_to(references_dir)
+        references[str(relative_path)] = content
        total_chars += len(content)

        # Stop if we've read enough