fix: Fix local repo extraction limitations (code analyzer, exclusions, enhancement)
This commit fixes three critical limitations discovered during local repository skill extraction testing: **Fix 1: Code Analyzer Import Issue** - Changed unified_scraper.py to use absolute imports instead of relative imports - Fixed: `from github_scraper import` → `from skill_seekers.cli.github_scraper import` - Fixed: `from pdf_scraper import` → `from skill_seekers.cli.pdf_scraper import` - Result: CodeAnalyzer now available during extraction, deep analysis works **Fix 2: Unity Library Exclusions** - Updated should_exclude_dir() to accept and check full directory paths - Updated _extract_file_tree_local() to pass both dir name and full path - Added exclusion config passing from unified_scraper to github_scraper - Result: exclude_dirs_additional now works (297 files excluded in test) **Fix 3: AI Enhancement for Single Sources** - Changed read_reference_files() to use rglob() for recursive search - Now finds reference files in subdirectories (e.g., references/github/README.md) - Result: AI enhancement works with unified skills that have nested references **Test Results:** - Code Analyzer: ✅ Working (deep analysis running) - Unity Exclusions: ✅ Working (297 files excluded from 679) - AI Enhancement: ✅ Working (finds and reads nested references) **Files Changed:** - src/skill_seekers/cli/unified_scraper.py (Fix 1 & 2) - src/skill_seekers/cli/github_scraper.py (Fix 2) - src/skill_seekers/cli/utils.py (Fix 3) **Test Artifacts:** - configs/deck_deck_go_local.json (test configuration) - docs/LOCAL_REPO_TEST_RESULTS.md (comprehensive test report) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -301,9 +301,29 @@ class GitHubScraper:
|
||||
except GithubException as e:
|
||||
logger.warning(f"Could not fetch languages: {e}")
|
||||
|
||||
def should_exclude_dir(self, dir_name: str) -> bool:
|
||||
"""Check if directory should be excluded from analysis."""
|
||||
return dir_name in self.excluded_dirs or dir_name.startswith('.')
|
||||
def should_exclude_dir(self, dir_name: str, dir_path: str = None) -> bool:
|
||||
"""
|
||||
Check if directory should be excluded from analysis.
|
||||
|
||||
Args:
|
||||
dir_name: Directory name (e.g., "Examples & Extras")
|
||||
dir_path: Full relative path (e.g., "TextMesh Pro/Examples & Extras")
|
||||
|
||||
Returns:
|
||||
True if directory should be excluded
|
||||
"""
|
||||
# Check directory name
|
||||
if dir_name in self.excluded_dirs or dir_name.startswith('.'):
|
||||
return True
|
||||
|
||||
# Check full path if provided (for nested exclusions like "TextMesh Pro/Examples & Extras")
|
||||
if dir_path:
|
||||
for excluded in self.excluded_dirs:
|
||||
# Match if path contains the exclusion pattern
|
||||
if excluded in dir_path or dir_path.startswith(excluded):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _extract_file_tree(self):
|
||||
"""Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
|
||||
@@ -322,16 +342,29 @@ class GitHubScraper:
|
||||
logger.error(f"Local repository path not found: {self.local_repo_path}")
|
||||
return
|
||||
|
||||
file_tree = []
|
||||
for root, dirs, files in os.walk(self.local_repo_path):
|
||||
# Exclude directories in-place to prevent os.walk from descending into them
|
||||
dirs[:] = [d for d in dirs if not self.should_exclude_dir(d)]
|
||||
# Log exclusions for debugging
|
||||
logger.info(f"Directory exclusions ({len(self.excluded_dirs)} total): {sorted(list(self.excluded_dirs)[:10])}")
|
||||
|
||||
# Calculate relative path from repo root
|
||||
file_tree = []
|
||||
excluded_count = 0
|
||||
for root, dirs, files in os.walk(self.local_repo_path):
|
||||
# Calculate relative path from repo root first (needed for exclusion checks)
|
||||
rel_root = os.path.relpath(root, self.local_repo_path)
|
||||
if rel_root == '.':
|
||||
rel_root = ''
|
||||
|
||||
# Exclude directories in-place to prevent os.walk from descending into them
|
||||
# Pass both dir name and full path for path-based exclusions
|
||||
filtered_dirs = []
|
||||
for d in dirs:
|
||||
dir_path = os.path.join(rel_root, d) if rel_root else d
|
||||
if self.should_exclude_dir(d, dir_path):
|
||||
excluded_count += 1
|
||||
logger.debug(f"Excluding directory: {dir_path}")
|
||||
else:
|
||||
filtered_dirs.append(d)
|
||||
dirs[:] = filtered_dirs
|
||||
|
||||
# Add directories
|
||||
for dir_name in dirs:
|
||||
dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name
|
||||
@@ -357,7 +390,7 @@ class GitHubScraper:
|
||||
})
|
||||
|
||||
self.extracted_data['file_tree'] = file_tree
|
||||
logger.info(f"File tree built (local mode): {len(file_tree)} items")
|
||||
logger.info(f"File tree built (local mode): {len(file_tree)} items ({excluded_count} directories excluded)")
|
||||
|
||||
def _extract_file_tree_github(self):
|
||||
"""Extract file tree from GitHub API (rate-limited)."""
|
||||
|
||||
Reference in New Issue
Block a user