feat: Add unlimited local repository analysis and fix 10 critical bugs

Features: - Add local_repo_path config parameter for unlimited file analysis - Auto-exclude virtual environments and build artifacts (95% noise reduction) - Enable comprehensive codebase analysis (50 → 323 files, 546% increase) Bug Fixes: - Fix logger initialization error (Issue #190) - Fix NoneType subscriptable errors in release tag parsing (3 instances) - Fix relative import paths causing ModuleNotFoundError - Fix hardcoded 50-file analysis limit - Fix GitHub API file tree limitation (140 → 345 files discovered) - Fix AST parser 'not iterable' errors (95 → 0 parsing failures) - Fix virtual environment file pollution (23,341 → 1,109 file tree items) - Fix force_rescrape flag not checked before interactive prompt Impact: - Code coverage: 14% → 93.6% (+79.6pp) - Files analyzed: 50 → 323 (+546%) - Classes extracted: 55 → 585 (+964%) - Functions extracted: 512 → 2,784 (+444%) - AST errors: 95 → 0 (-100%) Tested on JMo Security repository with 345 Python files.
2025-11-16 22:35:23 -05:00
parent 4cbd0a0a3c
commit 0b2a0d121e
5 changed files with 137 additions and 16 deletions
--- a/src/skill_seekers/cli/doc_scraper.py
+++ b/src/skill_seekers/cli/doc_scraper.py
@@ -1650,10 +1650,22 @@ def execute_scraping_and_building(config: Dict[str, Any], args: argparse.Namespa
    exists, page_count = check_existing_data(config['name'])

    if exists and not args.skip_scrape and not args.fresh:
-        logger.info("\n✓ Found existing data: %d pages", page_count)
-        response = input("Use existing data? (y/n): ").strip().lower()
-        if response == 'y':
-            args.skip_scrape = True
+        # Check force_rescrape flag from config
+        if config.get('force_rescrape', False):
+            # Auto-delete cached data and rescrape
+            logger.info("\n✓ Found existing data: %d pages", page_count)
+            logger.info("  force_rescrape enabled - deleting cached data and rescaping")
+            import shutil
+            data_dir = f"output/{config['name']}_data"
+            if os.path.exists(data_dir):
+                shutil.rmtree(data_dir)
+                logger.info(f"  Deleted: {data_dir}")
+        else:
+            # Only prompt if force_rescrape is False
+            logger.info("\n✓ Found existing data: %d pages", page_count)
+            response = input("Use existing data? (y/n): ").strip().lower()
+            if response == 'y':
+                args.skip_scrape = True
    elif exists and args.fresh:
        logger.info("\n✓ Found existing data: %d pages", page_count)
        logger.info("  --fresh flag set, will re-scrape from scratch")