fix: Critical CLI bug fixes for issues #258 and #259

This hotfix resolves 4 critical bugs reported by users: Issue #258: install command fails with unified_scraper - Added --fresh and --dry-run flags to unified_scraper.py - Updated main.py to pass both flags to unified scraper - Fixed "unrecognized arguments" error Issue #259 (Original): scrape command doesn't accept positional URL and --max-pages - Added positional URL argument to scrape command - Added --max-pages flag with safety warnings (>1000 pages, <10 pages) - Updated doc_scraper.py and main.py argument parsers Issue #259 (Comment A): Version shows 2.7.0 instead of actual version - Fixed hardcoded version in main.py - Now reads version dynamically from __init__.py Issue #259 (Comment B): PDF command shows empty "Error: " message - Improved exception handler in main.py to show exception type if message is empty - Added proper error handling in pdf_scraper.py with context-specific messages - Added traceback support in verbose mode All fixes tested and verified with exact commands from issue reports. Resolves: #258, #259 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-21 23:22:03 +03:00
parent 35cd0759e5
commit cc76efa29a
4 changed files with 125 additions and 14 deletions
--- a/src/skill_seekers/cli/unified_scraper.py
+++ b/src/skill_seekers/cli/unified_scraper.py
@@ -870,6 +870,16 @@ Examples:
        action="store_true",
        help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
    )
+    parser.add_argument(
+        "--fresh",
+        action="store_true",
+        help="Clear any existing data and start fresh (ignore checkpoints)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Preview what will be scraped without actually scraping",
+    )

    args = parser.parse_args()

@@ -885,6 +895,35 @@ Examples:
                    f"⏭️  Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}"
                )

+    # Handle --fresh flag (clear cache)
+    if args.fresh:
+        import shutil
+
+        if os.path.exists(scraper.cache_dir):
+            logger.info(f"🧹 Clearing cache: {scraper.cache_dir}")
+            shutil.rmtree(scraper.cache_dir)
+            # Recreate directories
+            os.makedirs(scraper.sources_dir, exist_ok=True)
+            os.makedirs(scraper.data_dir, exist_ok=True)
+            os.makedirs(scraper.repos_dir, exist_ok=True)
+            os.makedirs(scraper.logs_dir, exist_ok=True)
+
+    # Handle --dry-run flag
+    if args.dry_run:
+        logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
+        logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
+        for idx, source in enumerate(scraper.config.get("sources", []), 1):
+            source_type = source.get("type", "unknown")
+            if source_type == "documentation":
+                logger.info(f"  {idx}. Documentation: {source.get('base_url', 'N/A')}")
+            elif source_type == "github":
+                logger.info(f"  {idx}. GitHub: {source.get('repo', 'N/A')}")
+            elif source_type == "pdf":
+                logger.info(f"  {idx}. PDF: {source.get('pdf_path', 'N/A')}")
+        logger.info(f"\nOutput directory: {scraper.output_dir}")
+        logger.info(f"Merge mode: {scraper.merge_mode}")
+        return
+
    # Run scraper
    scraper.run()