This hotfix resolves 4 critical bugs reported by users: Issue #258: install command fails with unified_scraper - Added --fresh and --dry-run flags to unified_scraper.py - Updated main.py to pass both flags to unified scraper - Fixed "unrecognized arguments" error Issue #259 (Original): scrape command doesn't accept positional URL and --max-pages - Added positional URL argument to scrape command - Added --max-pages flag with safety warnings (>1000 pages, <10 pages) - Updated doc_scraper.py and main.py argument parsers Issue #259 (Comment A): Version shows 2.7.0 instead of actual version - Fixed hardcoded version in main.py - Now reads version dynamically from __init__.py Issue #259 (Comment B): PDF command shows empty "Error: " message - Improved exception handler in main.py to show exception type if message is empty - Added proper error handling in pdf_scraper.py with context-specific messages - Added traceback support in verbose mode All fixes tested and verified with exact commands from issue reports. Resolves: #258, #259 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -870,6 +870,16 @@ Examples:
|
||||
action="store_true",
|
||||
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fresh",
|
||||
action="store_true",
|
||||
help="Clear any existing data and start fresh (ignore checkpoints)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Preview what will be scraped without actually scraping",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -885,6 +895,35 @@ Examples:
|
||||
f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}"
|
||||
)
|
||||
|
||||
# Handle --fresh flag (clear cache)
|
||||
if args.fresh:
|
||||
import shutil
|
||||
|
||||
if os.path.exists(scraper.cache_dir):
|
||||
logger.info(f"🧹 Clearing cache: {scraper.cache_dir}")
|
||||
shutil.rmtree(scraper.cache_dir)
|
||||
# Recreate directories
|
||||
os.makedirs(scraper.sources_dir, exist_ok=True)
|
||||
os.makedirs(scraper.data_dir, exist_ok=True)
|
||||
os.makedirs(scraper.repos_dir, exist_ok=True)
|
||||
os.makedirs(scraper.logs_dir, exist_ok=True)
|
||||
|
||||
# Handle --dry-run flag
|
||||
if args.dry_run:
|
||||
logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
|
||||
logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
|
||||
for idx, source in enumerate(scraper.config.get("sources", []), 1):
|
||||
source_type = source.get("type", "unknown")
|
||||
if source_type == "documentation":
|
||||
logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
|
||||
elif source_type == "github":
|
||||
logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
|
||||
elif source_type == "pdf":
|
||||
logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
|
||||
logger.info(f"\nOutput directory: {scraper.output_dir}")
|
||||
logger.info(f"Merge mode: {scraper.merge_mode}")
|
||||
return
|
||||
|
||||
# Run scraper
|
||||
scraper.run()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user