From cc76efa29af417eb39853bdab78feb016c4d7f55 Mon Sep 17 00:00:00 2001 From: yusyus Date: Wed, 21 Jan 2026 23:22:03 +0300 Subject: [PATCH] fix: Critical CLI bug fixes for issues #258 and #259 This hotfix resolves 4 critical bugs reported by users: Issue #258: install command fails with unified_scraper - Added --fresh and --dry-run flags to unified_scraper.py - Updated main.py to pass both flags to unified scraper - Fixed "unrecognized arguments" error Issue #259 (Original): scrape command doesn't accept positional URL and --max-pages - Added positional URL argument to scrape command - Added --max-pages flag with safety warnings (>1000 pages, <10 pages) - Updated doc_scraper.py and main.py argument parsers Issue #259 (Comment A): Version shows 2.7.0 instead of actual version - Fixed hardcoded version in main.py - Now reads version dynamically from __init__.py Issue #259 (Comment B): PDF command shows empty "Error: " message - Improved exception handler in main.py to show exception type if message is empty - Added proper error handling in pdf_scraper.py with context-specific messages - Added traceback support in verbose mode All fixes tested and verified with exact commands from issue reports. Resolves: #258, #259 Co-Authored-By: Claude Sonnet 4.5 --- src/skill_seekers/cli/doc_scraper.py | 49 ++++++++++++++++++++++-- src/skill_seekers/cli/main.py | 27 ++++++++++--- src/skill_seekers/cli/pdf_scraper.py | 24 +++++++++--- src/skill_seekers/cli/unified_scraper.py | 39 +++++++++++++++++++ 4 files changed, 125 insertions(+), 14 deletions(-) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py index e0cc036..3ac94a0 100755 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -1900,6 +1900,14 @@ def setup_argument_parser() -> argparse.ArgumentParser: formatter_class=argparse.RawDescriptionHelpFormatter, ) + # Positional URL argument (optional, for quick scraping) + parser.add_argument( + "url", + nargs="?", + type=str, + help="Base documentation URL (alternative to --url)", + ) + parser.add_argument( "--interactive", "-i", @@ -1913,8 +1921,14 @@ def setup_argument_parser() -> argparse.ArgumentParser: help="Load configuration from file (e.g., configs/godot.json)", ) parser.add_argument("--name", type=str, help="Skill name") - parser.add_argument("--url", type=str, help="Base documentation URL") + parser.add_argument("--url", type=str, help="Base documentation URL (alternative to positional URL)") parser.add_argument("--description", "-d", type=str, help="Skill description") + parser.add_argument( + "--max-pages", + type=int, + metavar="N", + help="Maximum pages to scrape (overrides config). Use with caution - for testing/prototyping only.", + ) parser.add_argument( "--skip-scrape", action="store_true", help="Skip scraping, use existing data" ) @@ -2012,16 +2026,20 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]: >>> print(config['name']) react """ + # Handle URL from either positional argument or --url flag + # Positional 'url' takes precedence, then --url flag + effective_url = getattr(args, 'url', None) + # Get base configuration if args.config: config = load_config(args.config) - elif args.interactive or not (args.name and args.url): + elif args.interactive or not (args.name and effective_url): config = interactive_config() else: config = { "name": args.name, "description": args.description or f"Use when working with {args.name}", - "base_url": args.url, + "base_url": effective_url, "selectors": { "main_content": "div[role='main']", "title": "title", @@ -2067,6 +2085,31 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]: "⚠️ Async mode enabled but workers=1. Consider using --workers 4 for better performance" ) + # Apply CLI override for max_pages + if args.max_pages is not None: + old_max = config.get("max_pages", DEFAULT_MAX_PAGES) + config["max_pages"] = args.max_pages + + # Warnings for --max-pages usage + if args.max_pages > 1000: + logger.warning( + "⚠️ --max-pages=%d is very high - scraping may take hours", args.max_pages + ) + logger.warning( + " Recommendation: Use configs with reasonable limits for production" + ) + elif args.max_pages < 10: + logger.warning( + "⚠️ --max-pages=%d is very low - may result in incomplete skill", args.max_pages + ) + + if old_max and old_max != args.max_pages: + logger.info( + "📊 Max pages override: %d → %d (from --max-pages flag)", old_max, args.max_pages + ) + else: + logger.info("📊 Max pages set to: %d (from --max-pages flag)", args.max_pages) + return config diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index 4b9870d..cc5d64b 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -34,6 +34,8 @@ Examples: import argparse import sys +from skill_seekers.cli import __version__ + def create_parser() -> argparse.ArgumentParser: """Create the main argument parser with subcommands.""" @@ -63,7 +65,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers """, ) - parser.add_argument("--version", action="version", version="%(prog)s 2.7.0") + parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}") subparsers = parser.add_subparsers( dest="command", @@ -95,10 +97,11 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers help="Scrape documentation website", description="Scrape documentation website and generate skill", ) + scrape_parser.add_argument("url", nargs="?", help="Documentation URL (positional argument)") scrape_parser.add_argument("--config", help="Config JSON file") scrape_parser.add_argument("--name", help="Skill name") - scrape_parser.add_argument("--url", help="Documentation URL") scrape_parser.add_argument("--description", help="Skill description") + scrape_parser.add_argument("--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)") scrape_parser.add_argument( "--skip-scrape", action="store_true", help="Skip scraping, use cached data" ) @@ -154,6 +157,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers ) unified_parser.add_argument("--config", required=True, help="Unified config JSON file") unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)") + unified_parser.add_argument("--fresh", action="store_true", help="Clear existing data and start fresh") unified_parser.add_argument("--dry-run", action="store_true", help="Dry run mode") # === enhance subcommand === @@ -338,14 +342,17 @@ def main(argv: list[str] | None = None) -> int: # Convert args namespace to sys.argv format for doc_scraper sys.argv = ["doc_scraper.py"] + # Add positional URL if provided (positional arg has priority) + if hasattr(args, 'url') and args.url: + sys.argv.append(args.url) if args.config: sys.argv.extend(["--config", args.config]) if args.name: sys.argv.extend(["--name", args.name]) - if args.url: - sys.argv.extend(["--url", args.url]) if args.description: sys.argv.extend(["--description", args.description]) + if hasattr(args, 'max_pages') and args.max_pages: + sys.argv.extend(["--max-pages", str(args.max_pages)]) if args.skip_scrape: sys.argv.append("--skip-scrape") if args.enhance: @@ -406,6 +413,8 @@ def main(argv: list[str] | None = None) -> int: sys.argv = ["unified_scraper.py", "--config", args.config] if args.merge_mode: sys.argv.extend(["--merge-mode", args.merge_mode]) + if args.fresh: + sys.argv.append("--fresh") if args.dry_run: sys.argv.append("--dry-run") return unified_main() or 0 @@ -533,7 +542,15 @@ def main(argv: list[str] | None = None) -> int: print("\n\nInterrupted by user", file=sys.stderr) return 130 except Exception as e: - print(f"Error: {e}", file=sys.stderr) + # Provide helpful error message + error_msg = str(e) if str(e) else f"{type(e).__name__} occurred" + print(f"Error: {error_msg}", file=sys.stderr) + + # Show traceback in verbose mode (if -v flag exists in args) + import traceback + if hasattr(args, 'verbose') and getattr(args, 'verbose', False): + traceback.print_exc() + return 1 diff --git a/src/skill_seekers/cli/pdf_scraper.py b/src/skill_seekers/cli/pdf_scraper.py index c439276..137c156 100644 --- a/src/skill_seekers/cli/pdf_scraper.py +++ b/src/skill_seekers/cli/pdf_scraper.py @@ -591,14 +591,26 @@ def main(): } # Create converter - converter = PDFToSkillConverter(config) + try: + converter = PDFToSkillConverter(config) - # Extract if needed - if config.get("pdf_path") and not converter.extract_pdf(): + # Extract if needed + if config.get("pdf_path"): + if not converter.extract_pdf(): + print("\n❌ PDF extraction failed - see error above", file=sys.stderr) + sys.exit(1) + + # Build skill + converter.build_skill() + + except RuntimeError as e: + print(f"\n❌ Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"\n❌ Unexpected error during PDF processing: {e}", file=sys.stderr) + import traceback + traceback.print_exc() sys.exit(1) - - # Build skill - converter.build_skill() if __name__ == "__main__": diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index cc27811..8772041 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -870,6 +870,16 @@ Examples: action="store_true", help="Skip C3.x codebase analysis for GitHub sources (default: enabled)", ) + parser.add_argument( + "--fresh", + action="store_true", + help="Clear any existing data and start fresh (ignore checkpoints)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview what will be scraped without actually scraping", + ) args = parser.parse_args() @@ -885,6 +895,35 @@ Examples: f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}" ) + # Handle --fresh flag (clear cache) + if args.fresh: + import shutil + + if os.path.exists(scraper.cache_dir): + logger.info(f"🧹 Clearing cache: {scraper.cache_dir}") + shutil.rmtree(scraper.cache_dir) + # Recreate directories + os.makedirs(scraper.sources_dir, exist_ok=True) + os.makedirs(scraper.data_dir, exist_ok=True) + os.makedirs(scraper.repos_dir, exist_ok=True) + os.makedirs(scraper.logs_dir, exist_ok=True) + + # Handle --dry-run flag + if args.dry_run: + logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur") + logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:") + for idx, source in enumerate(scraper.config.get("sources", []), 1): + source_type = source.get("type", "unknown") + if source_type == "documentation": + logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}") + elif source_type == "github": + logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}") + elif source_type == "pdf": + logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}") + logger.info(f"\nOutput directory: {scraper.output_dir}") + logger.info(f"Merge mode: {scraper.merge_mode}") + return + # Run scraper scraper.run()