This hotfix resolves 4 critical bugs reported by users: Issue #258: install command fails with unified_scraper - Added --fresh and --dry-run flags to unified_scraper.py - Updated main.py to pass both flags to unified scraper - Fixed "unrecognized arguments" error Issue #259 (Original): scrape command doesn't accept positional URL and --max-pages - Added positional URL argument to scrape command - Added --max-pages flag with safety warnings (>1000 pages, <10 pages) - Updated doc_scraper.py and main.py argument parsers Issue #259 (Comment A): Version shows 2.7.0 instead of actual version - Fixed hardcoded version in main.py - Now reads version dynamically from __init__.py Issue #259 (Comment B): PDF command shows empty "Error: " message - Improved exception handler in main.py to show exception type if message is empty - Added proper error handling in pdf_scraper.py with context-specific messages - Added traceback support in verbose mode All fixes tested and verified with exact commands from issue reports. Resolves: #258, #259 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1900,6 +1900,14 @@ def setup_argument_parser() -> argparse.ArgumentParser:
|
|||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Positional URL argument (optional, for quick scraping)
|
||||||
|
parser.add_argument(
|
||||||
|
"url",
|
||||||
|
nargs="?",
|
||||||
|
type=str,
|
||||||
|
help="Base documentation URL (alternative to --url)",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--interactive",
|
"--interactive",
|
||||||
"-i",
|
"-i",
|
||||||
@@ -1913,8 +1921,14 @@ def setup_argument_parser() -> argparse.ArgumentParser:
|
|||||||
help="Load configuration from file (e.g., configs/godot.json)",
|
help="Load configuration from file (e.g., configs/godot.json)",
|
||||||
)
|
)
|
||||||
parser.add_argument("--name", type=str, help="Skill name")
|
parser.add_argument("--name", type=str, help="Skill name")
|
||||||
parser.add_argument("--url", type=str, help="Base documentation URL")
|
parser.add_argument("--url", type=str, help="Base documentation URL (alternative to positional URL)")
|
||||||
parser.add_argument("--description", "-d", type=str, help="Skill description")
|
parser.add_argument("--description", "-d", type=str, help="Skill description")
|
||||||
|
parser.add_argument(
|
||||||
|
"--max-pages",
|
||||||
|
type=int,
|
||||||
|
metavar="N",
|
||||||
|
help="Maximum pages to scrape (overrides config). Use with caution - for testing/prototyping only.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip-scrape", action="store_true", help="Skip scraping, use existing data"
|
"--skip-scrape", action="store_true", help="Skip scraping, use existing data"
|
||||||
)
|
)
|
||||||
@@ -2012,16 +2026,20 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
|
|||||||
>>> print(config['name'])
|
>>> print(config['name'])
|
||||||
react
|
react
|
||||||
"""
|
"""
|
||||||
|
# Handle URL from either positional argument or --url flag
|
||||||
|
# Positional 'url' takes precedence, then --url flag
|
||||||
|
effective_url = getattr(args, 'url', None)
|
||||||
|
|
||||||
# Get base configuration
|
# Get base configuration
|
||||||
if args.config:
|
if args.config:
|
||||||
config = load_config(args.config)
|
config = load_config(args.config)
|
||||||
elif args.interactive or not (args.name and args.url):
|
elif args.interactive or not (args.name and effective_url):
|
||||||
config = interactive_config()
|
config = interactive_config()
|
||||||
else:
|
else:
|
||||||
config = {
|
config = {
|
||||||
"name": args.name,
|
"name": args.name,
|
||||||
"description": args.description or f"Use when working with {args.name}",
|
"description": args.description or f"Use when working with {args.name}",
|
||||||
"base_url": args.url,
|
"base_url": effective_url,
|
||||||
"selectors": {
|
"selectors": {
|
||||||
"main_content": "div[role='main']",
|
"main_content": "div[role='main']",
|
||||||
"title": "title",
|
"title": "title",
|
||||||
@@ -2067,6 +2085,31 @@ def get_configuration(args: argparse.Namespace) -> dict[str, Any]:
|
|||||||
"⚠️ Async mode enabled but workers=1. Consider using --workers 4 for better performance"
|
"⚠️ Async mode enabled but workers=1. Consider using --workers 4 for better performance"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Apply CLI override for max_pages
|
||||||
|
if args.max_pages is not None:
|
||||||
|
old_max = config.get("max_pages", DEFAULT_MAX_PAGES)
|
||||||
|
config["max_pages"] = args.max_pages
|
||||||
|
|
||||||
|
# Warnings for --max-pages usage
|
||||||
|
if args.max_pages > 1000:
|
||||||
|
logger.warning(
|
||||||
|
"⚠️ --max-pages=%d is very high - scraping may take hours", args.max_pages
|
||||||
|
)
|
||||||
|
logger.warning(
|
||||||
|
" Recommendation: Use configs with reasonable limits for production"
|
||||||
|
)
|
||||||
|
elif args.max_pages < 10:
|
||||||
|
logger.warning(
|
||||||
|
"⚠️ --max-pages=%d is very low - may result in incomplete skill", args.max_pages
|
||||||
|
)
|
||||||
|
|
||||||
|
if old_max and old_max != args.max_pages:
|
||||||
|
logger.info(
|
||||||
|
"📊 Max pages override: %d → %d (from --max-pages flag)", old_max, args.max_pages
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("📊 Max pages set to: %d (from --max-pages flag)", args.max_pages)
|
||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -34,6 +34,8 @@ Examples:
|
|||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
from skill_seekers.cli import __version__
|
||||||
|
|
||||||
|
|
||||||
def create_parser() -> argparse.ArgumentParser:
|
def create_parser() -> argparse.ArgumentParser:
|
||||||
"""Create the main argument parser with subcommands."""
|
"""Create the main argument parser with subcommands."""
|
||||||
@@ -63,7 +65,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
|||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--version", action="version", version="%(prog)s 2.7.0")
|
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(
|
subparsers = parser.add_subparsers(
|
||||||
dest="command",
|
dest="command",
|
||||||
@@ -95,10 +97,11 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
|||||||
help="Scrape documentation website",
|
help="Scrape documentation website",
|
||||||
description="Scrape documentation website and generate skill",
|
description="Scrape documentation website and generate skill",
|
||||||
)
|
)
|
||||||
|
scrape_parser.add_argument("url", nargs="?", help="Documentation URL (positional argument)")
|
||||||
scrape_parser.add_argument("--config", help="Config JSON file")
|
scrape_parser.add_argument("--config", help="Config JSON file")
|
||||||
scrape_parser.add_argument("--name", help="Skill name")
|
scrape_parser.add_argument("--name", help="Skill name")
|
||||||
scrape_parser.add_argument("--url", help="Documentation URL")
|
|
||||||
scrape_parser.add_argument("--description", help="Skill description")
|
scrape_parser.add_argument("--description", help="Skill description")
|
||||||
|
scrape_parser.add_argument("--max-pages", type=int, dest="max_pages", help="Maximum pages to scrape (override config)")
|
||||||
scrape_parser.add_argument(
|
scrape_parser.add_argument(
|
||||||
"--skip-scrape", action="store_true", help="Skip scraping, use cached data"
|
"--skip-scrape", action="store_true", help="Skip scraping, use cached data"
|
||||||
)
|
)
|
||||||
@@ -154,6 +157,7 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers
|
|||||||
)
|
)
|
||||||
unified_parser.add_argument("--config", required=True, help="Unified config JSON file")
|
unified_parser.add_argument("--config", required=True, help="Unified config JSON file")
|
||||||
unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)")
|
unified_parser.add_argument("--merge-mode", help="Merge mode (rule-based, claude-enhanced)")
|
||||||
|
unified_parser.add_argument("--fresh", action="store_true", help="Clear existing data and start fresh")
|
||||||
unified_parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
|
unified_parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
|
||||||
|
|
||||||
# === enhance subcommand ===
|
# === enhance subcommand ===
|
||||||
@@ -338,14 +342,17 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
|
|
||||||
# Convert args namespace to sys.argv format for doc_scraper
|
# Convert args namespace to sys.argv format for doc_scraper
|
||||||
sys.argv = ["doc_scraper.py"]
|
sys.argv = ["doc_scraper.py"]
|
||||||
|
# Add positional URL if provided (positional arg has priority)
|
||||||
|
if hasattr(args, 'url') and args.url:
|
||||||
|
sys.argv.append(args.url)
|
||||||
if args.config:
|
if args.config:
|
||||||
sys.argv.extend(["--config", args.config])
|
sys.argv.extend(["--config", args.config])
|
||||||
if args.name:
|
if args.name:
|
||||||
sys.argv.extend(["--name", args.name])
|
sys.argv.extend(["--name", args.name])
|
||||||
if args.url:
|
|
||||||
sys.argv.extend(["--url", args.url])
|
|
||||||
if args.description:
|
if args.description:
|
||||||
sys.argv.extend(["--description", args.description])
|
sys.argv.extend(["--description", args.description])
|
||||||
|
if hasattr(args, 'max_pages') and args.max_pages:
|
||||||
|
sys.argv.extend(["--max-pages", str(args.max_pages)])
|
||||||
if args.skip_scrape:
|
if args.skip_scrape:
|
||||||
sys.argv.append("--skip-scrape")
|
sys.argv.append("--skip-scrape")
|
||||||
if args.enhance:
|
if args.enhance:
|
||||||
@@ -406,6 +413,8 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
sys.argv = ["unified_scraper.py", "--config", args.config]
|
sys.argv = ["unified_scraper.py", "--config", args.config]
|
||||||
if args.merge_mode:
|
if args.merge_mode:
|
||||||
sys.argv.extend(["--merge-mode", args.merge_mode])
|
sys.argv.extend(["--merge-mode", args.merge_mode])
|
||||||
|
if args.fresh:
|
||||||
|
sys.argv.append("--fresh")
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
sys.argv.append("--dry-run")
|
sys.argv.append("--dry-run")
|
||||||
return unified_main() or 0
|
return unified_main() or 0
|
||||||
@@ -533,7 +542,15 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
print("\n\nInterrupted by user", file=sys.stderr)
|
print("\n\nInterrupted by user", file=sys.stderr)
|
||||||
return 130
|
return 130
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error: {e}", file=sys.stderr)
|
# Provide helpful error message
|
||||||
|
error_msg = str(e) if str(e) else f"{type(e).__name__} occurred"
|
||||||
|
print(f"Error: {error_msg}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Show traceback in verbose mode (if -v flag exists in args)
|
||||||
|
import traceback
|
||||||
|
if hasattr(args, 'verbose') and getattr(args, 'verbose', False):
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -591,14 +591,26 @@ def main():
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Create converter
|
# Create converter
|
||||||
converter = PDFToSkillConverter(config)
|
try:
|
||||||
|
converter = PDFToSkillConverter(config)
|
||||||
|
|
||||||
# Extract if needed
|
# Extract if needed
|
||||||
if config.get("pdf_path") and not converter.extract_pdf():
|
if config.get("pdf_path"):
|
||||||
|
if not converter.extract_pdf():
|
||||||
|
print("\n❌ PDF extraction failed - see error above", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Build skill
|
||||||
|
converter.build_skill()
|
||||||
|
|
||||||
|
except RuntimeError as e:
|
||||||
|
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Unexpected error during PDF processing: {e}", file=sys.stderr)
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Build skill
|
|
||||||
converter.build_skill()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -870,6 +870,16 @@ Examples:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
|
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--fresh",
|
||||||
|
action="store_true",
|
||||||
|
help="Clear any existing data and start fresh (ignore checkpoints)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
help="Preview what will be scraped without actually scraping",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@@ -885,6 +895,35 @@ Examples:
|
|||||||
f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}"
|
f"⏭️ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Handle --fresh flag (clear cache)
|
||||||
|
if args.fresh:
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
if os.path.exists(scraper.cache_dir):
|
||||||
|
logger.info(f"🧹 Clearing cache: {scraper.cache_dir}")
|
||||||
|
shutil.rmtree(scraper.cache_dir)
|
||||||
|
# Recreate directories
|
||||||
|
os.makedirs(scraper.sources_dir, exist_ok=True)
|
||||||
|
os.makedirs(scraper.data_dir, exist_ok=True)
|
||||||
|
os.makedirs(scraper.repos_dir, exist_ok=True)
|
||||||
|
os.makedirs(scraper.logs_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Handle --dry-run flag
|
||||||
|
if args.dry_run:
|
||||||
|
logger.info("🔍 DRY RUN MODE - Preview only, no scraping will occur")
|
||||||
|
logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
|
||||||
|
for idx, source in enumerate(scraper.config.get("sources", []), 1):
|
||||||
|
source_type = source.get("type", "unknown")
|
||||||
|
if source_type == "documentation":
|
||||||
|
logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
|
||||||
|
elif source_type == "github":
|
||||||
|
logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
|
||||||
|
elif source_type == "pdf":
|
||||||
|
logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
|
||||||
|
logger.info(f"\nOutput directory: {scraper.output_dir}")
|
||||||
|
logger.info(f"Merge mode: {scraper.merge_mode}")
|
||||||
|
return
|
||||||
|
|
||||||
# Run scraper
|
# Run scraper
|
||||||
scraper.run()
|
scraper.run()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user