Files
skill-seekers-reference/src/skill_seekers/cli/arguments/scrape.py
yusyus 57061b7daf style: Auto-format 48 files with ruff format
- Fixed formatting to comply with ruff standards
- No functional changes, only formatting/style
- Completes CI/CD pipeline formatting requirements
2026-02-15 20:24:32 +03:00

239 lines
7.0 KiB
Python

"""Scrape command argument definitions.
This module defines ALL arguments for the scrape command in ONE place.
Both doc_scraper.py (standalone) and parsers/scrape_parser.py (unified CLI)
import and use these definitions.
This ensures the parsers NEVER drift out of sync.
"""
import argparse
from typing import Any
from skill_seekers.cli.constants import DEFAULT_RATE_LIMIT
from .common import RAG_ARGUMENTS
# Scrape-specific argument definitions as data structure
# This enables introspection for UI generation and testing
SCRAPE_ARGUMENTS: dict[str, dict[str, Any]] = {
# Positional argument
"url_positional": {
"flags": ("url",),
"kwargs": {
"nargs": "?",
"type": str,
"help": "Base documentation URL (alternative to --url)",
},
},
# Common arguments (also defined in common.py for other commands)
"config": {
"flags": ("--config", "-c"),
"kwargs": {
"type": str,
"help": "Load configuration from JSON file (e.g., configs/react.json)",
"metavar": "FILE",
},
},
"name": {
"flags": ("--name",),
"kwargs": {
"type": str,
"help": "Skill name (used for output directory and filenames)",
"metavar": "NAME",
},
},
"description": {
"flags": ("--description", "-d"),
"kwargs": {
"type": str,
"help": "Skill description (used in SKILL.md)",
"metavar": "TEXT",
},
},
# Enhancement arguments
"enhance_level": {
"flags": ("--enhance-level",),
"kwargs": {
"type": int,
"choices": [0, 1, 2, 3],
"default": 2,
"help": (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled, 1=SKILL.md only, 2=+architecture/config (default), 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
),
"metavar": "LEVEL",
},
},
"api_key": {
"flags": ("--api-key",),
"kwargs": {
"type": str,
"help": "Anthropic API key for --enhance (or set ANTHROPIC_API_KEY env var)",
"metavar": "KEY",
},
},
# Scrape-specific options
"interactive": {
"flags": ("--interactive", "-i"),
"kwargs": {
"action": "store_true",
"help": "Interactive configuration mode",
},
},
"url": {
"flags": ("--url",),
"kwargs": {
"type": str,
"help": "Base documentation URL (alternative to positional URL)",
"metavar": "URL",
},
},
"max_pages": {
"flags": ("--max-pages",),
"kwargs": {
"type": int,
"metavar": "N",
"help": "Maximum pages to scrape (overrides config). Use with caution - for testing/prototyping only.",
},
},
"skip_scrape": {
"flags": ("--skip-scrape",),
"kwargs": {
"action": "store_true",
"help": "Skip scraping, use existing data",
},
},
"dry_run": {
"flags": ("--dry-run",),
"kwargs": {
"action": "store_true",
"help": "Preview what will be scraped without actually scraping",
},
},
"resume": {
"flags": ("--resume",),
"kwargs": {
"action": "store_true",
"help": "Resume from last checkpoint (for interrupted scrapes)",
},
},
"fresh": {
"flags": ("--fresh",),
"kwargs": {
"action": "store_true",
"help": "Clear checkpoint and start fresh",
},
},
"rate_limit": {
"flags": ("--rate-limit", "-r"),
"kwargs": {
"type": float,
"metavar": "SECONDS",
"help": f"Override rate limit in seconds (default: from config or {DEFAULT_RATE_LIMIT}). Use 0 for no delay.",
},
},
"workers": {
"flags": ("--workers", "-w"),
"kwargs": {
"type": int,
"metavar": "N",
"help": "Number of parallel workers for faster scraping (default: 1, max: 10)",
},
},
"async_mode": {
"flags": ("--async",),
"kwargs": {
"dest": "async_mode",
"action": "store_true",
"help": "Enable async mode for better parallel performance (2-3x faster than threads)",
},
},
"no_rate_limit": {
"flags": ("--no-rate-limit",),
"kwargs": {
"action": "store_true",
"help": "Disable rate limiting completely (same as --rate-limit 0)",
},
},
"interactive_enhancement": {
"flags": ("--interactive-enhancement",),
"kwargs": {
"action": "store_true",
"help": "Open terminal window for enhancement (use with --enhance-local)",
},
},
"verbose": {
"flags": ("--verbose", "-v"),
"kwargs": {
"action": "store_true",
"help": "Enable verbose output (DEBUG level logging)",
},
},
"quiet": {
"flags": ("--quiet", "-q"),
"kwargs": {
"action": "store_true",
"help": "Minimize output (WARNING level logging only)",
},
},
# RAG chunking options (imported from common.py - see RAG_ARGUMENTS)
# Note: RAG arguments will be merged at runtime
"no_preserve_code_blocks": {
"flags": ("--no-preserve-code-blocks",),
"kwargs": {
"action": "store_true",
"help": "Allow splitting code blocks across chunks (not recommended)",
},
},
"no_preserve_paragraphs": {
"flags": ("--no-preserve-paragraphs",),
"kwargs": {
"action": "store_true",
"help": "Ignore paragraph boundaries when chunking (not recommended)",
},
},
}
# Merge RAG arguments from common.py
SCRAPE_ARGUMENTS.update(RAG_ARGUMENTS)
def add_scrape_arguments(parser: argparse.ArgumentParser) -> None:
"""Add all scrape command arguments to a parser.
This is the SINGLE SOURCE OF TRUTH for scrape arguments.
Used by:
- doc_scraper.py (standalone scraper)
- parsers/scrape_parser.py (unified CLI)
Args:
parser: The ArgumentParser to add arguments to
Example:
>>> parser = argparse.ArgumentParser()
>>> add_scrape_arguments(parser) # Adds all 26 scrape args
"""
for arg_name, arg_def in SCRAPE_ARGUMENTS.items():
flags = arg_def["flags"]
kwargs = arg_def["kwargs"]
parser.add_argument(*flags, **kwargs)
def get_scrape_argument_names() -> set:
"""Get the set of scrape argument destination names.
Returns:
Set of argument dest names
"""
return set(SCRAPE_ARGUMENTS.keys())
def get_scrape_argument_count() -> int:
"""Get the total number of scrape arguments.
Returns:
Number of arguments
"""
return len(SCRAPE_ARGUMENTS)