Implements ROADMAP task B2 — full .docx scraping support via mammoth + python-docx, producing SKILL.md + references/ output identical to other source types. New files: - src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class + main() entry point (~600 lines); mammoth → BeautifulSoup pipeline; handles headings, code detection (incl. monospace <p><br> blocks), tables, images, metadata extraction - src/skill_seekers/cli/arguments/word.py — add_word_arguments() + WORD_ARGUMENTS dict - src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified CLI parser registry - tests/test_word_scraper.py — comprehensive test suite (~300 lines) Modified files: - src/skill_seekers/cli/main.py — registered "word" command module - src/skill_seekers/cli/source_detector.py — .docx auto-detection + _detect_word() classmethod - src/skill_seekers/cli/create_command.py — _route_word() + --help-word - src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing - src/skill_seekers/cli/arguments/__init__.py — export word args - src/skill_seekers/cli/parsers/__init__.py — register WordParser - src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration - src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead of stub; remove [:3] reference file limit; capture run_workflows return - src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary open_issues[:20] / closed_issues[:10] reference file limits - pyproject.toml — skill-seekers-word entry point + docx optional dep - tests/test_cli_parsers.py — update parser count 21→22 Bug fixes applied during real-world testing: - Code detection: detect monospace <p><br> blocks as code (mammoth renders Courier paragraphs this way, not as <pre>/<code>) - Language detector: fix wrong method name detect_from_text → detect_from_code - Description inference: pass None from main() so extract_docx() can infer description from Word document subject/title metadata - Bullet-point guard: exclude prose starting with •/-/* from code scoring - Enhancement: implement real API/LOCAL enhancement (was stub) - pip install message: add quotes around skill-seekers[docx] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
572 lines
17 KiB
Python
572 lines
17 KiB
Python
"""Create command unified argument definitions.
|
|
|
|
Organizes arguments into three tiers:
|
|
1. Universal Arguments - Work for ALL sources (web, github, local, pdf, config)
|
|
2. Source-Specific Arguments - Only relevant for specific sources
|
|
3. Advanced Arguments - Rarely used, hidden from default help
|
|
|
|
This enables progressive disclosure in help text while maintaining
|
|
100% backward compatibility with existing commands.
|
|
"""
|
|
|
|
import argparse
|
|
from typing import Any
|
|
|
|
from skill_seekers.cli.constants import DEFAULT_RATE_LIMIT
|
|
from .common import RAG_ARGUMENTS
|
|
|
|
# =============================================================================
|
|
# TIER 1: UNIVERSAL ARGUMENTS (19 flags)
|
|
# =============================================================================
|
|
# These arguments work for ALL source types
|
|
# Includes: 11 core + 4 workflow + 4 RAG (merged from common.py)
|
|
|
|
UNIVERSAL_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
# Identity arguments
|
|
"name": {
|
|
"flags": ("--name",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Skill name (default: auto-detected from source)",
|
|
"metavar": "NAME",
|
|
},
|
|
},
|
|
"description": {
|
|
"flags": ("--description", "-d"),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Skill description (used in SKILL.md)",
|
|
"metavar": "TEXT",
|
|
},
|
|
},
|
|
"output": {
|
|
"flags": ("--output", "-o"),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Output directory (default: auto-generated from name)",
|
|
"metavar": "DIR",
|
|
},
|
|
},
|
|
# Enhancement arguments
|
|
"enhance_level": {
|
|
"flags": ("--enhance-level",),
|
|
"kwargs": {
|
|
"type": int,
|
|
"choices": [0, 1, 2, 3],
|
|
"default": 2,
|
|
"help": (
|
|
"AI enhancement level (auto-detects API vs LOCAL mode): "
|
|
"0=disabled, 1=SKILL.md only, 2=+architecture/config (default), 3=full enhancement. "
|
|
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
|
|
),
|
|
"metavar": "LEVEL",
|
|
},
|
|
},
|
|
"api_key": {
|
|
"flags": ("--api-key",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Anthropic API key (or set ANTHROPIC_API_KEY env var)",
|
|
"metavar": "KEY",
|
|
},
|
|
},
|
|
# Behavior arguments
|
|
"dry_run": {
|
|
"flags": ("--dry-run",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Preview what will be created without actually creating it",
|
|
},
|
|
},
|
|
"verbose": {
|
|
"flags": ("--verbose", "-v"),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Enable verbose output (DEBUG level logging)",
|
|
},
|
|
},
|
|
"quiet": {
|
|
"flags": ("--quiet", "-q"),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Minimize output (WARNING level only)",
|
|
},
|
|
},
|
|
# RAG features (imported from common.py - see RAG_ARGUMENTS)
|
|
# Note: RAG arguments are merged into UNIVERSAL_ARGUMENTS at runtime
|
|
# Preset system
|
|
"preset": {
|
|
"flags": ("--preset", "-p"),
|
|
"kwargs": {
|
|
"type": str,
|
|
"choices": ["quick", "standard", "comprehensive"],
|
|
"help": "Analysis preset: quick (1-2 min), standard (5-10 min), comprehensive (20-60 min)",
|
|
"metavar": "PRESET",
|
|
},
|
|
},
|
|
# Config loading
|
|
"config": {
|
|
"flags": ("--config", "-c"),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Load additional settings from JSON file",
|
|
"metavar": "FILE",
|
|
},
|
|
},
|
|
# Enhancement Workflow arguments (NEW - Phase 2)
|
|
"enhance_workflow": {
|
|
"flags": ("--enhance-workflow",),
|
|
"kwargs": {
|
|
"action": "append",
|
|
"help": "Apply enhancement workflow (file path or preset: security-focus, minimal, api-documentation, architecture-comprehensive). Can use multiple times to chain workflows.",
|
|
"metavar": "WORKFLOW",
|
|
},
|
|
},
|
|
"enhance_stage": {
|
|
"flags": ("--enhance-stage",),
|
|
"kwargs": {
|
|
"action": "append",
|
|
"help": "Add inline enhancement stage (format: 'name:prompt'). Can be used multiple times.",
|
|
"metavar": "STAGE",
|
|
},
|
|
},
|
|
"var": {
|
|
"flags": ("--var",),
|
|
"kwargs": {
|
|
"action": "append",
|
|
"help": "Override workflow variable (format: 'key=value'). Can be used multiple times.",
|
|
"metavar": "VAR",
|
|
},
|
|
},
|
|
"workflow_dry_run": {
|
|
"flags": ("--workflow-dry-run",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Preview workflow stages without executing (requires --enhance-workflow)",
|
|
},
|
|
},
|
|
"local_repo_path": {
|
|
"flags": ("--local-repo-path",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Path to local clone of a GitHub repository for unlimited C3.x analysis (bypasses GitHub API file limits)",
|
|
"metavar": "PATH",
|
|
},
|
|
},
|
|
}
|
|
|
|
# Merge RAG arguments from common.py into universal arguments
|
|
UNIVERSAL_ARGUMENTS.update(RAG_ARGUMENTS)
|
|
|
|
# =============================================================================
|
|
# TIER 2: SOURCE-SPECIFIC ARGUMENTS
|
|
# =============================================================================
|
|
|
|
# Web scraping specific (from scrape.py)
|
|
WEB_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"url": {
|
|
"flags": ("--url",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Base documentation URL (alternative to positional arg)",
|
|
"metavar": "URL",
|
|
},
|
|
},
|
|
"max_pages": {
|
|
"flags": ("--max-pages",),
|
|
"kwargs": {
|
|
"type": int,
|
|
"metavar": "N",
|
|
"help": "Maximum pages to scrape (for testing/prototyping)",
|
|
},
|
|
},
|
|
"skip_scrape": {
|
|
"flags": ("--skip-scrape",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip scraping, use existing data",
|
|
},
|
|
},
|
|
"resume": {
|
|
"flags": ("--resume",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Resume from last checkpoint",
|
|
},
|
|
},
|
|
"fresh": {
|
|
"flags": ("--fresh",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Clear checkpoint and start fresh",
|
|
},
|
|
},
|
|
"rate_limit": {
|
|
"flags": ("--rate-limit", "-r"),
|
|
"kwargs": {
|
|
"type": float,
|
|
"metavar": "SECONDS",
|
|
"help": f"Rate limit in seconds (default: {DEFAULT_RATE_LIMIT})",
|
|
},
|
|
},
|
|
"workers": {
|
|
"flags": ("--workers", "-w"),
|
|
"kwargs": {
|
|
"type": int,
|
|
"metavar": "N",
|
|
"help": "Number of parallel workers (default: 1, max: 10)",
|
|
},
|
|
},
|
|
"async_mode": {
|
|
"flags": ("--async",),
|
|
"kwargs": {
|
|
"dest": "async_mode",
|
|
"action": "store_true",
|
|
"help": "Enable async mode (2-3x faster)",
|
|
},
|
|
},
|
|
}
|
|
|
|
# GitHub repository specific (from github.py)
|
|
GITHUB_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"repo": {
|
|
"flags": ("--repo",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "GitHub repository (owner/repo)",
|
|
"metavar": "OWNER/REPO",
|
|
},
|
|
},
|
|
"token": {
|
|
"flags": ("--token",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "GitHub personal access token",
|
|
"metavar": "TOKEN",
|
|
},
|
|
},
|
|
"profile": {
|
|
"flags": ("--profile",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "GitHub profile name (from config)",
|
|
"metavar": "PROFILE",
|
|
},
|
|
},
|
|
"non_interactive": {
|
|
"flags": ("--non-interactive",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Non-interactive mode (fail on rate limits)",
|
|
},
|
|
},
|
|
"no_issues": {
|
|
"flags": ("--no-issues",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip GitHub issues",
|
|
},
|
|
},
|
|
"no_changelog": {
|
|
"flags": ("--no-changelog",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip CHANGELOG",
|
|
},
|
|
},
|
|
"no_releases": {
|
|
"flags": ("--no-releases",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip releases",
|
|
},
|
|
},
|
|
"max_issues": {
|
|
"flags": ("--max-issues",),
|
|
"kwargs": {
|
|
"type": int,
|
|
"default": 100,
|
|
"metavar": "N",
|
|
"help": "Max issues to fetch (default: 100)",
|
|
},
|
|
},
|
|
"scrape_only": {
|
|
"flags": ("--scrape-only",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Only scrape, don't build skill",
|
|
},
|
|
},
|
|
}
|
|
|
|
# Local codebase specific (from analyze.py)
|
|
LOCAL_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"directory": {
|
|
"flags": ("--directory",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Directory to analyze",
|
|
"metavar": "DIR",
|
|
},
|
|
},
|
|
"languages": {
|
|
"flags": ("--languages",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Comma-separated languages (e.g., Python,JavaScript)",
|
|
"metavar": "LANGS",
|
|
},
|
|
},
|
|
"file_patterns": {
|
|
"flags": ("--file-patterns",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Comma-separated file patterns",
|
|
"metavar": "PATTERNS",
|
|
},
|
|
},
|
|
"skip_patterns": {
|
|
"flags": ("--skip-patterns",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip design pattern detection",
|
|
},
|
|
},
|
|
"skip_test_examples": {
|
|
"flags": ("--skip-test-examples",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip test example extraction",
|
|
},
|
|
},
|
|
"skip_how_to_guides": {
|
|
"flags": ("--skip-how-to-guides",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip how-to guide generation",
|
|
},
|
|
},
|
|
"skip_config": {
|
|
"flags": ("--skip-config",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip configuration extraction",
|
|
},
|
|
},
|
|
"skip_docs": {
|
|
"flags": ("--skip-docs",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip documentation extraction",
|
|
},
|
|
},
|
|
}
|
|
|
|
# PDF specific (from pdf.py)
|
|
PDF_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"pdf": {
|
|
"flags": ("--pdf",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "PDF file path",
|
|
"metavar": "PATH",
|
|
},
|
|
},
|
|
"ocr": {
|
|
"flags": ("--ocr",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Enable OCR for scanned PDFs",
|
|
},
|
|
},
|
|
"pages": {
|
|
"flags": ("--pages",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "Page range (e.g., '1-10', '5,7,9')",
|
|
"metavar": "RANGE",
|
|
},
|
|
},
|
|
}
|
|
|
|
# Word document specific (from word.py)
|
|
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"docx": {
|
|
"flags": ("--docx",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"help": "DOCX file path",
|
|
"metavar": "PATH",
|
|
},
|
|
},
|
|
}
|
|
|
|
# Multi-source config specific (from unified_scraper.py)
|
|
CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"merge_mode": {
|
|
"flags": ("--merge-mode",),
|
|
"kwargs": {
|
|
"type": str,
|
|
"choices": ["rule-based", "claude-enhanced"],
|
|
"help": "Override merge mode from config (rule-based or claude-enhanced)",
|
|
"metavar": "MODE",
|
|
},
|
|
},
|
|
"skip_codebase_analysis": {
|
|
"flags": ("--skip-codebase-analysis",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Skip C3.x codebase analysis for GitHub sources in unified config",
|
|
},
|
|
},
|
|
# Note: --fresh is intentionally omitted here — it already lives in WEB_ARGUMENTS.
|
|
# For unified config files, use `skill-seekers unified --fresh` directly.
|
|
}
|
|
|
|
# =============================================================================
|
|
# TIER 3: ADVANCED/RARE ARGUMENTS
|
|
# =============================================================================
|
|
# Hidden from default help, shown only with --help-advanced
|
|
|
|
ADVANCED_ARGUMENTS: dict[str, dict[str, Any]] = {
|
|
"no_rate_limit": {
|
|
"flags": ("--no-rate-limit",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Disable rate limiting completely",
|
|
},
|
|
},
|
|
"no_preserve_code_blocks": {
|
|
"flags": ("--no-preserve-code-blocks",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Allow splitting code blocks across chunks (not recommended)",
|
|
},
|
|
},
|
|
"no_preserve_paragraphs": {
|
|
"flags": ("--no-preserve-paragraphs",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Ignore paragraph boundaries when chunking (not recommended)",
|
|
},
|
|
},
|
|
"interactive_enhancement": {
|
|
"flags": ("--interactive-enhancement",),
|
|
"kwargs": {
|
|
"action": "store_true",
|
|
"help": "Open terminal window for enhancement (use with --enhance-local)",
|
|
},
|
|
},
|
|
}
|
|
|
|
# =============================================================================
|
|
# HELPER FUNCTIONS
|
|
# =============================================================================
|
|
|
|
|
|
def get_universal_argument_names() -> set[str]:
|
|
"""Get set of universal argument names."""
|
|
return set(UNIVERSAL_ARGUMENTS.keys())
|
|
|
|
|
|
def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]:
|
|
"""Get source-specific arguments for a given source type.
|
|
|
|
Args:
|
|
source_type: One of 'web', 'github', 'local', 'pdf', 'config'
|
|
|
|
Returns:
|
|
Dict of argument definitions
|
|
"""
|
|
source_args = {
|
|
"web": WEB_ARGUMENTS,
|
|
"github": GITHUB_ARGUMENTS,
|
|
"local": LOCAL_ARGUMENTS,
|
|
"pdf": PDF_ARGUMENTS,
|
|
"word": WORD_ARGUMENTS,
|
|
"config": CONFIG_ARGUMENTS,
|
|
}
|
|
return source_args.get(source_type, {})
|
|
|
|
|
|
def get_compatible_arguments(source_type: str) -> list[str]:
|
|
"""Get list of compatible argument names for a source type.
|
|
|
|
Args:
|
|
source_type: Source type ('web', 'github', 'local', 'pdf', 'config')
|
|
|
|
Returns:
|
|
List of argument names that are compatible with this source
|
|
"""
|
|
# Universal arguments are always compatible
|
|
compatible = list(UNIVERSAL_ARGUMENTS.keys())
|
|
|
|
# Add source-specific arguments
|
|
source_specific = get_source_specific_arguments(source_type)
|
|
compatible.extend(source_specific.keys())
|
|
|
|
# Advanced arguments are always technically available
|
|
compatible.extend(ADVANCED_ARGUMENTS.keys())
|
|
|
|
return compatible
|
|
|
|
|
|
def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default") -> None:
|
|
"""Add create command arguments to parser.
|
|
|
|
Supports multiple help modes for progressive disclosure:
|
|
- 'default': Universal arguments only (15 flags)
|
|
- 'web': Universal + web-specific
|
|
- 'github': Universal + github-specific
|
|
- 'local': Universal + local-specific
|
|
- 'pdf': Universal + pdf-specific
|
|
- 'word': Universal + word-specific
|
|
- 'advanced': Advanced/rare arguments
|
|
- 'all': All 120+ arguments
|
|
|
|
Args:
|
|
parser: ArgumentParser to add arguments to
|
|
mode: Help mode (default, web, github, local, pdf, word, advanced, all)
|
|
"""
|
|
# Positional argument for source
|
|
parser.add_argument(
|
|
"source",
|
|
nargs="?",
|
|
type=str,
|
|
help="Source to create skill from (URL, GitHub repo, directory, PDF, or config file)",
|
|
)
|
|
|
|
# Always add universal arguments
|
|
for arg_name, arg_def in UNIVERSAL_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
# Add source-specific arguments based on mode
|
|
if mode in ["web", "all"]:
|
|
for arg_name, arg_def in WEB_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
if mode in ["github", "all"]:
|
|
for arg_name, arg_def in GITHUB_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
if mode in ["local", "all"]:
|
|
for arg_name, arg_def in LOCAL_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
if mode in ["pdf", "all"]:
|
|
for arg_name, arg_def in PDF_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
if mode in ["word", "all"]:
|
|
for arg_name, arg_def in WORD_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
if mode in ["config", "all"]:
|
|
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|
|
|
|
# Add advanced arguments if requested
|
|
if mode in ["advanced", "all"]:
|
|
for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
|
|
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
|