Files
skill-seekers-reference/src/skill_seekers/cli/arguments/create.py
yusyus b81d55fda0 feat(B2): add Microsoft Word (.docx) support
Implements ROADMAP task B2 — full .docx scraping support via mammoth +
python-docx, producing SKILL.md + references/ output identical to other
source types.

New files:
- src/skill_seekers/cli/word_scraper.py — WordToSkillConverter class +
  main() entry point (~600 lines); mammoth → BeautifulSoup pipeline;
  handles headings, code detection (incl. monospace <p><br> blocks),
  tables, images, metadata extraction
- src/skill_seekers/cli/arguments/word.py — add_word_arguments() +
  WORD_ARGUMENTS dict
- src/skill_seekers/cli/parsers/word_parser.py — WordParser for unified
  CLI parser registry
- tests/test_word_scraper.py — comprehensive test suite (~300 lines)

Modified files:
- src/skill_seekers/cli/main.py — registered "word" command module
- src/skill_seekers/cli/source_detector.py — .docx auto-detection +
  _detect_word() classmethod
- src/skill_seekers/cli/create_command.py — _route_word() + --help-word
- src/skill_seekers/cli/arguments/create.py — WORD_ARGUMENTS + routing
- src/skill_seekers/cli/arguments/__init__.py — export word args
- src/skill_seekers/cli/parsers/__init__.py — register WordParser
- src/skill_seekers/cli/unified_scraper.py — _scrape_word() integration
- src/skill_seekers/cli/pdf_scraper.py — fix: real enhancement instead
  of stub; remove [:3] reference file limit; capture run_workflows return
- src/skill_seekers/cli/github_scraper.py — fix: remove arbitrary
  open_issues[:20] / closed_issues[:10] reference file limits
- pyproject.toml — skill-seekers-word entry point + docx optional dep
- tests/test_cli_parsers.py — update parser count 21→22

Bug fixes applied during real-world testing:
- Code detection: detect monospace <p><br> blocks as code (mammoth
  renders Courier paragraphs this way, not as <pre>/<code>)
- Language detector: fix wrong method name detect_from_text →
  detect_from_code
- Description inference: pass None from main() so extract_docx() can
  infer description from Word document subject/title metadata
- Bullet-point guard: exclude prose starting with •/-/* from code scoring
- Enhancement: implement real API/LOCAL enhancement (was stub)
- pip install message: add quotes around skill-seekers[docx]

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-25 21:47:30 +03:00

572 lines
17 KiB
Python

"""Create command unified argument definitions.
Organizes arguments into three tiers:
1. Universal Arguments - Work for ALL sources (web, github, local, pdf, config)
2. Source-Specific Arguments - Only relevant for specific sources
3. Advanced Arguments - Rarely used, hidden from default help
This enables progressive disclosure in help text while maintaining
100% backward compatibility with existing commands.
"""
import argparse
from typing import Any
from skill_seekers.cli.constants import DEFAULT_RATE_LIMIT
from .common import RAG_ARGUMENTS
# =============================================================================
# TIER 1: UNIVERSAL ARGUMENTS (19 flags)
# =============================================================================
# These arguments work for ALL source types
# Includes: 11 core + 4 workflow + 4 RAG (merged from common.py)
UNIVERSAL_ARGUMENTS: dict[str, dict[str, Any]] = {
# Identity arguments
"name": {
"flags": ("--name",),
"kwargs": {
"type": str,
"help": "Skill name (default: auto-detected from source)",
"metavar": "NAME",
},
},
"description": {
"flags": ("--description", "-d"),
"kwargs": {
"type": str,
"help": "Skill description (used in SKILL.md)",
"metavar": "TEXT",
},
},
"output": {
"flags": ("--output", "-o"),
"kwargs": {
"type": str,
"help": "Output directory (default: auto-generated from name)",
"metavar": "DIR",
},
},
# Enhancement arguments
"enhance_level": {
"flags": ("--enhance-level",),
"kwargs": {
"type": int,
"choices": [0, 1, 2, 3],
"default": 2,
"help": (
"AI enhancement level (auto-detects API vs LOCAL mode): "
"0=disabled, 1=SKILL.md only, 2=+architecture/config (default), 3=full enhancement. "
"Mode selection: uses API if ANTHROPIC_API_KEY is set, otherwise LOCAL (Claude Code)"
),
"metavar": "LEVEL",
},
},
"api_key": {
"flags": ("--api-key",),
"kwargs": {
"type": str,
"help": "Anthropic API key (or set ANTHROPIC_API_KEY env var)",
"metavar": "KEY",
},
},
# Behavior arguments
"dry_run": {
"flags": ("--dry-run",),
"kwargs": {
"action": "store_true",
"help": "Preview what will be created without actually creating it",
},
},
"verbose": {
"flags": ("--verbose", "-v"),
"kwargs": {
"action": "store_true",
"help": "Enable verbose output (DEBUG level logging)",
},
},
"quiet": {
"flags": ("--quiet", "-q"),
"kwargs": {
"action": "store_true",
"help": "Minimize output (WARNING level only)",
},
},
# RAG features (imported from common.py - see RAG_ARGUMENTS)
# Note: RAG arguments are merged into UNIVERSAL_ARGUMENTS at runtime
# Preset system
"preset": {
"flags": ("--preset", "-p"),
"kwargs": {
"type": str,
"choices": ["quick", "standard", "comprehensive"],
"help": "Analysis preset: quick (1-2 min), standard (5-10 min), comprehensive (20-60 min)",
"metavar": "PRESET",
},
},
# Config loading
"config": {
"flags": ("--config", "-c"),
"kwargs": {
"type": str,
"help": "Load additional settings from JSON file",
"metavar": "FILE",
},
},
# Enhancement Workflow arguments (NEW - Phase 2)
"enhance_workflow": {
"flags": ("--enhance-workflow",),
"kwargs": {
"action": "append",
"help": "Apply enhancement workflow (file path or preset: security-focus, minimal, api-documentation, architecture-comprehensive). Can use multiple times to chain workflows.",
"metavar": "WORKFLOW",
},
},
"enhance_stage": {
"flags": ("--enhance-stage",),
"kwargs": {
"action": "append",
"help": "Add inline enhancement stage (format: 'name:prompt'). Can be used multiple times.",
"metavar": "STAGE",
},
},
"var": {
"flags": ("--var",),
"kwargs": {
"action": "append",
"help": "Override workflow variable (format: 'key=value'). Can be used multiple times.",
"metavar": "VAR",
},
},
"workflow_dry_run": {
"flags": ("--workflow-dry-run",),
"kwargs": {
"action": "store_true",
"help": "Preview workflow stages without executing (requires --enhance-workflow)",
},
},
"local_repo_path": {
"flags": ("--local-repo-path",),
"kwargs": {
"type": str,
"help": "Path to local clone of a GitHub repository for unlimited C3.x analysis (bypasses GitHub API file limits)",
"metavar": "PATH",
},
},
}
# Merge RAG arguments from common.py into universal arguments
UNIVERSAL_ARGUMENTS.update(RAG_ARGUMENTS)
# =============================================================================
# TIER 2: SOURCE-SPECIFIC ARGUMENTS
# =============================================================================
# Web scraping specific (from scrape.py)
WEB_ARGUMENTS: dict[str, dict[str, Any]] = {
"url": {
"flags": ("--url",),
"kwargs": {
"type": str,
"help": "Base documentation URL (alternative to positional arg)",
"metavar": "URL",
},
},
"max_pages": {
"flags": ("--max-pages",),
"kwargs": {
"type": int,
"metavar": "N",
"help": "Maximum pages to scrape (for testing/prototyping)",
},
},
"skip_scrape": {
"flags": ("--skip-scrape",),
"kwargs": {
"action": "store_true",
"help": "Skip scraping, use existing data",
},
},
"resume": {
"flags": ("--resume",),
"kwargs": {
"action": "store_true",
"help": "Resume from last checkpoint",
},
},
"fresh": {
"flags": ("--fresh",),
"kwargs": {
"action": "store_true",
"help": "Clear checkpoint and start fresh",
},
},
"rate_limit": {
"flags": ("--rate-limit", "-r"),
"kwargs": {
"type": float,
"metavar": "SECONDS",
"help": f"Rate limit in seconds (default: {DEFAULT_RATE_LIMIT})",
},
},
"workers": {
"flags": ("--workers", "-w"),
"kwargs": {
"type": int,
"metavar": "N",
"help": "Number of parallel workers (default: 1, max: 10)",
},
},
"async_mode": {
"flags": ("--async",),
"kwargs": {
"dest": "async_mode",
"action": "store_true",
"help": "Enable async mode (2-3x faster)",
},
},
}
# GitHub repository specific (from github.py)
GITHUB_ARGUMENTS: dict[str, dict[str, Any]] = {
"repo": {
"flags": ("--repo",),
"kwargs": {
"type": str,
"help": "GitHub repository (owner/repo)",
"metavar": "OWNER/REPO",
},
},
"token": {
"flags": ("--token",),
"kwargs": {
"type": str,
"help": "GitHub personal access token",
"metavar": "TOKEN",
},
},
"profile": {
"flags": ("--profile",),
"kwargs": {
"type": str,
"help": "GitHub profile name (from config)",
"metavar": "PROFILE",
},
},
"non_interactive": {
"flags": ("--non-interactive",),
"kwargs": {
"action": "store_true",
"help": "Non-interactive mode (fail on rate limits)",
},
},
"no_issues": {
"flags": ("--no-issues",),
"kwargs": {
"action": "store_true",
"help": "Skip GitHub issues",
},
},
"no_changelog": {
"flags": ("--no-changelog",),
"kwargs": {
"action": "store_true",
"help": "Skip CHANGELOG",
},
},
"no_releases": {
"flags": ("--no-releases",),
"kwargs": {
"action": "store_true",
"help": "Skip releases",
},
},
"max_issues": {
"flags": ("--max-issues",),
"kwargs": {
"type": int,
"default": 100,
"metavar": "N",
"help": "Max issues to fetch (default: 100)",
},
},
"scrape_only": {
"flags": ("--scrape-only",),
"kwargs": {
"action": "store_true",
"help": "Only scrape, don't build skill",
},
},
}
# Local codebase specific (from analyze.py)
LOCAL_ARGUMENTS: dict[str, dict[str, Any]] = {
"directory": {
"flags": ("--directory",),
"kwargs": {
"type": str,
"help": "Directory to analyze",
"metavar": "DIR",
},
},
"languages": {
"flags": ("--languages",),
"kwargs": {
"type": str,
"help": "Comma-separated languages (e.g., Python,JavaScript)",
"metavar": "LANGS",
},
},
"file_patterns": {
"flags": ("--file-patterns",),
"kwargs": {
"type": str,
"help": "Comma-separated file patterns",
"metavar": "PATTERNS",
},
},
"skip_patterns": {
"flags": ("--skip-patterns",),
"kwargs": {
"action": "store_true",
"help": "Skip design pattern detection",
},
},
"skip_test_examples": {
"flags": ("--skip-test-examples",),
"kwargs": {
"action": "store_true",
"help": "Skip test example extraction",
},
},
"skip_how_to_guides": {
"flags": ("--skip-how-to-guides",),
"kwargs": {
"action": "store_true",
"help": "Skip how-to guide generation",
},
},
"skip_config": {
"flags": ("--skip-config",),
"kwargs": {
"action": "store_true",
"help": "Skip configuration extraction",
},
},
"skip_docs": {
"flags": ("--skip-docs",),
"kwargs": {
"action": "store_true",
"help": "Skip documentation extraction",
},
},
}
# PDF specific (from pdf.py)
PDF_ARGUMENTS: dict[str, dict[str, Any]] = {
"pdf": {
"flags": ("--pdf",),
"kwargs": {
"type": str,
"help": "PDF file path",
"metavar": "PATH",
},
},
"ocr": {
"flags": ("--ocr",),
"kwargs": {
"action": "store_true",
"help": "Enable OCR for scanned PDFs",
},
},
"pages": {
"flags": ("--pages",),
"kwargs": {
"type": str,
"help": "Page range (e.g., '1-10', '5,7,9')",
"metavar": "RANGE",
},
},
}
# Word document specific (from word.py)
WORD_ARGUMENTS: dict[str, dict[str, Any]] = {
"docx": {
"flags": ("--docx",),
"kwargs": {
"type": str,
"help": "DOCX file path",
"metavar": "PATH",
},
},
}
# Multi-source config specific (from unified_scraper.py)
CONFIG_ARGUMENTS: dict[str, dict[str, Any]] = {
"merge_mode": {
"flags": ("--merge-mode",),
"kwargs": {
"type": str,
"choices": ["rule-based", "claude-enhanced"],
"help": "Override merge mode from config (rule-based or claude-enhanced)",
"metavar": "MODE",
},
},
"skip_codebase_analysis": {
"flags": ("--skip-codebase-analysis",),
"kwargs": {
"action": "store_true",
"help": "Skip C3.x codebase analysis for GitHub sources in unified config",
},
},
# Note: --fresh is intentionally omitted here — it already lives in WEB_ARGUMENTS.
# For unified config files, use `skill-seekers unified --fresh` directly.
}
# =============================================================================
# TIER 3: ADVANCED/RARE ARGUMENTS
# =============================================================================
# Hidden from default help, shown only with --help-advanced
ADVANCED_ARGUMENTS: dict[str, dict[str, Any]] = {
"no_rate_limit": {
"flags": ("--no-rate-limit",),
"kwargs": {
"action": "store_true",
"help": "Disable rate limiting completely",
},
},
"no_preserve_code_blocks": {
"flags": ("--no-preserve-code-blocks",),
"kwargs": {
"action": "store_true",
"help": "Allow splitting code blocks across chunks (not recommended)",
},
},
"no_preserve_paragraphs": {
"flags": ("--no-preserve-paragraphs",),
"kwargs": {
"action": "store_true",
"help": "Ignore paragraph boundaries when chunking (not recommended)",
},
},
"interactive_enhancement": {
"flags": ("--interactive-enhancement",),
"kwargs": {
"action": "store_true",
"help": "Open terminal window for enhancement (use with --enhance-local)",
},
},
}
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
def get_universal_argument_names() -> set[str]:
"""Get set of universal argument names."""
return set(UNIVERSAL_ARGUMENTS.keys())
def get_source_specific_arguments(source_type: str) -> dict[str, dict[str, Any]]:
"""Get source-specific arguments for a given source type.
Args:
source_type: One of 'web', 'github', 'local', 'pdf', 'config'
Returns:
Dict of argument definitions
"""
source_args = {
"web": WEB_ARGUMENTS,
"github": GITHUB_ARGUMENTS,
"local": LOCAL_ARGUMENTS,
"pdf": PDF_ARGUMENTS,
"word": WORD_ARGUMENTS,
"config": CONFIG_ARGUMENTS,
}
return source_args.get(source_type, {})
def get_compatible_arguments(source_type: str) -> list[str]:
"""Get list of compatible argument names for a source type.
Args:
source_type: Source type ('web', 'github', 'local', 'pdf', 'config')
Returns:
List of argument names that are compatible with this source
"""
# Universal arguments are always compatible
compatible = list(UNIVERSAL_ARGUMENTS.keys())
# Add source-specific arguments
source_specific = get_source_specific_arguments(source_type)
compatible.extend(source_specific.keys())
# Advanced arguments are always technically available
compatible.extend(ADVANCED_ARGUMENTS.keys())
return compatible
def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default") -> None:
"""Add create command arguments to parser.
Supports multiple help modes for progressive disclosure:
- 'default': Universal arguments only (15 flags)
- 'web': Universal + web-specific
- 'github': Universal + github-specific
- 'local': Universal + local-specific
- 'pdf': Universal + pdf-specific
- 'word': Universal + word-specific
- 'advanced': Advanced/rare arguments
- 'all': All 120+ arguments
Args:
parser: ArgumentParser to add arguments to
mode: Help mode (default, web, github, local, pdf, word, advanced, all)
"""
# Positional argument for source
parser.add_argument(
"source",
nargs="?",
type=str,
help="Source to create skill from (URL, GitHub repo, directory, PDF, or config file)",
)
# Always add universal arguments
for arg_name, arg_def in UNIVERSAL_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
# Add source-specific arguments based on mode
if mode in ["web", "all"]:
for arg_name, arg_def in WEB_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["github", "all"]:
for arg_name, arg_def in GITHUB_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["local", "all"]:
for arg_name, arg_def in LOCAL_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["pdf", "all"]:
for arg_name, arg_def in PDF_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["word", "all"]:
for arg_name, arg_def in WORD_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
if mode in ["config", "all"]:
for arg_name, arg_def in CONFIG_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])
# Add advanced arguments if requested
if mode in ["advanced", "all"]:
for arg_name, arg_def in ADVANCED_ARGUMENTS.items():
parser.add_argument(*arg_def["flags"], **arg_def["kwargs"])