diff --git a/.github/workflows/scheduled-updates.yml b/.github/workflows/scheduled-updates.yml index 279f0c6..078fe61 100644 --- a/.github/workflows/scheduled-updates.yml +++ b/.github/workflows/scheduled-updates.yml @@ -122,7 +122,7 @@ jobs: fi # Use streaming ingestion for large docs - skill-seekers scrape --config "$CONFIG_FILE" --streaming --max-pages 200 + skill-seekers create "$CONFIG_FILE" --max-pages 200 - name: Generate quality report if: steps.should_update.outputs.update == 'true' diff --git a/.github/workflows/vector-db-export.yml b/.github/workflows/vector-db-export.yml index 12fcaf6..1958bd3 100644 --- a/.github/workflows/vector-db-export.yml +++ b/.github/workflows/vector-db-export.yml @@ -74,7 +74,7 @@ jobs: if: steps.check_config.outputs.exists == 'true' run: | echo "šŸ“„ Scraping documentation for $SKILL_NAME..." - skill-seekers scrape --config "${{ steps.config.outputs.path }}" --max-pages 100 + skill-seekers create "${{ steps.config.outputs.path }}" --max-pages 100 continue-on-error: true - name: Determine export targets diff --git a/CLAUDE.md b/CLAUDE.md index 79a21cf..e97fe89 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -52,17 +52,26 @@ Runs on push/PR to `main` or `development`. Lint job (Python 3.12, Ubuntu) + Tes ## Architecture -### CLI: Git-style dispatcher +### CLI: Unified create command -Entry point `src/skill_seekers/cli/main.py` maps subcommands to modules. The `create` command auto-detects source type and is the recommended entry point for users. +Entry point `src/skill_seekers/cli/main.py`. The `create` command is the **only** entry point for skill creation — it auto-detects source type and routes to the appropriate `SkillConverter`. ``` skill-seekers create # Auto-detect: URL, owner/repo, ./path, file.pdf, etc. -skill-seekers [options] # Direct: scrape, github, pdf, word, epub, video, jupyter, html, openapi, asciidoc, pptx, rss, manpage, confluence, notion, chat -skill-seekers analyze # Analyze local codebase (C3.x pipeline) skill-seekers package # Package for platform (--target claude/gemini/openai/markdown/minimax/opencode/kimi/deepseek/qwen/openrouter/together/fireworks, --format langchain/llama-index/haystack/chroma/faiss/weaviate/qdrant/pinecone) ``` +### SkillConverter Pattern (Template Method + Factory) + +All 18 source types implement the `SkillConverter` base class (`skill_converter.py`): + +```python +converter = get_converter("web", config) # Factory lookup +converter.run() # Template: extract() → build_skill() +``` + +Registry in `CONVERTER_REGISTRY` maps source type → (module, class). `create_command.py` builds config from `ExecutionContext`, calls `get_converter()`, then runs centralized enhancement. + ### Data Flow (5 phases) 1. **Scrape** - Source-specific scraper extracts content to `output/{name}_data/pages/*.json` @@ -105,9 +114,9 @@ src/skill_seekers/cli/adaptors/ `--target` = LLM platforms, `--format` = RAG/vector DBs. All adaptors are imported with `try/except ImportError` so missing optional deps don't break the registry. -### 17 Source Type Scrapers +### 18 Source Type Converters -Each in `src/skill_seekers/cli/{type}_scraper.py` with a `main()` entry point. The `create_command.py` uses `source_detector.py` to auto-route. New scrapers added in v3.2.0+: jupyter, html, openapi, asciidoc, pptx, rss, manpage, confluence, notion, chat. +Each in `src/skill_seekers/cli/{type}_scraper.py` as a `SkillConverter` subclass (no `main()`). The `create_command.py` uses `source_detector.py` to auto-detect, then calls `get_converter()`. Converters: web (doc_scraper), github, pdf, word, epub, video, local (codebase_scraper), jupyter, html, openapi, asciidoc, pptx, rss, manpage, confluence, notion, chat, config (unified_scraper). ### CLI Argument System @@ -228,13 +237,14 @@ GITHUB_TOKEN=ghp_... # Higher GitHub rate limits 3. Add optional dep to `pyproject.toml` 4. Add tests in `tests/` -### New source type scraper -1. Create `src/skill_seekers/cli/{type}_scraper.py` with `main()` -2. Add to `COMMAND_MODULES` in `cli/main.py` -3. Add entry point in `pyproject.toml` `[project.scripts]` -4. Add auto-detection in `source_detector.py` -5. Add optional dep if needed -6. Add tests +### New source type converter +1. Create `src/skill_seekers/cli/{type}_scraper.py` with a class inheriting `SkillConverter` +2. Implement `extract()` and `build_skill()` methods, set `SOURCE_TYPE` +3. Register in `CONVERTER_REGISTRY` in `skill_converter.py` +4. Add source type config building in `create_command.py:_build_config()` +5. Add auto-detection in `source_detector.py` +6. Add optional dep if needed +7. Add tests ### New CLI argument - Universal: `UNIVERSAL_ARGUMENTS` in `arguments/create.py` diff --git a/pyproject.toml b/pyproject.toml index 43e46ba..1cc4d96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -299,32 +299,33 @@ Documentation = "https://skillseekersweb.com/" "Homebrew Tap" = "https://github.com/yusufkaraaslan/homebrew-skill-seekers" [project.scripts] -# Main unified CLI +# Main CLI entry point skill-seekers = "skill_seekers.cli.main:main" -# Individual tool entry points -skill-seekers-create = "skill_seekers.cli.create_command:main" # NEW: Unified create command -skill-seekers-doctor = "skill_seekers.cli.doctor:main" -skill-seekers-config = "skill_seekers.cli.config_command:main" -skill-seekers-resume = "skill_seekers.cli.resume_command:main" -skill-seekers-scrape = "skill_seekers.cli.doc_scraper:main" -skill-seekers-github = "skill_seekers.cli.github_scraper:main" -skill-seekers-pdf = "skill_seekers.cli.pdf_scraper:main" -skill-seekers-word = "skill_seekers.cli.word_scraper:main" -skill-seekers-epub = "skill_seekers.cli.epub_scraper:main" -skill-seekers-video = "skill_seekers.cli.video_scraper:main" -skill-seekers-unified = "skill_seekers.cli.unified_scraper:main" +# Core commands +skill-seekers-create = "skill_seekers.cli.create_command:main" skill-seekers-enhance = "skill_seekers.cli.enhance_command:main" skill-seekers-enhance-status = "skill_seekers.cli.enhance_status:main" skill-seekers-package = "skill_seekers.cli.package_skill:main" skill-seekers-upload = "skill_seekers.cli.upload_skill:main" -skill-seekers-estimate = "skill_seekers.cli.estimate_pages:main" skill-seekers-install = "skill_seekers.cli.install_skill:main" skill-seekers-install-agent = "skill_seekers.cli.install_agent:main" -skill-seekers-codebase = "skill_seekers.cli.codebase_scraper:main" + +# Analysis & utilities +skill-seekers-estimate = "skill_seekers.cli.estimate_pages:main" skill-seekers-patterns = "skill_seekers.cli.pattern_recognizer:main" skill-seekers-how-to-guides = "skill_seekers.cli.how_to_guide_builder:main" +skill-seekers-quality = "skill_seekers.cli.quality_metrics:main" +skill-seekers-workflows = "skill_seekers.cli.workflows_command:main" + +# Configuration & setup +skill-seekers-config = "skill_seekers.cli.config_command:main" +skill-seekers-doctor = "skill_seekers.cli.doctor:main" skill-seekers-setup = "skill_seekers.cli.setup_wizard:main" +skill-seekers-resume = "skill_seekers.cli.resume_command:main" +skill-seekers-sync-config = "skill_seekers.cli.sync_config:main" + +# Advanced skill-seekers-cloud = "skill_seekers.cli.cloud_storage_cli:main" skill-seekers-embed = "skill_seekers.embedding.server:main" skill-seekers-sync = "skill_seekers.cli.sync_cli:main" @@ -332,22 +333,6 @@ skill-seekers-benchmark = "skill_seekers.cli.benchmark_cli:main" skill-seekers-stream = "skill_seekers.cli.streaming_ingest:main" skill-seekers-update = "skill_seekers.cli.incremental_updater:main" skill-seekers-multilang = "skill_seekers.cli.multilang_support:main" -skill-seekers-quality = "skill_seekers.cli.quality_metrics:main" -skill-seekers-workflows = "skill_seekers.cli.workflows_command:main" -skill-seekers-sync-config = "skill_seekers.cli.sync_config:main" - -# New source type entry points (v3.2.0+) -skill-seekers-jupyter = "skill_seekers.cli.jupyter_scraper:main" -skill-seekers-html = "skill_seekers.cli.html_scraper:main" -skill-seekers-openapi = "skill_seekers.cli.openapi_scraper:main" -skill-seekers-asciidoc = "skill_seekers.cli.asciidoc_scraper:main" -skill-seekers-pptx = "skill_seekers.cli.pptx_scraper:main" -skill-seekers-rss = "skill_seekers.cli.rss_scraper:main" -skill-seekers-manpage = "skill_seekers.cli.man_scraper:main" -skill-seekers-confluence = "skill_seekers.cli.confluence_scraper:main" -skill-seekers-notion = "skill_seekers.cli.notion_scraper:main" -skill-seekers-chat = "skill_seekers.cli.chat_scraper:main" -skill-seekers-opencode-split = "skill_seekers.cli.opencode_skill_splitter:main" [tool.setuptools] package-dir = {"" = "src"} diff --git a/scripts/bootstrap_skill.sh b/scripts/bootstrap_skill.sh index 0496b79..cce8b78 100755 --- a/scripts/bootstrap_skill.sh +++ b/scripts/bootstrap_skill.sh @@ -37,8 +37,8 @@ echo "āœ“ Done" # Step 2: Run codebase analysis echo "Step 2: Analyzing codebase..." rm -rf "$OUTPUT_DIR" 2>/dev/null || true -uv run skill-seekers analyze \ - --directory "$PROJECT_ROOT" \ +uv run skill-seekers create "$PROJECT_ROOT" \ + --name "$SKILL_NAME" \ --output "$OUTPUT_DIR" 2>&1 | grep -E "^(INFO|āœ…)" || true echo "āœ“ Done" diff --git a/scripts/skill_header.md b/scripts/skill_header.md index d2d8d02..6007531 100644 --- a/scripts/skill_header.md +++ b/scripts/skill_header.md @@ -16,16 +16,16 @@ pip install skill-seekers | Source | Command | |--------|---------| -| Local code | `skill-seekers analyze --directory ./path` | -| Docs URL | `skill-seekers scrape --url https://...` | -| GitHub | `skill-seekers github --repo owner/repo` | -| PDF | `skill-seekers pdf --file doc.pdf` | +| Local code | `skill-seekers create ./path` | +| Docs URL | `skill-seekers create https://docs.example.com` | +| GitHub | `skill-seekers create owner/repo` | +| PDF | `skill-seekers create document.pdf` | ## Quick Start ```bash # Analyze local codebase -skill-seekers analyze --directory /path/to/project --output output/my-skill/ +skill-seekers create /path/to/project --name my-skill # Package for Claude yes | skill-seekers package output/my-skill/ --no-open diff --git a/src/skill_seekers/cli/__init__.py b/src/skill_seekers/cli/__init__.py index 7def83e..373451e 100644 --- a/src/skill_seekers/cli/__init__.py +++ b/src/skill_seekers/cli/__init__.py @@ -21,6 +21,9 @@ from .llms_txt_detector import LlmsTxtDetector from .llms_txt_downloader import LlmsTxtDownloader from .llms_txt_parser import LlmsTxtParser +# ExecutionContext - single source of truth for all configuration +from .execution_context import ExecutionContext, get_context + try: from .utils import open_folder, read_reference_files except ImportError: @@ -35,6 +38,8 @@ __all__ = [ "LlmsTxtDetector", "LlmsTxtDownloader", "LlmsTxtParser", + "ExecutionContext", + "get_context", "open_folder", "read_reference_files", "__version__", diff --git a/src/skill_seekers/cli/agent_client.py b/src/skill_seekers/cli/agent_client.py index 0a842d9..5da3c86 100644 --- a/src/skill_seekers/cli/agent_client.py +++ b/src/skill_seekers/cli/agent_client.py @@ -164,9 +164,16 @@ class AgentClient: Resolved from: arg → env SKILL_SEEKER_AGENT → "claude" api_key: API key override. If None, auto-detected from env vars. """ - # Resolve agent name + # Resolve agent name: param > ExecutionContext > env var > default + try: + from skill_seekers.cli.execution_context import ExecutionContext + + ctx = ExecutionContext.get() + ctx_agent = ctx.enhancement.agent or "" + except Exception: + ctx_agent = "" env_agent = os.environ.get("SKILL_SEEKER_AGENT", "").strip() - self.agent = normalize_agent_name(agent or env_agent or "claude") + self.agent = normalize_agent_name(agent or ctx_agent or env_agent or "claude") self.agent_display = AGENT_PRESETS.get(self.agent, {}).get("display_name", self.agent) # Detect API key and provider diff --git a/src/skill_seekers/cli/architectural_pattern_detector.py b/src/skill_seekers/cli/architectural_pattern_detector.py index 78e40d4..fa35cc2 100644 --- a/src/skill_seekers/cli/architectural_pattern_detector.py +++ b/src/skill_seekers/cli/architectural_pattern_detector.py @@ -139,6 +139,10 @@ class ArchitecturalPatternDetector: "Laravel": ["laravel", "illuminate", "artisan", "app/Http/Controllers", "app/Models"], } + # Web frameworks should only match for web-language projects + _WEB_FRAMEWORKS = {"React", "Vue.js", "Express", "Angular"} + _WEB_LANGUAGES = {"JavaScript", "TypeScript", "Python", "PHP", "Ruby"} + def __init__(self, enhance_with_ai: bool = True, agent: str | None = None): """ Initialize detector. @@ -268,11 +272,26 @@ class ArchitecturalPatternDetector: # Return early to prevent web framework false positives return detected + # Determine primary language to filter out impossible framework matches + # e.g., C#/C++ projects should not match React/Vue.js/Express + lang_counts: dict[str, int] = {} + for file_data in files: + lang = file_data.get("language", "") + if lang: + lang_counts[lang] = lang_counts.get(lang, 0) + 1 + primary_lang = max(lang_counts, key=lang_counts.get) if lang_counts else "" + + skip_web = primary_lang and primary_lang not in self._WEB_LANGUAGES + # Check other frameworks (including imports - fixes #239) for framework, markers in self.FRAMEWORK_MARKERS.items(): if framework in ["Unity", "Unreal", "Godot"]: continue # Already checked + # Skip web frameworks for non-web language projects + if skip_web and framework in self._WEB_FRAMEWORKS: + continue + # Check in file paths, directory structure, AND imports path_matches = sum(1 for marker in markers if marker.lower() in all_content.lower()) dir_matches = sum(1 for marker in markers if marker.lower() in dir_content.lower()) diff --git a/src/skill_seekers/cli/arguments/create.py b/src/skill_seekers/cli/arguments/create.py index b610a41..4ce98f9 100644 --- a/src/skill_seekers/cli/arguments/create.py +++ b/src/skill_seekers/cli/arguments/create.py @@ -938,3 +938,14 @@ def add_create_arguments(parser: argparse.ArgumentParser, mode: str = "default") action="store_true", help=argparse.SUPPRESS, ) + + +def get_create_defaults() -> dict[str, Any]: + """Build a defaults dict from a throwaway parser with all create arguments. + + Used by CreateCommand._is_explicitly_set() to compare argument values + against their registered defaults instead of hardcoded values. + """ + temp = argparse.ArgumentParser(add_help=False) + add_create_arguments(temp, mode="all") + return {action.dest: action.default for action in temp._actions if action.dest != "help"} diff --git a/src/skill_seekers/cli/asciidoc_scraper.py b/src/skill_seekers/cli/asciidoc_scraper.py index cb24851..ee88f9c 100644 --- a/src/skill_seekers/cli/asciidoc_scraper.py +++ b/src/skill_seekers/cli/asciidoc_scraper.py @@ -15,12 +15,10 @@ Usage: skill-seekers asciidoc --from-json doc_extracted.json """ -import argparse import json import logging import os import re -import sys from pathlib import Path # Optional dependency guard — asciidoc library for HTML conversion @@ -31,6 +29,8 @@ try: except ImportError: ASCIIDOC_AVAILABLE = False +from skill_seekers.cli.skill_converter import SkillConverter + logger = logging.getLogger(__name__) ASCIIDOC_EXTENSIONS = {".adoc", ".asciidoc", ".asc", ".ad"} @@ -112,7 +112,7 @@ def _score_code_quality(code: str) -> float: return min(10.0, max(0.0, score)) -class AsciiDocToSkillConverter: +class AsciiDocToSkillConverter(SkillConverter): """Convert AsciiDoc documentation to an AI-ready skill. Handles single ``.adoc`` files and directories. Content is parsed into @@ -120,7 +120,10 @@ class AsciiDocToSkillConverter: directory layout (SKILL.md, references/, etc.). """ + SOURCE_TYPE = "asciidoc" + def __init__(self, config: dict) -> None: + super().__init__(config) self.config = config self.name: str = config["name"] self.asciidoc_path: str = config.get("asciidoc_path", "") @@ -132,6 +135,10 @@ class AsciiDocToSkillConverter: self.categories: dict = config.get("categories", {}) self.extracted_data: dict | None = None + def extract(self): + """Extract content from AsciiDoc files (SkillConverter interface).""" + self.extract_asciidoc() + # ------------------------------------------------------------------ # Extraction # ------------------------------------------------------------------ @@ -943,147 +950,3 @@ class AsciiDocToSkillConverter: def _in_range(pos: int, ranges: list[tuple[int, int]]) -> bool: """Check whether pos falls within any consumed range.""" return any(s <= pos < e for s, e in ranges) - - -# ============================================================================ -# CLI entry point -# ============================================================================ - - -def main() -> int: - """CLI entry point for AsciiDoc scraper.""" - from skill_seekers.cli.arguments.asciidoc import add_asciidoc_arguments - - parser = argparse.ArgumentParser( - description="Convert AsciiDoc documentation to skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - add_asciidoc_arguments(parser) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = ( - getattr(args, "asciidoc_path", None) or getattr(args, "from_json", None) or "(none)" - ) - print(f"\n{'=' * 60}") - print("DRY RUN: AsciiDoc Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - if not (getattr(args, "asciidoc_path", None) or getattr(args, "from_json", None)): - parser.error("Must specify --asciidoc-path or --from-json") - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} documentation", - } - try: - converter = AsciiDocToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Direct AsciiDoc mode - if not getattr(args, "name", None): - p = Path(args.asciidoc_path) - args.name = p.stem if p.is_file() else p.name - - config = { - "name": args.name, - "asciidoc_path": args.asciidoc_path, - "description": getattr(args, "description", None), - } - - try: - converter = AsciiDocToSkillConverter(config) - - # Extract - if not converter.extract_asciidoc(): - print("\nāŒ AsciiDoc extraction failed - see error above", file=sys.stderr) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis," - " enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except (FileNotFoundError, ValueError, RuntimeError) as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during AsciiDoc processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/chat_scraper.py b/src/skill_seekers/cli/chat_scraper.py index 7d98a2f..0ec736f 100644 --- a/src/skill_seekers/cli/chat_scraper.py +++ b/src/skill_seekers/cli/chat_scraper.py @@ -34,16 +34,16 @@ Usage: skill-seekers chat --from-json myteam_extracted.json --name myteam """ -import argparse import json import logging import os import re -import sys from collections import defaultdict from datetime import datetime, timezone from pathlib import Path +from skill_seekers.cli.skill_converter import SkillConverter + # Optional dependency guard — Slack SDK try: from slack_sdk import WebClient @@ -243,7 +243,7 @@ def _score_code_quality(code: str) -> float: # --------------------------------------------------------------------------- -class ChatToSkillConverter: +class ChatToSkillConverter(SkillConverter): """Convert Slack or Discord chat history into an AI-ready skill. Follows the same pipeline pattern as the EPUB, Jupyter, and PPTX scrapers: @@ -261,6 +261,8 @@ class ChatToSkillConverter: channel, date range, and detected topic. """ + SOURCE_TYPE = "chat" + def __init__(self, config: dict) -> None: """Initialize the converter with a configuration dictionary. @@ -276,6 +278,7 @@ class ChatToSkillConverter: - description (str): Skill description (optional, inferred if absent). """ + super().__init__(config) self.config = config self.name: str = config["name"] self.export_path: str = config.get("export_path", "") @@ -294,6 +297,10 @@ class ChatToSkillConverter: # Extracted data (populated by extract_chat or load_extracted_data) self.extracted_data: dict | None = None + def extract(self): + """Extract content from chat history (SkillConverter interface).""" + self.extract_chat() + # ------------------------------------------------------------------ # Extraction — public entry point # ------------------------------------------------------------------ @@ -1730,195 +1737,3 @@ class ChatToSkillConverter: """ safe = re.sub(r"[^\w\s-]", "", name.lower()) return re.sub(r"[-\s]+", "_", safe) - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> int: - """CLI entry point for the Slack/Discord chat scraper. - - Parses command-line arguments and runs the extraction and - skill-building pipeline. Supports export import, API fetch, - and loading from previously extracted JSON. - - Returns: - Exit code (0 for success, non-zero for errors). - """ - from .arguments.chat import add_chat_arguments - - parser = argparse.ArgumentParser( - description="Convert Slack/Discord chat history to AI-ready skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Slack workspace export - %(prog)s --export-path ./slack-export/ --platform slack --name myteam - - # Slack API - %(prog)s --platform slack --token xoxb-... --channel C01234 --name myteam - - # Discord export (DiscordChatExporter) - %(prog)s --export-path ./discord-export.json --platform discord --name myserver - - # Discord API - %(prog)s --platform discord --token Bot-token --channel 12345 --name myserver - - # From previously extracted JSON - %(prog)s --from-json myteam_extracted.json --name myteam - """, - ) - - add_chat_arguments(parser) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if args.dry_run: - source = args.export_path or args.from_json or f"{args.platform}-api" - print(f"\n{'=' * 60}") - print("DRY RUN: Chat Extraction") - print(f"{'=' * 60}") - print(f"Platform: {args.platform}") - print(f"Source: {source}") - print(f"Name: {args.name or '(auto-detect)'}") - print(f"Channel: {args.channel or '(all)'}") - print(f"Max messages: {args.max_messages}") - print(f"Enhance level: {args.enhance_level}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - if args.from_json: - # Build from previously extracted JSON - name = args.name or Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": name, - "description": (args.description or f"Use when referencing {name} chat knowledge base"), - } - try: - converter = ChatToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Require either --export-path or --token for extraction - if not args.export_path and not args.token: - parser.error( - "Must specify --export-path (export mode), --token (API mode), " - "or --from-json (build from extracted data)" - ) - - if not args.name: - if args.export_path: - args.name = Path(args.export_path).stem - else: - args.name = f"{args.platform}_chat" - - config = { - "name": args.name, - "export_path": args.export_path or "", - "platform": args.platform, - "token": args.token or "", - "channel": args.channel or "", - "max_messages": args.max_messages, - "description": args.description, - } - - try: - converter = ChatToSkillConverter(config) - - # Extract - if not converter.extract_chat(): - print( - "\nāŒ Chat extraction failed - see error above", - file=sys.stderr, - ) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis, " - "enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import ( - LocalSkillEnhancer, - ) - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import ( - LocalSkillEnhancer, - ) - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except (FileNotFoundError, ValueError) as e: - print(f"\nāŒ Input error: {e}", file=sys.stderr) - sys.exit(1) - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print( - f"\nāŒ Unexpected error during chat processing: {e}", - file=sys.stderr, - ) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/codebase_scraper.py b/src/skill_seekers/cli/codebase_scraper.py index 5eba262..677df50 100644 --- a/src/skill_seekers/cli/codebase_scraper.py +++ b/src/skill_seekers/cli/codebase_scraper.py @@ -24,12 +24,10 @@ Credits: - pathspec for .gitignore support: https://pypi.org/project/pathspec/ """ -import argparse import json import logging import os import re -import sys from pathlib import Path from typing import Any @@ -38,7 +36,7 @@ from skill_seekers.cli.code_analyzer import CodeAnalyzer from skill_seekers.cli.config_extractor import ConfigExtractor from skill_seekers.cli.dependency_analyzer import DependencyAnalyzer from skill_seekers.cli.signal_flow_analyzer import SignalFlowAnalyzer -from skill_seekers.cli.utils import setup_logging +from skill_seekers.cli.skill_converter import SkillConverter # Try to import pathspec for .gitignore support try: @@ -2147,278 +2145,56 @@ def _generate_references(output_dir: Path): logger.info(f"āœ… Generated references directory: {references_dir}") -def _check_deprecated_flags(args): - """Check for deprecated flags and show migration warnings.""" - warnings = [] +class CodebaseAnalyzer(SkillConverter): + """SkillConverter wrapper around the analyze_codebase / _generate_skill_md functions.""" - # Deprecated: --depth - if hasattr(args, "depth") and args.depth: - preset_map = { - "surface": "quick", - "deep": "standard", - "full": "comprehensive", - } - suggested_preset = preset_map.get(args.depth, "standard") - warnings.append( - f"āš ļø DEPRECATED: --depth {args.depth} → use --preset {suggested_preset} instead" + SOURCE_TYPE = "local" + + def __init__(self, config: dict[str, Any]): + super().__init__(config) + self.directory = Path(config.get("directory", ".")).resolve() + self.output_dir = Path(config.get("output_dir", f"output/{self.name}")) + self.depth = config.get("depth", "deep") + self.languages = config.get("languages") + self.file_patterns = config.get("file_patterns") + self.build_api_reference = config.get("build_api_reference", True) + self.extract_comments = config.get("extract_comments", True) + self.build_dependency_graph = config.get("build_dependency_graph", True) + self.detect_patterns = config.get("detect_patterns", True) + self.extract_test_examples = config.get("extract_test_examples", True) + self.build_how_to_guides = config.get("build_how_to_guides", True) + self.extract_config_patterns = config.get("extract_config_patterns", True) + self.extract_docs = config.get("extract_docs", True) + self.enhance_level = config.get("enhance_level", 0) + self.skill_name = config.get("skill_name") or self.name + self.skill_description = config.get("skill_description") + self.doc_version = config.get("doc_version", "") + self._results: dict[str, Any] | None = None + + def extract(self): + """SkillConverter interface — delegates to analyze_codebase().""" + self._results = analyze_codebase( + directory=self.directory, + output_dir=self.output_dir, + depth=self.depth, + languages=self.languages, + file_patterns=self.file_patterns, + build_api_reference=self.build_api_reference, + extract_comments=self.extract_comments, + build_dependency_graph=self.build_dependency_graph, + detect_patterns=self.detect_patterns, + extract_test_examples=self.extract_test_examples, + build_how_to_guides=self.build_how_to_guides, + extract_config_patterns=self.extract_config_patterns, + extract_docs=self.extract_docs, + enhance_level=self.enhance_level, + skill_name=self.skill_name, + skill_description=self.skill_description, + doc_version=self.doc_version, ) - # Deprecated: --ai-mode - if hasattr(args, "ai_mode") and args.ai_mode and args.ai_mode != "auto": - if args.ai_mode == "api": - warnings.append( - "āš ļø DEPRECATED: --ai-mode api → use --enhance-level with ANTHROPIC_API_KEY set instead" - ) - elif args.ai_mode == "local": - warnings.append( - "āš ļø DEPRECATED: --ai-mode local → use --enhance-level without API key instead" - ) - elif args.ai_mode == "none": - warnings.append("āš ļø DEPRECATED: --ai-mode none → use --enhance-level 0 instead") - - # Deprecated: --quick flag - if hasattr(args, "quick") and args.quick: - warnings.append("āš ļø DEPRECATED: --quick → use --preset quick instead") - - # Deprecated: --comprehensive flag - if hasattr(args, "comprehensive") and args.comprehensive: - warnings.append("āš ļø DEPRECATED: --comprehensive → use --preset comprehensive instead") - - # Show warnings if any found - if warnings: - print("\n" + "=" * 70) - for warning in warnings: - print(warning) - print("\nšŸ’” MIGRATION TIP:") - print(" --preset quick (1-2 min, basic features)") - print(" --preset standard (5-10 min, core features, DEFAULT)") - print(" --preset comprehensive (20-60 min, all features + AI)") - print(" --enhance-level 0-3 (granular AI enhancement control)") - print("\nāš ļø Deprecated flags will be removed in v4.0.0") - print("=" * 70 + "\n") - - -def main(): - """Command-line interface for codebase analysis.""" - from skill_seekers.cli.arguments.analyze import add_analyze_arguments - - parser = argparse.ArgumentParser( - description="Analyze local codebases and extract code knowledge", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Analyze current directory - codebase-scraper --directory . --output output/codebase/ - - # Deep analysis with API reference and dependency graph - codebase-scraper --directory /path/to/repo --depth deep --build-api-reference --build-dependency-graph - - # Analyze only Python and JavaScript - codebase-scraper --directory . --languages Python,JavaScript - - # Use file patterns - codebase-scraper --directory . --file-patterns "*.py,src/**/*.js" - - # Full analysis with all features (default) - codebase-scraper --directory . --depth deep - - # Surface analysis (fast, skip all analysis features) - codebase-scraper --directory . --depth surface --skip-api-reference --skip-dependency-graph --skip-patterns --skip-test-examples - - # Skip specific features - codebase-scraper --directory . --skip-patterns --skip-test-examples -""", - ) - - # Register all args from the shared definitions module - add_analyze_arguments(parser) - - # Extra legacy arg only used by standalone CLI (not in arguments/analyze.py) - parser.add_argument( - "--ai-mode", - choices=["auto", "api", "local", "none"], - default="auto", - help=( - "AI enhancement mode for how-to guides: " - "auto (auto-detect: API if ANTHROPIC_API_KEY set, else LOCAL), " - "api (Anthropic API, requires ANTHROPIC_API_KEY), " - "local (coding agent CLI, FREE, no API key), " - "none (disable AI enhancement). " - "šŸ’” TIP: Use --enhance flag instead for simpler UX!" - ), - ) - - # Check for deprecated flags - deprecated_flags = { - "--build-api-reference": "--skip-api-reference", - "--build-dependency-graph": "--skip-dependency-graph", - "--detect-patterns": "--skip-patterns", - "--extract-test-examples": "--skip-test-examples", - "--build-how-to-guides": "--skip-how-to-guides", - "--extract-config-patterns": "--skip-config-patterns", - } - - for old_flag, new_flag in deprecated_flags.items(): - if old_flag in sys.argv: - logger.warning( - f"āš ļø DEPRECATED: {old_flag} is deprecated. " - f"All features are now enabled by default. " - f"Use {new_flag} to disable this feature." - ) - - # Handle --preset-list flag BEFORE parse_args() to avoid required --directory validation - if "--preset-list" in sys.argv: - from skill_seekers.cli.presets import PresetManager - - print(PresetManager.format_preset_help()) - return 0 - - args = parser.parse_args() - - # Check for deprecated flags and show warnings - _check_deprecated_flags(args) - - # Handle presets using formal preset system - preset_name = None - if hasattr(args, "preset") and args.preset: - # New --preset flag (recommended) - preset_name = args.preset - elif hasattr(args, "quick") and args.quick: - # Legacy --quick flag (backward compatibility) - preset_name = "quick" - elif hasattr(args, "comprehensive") and args.comprehensive: - # Legacy --comprehensive flag (backward compatibility) - preset_name = "comprehensive" - else: - # Default preset if none specified - preset_name = "standard" - - # Apply preset using PresetManager - if preset_name: - from skill_seekers.cli.presets import PresetManager - - try: - preset_args = PresetManager.apply_preset(preset_name, vars(args)) - # Update args with preset values - for key, value in preset_args.items(): - setattr(args, key, value) - - preset = PresetManager.get_preset(preset_name) - logger.info(f"{preset.icon} {preset.name} analysis mode: {preset.description}") - except ValueError as e: - logger.error(f"āŒ {e}") - return 1 - - # Apply default depth if not set by preset or CLI - if args.depth is None: - args.depth = "deep" # Default depth - - setup_logging(verbose=args.verbose, quiet=getattr(args, "quiet", False)) - - # Handle --dry-run - if getattr(args, "dry_run", False): - directory = Path(args.directory) - print(f"\n{'=' * 60}") - print(f"DRY RUN: Codebase Analysis") - print(f"{'=' * 60}") - print(f"Directory: {directory.resolve()}") - print(f"Output: {args.output}") - print(f"Preset: {preset_name}") - print(f"Depth: {args.depth or 'deep (default)'}") - print(f"Name: {getattr(args, 'name', None) or directory.name}") - print(f"Enhance: level {args.enhance_level}") - print(f"Skip flags: ", end="") - skips = [] - for flag in [ - "skip_api_reference", - "skip_dependency_graph", - "skip_patterns", - "skip_test_examples", - "skip_how_to_guides", - "skip_config_patterns", - "skip_docs", - ]: - if getattr(args, flag, False): - skips.append(f"--{flag.replace('_', '-')}") - print(", ".join(skips) if skips else "(none)") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate directory - directory = Path(args.directory) - if not directory.exists(): - logger.error(f"Directory not found: {directory}") - return 1 - - if not directory.is_dir(): - logger.error(f"Not a directory: {directory}") - return 1 - - # Parse languages - languages = None - if args.languages: - languages = [lang.strip() for lang in args.languages.split(",")] - - # Parse file patterns - file_patterns = None - if args.file_patterns: - file_patterns = [p.strip() for p in args.file_patterns.split(",")] - - # Analyze codebase - try: - results = analyze_codebase( - directory=directory, - output_dir=Path(args.output), - depth=args.depth, - languages=languages, - file_patterns=file_patterns, - build_api_reference=not args.skip_api_reference, - extract_comments=not args.no_comments, - build_dependency_graph=not args.skip_dependency_graph, - detect_patterns=not args.skip_patterns, - extract_test_examples=not args.skip_test_examples, - build_how_to_guides=not args.skip_how_to_guides, - extract_config_patterns=not args.skip_config_patterns, - extract_docs=not args.skip_docs, - enhance_level=args.enhance_level, # AI enhancement level (0-3) - skill_name=getattr(args, "name", None), - skill_description=getattr(args, "description", None), - doc_version=getattr(args, "doc_version", ""), - ) - - # ============================================================ - # WORKFLOW SYSTEM INTEGRATION (Phase 2) - # ============================================================ - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - - # Print summary - print(f"\n{'=' * 60}") - print("CODEBASE ANALYSIS COMPLETE") - if workflow_executed: - print(f" + {len(workflow_names)} ENHANCEMENT WORKFLOW(S) EXECUTED") - print(f"{'=' * 60}") - print(f"Files analyzed: {len(results['files'])}") - print(f"Output directory: {args.output}") - if not args.skip_api_reference: - print(f"API reference: {Path(args.output) / 'api_reference'}") - if workflow_executed: - print(f"Workflows applied: {', '.join(workflow_names)}") - print(f"{'=' * 60}\n") - - return 0 - - except KeyboardInterrupt: - logger.error("\nAnalysis interrupted by user") - return 130 - except Exception as e: - logger.error(f"Analysis failed: {e}") - import traceback - - traceback.print_exc() - return 1 - - -if __name__ == "__main__": - sys.exit(main()) + def build_skill(self): + """SkillConverter interface — no-op because analyze_codebase() already calls _generate_skill_md().""" + # analyze_codebase() generates SKILL.md internally via _generate_skill_md(), + # so there is nothing additional to do here. + pass diff --git a/src/skill_seekers/cli/config_extractor.py b/src/skill_seekers/cli/config_extractor.py index 1950766..9afa55b 100644 --- a/src/skill_seekers/cli/config_extractor.py +++ b/src/skill_seekers/cli/config_extractor.py @@ -627,15 +627,17 @@ class ConfigParser: parent_path = [] for key, value in data.items(): + # YAML parses 'on:' as boolean True; convert non-string keys + str_key = str(key) if not isinstance(key, str) else key if isinstance(value, dict): # Recurse into nested dicts - self._extract_settings_from_dict(value, config_file, parent_path + [key]) + self._extract_settings_from_dict(value, config_file, parent_path + [str_key]) else: setting = ConfigSetting( - key=".".join(parent_path + [key]) if parent_path else key, + key=".".join(parent_path + [str_key]) if parent_path else str_key, value=value, value_type=self._infer_type(value), - nested_path=parent_path + [key], + nested_path=parent_path + [str_key], ) config_file.settings.append(setting) diff --git a/src/skill_seekers/cli/config_validator.py b/src/skill_seekers/cli/config_validator.py index d85faba..d663102 100644 --- a/src/skill_seekers/cli/config_validator.py +++ b/src/skill_seekers/cli/config_validator.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -Unified Config Validator +UniSkillConfig Validator -Validates unified config format that supports multiple sources: +Validates uni_skill_config format that supports multiple sources: - documentation (website scraping) - github (repository scraping) - pdf (PDF document scraping) @@ -34,9 +34,9 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -class ConfigValidator: +class UniSkillConfigValidator: """ - Validates unified config format (legacy support removed in v2.11.0). + Validates uni_skill_config format (legacy support removed in v2.11.0). """ # Valid source types @@ -100,7 +100,7 @@ class ConfigValidator: def validate(self) -> bool: """ - Validate unified config format. + Validate uni_skill_config format. Returns: True if valid @@ -136,8 +136,8 @@ class ConfigValidator: return self._validate_unified() def _validate_unified(self) -> bool: - """Validate unified config format.""" - logger.info("Validating unified config format...") + """Validate uni_skill_config format.""" + logger.info("Validating uni_skill_config format...") # Required top-level fields if "name" not in self.config: @@ -483,7 +483,11 @@ class ConfigValidator: return has_docs_api and has_github_code -def validate_config(config_path: str) -> ConfigValidator: +# Backward-compat alias +ConfigValidator = UniSkillConfigValidator + + +def validate_config(config_path: str) -> UniSkillConfigValidator: """ Validate config file and return validator instance. @@ -491,12 +495,12 @@ def validate_config(config_path: str) -> ConfigValidator: config_path: Path to config JSON file Returns: - ConfigValidator instance + UniSkillConfigValidator instance Raises: ValueError if config is invalid """ - validator = ConfigValidator(config_path) + validator = UniSkillConfigValidator(config_path) validator.validate() return validator diff --git a/src/skill_seekers/cli/confluence_scraper.py b/src/skill_seekers/cli/confluence_scraper.py index 853500a..4c7567c 100644 --- a/src/skill_seekers/cli/confluence_scraper.py +++ b/src/skill_seekers/cli/confluence_scraper.py @@ -31,15 +31,15 @@ Usage: --space-key DEV --name dev-wiki --max-pages 200 """ -import argparse import json import logging import os import re -import sys from pathlib import Path from typing import Any +from skill_seekers.cli.skill_converter import SkillConverter + # Optional dependency guard for atlassian-python-api try: from atlassian import Confluence @@ -177,7 +177,7 @@ def infer_description_from_confluence( ) -class ConfluenceToSkillConverter: +class ConfluenceToSkillConverter(SkillConverter): """Convert Confluence space documentation to an AI-ready skill. Supports two extraction modes: @@ -209,6 +209,8 @@ class ConfluenceToSkillConverter: extracted_data: Structured extraction results dict. """ + SOURCE_TYPE = "confluence" + def __init__(self, config: dict) -> None: """Initialize the Confluence to skill converter. @@ -223,6 +225,7 @@ class ConfluenceToSkillConverter: - description (str): Skill description (optional). - max_pages (int): Maximum pages to fetch, default 500. """ + super().__init__(config) self.config = config self.name: str = config["name"] self.base_url: str = config.get("base_url", "") @@ -242,6 +245,10 @@ class ConfluenceToSkillConverter: # Extracted data storage self.extracted_data: dict[str, Any] | None = None + def extract(self): + """Extract content from Confluence (SkillConverter interface).""" + self.extract_confluence() + # ────────────────────────────────────────────────────────────────────── # Extraction dispatcher # ────────────────────────────────────────────────────────────────────── @@ -1916,255 +1923,3 @@ def _score_code_quality(code: str) -> float: score -= 2.0 return min(10.0, max(0.0, score)) - - -# ────────────────────────────────────────────────────────────────────────────── -# CLI entry point -# ────────────────────────────────────────────────────────────────────────────── - - -def main() -> int: - """CLI entry point for the Confluence scraper. - - Parses command-line arguments and runs the extraction/build pipeline. - Supports three workflows: - - 1. **API mode**: ``--base-url URL --space-key KEY --name my-skill`` - 2. **Export mode**: ``--export-path ./export-dir/ --name my-skill`` - 3. **Build from JSON**: ``--from-json my-skill_extracted.json`` - - Returns: - Exit code (0 for success, non-zero for failure). - """ - parser = argparse.ArgumentParser( - description="Convert Confluence documentation to AI-ready skills", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "Examples:\n" - " %(prog)s --base-url https://wiki.example.com " - "--space-key PROJ --name my-wiki\n" - " %(prog)s --export-path ./confluence-export/ --name my-wiki\n" - " %(prog)s --from-json my-wiki_extracted.json\n" - ), - ) - - # Standard shared arguments - from .arguments.common import add_all_standard_arguments - - add_all_standard_arguments(parser) - - # Override enhance-level default to 0 for Confluence - for action in parser._actions: - if hasattr(action, "dest") and action.dest == "enhance_level": - action.default = 0 - action.help = ( - "AI enhancement level (auto-detects API vs LOCAL mode): " - "0=disabled (default for Confluence), 1=SKILL.md only, " - "2=+architecture/config, 3=full enhancement. " - "Mode selection: uses API if ANTHROPIC_API_KEY is set, " - "otherwise LOCAL (Claude Code, Kimi, etc.)" - ) - - # Confluence-specific arguments - parser.add_argument( - "--base-url", - type=str, - help="Confluence instance base URL (e.g., https://wiki.example.com)", - metavar="URL", - ) - parser.add_argument( - "--space-key", - type=str, - help="Confluence space key to extract (e.g., PROJ, DEV)", - metavar="KEY", - ) - parser.add_argument( - "--export-path", - type=str, - help="Path to Confluence HTML/XML export directory", - metavar="PATH", - ) - parser.add_argument( - "--username", - type=str, - help=("Confluence username / email for API auth (or set CONFLUENCE_USERNAME env var)"), - metavar="USER", - ) - parser.add_argument( - "--token", - type=str, - help=("Confluence API token for API auth (or set CONFLUENCE_TOKEN env var)"), - metavar="TOKEN", - ) - parser.add_argument( - "--max-pages", - type=int, - default=500, - help="Maximum number of pages to fetch (default: 500)", - metavar="N", - ) - parser.add_argument( - "--from-json", - type=str, - help="Build skill from previously extracted JSON data", - metavar="FILE", - ) - - args = parser.parse_args() - - # Setup logging - if getattr(args, "quiet", False): - logging.basicConfig(level=logging.WARNING, format="%(message)s") - elif getattr(args, "verbose", False): - logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s") - else: - logging.basicConfig(level=logging.INFO, format="%(message)s") - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = ( - getattr(args, "base_url", None) - or getattr(args, "export_path", None) - or getattr(args, "from_json", None) - or "(none)" - ) - print(f"\n{'=' * 60}") - print("DRY RUN: Confluence Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Space key: {getattr(args, 'space_key', None) or '(N/A)'}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Max pages: {getattr(args, 'max_pages', 500)}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\n Dry run complete") - return 0 - - # Validate inputs - has_api = getattr(args, "base_url", None) and getattr(args, "space_key", None) - has_export = getattr(args, "export_path", None) - has_json = getattr(args, "from_json", None) - - if not (has_api or has_export or has_json): - parser.error( - "Must specify one of:\n" - " --base-url URL --space-key KEY (API mode)\n" - " --export-path PATH (export mode)\n" - " --from-json FILE (build from JSON)" - ) - - # Build from pre-extracted JSON - if has_json: - name = getattr(args, "name", None) or Path(args.from_json).stem.replace("_extracted", "") - config: dict[str, Any] = { - "name": name, - "description": ( - getattr(args, "description", None) or f"Use when referencing {name} documentation" - ), - } - try: - converter = ConfluenceToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\n Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Determine name - if not getattr(args, "name", None): - if has_api: - args.name = args.space_key.lower() - elif has_export: - args.name = Path(args.export_path).name - else: - args.name = "confluence-skill" - - # Build config - config = { - "name": args.name, - "base_url": getattr(args, "base_url", "") or "", - "space_key": getattr(args, "space_key", "") or "", - "export_path": getattr(args, "export_path", "") or "", - "username": getattr(args, "username", "") or "", - "token": getattr(args, "token", "") or "", - "max_pages": getattr(args, "max_pages", 500), - } - if getattr(args, "description", None): - config["description"] = args.description - - # Create converter and run - try: - converter = ConfluenceToSkillConverter(config) - - if not converter.extract_confluence(): - print("\n Confluence extraction failed", file=sys.stderr) - sys.exit(1) - - converter.build_skill() - - # Enhancement workflow integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f" AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialised analysis," - " enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print(" API enhancement complete!") - except ImportError: - print(" API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import ( - LocalSkillEnhancer, - ) - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print(" Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import ( - LocalSkillEnhancer, - ) - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print(" Local enhancement complete!") - - except (ValueError, RuntimeError, FileNotFoundError) as e: - print(f"\n Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\n Unexpected error during Confluence processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/create_command.py b/src/skill_seekers/cli/create_command.py index bca790d..a3341b4 100644 --- a/src/skill_seekers/cli/create_command.py +++ b/src/skill_seekers/cli/create_command.py @@ -1,19 +1,22 @@ """Unified create command - single entry point for skill creation. Auto-detects source type (web, GitHub, local, PDF, config) and routes -to appropriate scraper while maintaining full backward compatibility. +to appropriate converter via get_converter(). """ import sys import logging import argparse +from typing import Any from skill_seekers.cli.source_detector import SourceDetector, SourceInfo +from skill_seekers.cli.execution_context import ExecutionContext +from skill_seekers.cli.skill_converter import get_converter from skill_seekers.cli.arguments.create import ( get_compatible_arguments, + get_create_defaults, get_universal_argument_names, ) -from skill_seekers.cli.arguments.common import DEFAULT_CHUNK_TOKENS, DEFAULT_CHUNK_OVERLAP_TOKENS logger = logging.getLogger(__name__) @@ -21,14 +24,20 @@ logger = logging.getLogger(__name__) class CreateCommand: """Unified create command implementation.""" - def __init__(self, args: argparse.Namespace): + def __init__(self, args: argparse.Namespace, parser_defaults: dict[str, Any] | None = None): """Initialize create command. Args: args: Parsed command-line arguments + parser_defaults: Default values from the argument parser. Used by + _is_explicitly_set() to detect which args the user actually + provided on the command line vs. which are just defaults. """ self.args = args self.source_info: SourceInfo | None = None + self._parser_defaults = ( + parser_defaults if parser_defaults is not None else get_create_defaults() + ) def execute(self) -> int: """Execute the create command. @@ -52,12 +61,36 @@ class CreateCommand: logger.error(f"Source validation failed: {e}") return 1 - # 3. Validate and warn about incompatible arguments + # 3. Initialize ExecutionContext with source info + # This provides a single source of truth for all configuration + # Resolve config path from args or source detection + config_path = getattr(self.args, "config", None) or ( + self.source_info.parsed.get("config_path") if self.source_info else None + ) + ExecutionContext.initialize( + args=self.args, + config_path=config_path, + source_info=self.source_info, + ) + + # 4. Validate and warn about incompatible arguments self._validate_arguments() - # 4. Route to appropriate scraper - logger.info(f"Routing to {self.source_info.type} scraper...") - return self._route_to_scraper() + # 5. Route to appropriate converter + logger.info(f"Routing to {self.source_info.type} converter...") + result = self._route_to_scraper() + if result != 0: + return result + + # 6. Centralized enhancement (runs after converter, not inside each scraper) + ctx = ExecutionContext.get() + if ctx.enhancement.enabled and ctx.enhancement.level > 0: + self._run_enhancement(ctx) + + # 7. Centralized workflows + self._run_workflows() + + return 0 def _validate_arguments(self) -> None: """Validate arguments and warn about incompatible ones.""" @@ -86,313 +119,338 @@ class CreateCommand: f"{self.source_info.type} sources and will be ignored" ) - def _is_explicitly_set(self, arg_name: str, arg_value: any) -> bool: + def _is_explicitly_set(self, arg_name: str, arg_value: Any) -> bool: """Check if an argument was explicitly set by the user. + Compares the current value against the parser's registered default. + This avoids hardcoding default values that can drift out of sync. + Args: - arg_name: Argument name - arg_value: Argument value + arg_name: Argument destination name + arg_value: Current argument value Returns: True if user explicitly set this argument """ - # Boolean flags - True means it was set - if isinstance(arg_value, bool): - return arg_value - - # None means not set if arg_value is None: return False - # Check against common defaults — args with these values were NOT - # explicitly set by the user and should not be forwarded. - defaults = { - "max_issues": 100, - "chunk_tokens": DEFAULT_CHUNK_TOKENS, - "chunk_overlap_tokens": DEFAULT_CHUNK_OVERLAP_TOKENS, - "output": None, - "doc_version": "", - "video_languages": "en", - "whisper_model": "base", - "platform": "slack", - "visual_interval": 0.7, - "visual_min_gap": 0.5, - "visual_similarity": 3.0, - } + # Boolean flags: True means explicitly set (store_true defaults to False) + if isinstance(arg_value, bool): + return arg_value - if arg_name in defaults: - return arg_value != defaults[arg_name] + # Compare against parser default if available + if arg_name in self._parser_defaults: + return arg_value != self._parser_defaults[arg_name] - # Any other non-None value means it was set + # No registered default and non-None → user must have set it return True def _route_to_scraper(self) -> int: - """Route to appropriate scraper based on source type. + """Route to appropriate converter based on source type. + + Builds a config dict from ExecutionContext + source_info, then + calls converter.run() directly — no sys.argv swap needed. Returns: - Exit code from scraper + Exit code from converter """ - if self.source_info.type == "web": - return self._route_web() - elif self.source_info.type == "github": - return self._route_github() - elif self.source_info.type == "local": - return self._route_local() - elif self.source_info.type == "pdf": - return self._route_pdf() - elif self.source_info.type == "word": - return self._route_word() - elif self.source_info.type == "epub": - return self._route_epub() - elif self.source_info.type == "video": - return self._route_video() - elif self.source_info.type == "config": - return self._route_config() - elif self.source_info.type == "jupyter": - return self._route_generic("jupyter_scraper", "--notebook") - elif self.source_info.type == "html": - return self._route_generic("html_scraper", "--html-path") - elif self.source_info.type == "openapi": - return self._route_generic("openapi_scraper", "--spec") - elif self.source_info.type == "asciidoc": - return self._route_generic("asciidoc_scraper", "--asciidoc-path") - elif self.source_info.type == "pptx": - return self._route_generic("pptx_scraper", "--pptx") - elif self.source_info.type == "rss": - return self._route_generic("rss_scraper", "--feed-path") - elif self.source_info.type == "manpage": - return self._route_generic("man_scraper", "--man-path") - elif self.source_info.type == "confluence": - return self._route_generic("confluence_scraper", "--export-path") - elif self.source_info.type == "notion": - return self._route_generic("notion_scraper", "--export-path") - elif self.source_info.type == "chat": - return self._route_generic("chat_scraper", "--export-path") - else: - logger.error(f"Unknown source type: {self.source_info.type}") - return 1 + source_type = self.source_info.type + ctx = ExecutionContext.get() - # ── Dynamic argument forwarding ────────────────────────────────────── - # - # Instead of manually checking each flag in every _route_*() method, - # _build_argv() dynamically iterates vars(self.args) and forwards all - # explicitly-set arguments. This is the same pattern used by - # main.py::_reconstruct_argv() and eliminates ~40 missing-flag gaps. + # UnifiedScraper is special — it takes config_path, not a config dict + if source_type == "config": + from skill_seekers.cli.unified_scraper import UnifiedScraper - # Dest names that differ from their CLI flag (dest → flag) - _DEST_TO_FLAG = { - "async_mode": "--async", - "video_url": "--url", - "video_playlist": "--playlist", - "video_languages": "--languages", - "skip_config": "--skip-config-patterns", - } + config_path = self.source_info.parsed["config_path"] + merge_mode = getattr(self.args, "merge_mode", None) + converter = UnifiedScraper(config_path, merge_mode=merge_mode) + return converter.run() - # Internal args that should never be forwarded to sub-scrapers. - # video_url/video_playlist/video_file are handled as positionals by _route_video(). - # config is forwarded manually only by routes that need it (web, github). - _SKIP_ARGS = frozenset( - { - "source", - "func", - "subcommand", - "command", - "config", - "video_url", - "video_playlist", - "video_file", - } - ) + config = self._build_config(source_type, ctx) + converter = get_converter(source_type, config) + return converter.run() - def _build_argv( - self, - module_name: str, - positional_args: list[str], - allowlist: frozenset[str] | None = None, - ) -> list[str]: - """Build argv dynamically by forwarding all explicitly-set arguments. + def _build_config(self, source_type: str, ctx: ExecutionContext) -> dict[str, Any]: + """Build a config dict for the converter from ExecutionContext. - Uses the same pattern as main.py::_reconstruct_argv(). - Replaces manual per-flag checking in _route_*() and _add_common_args(). + Each converter reads specific keys from the config dict passed to + its __init__. This method constructs that dict from the centralized + ExecutionContext, which already holds all CLI args + config file values. Args: - module_name: Scraper module name (e.g., "doc_scraper") - positional_args: Positional arguments to prepend (e.g., [url] or ["--repo", repo]) - allowlist: If provided, ONLY forward args in this set (overrides _SKIP_ARGS). - Used for targets with strict arg sets like unified_scraper. + source_type: Detected source type (web, github, pdf, etc.) + ctx: Initialized ExecutionContext Returns: - Complete argv list for the scraper + Config dict suitable for the converter's __init__. """ - argv = [module_name] + positional_args - - # Auto-add suggested name if user didn't provide one (skip for allowlisted targets) - if not allowlist and not self.args.name and self.source_info: - argv.extend(["--name", self.source_info.suggested_name]) - - for key, value in vars(self.args).items(): - # If allowlist provided, only forward args in the allowlist - if allowlist is not None: - if key not in allowlist: - continue - elif key in self._SKIP_ARGS or key.startswith("_help_"): - continue - if not self._is_explicitly_set(key, value): - continue - - # Use translation map for mismatched dest→flag names, else derive from key - if key in self._DEST_TO_FLAG: - arg_flag = self._DEST_TO_FLAG[key] - else: - arg_flag = f"--{key.replace('_', '-')}" - - if isinstance(value, bool): - if value: - argv.append(arg_flag) - elif isinstance(value, list): - for item in value: - argv.extend([arg_flag, str(item)]) - elif value is not None: - argv.extend([arg_flag, str(value)]) - - return argv - - def _call_module(self, module, argv: list[str]) -> int: - """Call a scraper module with the given argv. - - Swaps sys.argv, calls module.main(), restores sys.argv. - """ - logger.debug(f"Calling {argv[0]} with argv: {argv}") - original_argv = sys.argv - try: - sys.argv = argv - result = module.main() - if result is None: - logger.warning(f"Module returned None exit code, treating as success") - return 0 - return result - finally: - sys.argv = original_argv - - def _route_web(self) -> int: - """Route to web documentation scraper (doc_scraper.py).""" - from skill_seekers.cli import doc_scraper - - url = self.source_info.parsed.get("url", self.source_info.raw_source) - argv = self._build_argv("doc_scraper", [url]) - - # Forward config if set (not in _build_argv since it's in SKIP_ARGS - # to avoid double-forwarding for config-type sources) - if self.args.config: - argv.extend(["--config", self.args.config]) - - return self._call_module(doc_scraper, argv) - - def _route_github(self) -> int: - """Route to GitHub repository scraper (github_scraper.py).""" - from skill_seekers.cli import github_scraper - - repo = self.source_info.parsed.get("repo", self.source_info.raw_source) - argv = self._build_argv("github_scraper", ["--repo", repo]) - - if self.args.config: - argv.extend(["--config", self.args.config]) - - return self._call_module(github_scraper, argv) - - def _route_local(self) -> int: - """Route to local codebase analyzer (codebase_scraper.py).""" - from skill_seekers.cli import codebase_scraper - - directory = self.source_info.parsed.get("directory", self.source_info.raw_source) - argv = self._build_argv("codebase_scraper", ["--directory", directory]) - return self._call_module(codebase_scraper, argv) - - def _route_pdf(self) -> int: - """Route to PDF scraper (pdf_scraper.py).""" - from skill_seekers.cli import pdf_scraper - - file_path = self.source_info.parsed.get("file_path", self.source_info.raw_source) - argv = self._build_argv("pdf_scraper", ["--pdf", file_path]) - return self._call_module(pdf_scraper, argv) - - def _route_word(self) -> int: - """Route to Word document scraper (word_scraper.py).""" - from skill_seekers.cli import word_scraper - - file_path = self.source_info.parsed.get("file_path", self.source_info.raw_source) - argv = self._build_argv("word_scraper", ["--docx", file_path]) - return self._call_module(word_scraper, argv) - - def _route_epub(self) -> int: - """Route to EPUB scraper (epub_scraper.py).""" - from skill_seekers.cli import epub_scraper - - file_path = self.source_info.parsed.get("file_path", self.source_info.raw_source) - argv = self._build_argv("epub_scraper", ["--epub", file_path]) - return self._call_module(epub_scraper, argv) - - def _route_video(self) -> int: - """Route to video scraper (video_scraper.py).""" - from skill_seekers.cli import video_scraper - parsed = self.source_info.parsed - if parsed.get("source_kind") == "file": - positional = ["--video-file", parsed["file_path"]] - elif parsed.get("url"): - url = parsed["url"] - flag = "--playlist" if "playlist" in url.lower() else "--url" - positional = [flag, url] - else: - positional = [] + name = ctx.output.name or self.source_info.suggested_name - argv = self._build_argv("video_scraper", positional) - return self._call_module(video_scraper, argv) - - # Args accepted by unified_scraper (allowlist for config route) - _UNIFIED_SCRAPER_ARGS = frozenset( - { - "merge_mode", - "skip_codebase_analysis", - "fresh", - "dry_run", - "enhance_workflow", - "enhance_stage", - "var", - "workflow_dry_run", - "api_key", - "enhance_level", - "agent", - "agent_cmd", + # Common keys shared by all converters + config: dict[str, Any] = { + "name": name, + "description": getattr(self.args, "description", None) + or f"Use when working with {name}", } - ) - def _route_config(self) -> int: - """Route to unified scraper for config files (unified_scraper.py).""" - from skill_seekers.cli import unified_scraper + if source_type == "web": + url = parsed.get("url", parsed.get("base_url", self.source_info.raw_input)) + config.update( + { + "base_url": url, + "doc_version": ctx.output.doc_version, + "max_pages": ctx.scraping.max_pages, + "rate_limit": ctx.scraping.rate_limit, + "browser": ctx.scraping.browser, + "browser_wait_until": ctx.scraping.browser_wait_until, + "browser_extra_wait": ctx.scraping.browser_extra_wait, + "workers": ctx.scraping.workers, + "async_mode": ctx.scraping.async_mode, + "resume": ctx.scraping.resume, + "fresh": ctx.scraping.fresh, + "skip_scrape": ctx.scraping.skip_scrape, + "selectors": {"title": "title", "code_blocks": "pre code"}, + "url_patterns": {"include": [], "exclude": []}, + } + ) + # Load from config file if provided + config_path = getattr(self.args, "config", None) + if config_path: + self._merge_json_config(config, config_path) - config_path = self.source_info.parsed["config_path"] - argv = self._build_argv( - "unified_scraper", - ["--config", config_path], - allowlist=self._UNIFIED_SCRAPER_ARGS, - ) - return self._call_module(unified_scraper, argv) + elif source_type == "github": + repo = parsed.get("repo", self.source_info.raw_input) + config.update( + { + "repo": repo, + "local_repo_path": getattr(self.args, "local_repo_path", None), + "include_issues": getattr(self.args, "include_issues", True), + "max_issues": getattr(self.args, "max_issues", 100), + "include_changelog": getattr(self.args, "include_changelog", True), + "include_releases": getattr(self.args, "include_releases", True), + "include_code": getattr(self.args, "include_code", False), + } + ) + config_path = getattr(self.args, "config", None) + if config_path: + self._merge_json_config(config, config_path) - def _route_generic(self, module_name: str, file_flag: str) -> int: - """Generic routing for new source types. + elif source_type == "local": + directory = parsed.get("directory", self.source_info.raw_input) + config.update( + { + "directory": directory, + "depth": ctx.analysis.depth, + "output_dir": ctx.output.output_dir or f"output/{name}", + "languages": getattr(self.args, "languages", None), + "file_patterns": ctx.analysis.file_patterns, + "detect_patterns": not ctx.analysis.skip_patterns, + "extract_test_examples": not ctx.analysis.skip_test_examples, + "build_how_to_guides": not ctx.analysis.skip_how_to_guides, + "extract_config_patterns": not ctx.analysis.skip_config_patterns, + "build_api_reference": not ctx.analysis.skip_api_reference, + "build_dependency_graph": not ctx.analysis.skip_dependency_graph, + "extract_docs": not ctx.analysis.skip_docs, + "extract_comments": not ctx.analysis.no_comments, + "enhance_level": ctx.enhancement.level if ctx.enhancement.enabled else 0, + "skill_name": name, + "doc_version": ctx.output.doc_version, + } + ) - All new source types (jupyter, html, openapi, asciidoc, pptx, rss, - manpage, confluence, notion, chat) use dynamic argument forwarding. + elif source_type == "pdf": + config.update( + { + "pdf_path": parsed.get("file_path", self.source_info.raw_input), + "extract_options": { + "chunk_size": 10, + "min_quality": 5.0, + "extract_images": True, + "min_image_size": 100, + }, + } + ) + + elif source_type == "word": + config["docx_path"] = parsed.get("file_path", self.source_info.raw_input) + + elif source_type == "epub": + config["epub_path"] = parsed.get("file_path", self.source_info.raw_input) + + elif source_type == "video": + config.update( + { + "languages": getattr(self.args, "video_languages", "en"), + "visual": getattr(self.args, "visual", False), + "whisper_model": getattr(self.args, "whisper_model", "base"), + "visual_interval": getattr(self.args, "visual_interval", 0.7), + "visual_min_gap": getattr(self.args, "visual_min_gap", 0.5), + "visual_similarity": getattr(self.args, "visual_similarity", 3.0), + } + ) + # Video source can be URL, playlist, or file + if parsed.get("source_kind") == "file": + config["video_file"] = parsed["file_path"] + elif parsed.get("url"): + url = parsed["url"] + if "playlist" in url.lower(): + config["playlist"] = url + else: + config["url"] = url + else: + # Fallback: treat raw input as URL + config["url"] = self.source_info.raw_input + + elif source_type == "jupyter": + config["notebook_path"] = parsed.get("file_path", self.source_info.raw_input) + + elif source_type == "html": + config["html_path"] = parsed.get("file_path", self.source_info.raw_input) + + elif source_type == "openapi": + file_path = parsed.get("file_path", self.source_info.raw_input) + if file_path.startswith(("http://", "https://")): + config["spec_url"] = file_path + else: + config["spec_path"] = file_path + + elif source_type == "asciidoc": + config["asciidoc_path"] = parsed.get("file_path", self.source_info.raw_input) + + elif source_type == "pptx": + config["pptx_path"] = parsed.get("file_path", self.source_info.raw_input) + + elif source_type == "rss": + file_path = parsed.get("file_path", self.source_info.raw_input) + if file_path.startswith(("http://", "https://")): + config["feed_url"] = file_path + else: + config["feed_path"] = file_path + config["follow_links"] = getattr(self.args, "follow_links", True) + config["max_articles"] = getattr(self.args, "max_articles", 50) + + elif source_type == "manpage": + file_path = parsed.get("file_path", "") + if file_path: + config["man_path"] = file_path + man_names = parsed.get("man_names", []) + if man_names: + config["man_names"] = man_names + + elif source_type == "confluence": + config.update( + { + "export_path": parsed.get("file_path", ""), + "base_url": getattr(self.args, "confluence_url", ""), + "space_key": getattr(self.args, "space_key", ""), + "username": getattr(self.args, "username", ""), + "token": getattr(self.args, "token", ""), + "max_pages": getattr(self.args, "max_pages", 500), + } + ) + + elif source_type == "notion": + config.update( + { + "export_path": parsed.get("file_path"), + "database_id": getattr(self.args, "database_id", None), + "page_id": getattr(self.args, "page_id", None), + "token": getattr(self.args, "notion_token", None), + "max_pages": getattr(self.args, "max_pages", 100), + } + ) + + elif source_type == "chat": + config.update( + { + "export_path": parsed.get("file_path", ""), + "platform": getattr(self.args, "platform", "slack"), + "token": getattr(self.args, "token", ""), + "channel": getattr(self.args, "channel", ""), + "max_messages": getattr(self.args, "max_messages", 1000), + } + ) + + return config + + @staticmethod + def _merge_json_config(config: dict[str, Any], config_path: str) -> None: + """Merge a JSON config file into the config dict. + + Config file values are used as defaults — CLI args (already in config) take precedence. """ - import importlib + import json - module = importlib.import_module(f"skill_seekers.cli.{module_name}") + try: + with open(config_path, encoding="utf-8") as f: + file_config = json.load(f) + # Only set keys that aren't already in config + for key, value in file_config.items(): + if key not in config: + config[key] = value + except (FileNotFoundError, json.JSONDecodeError) as e: + logger.warning(f"Could not load config file {config_path}: {e}") - file_path = self.source_info.parsed.get("file_path", "") - positional = [file_flag, file_path] if file_path else [] - argv = self._build_argv(module_name, positional) - return self._call_module(module, argv) + def _run_enhancement(self, ctx: ExecutionContext) -> None: + """Run centralized AI enhancement after converter completes.""" + from pathlib import Path + + name = ctx.output.name or ( + self.source_info.suggested_name if self.source_info else "unnamed" + ) + skill_dir = ctx.output.output_dir or f"output/{name}" + + logger.info("\n" + "=" * 60) + logger.info(f"Enhancing SKILL.md (level {ctx.enhancement.level})") + logger.info("=" * 60) + + try: + from skill_seekers.cli.agent_client import AgentClient + + client = AgentClient( + mode=ctx.enhancement.mode, + agent=ctx.enhancement.agent, + api_key=ctx.enhancement.api_key, + ) + + if client.mode == "api" and client.client: + from skill_seekers.cli.enhance_skill import enhance_skill_md + + api_key = ctx.enhancement.api_key or client.api_key + if api_key: + enhance_skill_md(skill_dir, api_key) + logger.info("API enhancement complete!") + else: + logger.warning("No API key available for enhancement") + else: + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer + + enhancer = LocalSkillEnhancer( + Path(skill_dir), + agent=ctx.enhancement.agent, + agent_cmd=ctx.enhancement.agent_cmd, + ) + success = enhancer.run(headless=True, timeout=ctx.enhancement.timeout) + if success: + agent_name = ctx.enhancement.agent or "claude" + logger.info(f"Local enhancement complete! (via {agent_name})") + else: + logger.warning("Local enhancement did not complete") + except Exception as e: + logger.warning(f"Enhancement failed: {e}") + + def _run_workflows(self) -> None: + """Run enhancement workflows if configured.""" + try: + from skill_seekers.cli.workflow_runner import run_workflows + + run_workflows(self.args) + except ImportError: + pass + except Exception as e: + logger.warning(f"Workflow execution failed: {e}") def main() -> int: @@ -492,97 +550,28 @@ Common Workflows: args = parser.parse_args() # Handle source-specific help modes - if args._help_web: - # Recreate parser with web-specific arguments - parser_web = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from web documentation", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_web, mode="web") - parser_web.print_help() - return 0 - elif args._help_github: - parser_github = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from GitHub repository", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_github, mode="github") - parser_github.print_help() - return 0 - elif args._help_local: - parser_local = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from local codebase", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_local, mode="local") - parser_local.print_help() - return 0 - elif args._help_pdf: - parser_pdf = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from PDF file", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_pdf, mode="pdf") - parser_pdf.print_help() - return 0 - elif args._help_word: - parser_word = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from Word document (.docx)", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_word, mode="word") - parser_word.print_help() - return 0 - elif args._help_epub: - parser_epub = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from EPUB e-book (.epub)", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_epub, mode="epub") - parser_epub.print_help() - return 0 - elif args._help_video: - parser_video = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from video (YouTube, Vimeo, local files)", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_video, mode="video") - parser_video.print_help() - return 0 - elif args._help_config: - parser_config = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill from multi-source config file (unified scraper)", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_config, mode="config") - parser_config.print_help() - return 0 - elif args._help_advanced: - parser_advanced = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill - advanced options", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_advanced, mode="advanced") - parser_advanced.print_help() - return 0 - elif args._help_all: - parser_all = argparse.ArgumentParser( - prog="skill-seekers create", - description="Create skill - all options", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_create_arguments(parser_all, mode="all") - parser_all.print_help() - return 0 + _HELP_MODES = { + "_help_web": ("web", "Create skill from web documentation"), + "_help_github": ("github", "Create skill from GitHub repository"), + "_help_local": ("local", "Create skill from local codebase"), + "_help_pdf": ("pdf", "Create skill from PDF file"), + "_help_word": ("word", "Create skill from Word document (.docx)"), + "_help_epub": ("epub", "Create skill from EPUB e-book (.epub)"), + "_help_video": ("video", "Create skill from video (YouTube, Vimeo, local files)"), + "_help_config": ("config", "Create skill from multi-source config file (unified scraper)"), + "_help_advanced": ("advanced", "Create skill - advanced options"), + "_help_all": ("all", "Create skill - all options"), + } + for attr, (mode, description) in _HELP_MODES.items(): + if getattr(args, attr, False): + help_parser = argparse.ArgumentParser( + prog="skill-seekers create", + description=description, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + add_create_arguments(help_parser, mode=mode) + help_parser.print_help() + return 0 # Setup logging log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO) diff --git a/src/skill_seekers/cli/doc_scraper.py b/src/skill_seekers/cli/doc_scraper.py old mode 100755 new mode 100644 index f41cb2b..0f94d97 --- a/src/skill_seekers/cli/doc_scraper.py +++ b/src/skill_seekers/cli/doc_scraper.py @@ -46,7 +46,7 @@ from skill_seekers.cli.language_detector import LanguageDetector from skill_seekers.cli.llms_txt_detector import LlmsTxtDetector from skill_seekers.cli.llms_txt_downloader import LlmsTxtDownloader from skill_seekers.cli.llms_txt_parser import LlmsTxtParser -from skill_seekers.cli.arguments.scrape import add_scrape_arguments +from skill_seekers.cli.skill_converter import SkillConverter from skill_seekers.cli.utils import sanitize_url, setup_logging # Configure logging @@ -152,8 +152,11 @@ def infer_description_from_docs( ) -class DocToSkillConverter: +class DocToSkillConverter(SkillConverter): + SOURCE_TYPE = "web" + def __init__(self, config: dict[str, Any], dry_run: bool = False, resume: bool = False) -> None: + super().__init__(config) self.config = config self.name = config["name"] self.base_url = config["base_url"] @@ -1943,6 +1946,10 @@ To refresh this skill with updated documentation: logger.info(" āœ“ index.md") + def extract(self): + """SkillConverter interface — delegates to scrape_all().""" + self.scrape_all() + def build_skill(self) -> bool: """Build the skill from scraped data. @@ -2209,495 +2216,130 @@ def load_config(config_path: str) -> dict[str, Any]: return config -def interactive_config() -> dict[str, Any]: - """Interactive configuration wizard for creating new configs. +def scrape_documentation( + config: dict[str, Any], + ctx: Any | None = None, + verbose: bool = False, + quiet: bool = False, +) -> int: + """Scrape documentation using config and optional context. - Prompts user for all required configuration fields step-by-step - and returns a complete configuration dictionary. - - Returns: - dict: Complete configuration dictionary with user-provided values - - Example: - >>> config = interactive_config() - # User enters: name=react, url=https://react.dev, etc. - >>> config['name'] - 'react' - """ - logger.info("\n" + "=" * 60) - logger.info("Documentation to Skill Converter") - logger.info("=" * 60 + "\n") - - config: dict[str, Any] = {} - - # Basic info - config["name"] = input("Skill name (e.g., 'react', 'godot'): ").strip() - config["description"] = input("Skill description: ").strip() - config["base_url"] = input("Base URL (e.g., https://docs.example.com/): ").strip() - - if not config["base_url"].endswith("/"): - config["base_url"] += "/" - - # Selectors - logger.info("\nCSS Selectors (press Enter for defaults):") - selectors = {} - selectors["main_content"] = ( - input(" Main content [div[role='main']]: ").strip() or "div[role='main']" - ) - selectors["title"] = input(" Title [title]: ").strip() or "title" - selectors["code_blocks"] = input(" Code blocks [pre code]: ").strip() or "pre code" - config["selectors"] = selectors - - # URL patterns - logger.info("\nURL Patterns (comma-separated, optional):") - include = input(" Include: ").strip() - exclude = input(" Exclude: ").strip() - config["url_patterns"] = { - "include": [p.strip() for p in include.split(",") if p.strip()], - "exclude": [p.strip() for p in exclude.split(",") if p.strip()], - } - - # Settings - rate = input(f"\nRate limit (seconds) [{DEFAULT_RATE_LIMIT}]: ").strip() - config["rate_limit"] = float(rate) if rate else DEFAULT_RATE_LIMIT - - max_p = input(f"Max pages [{DEFAULT_MAX_PAGES}]: ").strip() - config["max_pages"] = int(max_p) if max_p else DEFAULT_MAX_PAGES - - return config - - -def check_existing_data(name: str) -> tuple[bool, int]: - """Check if scraped data already exists for a skill. + This is the main entry point for programmatic use. CLI main() is a thin + wrapper around this function. Args: - name (str): Skill name to check + config: Configuration dictionary with required fields (name, base_url, etc.) + ctx: Optional ExecutionContext for shared configuration + verbose: Enable verbose logging + quiet: Minimize logging output Returns: - tuple: (exists, page_count) where exists is bool and page_count is int - - Example: - >>> exists, count = check_existing_data('react') - >>> if exists: - ... print(f"Found {count} existing pages") + Exit code (0 for success, non-zero for error) """ - data_dir = f"output/{name}_data" - if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"): - with open(f"{data_dir}/summary.json", encoding="utf-8") as f: - summary = json.load(f) - return True, summary.get("total_pages", 0) - return False, 0 + from skill_seekers.cli.execution_context import ExecutionContext + # Setup logging + setup_logging(verbose=verbose, quiet=quiet) -def setup_argument_parser() -> argparse.ArgumentParser: - """Setup and configure command-line argument parser. - - Creates an ArgumentParser with all CLI options for the doc scraper tool, - including configuration, scraping, enhancement, and performance options. - - All arguments are defined in skill_seekers.cli.arguments.scrape to ensure - consistency between the standalone scraper and unified CLI. - - Returns: - argparse.ArgumentParser: Configured argument parser - - Example: - >>> parser = setup_argument_parser() - >>> args = parser.parse_args(['--config', 'configs/react.json']) - >>> print(args.config) - configs/react.json - """ - parser = argparse.ArgumentParser( - description="Convert documentation websites to AI skills", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - # Add all scrape arguments from shared definitions - # This ensures the standalone scraper and unified CLI stay in sync - add_scrape_arguments(parser) - - return parser - - -def get_configuration(args: argparse.Namespace) -> dict[str, Any]: - """Load or create configuration from command-line arguments. - - Handles three configuration modes: - 1. Load from JSON file (--config) - 2. Interactive configuration wizard (--interactive or missing args) - 3. Quick mode from command-line arguments (--name, --url) - - Also applies CLI overrides for rate limiting and worker count. - - Args: - args: Parsed command-line arguments from argparse - - Returns: - dict: Configuration dictionary with all required fields - - Example: - >>> args = parser.parse_args(['--name', 'react', '--url', 'https://react.dev']) - >>> config = get_configuration(args) - >>> print(config['name']) - react - """ - # Handle URL from either positional argument or --url flag - # Positional 'url' takes precedence, then --url flag - effective_url = getattr(args, "url", None) - - # Get base configuration - if args.config: - config = load_config(args.config) - elif args.interactive or not (args.name and effective_url): - config = interactive_config() - else: - config = { - "name": args.name, - "description": args.description or f"Use when working with {args.name}", - "base_url": effective_url, - "selectors": { - "title": "title", - "code_blocks": "pre code", - }, - "url_patterns": {"include": [], "exclude": []}, - "rate_limit": DEFAULT_RATE_LIMIT, - "max_pages": DEFAULT_MAX_PAGES, - } - - # Apply CLI override for doc_version (works for all config modes) - cli_doc_version = getattr(args, "doc_version", "") - if cli_doc_version: - config["doc_version"] = cli_doc_version - - # Apply CLI overrides for rate limiting - if args.no_rate_limit: - config["rate_limit"] = 0 - logger.info("⚔ Rate limiting disabled") - elif args.rate_limit is not None: - config["rate_limit"] = args.rate_limit - if args.rate_limit == 0: - logger.info("⚔ Rate limiting disabled") + # Use existing context if already initialized, otherwise create one + if ctx is None: + if ExecutionContext._initialized: + ctx = ExecutionContext.get() else: - logger.info("⚔ Rate limit override: %ss per page", args.rate_limit) + ctx = ExecutionContext.initialize(args=argparse.Namespace(**config)) - # Apply CLI overrides for worker count - if args.workers: - # Validate workers count - if args.workers < 1: - logger.error("āŒ Error: --workers must be at least 1 (got %d)", args.workers) - logger.error(" Suggestion: Use --workers 1 (default) or omit the flag") - sys.exit(1) - if args.workers > 10: - logger.warning("āš ļø Warning: --workers capped at 10 (requested %d)", args.workers) - args.workers = 10 - config["workers"] = args.workers - if args.workers > 1: - logger.info("šŸš€ Parallel scraping enabled: %d workers", args.workers) + # Build converter and execute + try: + converter = _run_scraping(config) + if converter is None: + return 1 - # Apply CLI override for async mode - if args.async_mode: - config["async_mode"] = True - if config.get("workers", 1) > 1: - logger.info("⚔ Async mode enabled (2-3x faster than threads)") - else: - logger.warning( - "āš ļø Async mode enabled but workers=1. Consider using --workers 4 for better performance" - ) + # Handle enhancement if enabled + if ctx.enhancement.enabled and ctx.enhancement.level > 0: + _run_enhancement(config, ctx, converter) - # Apply CLI override for browser mode - if getattr(args, "browser", False): - config["browser"] = True - logger.info("🌐 Browser mode enabled (Playwright headless Chromium)") - - # Apply CLI override for max_pages - if args.max_pages is not None: - old_max = config.get("max_pages", DEFAULT_MAX_PAGES) - config["max_pages"] = args.max_pages - - # Warnings for --max-pages usage - if args.max_pages > 1000: - logger.warning( - "āš ļø --max-pages=%d is very high - scraping may take hours", args.max_pages - ) - logger.warning(" Recommendation: Use configs with reasonable limits for production") - elif args.max_pages < 10: - logger.warning( - "āš ļø --max-pages=%d is very low - may result in incomplete skill", args.max_pages - ) - - if old_max and old_max != args.max_pages: - logger.info( - "šŸ“Š Max pages override: %d → %d (from --max-pages flag)", old_max, args.max_pages - ) - else: - logger.info("šŸ“Š Max pages set to: %d (from --max-pages flag)", args.max_pages) - - return config + return 0 + except Exception as e: + logger.error(f"Scraping failed: {e}") + return 1 -def execute_scraping_and_building( - config: dict[str, Any], args: argparse.Namespace -) -> Optional["DocToSkillConverter"]: - """Execute the scraping and skill building process. - - Handles dry run mode, existing data checks, scraping with checkpoints, - keyboard interrupts, and skill building. This is the core workflow - orchestration for the scraping phase. - - Args: - config (dict): Configuration dictionary with scraping parameters - args: Parsed command-line arguments - - Returns: - DocToSkillConverter: The converter instance after scraping/building, - or None if process was aborted - - Example: - >>> config = {'name': 'react', 'base_url': 'https://react.dev'} - >>> converter = execute_scraping_and_building(config, args) - >>> if converter: - ... print("Scraping complete!") - """ - # Dry run mode - preview only - if args.dry_run: - logger.info("\n" + "=" * 60) - logger.info("DRY RUN MODE") - logger.info("=" * 60) - logger.info("This will show what would be scraped without saving anything.\n") - - converter = DocToSkillConverter(config, dry_run=True) - converter.scrape_all() - - logger.info("\nšŸ“‹ Configuration Summary:") - logger.info(" Name: %s", config["name"]) - logger.info(" Base URL: %s", config["base_url"]) - logger.info(" Max pages: %d", config.get("max_pages", DEFAULT_MAX_PAGES)) - logger.info(" Rate limit: %ss", config.get("rate_limit", DEFAULT_RATE_LIMIT)) - logger.info(" Categories: %d", len(config.get("categories", {}))) - return None - - # Check for existing data - exists, page_count = check_existing_data(config["name"]) - - if exists and not args.skip_scrape and not args.fresh: - # Check force_rescrape flag from config - if config.get("force_rescrape", False): - # Auto-delete cached data and rescrape - logger.info("\nāœ“ Found existing data: %d pages", page_count) - logger.info(" force_rescrape enabled - deleting cached data and rescaping") - import shutil - - data_dir = f"output/{config['name']}_data" - if os.path.exists(data_dir): - shutil.rmtree(data_dir) - logger.info(f" Deleted: {data_dir}") - else: - # Only prompt if force_rescrape is False - logger.info("\nāœ“ Found existing data: %d pages", page_count) - response = input("Use existing data? (y/n): ").strip().lower() - if response == "y": - args.skip_scrape = True - elif exists and args.fresh: - logger.info("\nāœ“ Found existing data: %d pages", page_count) - logger.info(" --fresh flag set, will re-scrape from scratch") - +def _run_scraping(config: dict[str, Any]) -> Optional["DocToSkillConverter"]: + """Run the scraping process.""" # Create converter - converter = DocToSkillConverter(config, resume=args.resume) + converter = DocToSkillConverter(config) - # Initialize workflow tracking (will be updated if workflow runs) - converter.workflow_executed = False - converter.workflow_name = None - - # Handle fresh start (clear checkpoint) - if args.fresh: - converter.clear_checkpoint() - - # Scrape or skip - if not args.skip_scrape: - try: - converter.scrape_all() - # Save final checkpoint - if converter.checkpoint_enabled: - converter.save_checkpoint() - logger.info("\nšŸ’¾ Final checkpoint saved") - # Clear checkpoint after successful completion - converter.clear_checkpoint() - logger.info("āœ… Scraping complete - checkpoint cleared") - except KeyboardInterrupt: - logger.warning("\n\nScraping interrupted.") - if converter.checkpoint_enabled: - converter.save_checkpoint() - logger.info("šŸ’¾ Progress saved to checkpoint") - logger.info( - " Resume with: --config %s --resume", - args.config if args.config else "config.json", - ) - response = input("Continue with skill building? (y/n): ").strip().lower() - if response != "y": - return None + # Check for resume + if config.get("resume") and converter.checkpoint_exists(): + logger.info("šŸ“‚ Resuming from checkpoint...") + converter.load_checkpoint() else: - logger.info("\nā­ļø Skipping scrape, using existing data") + # Clear checkpoint if fresh start + if config.get("fresh"): + converter.clear_checkpoint() + + # Scrape + if not config.get("skip_scrape"): + logger.info("\nšŸ” Starting scrape...") + try: + asyncio.run(converter.scrape()) + except KeyboardInterrupt: + logger.info("\n\nāš ļø Interrupted by user") + converter.save_checkpoint() + logger.info("šŸ’¾ Checkpoint saved. Resume with --resume") + return None # Build skill - success = converter.build_skill() - - if not success: - sys.exit(1) - - # RAG chunking (optional - NEW v2.10.0) - if args.chunk_for_rag: - logger.info("\n" + "=" * 60) - logger.info("šŸ”Ŗ Generating RAG chunks...") - logger.info("=" * 60) - - from skill_seekers.cli.rag_chunker import RAGChunker - - chunker = RAGChunker( - chunk_size=args.chunk_tokens, - chunk_overlap=args.chunk_overlap_tokens, - preserve_code_blocks=not args.no_preserve_code_blocks, - preserve_paragraphs=not args.no_preserve_paragraphs, - ) - - # Chunk the skill - skill_dir = Path(converter.skill_dir) - chunks = chunker.chunk_skill(skill_dir) - - # Save chunks - chunks_path = skill_dir / "rag_chunks.json" - chunker.save_chunks(chunks, chunks_path) - - logger.info(f"āœ… Generated {len(chunks)} RAG chunks") - logger.info(f"šŸ“„ Saved to: {chunks_path}") - logger.info(f"šŸ’” Use with LangChain: --target langchain") - logger.info(f"šŸ’” Use with LlamaIndex: --target llama-index") - - # ============================================================ - # WORKFLOW SYSTEM INTEGRATION (Phase 2 - doc_scraper) - # ============================================================ - from skill_seekers.cli.workflow_runner import run_workflows - - # Pass doc-scraper-specific context to workflows - doc_context = { - "name": config["name"], - "base_url": config.get("base_url", ""), - "description": config.get("description", ""), - } - - workflow_executed, workflow_names = run_workflows(args, context=doc_context) - - # Store workflow execution status on converter for execute_enhancement() to access - converter.workflow_executed = workflow_executed - converter.workflow_name = ", ".join(workflow_names) if workflow_names else None + logger.info("\nšŸ“¦ Building skill...") + converter.build_skill() return converter -def execute_enhancement(config: dict[str, Any], args: argparse.Namespace, converter=None) -> None: - """Execute optional SKILL.md enhancement with AI. +def _run_enhancement( + config: dict[str, Any], + ctx: Any, + _converter: Any, +) -> None: + """Run enhancement using context settings.""" + from pathlib import Path - Supports two enhancement modes: - 1. API-based enhancement (requires ANTHROPIC_API_KEY) - 2. Local enhancement using a coding agent CLI (no API key needed) + skill_dir = f"output/{config['name']}" - Prints appropriate messages and suggestions based on whether - enhancement was requested and whether it succeeded. + logger.info("\n" + "=" * 60) + logger.info(f"šŸ¤– Enhancing SKILL.md (level {ctx.enhancement.level})") + logger.info("=" * 60) - Args: - config (dict): Configuration dictionary with skill name - args: Parsed command-line arguments with enhancement flags - converter: Optional DocToSkillConverter instance (to check workflow status) + # Use AgentClient from context + try: + agent_client = ctx.get_agent_client() - Example: - >>> execute_enhancement(config, args) - # Runs enhancement if --enhance or --enhance-local flag is set - """ - import subprocess + # Run enhancement based on mode + if agent_client.mode == "api" and agent_client.client: + # API mode enhancement + from skill_seekers.cli.enhance_skill import enhance_skill_md - # Check if workflow was already executed (for logging context) - workflow_executed = ( - converter and hasattr(converter, "workflow_executed") and converter.workflow_executed - ) - workflow_name = converter.workflow_name if workflow_executed else None + # Use AgentClient's API key detection (respects priority: CLI > config > env) + api_key = ctx.enhancement.api_key or agent_client.api_key + if api_key: + enhance_skill_md(skill_dir, api_key) + logger.info("āœ… API enhancement complete!") + else: + logger.warning("āš ļø No API key available for enhancement") + else: + # Local mode enhancement + from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - # Optional enhancement with auto-detected mode (API or LOCAL) - # Note: Runs independently of workflow system (they complement each other) - if getattr(args, "enhance_level", 0) > 0: - import os - - has_api_key = bool(os.environ.get("ANTHROPIC_API_KEY") or args.api_key) - mode = "API" if has_api_key else "LOCAL" - - logger.info("\n" + "=" * 80) - logger.info(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - logger.info("=" * 80) - if workflow_executed: - logger.info(f" Running after workflow: {workflow_name}") - logger.info( - " (Workflow provides specialized analysis, enhancement provides general improvements)" + enhancer = LocalSkillEnhancer( + Path(skill_dir), + agent=ctx.enhancement.agent, + agent_cmd=ctx.enhancement.agent_cmd, ) - logger.info("") - - try: - enhance_cmd = ["skill-seekers-enhance", f"output/{config['name']}/"] - - if args.api_key: - enhance_cmd.extend(["--api-key", args.api_key]) - if getattr(args, "agent", None): - enhance_cmd.extend(["--agent", args.agent]) - if getattr(args, "interactive_enhancement", False): - enhance_cmd.append("--interactive-enhancement") - - result = subprocess.run(enhance_cmd, check=True) - if result.returncode == 0: - logger.info("\nāœ… Enhancement complete!") - except subprocess.CalledProcessError: - logger.warning("\n⚠ Enhancement failed, but skill was still built") - except FileNotFoundError: - logger.warning("\n⚠ skill-seekers-enhance command not found. Run manually:") - logger.info( - " skill-seekers enhance output/%s/", - config["name"], - ) - - # Print packaging instructions - logger.info("\nšŸ“¦ Package your skill:") - logger.info(" skill-seekers-package output/%s/", config["name"]) - - # Suggest enhancement if not done - if getattr(args, "enhance_level", 0) == 0: - logger.info("\nšŸ’” Optional: Enhance SKILL.md with AI:") - logger.info(" skill-seekers enhance output/%s/", config["name"]) - logger.info(" or re-run with: --enhance-level 2 (auto-detects API vs LOCAL mode)") - logger.info( - " API-based: skill-seekers-enhance-api output/%s/", - config["name"], - ) - logger.info(" or re-run with: --enhance") - logger.info( - "\nšŸ’” Tip: Use --interactive-enhancement with --enhance-local to open terminal window" - ) - - -def main() -> None: - parser = setup_argument_parser() - args = parser.parse_args() - - # Setup logging based on verbosity flags - setup_logging(verbose=args.verbose, quiet=args.quiet) - - config = get_configuration(args) - - # Execute scraping and building - converter = execute_scraping_and_building(config, args) - - # Exit if dry run or aborted - if converter is None: - return - - # Execute enhancement and print instructions (pass converter for workflow status check) - execute_enhancement(config, args, converter) - - -if __name__ == "__main__": - main() + success = enhancer.run(headless=True, timeout=ctx.enhancement.timeout) + if success: + agent_name = ctx.enhancement.agent or "claude" + logger.info(f"āœ… Local enhancement complete! (via {agent_name})") + else: + logger.warning("āš ļø Local enhancement did not complete") + except Exception as e: + logger.warning(f"āš ļø Enhancement failed: {e}") diff --git a/src/skill_seekers/cli/enhance_skill_local.py b/src/skill_seekers/cli/enhance_skill_local.py index 020fcb5..7d2f1ce 100644 --- a/src/skill_seekers/cli/enhance_skill_local.py +++ b/src/skill_seekers/cli/enhance_skill_local.py @@ -200,11 +200,22 @@ class LocalSkillEnhancer: raise ValueError(f"Executable '{executable}' not found in PATH") def _resolve_agent(self, agent, agent_cmd): + # Priority: explicit param > ExecutionContext > env var > default + try: + from skill_seekers.cli.execution_context import ExecutionContext + + ctx = ExecutionContext.get() + ctx_agent = ctx.enhancement.agent or "" + ctx_cmd = ctx.enhancement.agent_cmd or "" + except Exception: + ctx_agent = "" + ctx_cmd = "" + env_agent = os.environ.get("SKILL_SEEKER_AGENT", "").strip() env_cmd = os.environ.get("SKILL_SEEKER_AGENT_CMD", "").strip() - agent_name = _normalize_agent_name(agent or env_agent or "claude") - cmd_override = agent_cmd or env_cmd or None + agent_name = _normalize_agent_name(agent or ctx_agent or env_agent or "claude") + cmd_override = agent_cmd or ctx_cmd or env_cmd or None if agent_name == "custom": if not cmd_override: diff --git a/src/skill_seekers/cli/epub_scraper.py b/src/skill_seekers/cli/epub_scraper.py index 70adaa6..17fa187 100644 --- a/src/skill_seekers/cli/epub_scraper.py +++ b/src/skill_seekers/cli/epub_scraper.py @@ -10,12 +10,10 @@ Usage: skill-seekers epub --from-json book_extracted.json """ -import argparse import json import logging import os import re -import sys from pathlib import Path # Optional dependency guard @@ -30,6 +28,8 @@ except ImportError: # BeautifulSoup is a core dependency (always available) from bs4 import BeautifulSoup, Comment +from .skill_converter import SkillConverter + logger = logging.getLogger(__name__) @@ -68,10 +68,13 @@ def infer_description_from_epub(metadata: dict | None = None, name: str = "") -> ) -class EpubToSkillConverter: +class EpubToSkillConverter(SkillConverter): """Convert EPUB e-book to AI skill.""" + SOURCE_TYPE = "epub" + def __init__(self, config): + super().__init__(config) self.config = config self.name = config["name"] self.epub_path = config.get("epub_path", "") @@ -89,6 +92,10 @@ class EpubToSkillConverter: # Extracted data self.extracted_data = None + def extract(self): + """SkillConverter interface — delegates to extract_epub().""" + return self.extract_epub() + def extract_epub(self): """Extract content from EPUB file. @@ -1068,143 +1075,3 @@ def _score_code_quality(code: str) -> float: score -= 2.0 return min(10.0, max(0.0, score)) - - -def main(): - from .arguments.epub import add_epub_arguments - - parser = argparse.ArgumentParser( - description="Convert EPUB e-book to skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - add_epub_arguments(parser) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = getattr(args, "epub", None) or getattr(args, "from_json", None) or "(none)" - print(f"\n{'=' * 60}") - print("DRY RUN: EPUB Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - if not (getattr(args, "epub", None) or getattr(args, "from_json", None)): - parser.error("Must specify --epub or --from-json") - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} documentation", - } - try: - converter = EpubToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Direct EPUB mode - if not getattr(args, "name", None): - # Auto-detect name from filename - args.name = Path(args.epub).stem - - config = { - "name": args.name, - "epub_path": args.epub, - # Pass None so extract_epub() can infer from EPUB metadata - "description": getattr(args, "description", None), - } - - try: - converter = EpubToSkillConverter(config) - - # Extract - if not converter.extract_epub(): - print("\nāŒ EPUB extraction failed - see error above", file=sys.stderr) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - import os - - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis," - " enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during EPUB processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/execution_context.py b/src/skill_seekers/cli/execution_context.py new file mode 100644 index 0000000..2a6b4f6 --- /dev/null +++ b/src/skill_seekers/cli/execution_context.py @@ -0,0 +1,549 @@ +"""ExecutionContext - Single source of truth for all configuration. + +This module provides a singleton context object that holds all resolved +configuration from CLI args, config files, and environment variables. +All components read from this context instead of parsing their own argv. + +Example: + >>> from skill_seekers.cli.execution_context import ExecutionContext + >>> ctx = ExecutionContext.initialize(args=parsed_args) + >>> ctx = ExecutionContext.get() # Get initialized instance + >>> print(ctx.output.name) + >>> print(ctx.enhancement.agent) +""" + +from __future__ import annotations + +import contextlib +import json +import logging +import os +import threading +from pathlib import Path +from typing import Any, ClassVar, Literal +from collections.abc import Generator + +from pydantic import BaseModel, Field, PrivateAttr + +logger = logging.getLogger(__name__) + + +class SourceInfoConfig(BaseModel): + """Source detection results.""" + + type: str = Field(..., description="Source type (web, github, pdf, etc.)") + raw_source: str = Field(..., description="Original user input") + parsed: dict[str, Any] = Field(default_factory=dict, description="Parsed source details") + suggested_name: str = Field(default="", description="Auto-generated skill name") + + +class EnhancementSettings(BaseModel): + """AI enhancement configuration.""" + + model_config = { + "json_schema_extra": { + "example": { + "enabled": True, + "level": 2, + "mode": "auto", + "agent": "kimi", + "timeout": 2700, + } + } + } + + enabled: bool = Field(default=True, description="Whether enhancement is enabled") + level: int = Field(default=2, ge=0, le=3, description="Enhancement level (0-3)") + mode: str = Field(default="auto", description="Mode: api, local, or auto") + agent: str | None = Field(default=None, description="Local agent name (claude, kimi, etc.)") + agent_cmd: str | None = Field(default=None, description="Custom agent command override") + api_key: str | None = Field(default=None, description="API key for enhancement") + timeout: int = Field(default=2700, description="Timeout in seconds (default: 45min)") + workflows: list[str] = Field(default_factory=list, description="Enhancement workflow names") + stages: list[str] = Field(default_factory=list, description="Inline enhancement stages") + workflow_vars: dict[str, str] = Field(default_factory=dict, description="Workflow variables") + + +class OutputSettings(BaseModel): + """Output configuration.""" + + model_config = { + "json_schema_extra": { + "example": { + "name": "react-docs", + "doc_version": "18.2", + "dry_run": False, + } + } + } + + name: str | None = Field(default=None, description="Skill name") + output_dir: str | None = Field(default=None, description="Output directory override") + doc_version: str = Field(default="", description="Documentation version tag") + dry_run: bool = Field(default=False, description="Preview mode without execution") + + +class ScrapingSettings(BaseModel): + """Web scraping configuration.""" + + max_pages: int | None = Field(default=None, description="Maximum pages to scrape") + rate_limit: float | None = Field(default=None, description="Rate limit in seconds") + browser: bool = Field(default=False, description="Use headless browser for JS sites") + browser_wait_until: str = Field( + default="domcontentloaded", description="Browser wait condition" + ) + browser_extra_wait: int = Field(default=0, description="Extra wait time in ms after page load") + workers: int = Field(default=1, description="Number of parallel workers") + async_mode: bool = Field(default=False, description="Enable async mode") + resume: bool = Field(default=False, description="Resume from checkpoint") + fresh: bool = Field(default=False, description="Clear checkpoint and start fresh") + skip_scrape: bool = Field(default=False, description="Skip scraping, use existing data") + languages: list[str] = Field(default_factory=lambda: ["en"], description="Language preferences") + + +class AnalysisSettings(BaseModel): + """Code analysis configuration.""" + + depth: Literal["surface", "deep", "full"] = Field( + default="surface", description="Analysis depth: surface, deep, full" + ) + skip_patterns: bool = Field(default=False, description="Skip design pattern detection") + skip_test_examples: bool = Field(default=False, description="Skip test example extraction") + skip_how_to_guides: bool = Field(default=False, description="Skip how-to guide generation") + skip_config_patterns: bool = Field(default=False, description="Skip config pattern extraction") + skip_api_reference: bool = Field(default=False, description="Skip API reference generation") + skip_dependency_graph: bool = Field(default=False, description="Skip dependency graph") + skip_docs: bool = Field(default=False, description="Skip documentation extraction") + no_comments: bool = Field(default=False, description="Skip comment extraction") + file_patterns: list[str] | None = Field(default=None, description="File patterns to analyze") + + +class RAGSettings(BaseModel): + """RAG (Retrieval-Augmented Generation) configuration.""" + + chunk_for_rag: bool = Field(default=False, description="Enable semantic chunking") + chunk_tokens: int = Field(default=512, description="Chunk size in tokens") + chunk_overlap_tokens: int = Field(default=50, description="Overlap between chunks") + preserve_code_blocks: bool = Field(default=True, description="Don't split code blocks") + preserve_paragraphs: bool = Field(default=True, description="Respect paragraph boundaries") + + +class ExecutionContext(BaseModel): + """Single source of truth for all execution configuration. + + This is a singleton - use ExecutionContext.get() to access the instance. + Initialize once at entry point with ExecutionContext.initialize(). + + Example: + >>> ctx = ExecutionContext.initialize(args=parsed_args) + >>> ctx = ExecutionContext.get() # Get initialized instance + >>> print(ctx.output.name) + """ + + model_config = { + "json_schema_extra": { + "example": { + "source": {"type": "web", "raw_source": "https://react.dev/"}, + "enhancement": {"level": 2, "agent": "kimi"}, + "output": {"name": "react-docs"}, + } + } + } + + # Configuration sections + source: SourceInfoConfig | None = Field(default=None, description="Source information") + enhancement: EnhancementSettings = Field(default_factory=EnhancementSettings) + output: OutputSettings = Field(default_factory=OutputSettings) + scraping: ScrapingSettings = Field(default_factory=ScrapingSettings) + analysis: AnalysisSettings = Field(default_factory=AnalysisSettings) + rag: RAGSettings = Field(default_factory=RAGSettings) + + # Private attributes + _raw_args: dict[str, Any] = PrivateAttr(default_factory=dict) + _config_path: str | None = PrivateAttr(default=None) + + # Singleton storage (class-level) + _instance: ClassVar[ExecutionContext | None] = None + _lock: ClassVar[threading.Lock] = threading.Lock() + _initialized: ClassVar[bool] = False + + @classmethod + def get(cls) -> ExecutionContext: + """Get the singleton instance (thread-safe). + + Returns a default context if not explicitly initialized. + This ensures components can always read from the context + without try/except blocks. + """ + with cls._lock: + if cls._instance is None: + cls._instance = cls() + logger.debug("ExecutionContext auto-initialized with defaults") + return cls._instance + + @classmethod + def initialize( + cls, + args: Any | None = None, + config_path: str | None = None, + source_info: Any | None = None, + ) -> ExecutionContext: + """Initialize the singleton context. + + Priority (highest to lowest): + 1. CLI args (explicit user input) + 2. Config file (JSON config) + 3. Environment variables + 4. Defaults + + Args: + args: Parsed argparse.Namespace + config_path: Path to config JSON file + source_info: SourceInfo from source_detector + + Returns: + Initialized ExecutionContext instance + """ + with cls._lock: + if cls._initialized: + logger.info( + "ExecutionContext.initialize() called again — returning existing instance. " + "Use ExecutionContext.reset() first if re-initialization is intended." + ) + return cls._instance + + context_data = cls._build_from_sources(args, config_path, source_info) + cls._instance = cls.model_validate(context_data) + if args: + cls._instance._raw_args = vars(args) + cls._instance._config_path = config_path + cls._initialized = True + return cls._instance + + @classmethod + def reset(cls) -> None: + """Reset the singleton (mainly for testing).""" + with cls._lock: + cls._instance = None + cls._initialized = False + + @classmethod + def _build_from_sources( + cls, + args: Any | None, + config_path: str | None, + source_info: Any | None, + ) -> dict[str, Any]: + """Build context dict from all configuration sources.""" + # Start with defaults + data = cls._default_data() + + # Layer 1: Config file + if config_path: + file_config = cls._load_config_file(config_path) + data = cls._deep_merge(data, file_config) + + # Layer 2: CLI args (override config file) + if args: + arg_config = cls._args_to_data(args) + data = cls._deep_merge(data, arg_config) + + # Layer 3: Source info + if source_info: + data["source"] = { + "type": source_info.type, + "raw_source": getattr(source_info, "raw_source", None) + or getattr(source_info, "raw_input", ""), + "parsed": source_info.parsed, + "suggested_name": source_info.suggested_name, + } + + return data + + @classmethod + def _default_data(cls) -> dict[str, Any]: + """Get default configuration.""" + from skill_seekers.cli.agent_client import get_default_timeout + + return { + "enhancement": { + "enabled": True, + "level": 2, + # Env-var-based mode detection (lowest priority — CLI and config override this) + "mode": "api" + if any( + os.environ.get(k) + for k in ( + "ANTHROPIC_API_KEY", + "OPENAI_API_KEY", + "MOONSHOT_API_KEY", + "GOOGLE_API_KEY", + ) + ) + else "auto", + "agent": os.environ.get("SKILL_SEEKER_AGENT"), + "agent_cmd": None, + "api_key": None, + "timeout": get_default_timeout(), + "workflows": [], + "stages": [], + "workflow_vars": {}, + }, + "output": { + "name": None, + "output_dir": None, + "doc_version": "", + "dry_run": False, + }, + "scraping": { + "max_pages": None, + "rate_limit": None, + "browser": False, + "browser_wait_until": "domcontentloaded", + "browser_extra_wait": 0, + "workers": 1, + "async_mode": False, + "resume": False, + "fresh": False, + "skip_scrape": False, + "languages": ["en"], + }, + "analysis": { + "depth": "surface", + "skip_patterns": False, + "skip_test_examples": False, + "skip_how_to_guides": False, + "skip_config_patterns": False, + "skip_api_reference": False, + "skip_dependency_graph": False, + "skip_docs": False, + "no_comments": False, + "file_patterns": None, + }, + "rag": { + "chunk_for_rag": False, + "chunk_tokens": 512, + "chunk_overlap_tokens": 50, + "preserve_code_blocks": True, + "preserve_paragraphs": True, + }, + } + + @classmethod + def _load_config_file(cls, config_path: str) -> dict[str, Any]: + """Load and normalize config file.""" + path = Path(config_path) + try: + with open(path, encoding="utf-8") as f: + file_data = json.load(f) + except FileNotFoundError: + raise ValueError(f"Config file not found: {config_path}") from None + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in config file {config_path}: {e}") from None + + config: dict[str, Any] = {} + + # Unified config format (sources array) + if "sources" in file_data: + enhancement = file_data.get("enhancement", {}) + + # Handle timeout field (can be "unlimited" or integer) + timeout_val = enhancement.get("timeout", 2700) + if isinstance(timeout_val, str) and timeout_val.lower() in ("unlimited", "none"): + from skill_seekers.cli.agent_client import UNLIMITED_TIMEOUT + + timeout_val = UNLIMITED_TIMEOUT + + config["output"] = { + "name": file_data.get("name"), + "doc_version": file_data.get("version", ""), + } + config["enhancement"] = { + "enabled": enhancement.get("enabled", True), + "level": enhancement.get("level", 2), + "mode": enhancement.get("mode", "auto").lower(), + "agent": enhancement.get("agent"), + "timeout": timeout_val, + "workflows": file_data.get("workflows", []), + "stages": file_data.get("workflow_stages", []), + "workflow_vars": file_data.get("workflow_vars", {}), + } + + # Simple web config format + elif "base_url" in file_data: + config["output"] = { + "name": file_data.get("name"), + "doc_version": file_data.get("version", ""), + } + config["scraping"] = { + "max_pages": file_data.get("max_pages"), + "rate_limit": file_data.get("rate_limit"), + "browser": file_data.get("browser", False), + } + + return config + + @classmethod + def _args_to_data(cls, args: Any) -> dict[str, Any]: + """Convert argparse.Namespace to config dict.""" + config: dict[str, Any] = {} + + # Output + if hasattr(args, "name") and args.name is not None: + config.setdefault("output", {})["name"] = args.name + if hasattr(args, "output") and args.output is not None: + config.setdefault("output", {})["output_dir"] = args.output + if hasattr(args, "doc_version") and args.doc_version: + config.setdefault("output", {})["doc_version"] = args.doc_version + if getattr(args, "dry_run", False): + config.setdefault("output", {})["dry_run"] = True + + # Enhancement + if hasattr(args, "enhance_level") and args.enhance_level is not None: + config.setdefault("enhancement", {})["level"] = args.enhance_level + if getattr(args, "agent", None): + config.setdefault("enhancement", {})["agent"] = args.agent + if getattr(args, "agent_cmd", None): + config.setdefault("enhancement", {})["agent_cmd"] = args.agent_cmd + if getattr(args, "api_key", None): + config.setdefault("enhancement", {})["api_key"] = args.api_key + + # Resolve mode from explicit CLI flags: + # --api-key → "api", --agent (without --api-key) → "local". + # Env-var-based mode detection belongs in _default_data(), not here, + # to preserve the priority: CLI args > Config file > Env vars > Defaults. + if getattr(args, "api_key", None): + config.setdefault("enhancement", {})["mode"] = "api" + elif getattr(args, "agent", None): + config.setdefault("enhancement", {})["mode"] = "local" + + # Workflows + if getattr(args, "enhance_workflow", None): + config.setdefault("enhancement", {})["workflows"] = list(args.enhance_workflow) + if getattr(args, "enhance_stage", None): + config.setdefault("enhancement", {})["stages"] = list(args.enhance_stage) + if getattr(args, "var", None): + config.setdefault("enhancement", {})["workflow_vars"] = cls._parse_vars(args.var) + + # Scraping + if hasattr(args, "max_pages") and args.max_pages is not None: + config.setdefault("scraping", {})["max_pages"] = args.max_pages + if hasattr(args, "rate_limit") and args.rate_limit is not None: + config.setdefault("scraping", {})["rate_limit"] = args.rate_limit + if getattr(args, "browser", False): + config.setdefault("scraping", {})["browser"] = True + if hasattr(args, "workers") and args.workers: + config.setdefault("scraping", {})["workers"] = args.workers + if getattr(args, "async_mode", False): + config.setdefault("scraping", {})["async_mode"] = True + if getattr(args, "resume", False): + config.setdefault("scraping", {})["resume"] = True + if getattr(args, "fresh", False): + config.setdefault("scraping", {})["fresh"] = True + if getattr(args, "skip_scrape", False): + config.setdefault("scraping", {})["skip_scrape"] = True + + # Analysis + if getattr(args, "depth", None): + config.setdefault("analysis", {})["depth"] = args.depth + if getattr(args, "skip_patterns", False): + config.setdefault("analysis", {})["skip_patterns"] = True + if getattr(args, "skip_test_examples", False): + config.setdefault("analysis", {})["skip_test_examples"] = True + if getattr(args, "skip_how_to_guides", False): + config.setdefault("analysis", {})["skip_how_to_guides"] = True + if getattr(args, "file_patterns", None): + config.setdefault("analysis", {})["file_patterns"] = [ + p.strip() for p in args.file_patterns.split(",") + ] + + # RAG + if getattr(args, "chunk_for_rag", False): + config.setdefault("rag", {})["chunk_for_rag"] = True + if hasattr(args, "chunk_tokens") and args.chunk_tokens is not None: + config.setdefault("rag", {})["chunk_tokens"] = args.chunk_tokens + + return config + + @staticmethod + def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: + """Deep merge override into base.""" + result = base.copy() + for key, value in override.items(): + if isinstance(value, dict) and key in result and isinstance(result[key], dict): + result[key] = ExecutionContext._deep_merge(result[key], value) + else: + result[key] = value + return result + + @staticmethod + def _parse_vars(var_list: list[str]) -> dict[str, str]: + """Parse --var key=value into dict.""" + result = {} + for var in var_list: + if "=" in var: + key, value = var.split("=", 1) + result[key] = value + return result + + @property + def config_path(self) -> str | None: + """Path to the config file used for initialization, if any.""" + return self._config_path + + def get_raw(self, name: str, default: Any = None) -> Any: + """Get raw argument value (backward compatibility).""" + return self._raw_args.get(name, default) + + def get_agent_client(self) -> Any: + """Get configured AgentClient from context.""" + from skill_seekers.cli.agent_client import AgentClient + + return AgentClient(mode=self.enhancement.mode, agent=self.enhancement.agent) + + @contextlib.contextmanager + def override(self, **kwargs: Any) -> Generator[ExecutionContext, None, None]: + """Temporarily override context values. + + Thread-safe: uses an override stack so nested/concurrent overrides + restore correctly regardless of ordering. + + Usage: + with ctx.override(enhancement__level=3): + run_workflow() # Uses level 3 + # Original values restored + """ + # Create new data with overrides + current_data = self.model_dump(exclude={"_raw_args"}) + + for key, value in kwargs.items(): + if "__" in key: + parts = key.split("__") + target = current_data + for part in parts[:-1]: + target = target.setdefault(part, {}) + target[parts[-1]] = value + else: + current_data[key] = value + + # Create temporary instance and preserve _raw_args + temp_ctx = self.__class__.model_validate(current_data) + temp_ctx._raw_args = dict(self._raw_args) # Copy raw args to temp context + + # Swap singleton atomically and save previous state on a stack + # so nested/concurrent overrides restore in the correct order. + with self.__class__._lock: + saved = (self.__class__._instance, self.__class__._initialized) + self.__class__._instance = temp_ctx + self.__class__._initialized = True + try: + yield temp_ctx + finally: + with self.__class__._lock: + self.__class__._instance = saved[0] + self.__class__._initialized = saved[1] + + +def get_context() -> ExecutionContext: + """Shortcut for ExecutionContext.get().""" + return ExecutionContext.get() diff --git a/src/skill_seekers/cli/github_scraper.py b/src/skill_seekers/cli/github_scraper.py index 6afe6ca..87e722f 100644 --- a/src/skill_seekers/cli/github_scraper.py +++ b/src/skill_seekers/cli/github_scraper.py @@ -14,7 +14,6 @@ Usage: skill-seekers github --repo owner/repo --token $GITHUB_TOKEN """ -import argparse import fnmatch import itertools import json @@ -32,8 +31,7 @@ except ImportError: print("Error: PyGithub not installed. Run: pip install PyGithub") sys.exit(1) -from skill_seekers.cli.arguments.github import add_github_arguments -from skill_seekers.cli.utils import setup_logging +from skill_seekers.cli.skill_converter import SkillConverter # Try to import pathspec for .gitignore support try: @@ -183,7 +181,7 @@ def extract_description_from_readme(readme_content: str, repo_name: str) -> str: return f"Use when working with {project_name}" -class GitHubScraper: +class GitHubScraper(SkillConverter): """ GitHub Repository Scraper (C1.1-C1.9) @@ -199,8 +197,11 @@ class GitHubScraper: - Releases """ + SOURCE_TYPE = "github" + def __init__(self, config: dict[str, Any], local_repo_path: str | None = None): """Initialize GitHub scraper with configuration.""" + super().__init__(config) self.config = config self.repo_name = config["repo"] self.name = config.get("name", self.repo_name.split("/")[-1]) @@ -353,6 +354,15 @@ class GitHubScraper: logger.error(f"Unexpected error during scraping: {e}") raise + def extract(self): + """SkillConverter interface — delegates to scrape().""" + self.scrape() + + def build_skill(self): + """SkillConverter interface — delegates to GitHubToSkillConverter.""" + converter = GitHubToSkillConverter(self.config) + converter.build_skill() + def _fetch_repository(self): """C1.1: Fetch repository structure using GitHub API.""" logger.info(f"Fetching repository: {self.repo_name}") @@ -1379,186 +1389,3 @@ Use this skill when you need to: with open(structure_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Generated: {structure_path}") - - -def setup_argument_parser() -> argparse.ArgumentParser: - """Setup and configure command-line argument parser. - - Creates an ArgumentParser with all CLI options for the github scraper. - All arguments are defined in skill_seekers.cli.arguments.github to ensure - consistency between the standalone scraper and unified CLI. - - Returns: - argparse.ArgumentParser: Configured argument parser - """ - parser = argparse.ArgumentParser( - description="GitHub Repository to AI Skill Converter", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - skill-seekers github --repo facebook/react - skill-seekers github --config configs/react_github.json - skill-seekers github --repo owner/repo --token $GITHUB_TOKEN - """, - ) - - # Add all github arguments from shared definitions - # This ensures the standalone scraper and unified CLI stay in sync - add_github_arguments(parser) - - return parser - - -def main(): - """C1.10: CLI tool entry point.""" - parser = setup_argument_parser() - args = parser.parse_args() - - setup_logging(verbose=getattr(args, "verbose", False), quiet=getattr(args, "quiet", False)) - - # Handle --dry-run - if getattr(args, "dry_run", False): - repo = args.repo or (args.config and "(from config)") - print(f"\n{'=' * 60}") - print(f"DRY RUN: GitHub Repository Analysis") - print(f"{'=' * 60}") - print(f"Repository: {repo}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Include issues: {not getattr(args, 'no_issues', False)}") - print(f"Include releases: {not getattr(args, 'no_releases', False)}") - print(f"Include changelog: {not getattr(args, 'no_changelog', False)}") - print(f"Max issues: {getattr(args, 'max_issues', 100)}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"Profile: {getattr(args, 'profile', None) or '(default)'}") - print(f"\nāœ… Dry run complete") - return 0 - - # Build config from args or file - if args.config: - with open(args.config, encoding="utf-8") as f: - config = json.load(f) - # Override with CLI args if provided - if args.non_interactive: - config["interactive"] = False - if args.profile: - config["github_profile"] = args.profile - elif args.repo: - config = { - "repo": args.repo, - "name": args.name or args.repo.split("/")[-1], - "description": args.description or f"Use when working with {args.repo.split('/')[-1]}", - "github_token": args.token, - "include_issues": not args.no_issues, - "include_changelog": not args.no_changelog, - "include_releases": not args.no_releases, - "max_issues": args.max_issues, - "interactive": not args.non_interactive, - "github_profile": args.profile, - "local_repo_path": getattr(args, "local_repo_path", None), - } - else: - parser.error("Either --repo or --config is required") - - try: - # Phase 1: Scrape GitHub repository - scraper = GitHubScraper(config) - scraper.scrape() - - if args.scrape_only: - logger.info("Scrape complete (--scrape-only mode)") - return - - # Phase 2: Build skill - converter = GitHubToSkillConverter(config) - converter.build_skill() - - skill_name = config.get("name", config["repo"].split("/")[-1]) - skill_dir = f"output/{skill_name}" - - # ============================================================ - # WORKFLOW SYSTEM INTEGRATION (Phase 2 - github_scraper) - # ============================================================ - from skill_seekers.cli.workflow_runner import run_workflows - - # Pass GitHub-specific context to workflows - github_context = { - "repo": config.get("repo", ""), - "name": skill_name, - "description": config.get("description", ""), - } - - workflow_executed, workflow_names = run_workflows(args, context=github_context) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Phase 3: Optional enhancement with auto-detected mode - # Note: Runs independently of workflow system (they complement each other) - if getattr(args, "enhance_level", 0) > 0: - import os - - # Auto-detect mode based on API key availability - api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - logger.info("\n" + "=" * 80) - logger.info(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - logger.info("=" * 80) - if workflow_executed: - logger.info(f" Running after workflow: {workflow_name}") - logger.info( - " (Workflow provides specialized analysis, enhancement provides general improvements)" - ) - logger.info("") - - if api_key: - # API-based enhancement - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - logger.info("āœ… API enhancement complete!") - except ImportError: - logger.error("āŒ API enhancement not available. Install: pip install anthropic") - logger.info("šŸ’” Falling back to LOCAL mode...") - # Fall back to LOCAL mode - from pathlib import Path - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - agent_name = agent or "claude" - logger.info(f"āœ… Local enhancement complete! (via {agent_name})") - else: - # LOCAL enhancement (no API key) - from pathlib import Path - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - agent_name = agent or "claude" - logger.info(f"āœ… Local enhancement complete! (via {agent_name})") - - logger.info(f"\nāœ… Success! Skill created at: {skill_dir}/") - - # Only suggest enhancement if neither workflow nor traditional enhancement was done - if not workflow_executed and getattr(args, "enhance_level", 0) == 0: - logger.info("\nšŸ’” Optional: Enhance SKILL.md with AI:") - logger.info(f" skill-seekers enhance {skill_dir}/ --enhance-level 2") - logger.info(" (auto-detects API vs LOCAL mode based on ANTHROPIC_API_KEY)") - logger.info("\nšŸ’” Or use a workflow:") - logger.info( - f" skill-seekers github --repo {config['repo']} --enhance-workflow architecture-comprehensive" - ) - - logger.info(f"\nNext step: skill-seekers package {skill_dir}/") - - except Exception as e: - logger.error(f"Error: {e}") - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/src/skill_seekers/cli/how_to_guide_builder.py b/src/skill_seekers/cli/how_to_guide_builder.py index df79b61..0becf87 100644 --- a/src/skill_seekers/cli/how_to_guide_builder.py +++ b/src/skill_seekers/cli/how_to_guide_builder.py @@ -908,8 +908,29 @@ class HowToGuideBuilder: return collection def _extract_workflow_examples(self, examples: list[dict]) -> list[dict]: - """Filter to workflow category only""" - return [ex for ex in examples if isinstance(ex, dict) and ex.get("category") == "workflow"] + """Filter to examples suitable for guide generation. + + Includes: + - All workflow-category examples + - Setup/config examples with sufficient complexity (4+ steps or high confidence) + - Instantiation examples with high confidence and multiple dependencies + """ + guide_worthy = [] + for ex in examples: + if not isinstance(ex, dict): + continue + category = ex.get("category", "") + complexity = ex.get("complexity_score", 0) + confidence = ex.get("confidence", 0) + + if category == "workflow": + guide_worthy.append(ex) + elif category in ("setup", "config") and (complexity >= 0.4 or confidence >= 0.7): + guide_worthy.append(ex) + elif category == "instantiation" and complexity >= 0.6 and confidence >= 0.7: + guide_worthy.append(ex) + + return guide_worthy def _create_guide(self, title: str, workflows: list[dict], enhancer=None) -> HowToGuide: """ diff --git a/src/skill_seekers/cli/html_scraper.py b/src/skill_seekers/cli/html_scraper.py index 5d1a5fd..320b957 100644 --- a/src/skill_seekers/cli/html_scraper.py +++ b/src/skill_seekers/cli/html_scraper.py @@ -16,17 +16,17 @@ Usage: skill-seekers html --from-json page_extracted.json """ -import argparse import json import logging import os import re -import sys from pathlib import Path # BeautifulSoup is a core dependency (always available) from bs4 import BeautifulSoup, Comment, Tag +from .skill_converter import SkillConverter + logger = logging.getLogger(__name__) # File extensions treated as HTML @@ -95,7 +95,7 @@ def _collect_html_files(html_path: str) -> list[Path]: raise ValueError(f"Path is neither a file nor a directory: {html_path}") -class HtmlToSkillConverter: +class HtmlToSkillConverter(SkillConverter): """Convert local HTML files to a skill. Supports single HTML files and directories of HTML files. Parses document @@ -112,6 +112,8 @@ class HtmlToSkillConverter: extracted_data: Parsed extraction results dict. """ + SOURCE_TYPE = "html" + def __init__(self, config: dict) -> None: """Initialize the HTML to skill converter. @@ -122,6 +124,7 @@ class HtmlToSkillConverter: - description (str): Skill description (optional). - categories (dict): Category definitions for content grouping. """ + super().__init__(config) self.config = config self.name: str = config["name"] self.html_path: str = config.get("html_path", "") @@ -139,6 +142,10 @@ class HtmlToSkillConverter: # Extracted data self.extracted_data: dict | None = None + def extract(self): + """SkillConverter interface — delegates to extract_html().""" + return self.extract_html() + # ------------------------------------------------------------------ # Extraction # ------------------------------------------------------------------ @@ -1742,205 +1749,3 @@ def _score_code_quality(code: str) -> float: score -= 2.0 return min(10.0, max(0.0, score)) - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> int: - """CLI entry point for the HTML scraper. - - Parses command-line arguments and runs the extraction/build pipeline. - Supports two workflows: - 1. Direct HTML extraction: ``--html-path page.html --name myskill`` - 2. Build from JSON: ``--from-json page_extracted.json`` - - Returns: - Exit code (0 for success, non-zero for failure). - """ - parser = argparse.ArgumentParser( - description="Convert local HTML files to skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "Examples:\n" - " %(prog)s --html-path page.html --name myskill\n" - " %(prog)s --html-path ./docs/ --name myskill\n" - " %(prog)s --from-json page_extracted.json\n" - ), - ) - - # Shared universal args - from .arguments.common import add_all_standard_arguments - - add_all_standard_arguments(parser) - - # Override enhance-level default to 0 for HTML - for action in parser._actions: - if hasattr(action, "dest") and action.dest == "enhance_level": - action.default = 0 - action.help = ( - "AI enhancement level (auto-detects API vs LOCAL mode): " - "0=disabled (default for HTML), 1=SKILL.md only, " - "2=+architecture/config, 3=full enhancement. " - "Mode selection: uses API if ANTHROPIC_API_KEY is set, " - "otherwise LOCAL (Claude Code, Kimi, etc.)" - ) - - # HTML-specific args - parser.add_argument( - "--html-path", - type=str, - help="Path to HTML file or directory of HTML files", - metavar="PATH", - ) - parser.add_argument( - "--from-json", - type=str, - help="Build skill from previously extracted JSON", - metavar="FILE", - ) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = getattr(args, "html_path", None) or getattr(args, "from_json", None) or "(none)" - print(f"\n{'=' * 60}") - print("DRY RUN: HTML Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - if not (getattr(args, "html_path", None) or getattr(args, "from_json", None)): - parser.error("Must specify --html-path or --from-json") - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} documentation", - } - try: - converter = HtmlToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Direct HTML mode - if not getattr(args, "name", None): - # Auto-detect name from path - path = Path(args.html_path) - args.name = path.stem if path.is_file() else path.name - - config = { - "name": args.name, - "html_path": args.html_path, - # Pass None so extract_html() can infer from HTML metadata - "description": getattr(args, "description", None), - } - - try: - converter = HtmlToSkillConverter(config) - - # Extract - if not converter.extract_html(): - print( - "\nāŒ HTML extraction failed - see error above", - file=sys.stderr, - ) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis," - " enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import ( - enhance_skill_md, - ) - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import ( - LocalSkillEnhancer, - ) - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import ( - LocalSkillEnhancer, - ) - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except (FileNotFoundError, ValueError) as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print( - f"\nāŒ Unexpected error during HTML processing: {e}", - file=sys.stderr, - ) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/jupyter_scraper.py b/src/skill_seekers/cli/jupyter_scraper.py index 663dea5..e9352d7 100644 --- a/src/skill_seekers/cli/jupyter_scraper.py +++ b/src/skill_seekers/cli/jupyter_scraper.py @@ -14,12 +14,10 @@ Usage: skill-seekers jupyter --from-json notebook_extracted.json """ -import argparse import json import logging import os import re -import sys from pathlib import Path # Optional dependency guard @@ -30,6 +28,8 @@ try: except ImportError: JUPYTER_AVAILABLE = False +from .skill_converter import SkillConverter + logger = logging.getLogger(__name__) # Import pattern categories for code analysis @@ -199,10 +199,13 @@ def infer_description_from_notebook(metadata: dict | None = None, name: str = "" ) -class JupyterToSkillConverter: +class JupyterToSkillConverter(SkillConverter): """Convert Jupyter Notebook (.ipynb) to skill.""" + SOURCE_TYPE = "jupyter" + def __init__(self, config: dict): + super().__init__(config) self.config = config self.name = config["name"] self.notebook_path = config.get("notebook_path", "") @@ -214,6 +217,10 @@ class JupyterToSkillConverter: self.categories = config.get("categories", {}) self.extracted_data: dict | None = None + def extract(self): + """SkillConverter interface — delegates to extract_notebook().""" + return self.extract_notebook() + # ------------------------------------------------------------------ # Extraction # ------------------------------------------------------------------ @@ -1082,132 +1089,3 @@ def _score_code_quality(code: str) -> float: if line_count > 0 and not non_magic: score -= 1.0 return min(10.0, max(0.0, score)) - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> int: - """Standalone CLI entry point for the Jupyter Notebook scraper.""" - from .arguments.jupyter import add_jupyter_arguments - - parser = argparse.ArgumentParser( - description="Convert Jupyter Notebook (.ipynb) to skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - add_jupyter_arguments(parser) - args = parser.parse_args() - - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - if getattr(args, "dry_run", False): - source = getattr(args, "notebook", None) or getattr(args, "from_json", None) or "(none)" - print(f"\n{'=' * 60}") - print("DRY RUN: Jupyter Notebook Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - if not (getattr(args, "notebook", None) or getattr(args, "from_json", None)): - parser.error("Must specify --notebook or --from-json") - - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} notebook documentation", - } - try: - converter = JupyterToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Direct notebook mode - if not getattr(args, "name", None): - nb_path = Path(args.notebook) - args.name = nb_path.stem if nb_path.is_file() else (nb_path.name or "notebooks") - - config = { - "name": args.name, - "notebook_path": args.notebook, - "description": getattr(args, "description", None), - } - - try: - converter = JupyterToSkillConverter(config) - if not converter.extract_notebook(): - print("\nāŒ Notebook extraction failed - see error above", file=sys.stderr) - sys.exit(1) - converter.build_skill() - - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis, " - "enhancement provides general improvements)" - ) - print("") - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during Jupyter processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/main.py b/src/skill_seekers/cli/main.py index f0d41d4..a80568d 100644 --- a/src/skill_seekers/cli/main.py +++ b/src/skill_seekers/cli/main.py @@ -2,97 +2,67 @@ """ Skill Seekers - Unified CLI Entry Point -Provides a git-style unified command-line interface for all Skill Seekers tools. +Convert documentation, codebases, and repositories into AI skills. Usage: skill-seekers [options] Commands: - config Configure GitHub tokens, API keys, and settings - scrape Scrape documentation website - github Scrape GitHub repository - pdf Extract from PDF file - word Extract from Word (.docx) file - epub Extract from EPUB e-book (.epub) - video Extract from video (YouTube or local) - jupyter Extract from Jupyter Notebook (.ipynb) - html Extract from local HTML files - openapi Extract from OpenAPI/Swagger spec - asciidoc Extract from AsciiDoc documents (.adoc) - pptx Extract from PowerPoint (.pptx) - rss Extract from RSS/Atom feeds - manpage Extract from man pages - confluence Extract from Confluence wiki - notion Extract from Notion pages - chat Extract from Slack/Discord chat exports - unified Multi-source scraping (docs + GitHub + PDF + more) - analyze Analyze local codebase and extract code knowledge + create Create skill from any source (auto-detects type) enhance AI-powered enhancement (auto: API or LOCAL mode) enhance-status Check enhancement status (for background/daemon modes) package Package skill into .zip file upload Upload skill to target platform + install One-command workflow (scrape + enhance + package + upload) + install-agent Install skill to AI agent directories estimate Estimate page count before scraping extract-test-examples Extract usage examples from test files - install-agent Install skill to AI agent directories resume Resume interrupted scraping job + config Configure GitHub tokens, API keys, and settings + doctor Health check for dependencies and configuration Examples: - skill-seekers scrape --config configs/react.json - skill-seekers github --repo microsoft/TypeScript - skill-seekers unified --config configs/react_unified.json - skill-seekers extract-test-examples tests/ --language python + skill-seekers create https://react.dev + skill-seekers create owner/repo + skill-seekers create ./document.pdf + skill-seekers create configs/unity-spine.json + skill-seekers create configs/unity-spine.json --enhance-workflow unity-game-dev + skill-seekers enhance output/react/ skill-seekers package output/react/ - skill-seekers install-agent output/react/ --agent cursor """ import argparse import importlib -import os import sys -from pathlib import Path from skill_seekers.cli import __version__ # Command module mapping (command name -> module path) COMMAND_MODULES = { - "create": "skill_seekers.cli.create_command", # NEW: Unified create command - "doctor": "skill_seekers.cli.doctor", - "config": "skill_seekers.cli.config_command", - "scrape": "skill_seekers.cli.doc_scraper", - "github": "skill_seekers.cli.github_scraper", - "pdf": "skill_seekers.cli.pdf_scraper", - "word": "skill_seekers.cli.word_scraper", - "epub": "skill_seekers.cli.epub_scraper", - "video": "skill_seekers.cli.video_scraper", - "unified": "skill_seekers.cli.unified_scraper", + # Skill creation — unified entry point for all 18 source types + "create": "skill_seekers.cli.create_command", + # Enhancement & packaging "enhance": "skill_seekers.cli.enhance_command", "enhance-status": "skill_seekers.cli.enhance_status", "package": "skill_seekers.cli.package_skill", "upload": "skill_seekers.cli.upload_skill", + "install": "skill_seekers.cli.install_skill", + "install-agent": "skill_seekers.cli.install_agent", + # Utilities "estimate": "skill_seekers.cli.estimate_pages", "extract-test-examples": "skill_seekers.cli.test_example_extractor", - "install-agent": "skill_seekers.cli.install_agent", - "analyze": "skill_seekers.cli.codebase_scraper", - "install": "skill_seekers.cli.install_skill", "resume": "skill_seekers.cli.resume_command", + "quality": "skill_seekers.cli.quality_metrics", + # Configuration & workflows + "config": "skill_seekers.cli.config_command", + "doctor": "skill_seekers.cli.doctor", + "workflows": "skill_seekers.cli.workflows_command", + "sync-config": "skill_seekers.cli.sync_config", + # Advanced (less common) "stream": "skill_seekers.cli.streaming_ingest", "update": "skill_seekers.cli.incremental_updater", "multilang": "skill_seekers.cli.multilang_support", - "quality": "skill_seekers.cli.quality_metrics", - "workflows": "skill_seekers.cli.workflows_command", - "sync-config": "skill_seekers.cli.sync_config", - # New source types (v3.2.0+) - "jupyter": "skill_seekers.cli.jupyter_scraper", - "html": "skill_seekers.cli.html_scraper", - "openapi": "skill_seekers.cli.openapi_scraper", - "asciidoc": "skill_seekers.cli.asciidoc_scraper", - "pptx": "skill_seekers.cli.pptx_scraper", - "rss": "skill_seekers.cli.rss_scraper", - "manpage": "skill_seekers.cli.man_scraper", - "confluence": "skill_seekers.cli.confluence_scraper", - "notion": "skill_seekers.cli.notion_scraper", - "chat": "skill_seekers.cli.chat_scraper", } @@ -106,14 +76,14 @@ def create_parser() -> argparse.ArgumentParser: formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Scrape documentation - skill-seekers scrape --config configs/react.json + # Create skill from documentation (auto-detects source type) + skill-seekers create https://docs.react.dev --name react - # Scrape GitHub repository - skill-seekers github --repo microsoft/TypeScript --name typescript + # Create skill from GitHub repository + skill-seekers create microsoft/TypeScript --name typescript - # Multi-source scraping (unified) - skill-seekers unified --config configs/react_unified.json + # Create skill from PDF file + skill-seekers create ./documentation.pdf --name mydocs # AI-powered enhancement skill-seekers enhance output/react/ @@ -145,6 +115,9 @@ For more information: https://github.com/yusufkaraaslan/Skill_Seekers def _reconstruct_argv(command: str, args: argparse.Namespace) -> list[str]: """Reconstruct sys.argv from args namespace for command module. + DEPRECATED: Use ExecutionContext instead. This function is kept for + backward compatibility and will be removed in a future version. + Args: command: Command name args: Parsed arguments namespace @@ -206,18 +179,8 @@ def main(argv: list[str] | None = None) -> int: Returns: Exit code (0 for success, non-zero for error) """ - # Special handling for analyze --preset-list (no directory required) if argv is None: argv = sys.argv[1:] - if len(argv) >= 2 and argv[0] == "analyze" and "--preset-list" in argv: - from skill_seekers.cli.codebase_scraper import main as analyze_main - - original_argv = sys.argv.copy() - sys.argv = ["codebase_scraper.py", "--preset-list"] - try: - return analyze_main() or 0 - finally: - sys.argv = original_argv parser = create_parser() args = parser.parse_args(argv) @@ -226,6 +189,10 @@ def main(argv: list[str] | None = None) -> int: parser.print_help() return 1 + # Note: ExecutionContext is initialized by individual commands (e.g., create_command, + # enhance_command) with the correct config_path and source_info. Do NOT initialize + # it here — commands need to set config_path which requires source detection first. + # Get command module module_name = COMMAND_MODULES.get(args.command) if not module_name: @@ -233,9 +200,38 @@ def main(argv: list[str] | None = None) -> int: parser.print_help() return 1 - # Special handling for 'analyze' command (has post-processing) - if args.command == "analyze": - return _handle_analyze_command(args) + # create command: call directly with parsed args (no argv reconstruction) + if args.command == "create": + # Handle --help-* flags before execute (no source needed for help) + from skill_seekers.cli.arguments.create import add_create_arguments + + help_modes = { + "_help_web": "web", + "_help_github": "github", + "_help_local": "local", + "_help_pdf": "pdf", + "_help_word": "word", + "_help_epub": "epub", + "_help_video": "video", + "_help_config": "config", + "_help_advanced": "advanced", + "_help_all": "all", + } + for attr, mode in help_modes.items(): + if getattr(args, attr, False): + help_parser = argparse.ArgumentParser( + prog="skill-seekers create", + description=f"Create skill — {mode} options", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + add_create_arguments(help_parser, mode=mode) + help_parser.print_help() + return 0 + + from skill_seekers.cli.create_command import CreateCommand + + command = CreateCommand(args) + return command.execute() # Standard delegation for all other commands try: @@ -269,165 +265,5 @@ def main(argv: list[str] | None = None) -> int: return 1 -def _handle_analyze_command(args: argparse.Namespace) -> int: - """Handle analyze command with special post-processing logic. - - Args: - args: Parsed arguments - - Returns: - Exit code - """ - from skill_seekers.cli.codebase_scraper import main as analyze_main - - # Reconstruct sys.argv for analyze command - original_argv = sys.argv.copy() - sys.argv = ["codebase_scraper.py", "--directory", args.directory] - - if args.output: - sys.argv.extend(["--output", args.output]) - - # Handle preset flags (depth and features) - if args.quick: - sys.argv.extend( - [ - "--depth", - "surface", - "--skip-patterns", - "--skip-test-examples", - "--skip-how-to-guides", - "--skip-config-patterns", - ] - ) - elif args.comprehensive: - sys.argv.extend(["--depth", "full"]) - elif args.depth: - sys.argv.extend(["--depth", args.depth]) - - # Determine enhance_level (simplified - use default or override) - enhance_level = getattr(args, "enhance_level", 2) # Default is 2 - if getattr(args, "quick", False): - enhance_level = 0 # Quick mode disables enhancement - - sys.argv.extend(["--enhance-level", str(enhance_level)]) - - # Pass through remaining arguments - if args.languages: - sys.argv.extend(["--languages", args.languages]) - if args.file_patterns: - sys.argv.extend(["--file-patterns", args.file_patterns]) - if args.skip_api_reference: - sys.argv.append("--skip-api-reference") - if args.skip_dependency_graph: - sys.argv.append("--skip-dependency-graph") - if args.skip_patterns: - sys.argv.append("--skip-patterns") - if args.skip_test_examples: - sys.argv.append("--skip-test-examples") - if args.skip_how_to_guides: - sys.argv.append("--skip-how-to-guides") - if args.skip_config_patterns: - sys.argv.append("--skip-config-patterns") - if args.skip_docs: - sys.argv.append("--skip-docs") - if args.no_comments: - sys.argv.append("--no-comments") - if args.verbose: - sys.argv.append("--verbose") - if getattr(args, "quiet", False): - sys.argv.append("--quiet") - if getattr(args, "dry_run", False): - sys.argv.append("--dry-run") - if getattr(args, "preset", None): - sys.argv.extend(["--preset", args.preset]) - if getattr(args, "name", None): - sys.argv.extend(["--name", args.name]) - if getattr(args, "description", None): - sys.argv.extend(["--description", args.description]) - if getattr(args, "api_key", None): - sys.argv.extend(["--api-key", args.api_key]) - # Enhancement Workflow arguments - if getattr(args, "enhance_workflow", None): - for wf in args.enhance_workflow: - sys.argv.extend(["--enhance-workflow", wf]) - if getattr(args, "enhance_stage", None): - for stage in args.enhance_stage: - sys.argv.extend(["--enhance-stage", stage]) - if getattr(args, "var", None): - for var in args.var: - sys.argv.extend(["--var", var]) - if getattr(args, "workflow_dry_run", False): - sys.argv.append("--workflow-dry-run") - - try: - result = analyze_main() or 0 - - # Enhance SKILL.md if enhance_level >= 1 - if result == 0 and enhance_level >= 1: - skill_dir = Path(args.output) - skill_md = skill_dir / "SKILL.md" - - if skill_md.exists(): - print("\n" + "=" * 60) - print(f"ENHANCING SKILL.MD WITH AI (Level {enhance_level})") - print("=" * 60 + "\n") - - try: - from skill_seekers.cli.enhance_command import ( - _is_root, - _pick_mode, - _run_api_mode, - _run_local_mode, - ) - import argparse as _ap - - _fake_args = _ap.Namespace( - skill_directory=str(skill_dir), - target=None, - api_key=None, - dry_run=False, - agent=None, - agent_cmd=None, - interactive_enhancement=False, - background=False, - daemon=False, - no_force=False, - timeout=2700, - ) - _mode, _target = _pick_mode(_fake_args) - - if _mode == "api": - print(f"\nšŸ¤– Enhancement mode: API ({_target})") - success = _run_api_mode(_fake_args, _target) == 0 - elif _is_root(): - print("\nāš ļø Skipping SKILL.md enhancement: running as root") - print(" Set ANTHROPIC_API_KEY / GOOGLE_API_KEY to enable API mode") - success = False - else: - agent_name = ( - os.environ.get("SKILL_SEEKER_AGENT", "claude").strip() or "claude" - ) - print(f"\nšŸ¤– Enhancement mode: LOCAL ({agent_name})") - success = _run_local_mode(_fake_args) == 0 - - if success: - print("\nāœ… SKILL.md enhancement complete!") - with open(skill_md) as f: - lines = len(f.readlines()) - print(f" Enhanced SKILL.md: {lines} lines") - else: - print("\nāš ļø SKILL.md enhancement did not complete") - print(" You can retry with: skill-seekers enhance " + str(skill_dir)) - except Exception as e: - print(f"\nāš ļø SKILL.md enhancement failed: {e}") - print(" You can retry with: skill-seekers enhance " + str(skill_dir)) - else: - print(f"\nāš ļø SKILL.md not found at {skill_md}, skipping enhancement") - - return result - finally: - sys.argv = original_argv - - if __name__ == "__main__": sys.exit(main()) diff --git a/src/skill_seekers/cli/man_scraper.py b/src/skill_seekers/cli/man_scraper.py index 42b77b1..1f908ac 100644 --- a/src/skill_seekers/cli/man_scraper.py +++ b/src/skill_seekers/cli/man_scraper.py @@ -20,15 +20,15 @@ Usage: skill-seekers man --from-json unix-tools_extracted.json """ -import argparse import json import logging import os import re import subprocess -import sys from pathlib import Path +from skill_seekers.cli.skill_converter import SkillConverter + logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- @@ -116,7 +116,7 @@ def infer_description_from_manpages( ) -class ManPageToSkillConverter: +class ManPageToSkillConverter(SkillConverter): """Convert Unix man pages into a skill directory structure. Supports extraction via the ``man`` command or by reading raw man-page @@ -125,6 +125,8 @@ class ManPageToSkillConverter: from skill generation. """ + SOURCE_TYPE = "manpage" + def __init__(self, config: dict) -> None: """Initialise the converter from a configuration dictionary. @@ -137,6 +139,7 @@ class ManPageToSkillConverter: - ``description``-- explicit description (optional) - ``categories`` -- keyword-based categorisation map (optional) """ + super().__init__(config) self.config = config self.name: str = config["name"] self.man_names: list[str] = config.get("man_names", []) @@ -156,6 +159,10 @@ class ManPageToSkillConverter: # Extracted data placeholder self.extracted_data: dict | None = None + def extract(self): + """Extract content from man pages (SkillConverter interface).""" + self.extract_manpages() + # ------------------------------------------------------------------ # Extraction # ------------------------------------------------------------------ @@ -1285,233 +1292,3 @@ class ManPageToSkillConverter: safe = re.sub(r"[^\w\s-]", "", name.lower()) safe = re.sub(r"[-\s]+", "_", safe) return safe - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> int: - """CLI entry point for the man page scraper. - - Supports three workflows: - - 1. ``--man-names git,curl`` -- extract named man pages via the ``man`` - command. - 2. ``--man-path /usr/share/man/man1`` -- read man page files from a - directory. - 3. ``--from-json data.json`` -- reload previously extracted data and - rebuild the skill. - - Returns: - Exit code (0 on success, non-zero on error). - """ - parser = argparse.ArgumentParser( - description="Convert Unix man pages to a skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "Examples:\n" - " %(prog)s --man-names git,curl --name unix-tools\n" - " %(prog)s --man-path /usr/share/man/man1 --name coreutils\n" - " %(prog)s --from-json unix-tools_extracted.json\n" - ), - ) - - # Standard arguments (name, description, output, enhance-level, etc.) - from .arguments.common import add_all_standard_arguments - - add_all_standard_arguments(parser) - - # Override enhance-level default to 0 for man pages - for action in parser._actions: - if hasattr(action, "dest") and action.dest == "enhance_level": - action.default = 0 - action.help = ( - "AI enhancement level (auto-detects API vs LOCAL mode): " - "0=disabled (default for man), 1=SKILL.md only, " - "2=+architecture/config, 3=full enhancement. " - "Mode selection: uses API if ANTHROPIC_API_KEY is set, " - "otherwise LOCAL (Claude Code, Kimi, etc.)" - ) - - # Man-specific arguments - parser.add_argument( - "--man-names", - type=str, - help="Comma-separated list of man page names (e.g. git,curl,grep)", - metavar="NAMES", - ) - parser.add_argument( - "--man-path", - type=str, - help="Directory containing man page files (.1-.8, .man, .gz)", - metavar="DIR", - ) - parser.add_argument( - "--sections", - type=str, - help="Comma-separated list of man section numbers to extract (e.g. 1,3,8)", - metavar="NUMS", - ) - parser.add_argument( - "--from-json", - type=str, - help="Build skill from previously extracted JSON", - metavar="FILE", - ) - - args = parser.parse_args() - - # Logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Dry run - if getattr(args, "dry_run", False): - source = ( - getattr(args, "man_names", None) - or getattr(args, "man_path", None) - or getattr(args, "from_json", None) - or "(none)" - ) - print(f"\n{'=' * 60}") - print("DRY RUN: Man Page Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Sections: {getattr(args, 'sections', None) or 'all'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate: must have at least one source - if not ( - getattr(args, "man_names", None) - or getattr(args, "man_path", None) - or getattr(args, "from_json", None) - ): - parser.error("Must specify --man-names, --man-path, or --from-json") - - # Parse section numbers - section_list: list[int] = [] - if getattr(args, "sections", None): - try: - section_list = [int(s.strip()) for s in args.sections.split(",") if s.strip()] - except ValueError: - parser.error("--sections must be comma-separated integers (e.g. 1,3,8)") - - # Parse man names - man_name_list: list[str] = [] - if getattr(args, "man_names", None): - man_name_list = [n.strip() for n in args.man_names.split(",") if n.strip()] - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} documentation", - } - try: - converter = ManPageToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Auto-detect name from man names or path - if not getattr(args, "name", None): - if man_name_list: - args.name = man_name_list[0] if len(man_name_list) == 1 else "man-pages" - elif getattr(args, "man_path", None): - args.name = Path(args.man_path).name - else: - args.name = "man-pages" - - config = { - "name": args.name, - "man_names": man_name_list, - "man_path": getattr(args, "man_path", ""), - "sections": section_list, - "description": getattr(args, "description", None), - } - - try: - converter = ManPageToSkillConverter(config) - - # Extract - if not converter.extract_manpages(): - print("\nāŒ Man page extraction failed -- see error above", file=sys.stderr) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis," - " enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during man page processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/notion_scraper.py b/src/skill_seekers/cli/notion_scraper.py index d951c07..787e8d1 100644 --- a/src/skill_seekers/cli/notion_scraper.py +++ b/src/skill_seekers/cli/notion_scraper.py @@ -16,17 +16,17 @@ Usage: skill-seekers notion --from-json output/myskill_notion_data.json --name myskill """ -import argparse import csv import json import logging import os import re -import sys import time from pathlib import Path from typing import Any +from skill_seekers.cli.skill_converter import SkillConverter + # Optional dependency guard — notion-client is not a core dependency try: from notion_client import Client as NotionClient @@ -71,7 +71,7 @@ def infer_description_from_notion(metadata: dict | None = None, name: str = "") ) -class NotionToSkillConverter: +class NotionToSkillConverter(SkillConverter): """Convert Notion workspace content (database or page tree) to a skill. Args: @@ -79,7 +79,10 @@ class NotionToSkillConverter: token, description, max_pages. """ + SOURCE_TYPE = "notion" + def __init__(self, config: dict) -> None: + super().__init__(config) self.config = config self.name: str = config["name"] self.database_id: str | None = config.get("database_id") @@ -109,6 +112,10 @@ class NotionToSkillConverter: logger.info("Notion API client initialised") return self._client + def extract(self): + """Extract content from Notion (SkillConverter interface).""" + self.extract_notion() + # -- Public extraction ----------------------------------------------- def extract_notion(self) -> bool: @@ -857,173 +864,3 @@ class NotionToSkillConverter: """Strip trailing Notion hex IDs from export filenames.""" cleaned = re.sub(r"\s+[0-9a-f]{16,}$", "", stem) return cleaned.strip() or stem - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> int: - """CLI entry point for the Notion scraper.""" - from .arguments.common import add_all_standard_arguments - - parser = argparse.ArgumentParser( - description="Convert Notion workspace content to AI-ready skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "Examples:\n" - " skill-seekers notion --database-id ID --token $NOTION_TOKEN --name myskill\n" - " skill-seekers notion --page-id ID --token $NOTION_TOKEN --name myskill\n" - " skill-seekers notion --export-path ./export/ --name myskill\n" - " skill-seekers notion --from-json output/myskill_notion_data.json --name myskill" - ), - ) - add_all_standard_arguments(parser) - - # Override enhance-level default to 0 for Notion - for action in parser._actions: - if hasattr(action, "dest") and action.dest == "enhance_level": - action.default = 0 - - # Notion-specific arguments - parser.add_argument( - "--database-id", type=str, help="Notion database ID (API mode)", metavar="ID" - ) - parser.add_argument( - "--page-id", type=str, help="Notion page ID (API mode, recursive)", metavar="ID" - ) - parser.add_argument( - "--export-path", type=str, help="Notion export directory (export mode)", metavar="PATH" - ) - parser.add_argument( - "--token", type=str, help="Notion integration token (or NOTION_TOKEN env)", metavar="TOKEN" - ) - parser.add_argument( - "--max-pages", - type=int, - default=DEFAULT_MAX_PAGES, - help=f"Maximum pages to extract (default: {DEFAULT_MAX_PAGES})", - metavar="N", - ) - parser.add_argument( - "--from-json", type=str, help="Build from previously extracted JSON", metavar="FILE" - ) - - args = parser.parse_args() - - # Logging - level = ( - logging.WARNING - if getattr(args, "quiet", False) - else (logging.DEBUG if getattr(args, "verbose", False) else logging.INFO) - ) - logging.basicConfig(level=level, format="%(message)s", force=True) - - # Dry run - if getattr(args, "dry_run", False): - source = ( - getattr(args, "database_id", None) - or getattr(args, "page_id", None) - or getattr(args, "export_path", None) - or getattr(args, "from_json", None) - or "(none)" - ) - print(f"\n{'=' * 60}\nDRY RUN: Notion Extraction\n{'=' * 60}") - print( - f"Source: {source}\nName: {getattr(args, 'name', None) or '(auto)'}\nMax pages: {args.max_pages}" - ) - return 0 - - # Validate - has_source = any( - getattr(args, a, None) for a in ("database_id", "page_id", "export_path", "from_json") - ) - if not has_source: - parser.error("Must specify --database-id, --page-id, --export-path, or --from-json") - if not getattr(args, "name", None): - if getattr(args, "from_json", None): - args.name = Path(args.from_json).stem.replace("_notion_data", "") - elif getattr(args, "export_path", None): - args.name = Path(args.export_path).stem - else: - parser.error("--name is required when using --database-id or --page-id") - - # --from-json: build only - if getattr(args, "from_json", None): - config = { - "name": args.name, - "description": getattr(args, "description", None), - "max_pages": args.max_pages, - } - try: - conv = NotionToSkillConverter(config) - conv.load_extracted_data(args.from_json) - conv.build_skill() - except Exception as e: - print(f"\n Error: {e}", file=sys.stderr) - sys.exit(1) # noqa: E702 - return 0 - - # Full extract + build - config: dict[str, Any] = { - "name": args.name, - "database_id": getattr(args, "database_id", None), - "page_id": getattr(args, "page_id", None), - "export_path": getattr(args, "export_path", None), - "token": getattr(args, "token", None), - "description": getattr(args, "description", None), - "max_pages": args.max_pages, - } - try: - conv = NotionToSkillConverter(config) - if not conv.extract_notion(): - print("\n Notion extraction failed", file=sys.stderr) - sys.exit(1) # noqa: E702 - conv.build_skill() - - # Run enhancement workflows if specified - try: - from skill_seekers.cli.workflow_runner import run_workflows - - run_workflows(args) - except (ImportError, AttributeError): - pass - - # Traditional AI enhancement - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - skill_dir = conv.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - except ImportError: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - except RuntimeError as e: - print(f"\n Error: {e}", file=sys.stderr) - sys.exit(1) # noqa: E702 - except Exception as e: - print(f"\n Unexpected error: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) # noqa: E702 - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/openapi_scraper.py b/src/skill_seekers/cli/openapi_scraper.py index bb00191..cdd5559 100644 --- a/src/skill_seekers/cli/openapi_scraper.py +++ b/src/skill_seekers/cli/openapi_scraper.py @@ -22,14 +22,11 @@ Usage: python3 -m skill_seekers.cli.openapi_scraper --spec spec.yaml --name my-api """ -import argparse import copy import json import logging import os import re -import sys -from pathlib import Path from typing import Any # Optional dependency guard @@ -40,6 +37,8 @@ try: except ImportError: YAML_AVAILABLE = False +from skill_seekers.cli.skill_converter import SkillConverter + logger = logging.getLogger(__name__) # HTTP methods recognized in OpenAPI path items @@ -90,7 +89,7 @@ def infer_description_from_spec(info: dict | None = None, name: str = "") -> str return f"Use when working with the {name} API" if name else "Use when working with this API" -class OpenAPIToSkillConverter: +class OpenAPIToSkillConverter(SkillConverter): """Convert OpenAPI/Swagger specifications to AI-ready skills. Supports OpenAPI 2.0 (Swagger), 3.0, and 3.1 specifications in both @@ -111,6 +110,8 @@ class OpenAPIToSkillConverter: extracted_data: Structured extraction result with endpoints, schemas, etc. """ + SOURCE_TYPE = "openapi" + def __init__(self, config: dict) -> None: """Initialize the converter with configuration. @@ -125,6 +126,7 @@ class OpenAPIToSkillConverter: ValueError: If neither spec_path nor spec_url is provided and no from_json workflow is intended. """ + super().__init__(config) self.config = config self.name = config["name"] self.spec_path: str = config.get("spec_path", "") @@ -142,6 +144,10 @@ class OpenAPIToSkillConverter: self.extracted_data: dict[str, Any] = {} self.openapi_version: str = "" + def extract(self): + """Extract content from OpenAPI spec (SkillConverter interface).""" + self.extract_spec() + # ────────────────────────────────────────────────────────────────────── # Spec loading # ────────────────────────────────────────────────────────────────────── @@ -1772,192 +1778,3 @@ class OpenAPIToSkillConverter: safe = re.sub(r"[^\w\s-]", "", name.lower()) safe = re.sub(r"[-\s]+", "_", safe) return safe - - -# ────────────────────────────────────────────────────────────────────────────── -# CLI entry point -# ────────────────────────────────────────────────────────────────────────────── - - -def main() -> int: - """CLI entry point for the OpenAPI scraper. - - Supports three input modes: - 1. Local spec file: --spec path/to/spec.yaml - 2. Remote spec URL: --spec-url https://example.com/openapi.json - 3. Pre-extracted JSON: --from-json extracted.json - - Standard arguments (--name, --description, --verbose, --quiet, --dry-run) - are provided by the shared argument system. - """ - _check_yaml_deps() - - parser = argparse.ArgumentParser( - description="Convert OpenAPI/Swagger specifications to AI-ready skills", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - %(prog)s --spec petstore.yaml --name petstore-api - %(prog)s --spec-url https://petstore3.swagger.io/api/v3/openapi.json --name petstore - %(prog)s --from-json petstore_extracted.json - """, - ) - - # Standard shared arguments - from .arguments.common import add_all_standard_arguments - - add_all_standard_arguments(parser) - - # Override enhance-level default to 0 for OpenAPI - for action in parser._actions: - if hasattr(action, "dest") and action.dest == "enhance_level": - action.default = 0 - action.help = ( - "AI enhancement level (auto-detects API vs LOCAL mode): " - "0=disabled (default for OpenAPI), 1=SKILL.md only, " - "2=+architecture/config, 3=full enhancement. " - "Mode selection: uses API if ANTHROPIC_API_KEY is set, " - "otherwise LOCAL (Claude Code, Kimi, etc.)" - ) - - # OpenAPI-specific arguments - parser.add_argument( - "--spec", - type=str, - help="Local path to OpenAPI/Swagger spec file (YAML or JSON)", - metavar="PATH", - ) - parser.add_argument( - "--spec-url", - type=str, - help="Remote URL to fetch OpenAPI/Swagger spec from", - metavar="URL", - ) - parser.add_argument( - "--from-json", - type=str, - help="Build skill from previously extracted JSON data", - metavar="FILE", - ) - - args = parser.parse_args() - - # Setup logging - if getattr(args, "quiet", False): - logging.basicConfig(level=logging.WARNING, format="%(message)s") - elif getattr(args, "verbose", False): - logging.basicConfig(level=logging.DEBUG, format="%(levelname)s: %(message)s") - else: - logging.basicConfig(level=logging.INFO, format="%(message)s") - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = args.spec or args.spec_url or args.from_json or "(none)" - print(f"\n{'=' * 60}") - print("DRY RUN: OpenAPI Specification Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\n Dry run complete") - return 0 - - # Validate inputs - if not (args.spec or args.spec_url or args.from_json): - parser.error("Must specify --spec (file path), --spec-url (URL), or --from-json") - - # Build from pre-extracted JSON - if args.from_json: - name = args.name or Path(args.from_json).stem.replace("_extracted", "") - config: dict[str, Any] = { - "name": name, - "description": (args.description or f"Use when working with the {name} API"), - } - converter = OpenAPIToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - return 0 - - # Determine name - if not args.name: - if args.spec: - name = Path(args.spec).stem - elif args.spec_url: - # Derive name from URL - from urllib.parse import urlparse - - url_path = urlparse(args.spec_url).path - name = Path(url_path).stem if url_path else "api" - else: - name = "api" - else: - name = args.name - - # Build config - config = { - "name": name, - "spec_path": args.spec or "", - "spec_url": args.spec_url or "", - } - if args.description: - config["description"] = args.description - - # Create converter and run - try: - converter = OpenAPIToSkillConverter(config) - - if not converter.extract_spec(): - print("\n OpenAPI extraction failed", file=sys.stderr) - sys.exit(1) - - converter.build_skill() - - # Enhancement workflow integration - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print(f"\n{'=' * 80}") - print(f" AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print(" API enhancement complete!") - except ImportError: - print(" API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print(" Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print(" Local enhancement complete!") - - except (ValueError, RuntimeError) as e: - print(f"\n Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\n Unexpected error during OpenAPI processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/parsers/__init__.py b/src/skill_seekers/cli/parsers/__init__.py index d5faaef..475d867 100644 --- a/src/skill_seekers/cli/parsers/__init__.py +++ b/src/skill_seekers/cli/parsers/__init__.py @@ -1,21 +1,15 @@ """Parser registry and factory. This module registers all subcommand parsers and provides a factory -function to create them. +function to create them. Individual scraper commands have been removed — +use `skill-seekers create ` for all source types. """ from .base import SubcommandParser -# Import all parser classes -from .create_parser import CreateParser # NEW: Unified create command +# Import parser classes (scrapers removed — use create command) +from .create_parser import CreateParser from .config_parser import ConfigParser -from .scrape_parser import ScrapeParser -from .github_parser import GitHubParser -from .pdf_parser import PDFParser -from .word_parser import WordParser -from .epub_parser import EpubParser -from .video_parser import VideoParser -from .unified_parser import UnifiedParser from .enhance_parser import EnhanceParser from .enhance_status_parser import EnhanceStatusParser from .package_parser import PackageParser @@ -23,7 +17,6 @@ from .upload_parser import UploadParser from .estimate_parser import EstimateParser from .test_examples_parser import TestExamplesParser from .install_agent_parser import InstallAgentParser -from .analyze_parser import AnalyzeParser from .install_parser import InstallParser from .resume_parser import ResumeParser from .stream_parser import StreamParser @@ -34,57 +27,26 @@ from .workflows_parser import WorkflowsParser from .sync_config_parser import SyncConfigParser from .doctor_parser import DoctorParser -# New source type parsers (v3.2.0+) -from .jupyter_parser import JupyterParser -from .html_parser import HtmlParser -from .openapi_parser import OpenAPIParser -from .asciidoc_parser import AsciiDocParser -from .pptx_parser import PptxParser -from .rss_parser import RssParser -from .manpage_parser import ManPageParser -from .confluence_parser import ConfluenceParser -from .notion_parser import NotionParser -from .chat_parser import ChatParser - -# Registry of all parsers (in order of usage frequency) +# Registry of all parsers PARSERS = [ - CreateParser(), # NEW: Unified create command (placed first for prominence) + CreateParser(), DoctorParser(), ConfigParser(), - ScrapeParser(), - GitHubParser(), - PackageParser(), - UploadParser(), - AnalyzeParser(), EnhanceParser(), EnhanceStatusParser(), - PDFParser(), - WordParser(), - EpubParser(), - VideoParser(), - UnifiedParser(), + PackageParser(), + UploadParser(), EstimateParser(), InstallParser(), InstallAgentParser(), TestExamplesParser(), ResumeParser(), - StreamParser(), - UpdateParser(), - MultilangParser(), QualityParser(), WorkflowsParser(), SyncConfigParser(), - # New source types (v3.2.0+) - JupyterParser(), - HtmlParser(), - OpenAPIParser(), - AsciiDocParser(), - PptxParser(), - RssParser(), - ManPageParser(), - ConfluenceParser(), - NotionParser(), - ChatParser(), + StreamParser(), + UpdateParser(), + MultilangParser(), ] diff --git a/src/skill_seekers/cli/parsers/create_parser.py b/src/skill_seekers/cli/parsers/create_parser.py index 4f83d6d..59f57dc 100644 --- a/src/skill_seekers/cli/parsers/create_parser.py +++ b/src/skill_seekers/cli/parsers/create_parser.py @@ -48,14 +48,13 @@ Presets: -p quick (1-2min) | -p standard (5-10min) | -p comprehensive (20-60min) def add_arguments(self, parser): """Add create-specific arguments. - Uses shared argument definitions with progressive disclosure. - Default mode shows only universal arguments (15 flags). - - Multi-mode help handled via custom flags detected in argument parsing. + Registers ALL arguments (120+ flags) so the top-level parser + accepts source-specific flags like --browser, --max-pages, etc. + Default help still shows only universal args; use --help-all for full list. """ - # Add all arguments in 'default' mode (universal only) - # This keeps help text clean and focused - add_create_arguments(parser, mode="default") + # Register all arguments so source-specific flags are accepted + # by the top-level parser (create is the only entry point now) + add_create_arguments(parser, mode="all") # Add hidden help mode flags # These won't show in default help but can be used to get source-specific help diff --git a/src/skill_seekers/cli/pdf_scraper.py b/src/skill_seekers/cli/pdf_scraper.py index 867de5c..b13e0f2 100644 --- a/src/skill_seekers/cli/pdf_scraper.py +++ b/src/skill_seekers/cli/pdf_scraper.py @@ -11,16 +11,14 @@ Usage: python3 pdf_scraper.py --from-json manual_extracted.json """ -import argparse import json -import logging import os import re -import sys from pathlib import Path # Import the PDF extractor from .pdf_extractor_poc import PDFExtractor +from .skill_converter import SkillConverter def infer_description_from_pdf(pdf_metadata: dict = None, name: str = "") -> str: @@ -62,10 +60,13 @@ def infer_description_from_pdf(pdf_metadata: dict = None, name: str = "") -> str ) -class PDFToSkillConverter: +class PDFToSkillConverter(SkillConverter): """Convert PDF documentation to AI skill""" + SOURCE_TYPE = "pdf" + def __init__(self, config): + super().__init__(config) self.config = config self.name = config["name"] self.pdf_path = config.get("pdf_path", "") @@ -87,6 +88,10 @@ class PDFToSkillConverter: # Extracted data self.extracted_data = None + def extract(self): + """SkillConverter interface — delegates to extract_pdf().""" + return self.extract_pdf() + def extract_pdf(self): """Extract content from PDF using pdf_extractor_poc.py""" print(f"\nšŸ” Extracting from PDF: {self.pdf_path}") @@ -631,151 +636,3 @@ class PDFToSkillConverter: safe = re.sub(r"[^\w\s-]", "", name.lower()) safe = re.sub(r"[-\s]+", "_", safe) return safe - - -def main(): - from .arguments.pdf import add_pdf_arguments - - parser = argparse.ArgumentParser( - description="Convert PDF documentation to AI skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - add_pdf_arguments(parser) - - args = parser.parse_args() - - # Set logging level from behavior args - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = args.pdf or args.config or args.from_json or "(none)" - print(f"\n{'=' * 60}") - print(f"DRY RUN: PDF Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return - - # Validate inputs - if not (args.config or args.pdf or args.from_json): - parser.error("Must specify --config, --pdf, or --from-json") - - # Load or create config - if args.config: - with open(args.config) as f: - config = json.load(f) - elif args.from_json: - # Build from extracted JSON - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": name, - "description": args.description or f"Use when referencing {name} documentation", - } - converter = PDFToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - return - else: - # Direct PDF mode - if not args.name: - parser.error("Must specify --name with --pdf") - config = { - "name": args.name, - "pdf_path": args.pdf, - "description": args.description or f"Use when referencing {args.name} documentation", - "extract_options": { - "chunk_size": 10, - "min_quality": 5.0, - "extract_images": True, - "min_image_size": 100, - }, - } - - # Create converter - try: - converter = PDFToSkillConverter(config) - - # Extract if needed - if config.get("pdf_path") and not converter.extract_pdf(): - print("\nāŒ PDF extraction failed - see error above", file=sys.stderr) - sys.exit(1) - - # Build skill - converter.build_skill() - - # ═══════════════════════════════════════════════════════════════════════════ - # Enhancement Workflow Integration (Phase 2 - PDF Support) - # ═══════════════════════════════════════════════════════════════════════════ - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # ═══════════════════════════════════════════════════════════════════════════ - # Traditional Enhancement (complements workflow system) - # ═══════════════════════════════════════════════════════════════════════════ - if getattr(args, "enhance_level", 0) > 0: - import os - - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis, enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from pathlib import Path - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - agent_name = agent or "claude" - print(f"āœ… Local enhancement complete! (via {agent_name})") - else: - from pathlib import Path - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - agent_name = agent or "claude" - print(f"āœ… Local enhancement complete! (via {agent_name})") - - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during PDF processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/src/skill_seekers/cli/pptx_scraper.py b/src/skill_seekers/cli/pptx_scraper.py index 179ff18..f692c44 100644 --- a/src/skill_seekers/cli/pptx_scraper.py +++ b/src/skill_seekers/cli/pptx_scraper.py @@ -15,12 +15,10 @@ Usage: skill-seekers pptx --from-json presentation_extracted.json """ -import argparse import json import logging import os import re -import sys from pathlib import Path # Optional dependency guard @@ -33,6 +31,8 @@ try: except ImportError: PPTX_AVAILABLE = False +from skill_seekers.cli.skill_converter import SkillConverter + logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- @@ -147,7 +147,7 @@ def infer_description_from_pptx( # --------------------------------------------------------------------------- -class PptxToSkillConverter: +class PptxToSkillConverter(SkillConverter): """Convert PowerPoint presentation (.pptx) to an AI-ready skill. Follows the same pipeline pattern as the Word, EPUB, and PDF scrapers: @@ -165,6 +165,8 @@ class PptxToSkillConverter: .pptx files (merged into a single skill). """ + SOURCE_TYPE = "pptx" + def __init__(self, config: dict) -> None: """Initialize the converter with a configuration dictionary. @@ -175,6 +177,7 @@ class PptxToSkillConverter: - description (str): Skill description (optional, inferred if absent) - categories (dict): Manual category assignments (optional) """ + super().__init__(config) self.config = config self.name: str = config["name"] self.pptx_path: str = config.get("pptx_path", "") @@ -192,6 +195,10 @@ class PptxToSkillConverter: # Extracted data (populated by extract_pptx or load_extracted_data) self.extracted_data: dict | None = None + def extract(self): + """Extract content from PowerPoint files (SkillConverter interface).""" + self.extract_pptx() + # ------------------------------------------------------------------ # Extraction # ------------------------------------------------------------------ @@ -1661,165 +1668,3 @@ def _score_code_quality(code: str) -> float: score -= 2.0 return min(10.0, max(0.0, score)) - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - - -def main() -> int: - """CLI entry point for the PowerPoint scraper. - - Parses command-line arguments and runs the extraction and skill-building - pipeline. Supports direct .pptx input, directory input, and loading from - previously extracted JSON. - - Returns: - Exit code (0 for success, non-zero for errors). - """ - from skill_seekers.cli.arguments.pptx import add_pptx_arguments - - parser = argparse.ArgumentParser( - description="Convert PowerPoint presentation (.pptx) to skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - add_pptx_arguments(parser) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = getattr(args, "pptx", None) or getattr(args, "from_json", None) or "(none)" - print(f"\n{'=' * 60}") - print("DRY RUN: PowerPoint Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - if not (getattr(args, "pptx", None) or getattr(args, "from_json", None)): - parser.error("Must specify --pptx or --from-json") - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} presentation", - } - try: - converter = PptxToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Direct PPTX mode - if not getattr(args, "name", None): - # Auto-detect name from filename or directory name - pptx_path = Path(args.pptx) - args.name = pptx_path.stem if pptx_path.is_file() else pptx_path.name - - config = { - "name": args.name, - "pptx_path": args.pptx, - # Pass None so extract_pptx() can infer from presentation metadata - "description": getattr(args, "description", None), - } - - try: - converter = PptxToSkillConverter(config) - - # Extract - if not converter.extract_pptx(): - print( - "\nāŒ PowerPoint extraction failed - see error above", - file=sys.stderr, - ) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis," - " enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except (FileNotFoundError, ValueError) as e: - print(f"\nāŒ Input error: {e}", file=sys.stderr) - sys.exit(1) - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print( - f"\nāŒ Unexpected error during PowerPoint processing: {e}", - file=sys.stderr, - ) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/rss_scraper.py b/src/skill_seekers/cli/rss_scraper.py index 9e97c90..b8365d3 100644 --- a/src/skill_seekers/cli/rss_scraper.py +++ b/src/skill_seekers/cli/rss_scraper.py @@ -19,16 +19,13 @@ Usage: python3 -m skill_seekers.cli.rss_scraper --feed-url https://example.com/atom.xml --name myblog """ -import argparse import hashlib import json import logging import os import re -import sys import time from datetime import datetime -from pathlib import Path from typing import Any # Optional dependency guard — feedparser is not in core deps @@ -42,6 +39,8 @@ except ImportError: # BeautifulSoup is a core dependency (always available) from bs4 import BeautifulSoup, Comment, Tag +from skill_seekers.cli.skill_converter import SkillConverter + logger = logging.getLogger(__name__) # Feed type constants @@ -109,7 +108,7 @@ def infer_description_from_feed( ) -class RssToSkillConverter: +class RssToSkillConverter(SkillConverter): """Convert RSS/Atom feeds to AI-ready skills. Parses RSS 2.0, RSS 1.0 (RDF), and Atom feeds using feedparser. @@ -117,6 +116,8 @@ class RssToSkillConverter: requests + BeautifulSoup. """ + SOURCE_TYPE = "rss" + def __init__(self, config: dict[str, Any]) -> None: """Initialize the converter with configuration. @@ -125,6 +126,7 @@ class RssToSkillConverter: follow_links (default True), max_articles (default 50), and description (optional). """ + super().__init__(config) self.config = config self.name: str = config["name"] self.feed_url: str = config.get("feed_url", "") @@ -142,6 +144,10 @@ class RssToSkillConverter: # Internal state self.extracted_data: dict[str, Any] | None = None + def extract(self): + """Extract content from RSS/Atom feed (SkillConverter interface).""" + self.extract_feed() + # ────────────────────────────────────────────────────────────────────── # Public API # ────────────────────────────────────────────────────────────────────── @@ -865,227 +871,3 @@ class RssToSkillConverter: safe = re.sub(r"[^\w\s-]", "", name.lower()) safe = re.sub(r"[-\s]+", "_", safe) return safe or "unnamed" - - -# ────────────────────────────────────────────────────────────────────────── -# CLI entry point -# ────────────────────────────────────────────────────────────────────────── - - -def main() -> int: - """CLI entry point for the RSS/Atom feed scraper.""" - from .arguments.common import add_all_standard_arguments - - parser = argparse.ArgumentParser( - description="Convert RSS/Atom feed to AI-ready skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=( - "Examples:\n" - " %(prog)s --feed-url https://example.com/feed.xml --name myblog\n" - " %(prog)s --feed-path ./feed.xml --name myblog\n" - " %(prog)s --feed-url https://example.com/rss --no-follow-links --name myblog\n" - " %(prog)s --from-json myblog_extracted.json\n" - ), - ) - - # Standard arguments (name, description, output, enhance-level, etc.) - add_all_standard_arguments(parser) - - # Override enhance-level default to 0 for RSS - for action in parser._actions: - if hasattr(action, "dest") and action.dest == "enhance_level": - action.default = 0 - action.help = ( - "AI enhancement level (auto-detects API vs LOCAL mode): " - "0=disabled (default for RSS), 1=SKILL.md only, " - "2=+architecture/config, 3=full enhancement. " - "Mode selection: uses API if ANTHROPIC_API_KEY is set, " - "otherwise LOCAL (Claude Code, Kimi, etc.)" - ) - - # RSS-specific arguments - parser.add_argument( - "--feed-url", - type=str, - help="URL of the RSS/Atom feed to scrape", - metavar="URL", - ) - parser.add_argument( - "--feed-path", - type=str, - help="Local file path to an RSS/Atom XML file", - metavar="PATH", - ) - parser.add_argument( - "--follow-links", - action="store_true", - default=True, - dest="follow_links", - help="Follow article links to scrape full content (default: enabled)", - ) - parser.add_argument( - "--no-follow-links", - action="store_false", - dest="follow_links", - help="Do not follow article links — use feed content only", - ) - parser.add_argument( - "--max-articles", - type=int, - default=50, - metavar="N", - help="Maximum number of articles to process (default: 50)", - ) - parser.add_argument( - "--from-json", - type=str, - help="Build skill from previously extracted JSON file", - metavar="FILE", - ) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = ( - getattr(args, "feed_url", None) - or getattr(args, "feed_path", None) - or getattr(args, "from_json", None) - or "(none)" - ) - print(f"\n{'=' * 60}") - print("DRY RUN: RSS/Atom Feed Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Follow links: {getattr(args, 'follow_links', True)}") - print(f"Max articles: {getattr(args, 'max_articles', 50)}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - has_source = ( - getattr(args, "feed_url", None) - or getattr(args, "feed_path", None) - or getattr(args, "from_json", None) - ) - if not has_source: - parser.error("Must specify --feed-url, --feed-path, or --from-json") - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config: dict[str, Any] = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} feed content", - } - try: - converter = RssToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Feed extraction workflow - if not getattr(args, "name", None): - # Auto-detect name from URL or file path - if getattr(args, "feed_url", None): - from urllib.parse import urlparse - - parsed_url = urlparse(args.feed_url) - args.name = parsed_url.hostname.replace(".", "-") if parsed_url.hostname else "feed" - elif getattr(args, "feed_path", None): - args.name = Path(args.feed_path).stem - - config = { - "name": args.name, - "feed_url": getattr(args, "feed_url", "") or "", - "feed_path": getattr(args, "feed_path", "") or "", - "follow_links": getattr(args, "follow_links", True), - "max_articles": getattr(args, "max_articles", 50), - "description": getattr(args, "description", None), - } - - try: - converter = RssToSkillConverter(config) - - # Extract feed - if not converter.extract_feed(): - print("\nāŒ Feed extraction failed — see error above", file=sys.stderr) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis, " - "enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during feed processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/skill_converter.py b/src/skill_seekers/cli/skill_converter.py new file mode 100644 index 0000000..4f30e64 --- /dev/null +++ b/src/skill_seekers/cli/skill_converter.py @@ -0,0 +1,115 @@ +""" +SkillConverter — Base interface for all source type converters. + +Every scraper/converter inherits this and implements extract(). +The create command calls converter.run() — same interface for all 18 types. + +Usage: + converter = get_converter("web", config) + converter.run() # extract + build + return exit code +""" + +import logging +from typing import Any + +logger = logging.getLogger(__name__) + + +class SkillConverter: + """Base interface for all skill converters. + + Subclasses must implement extract() at minimum. + build_skill() has a default implementation that most converters override. + """ + + # Override in subclass + SOURCE_TYPE: str = "unknown" + + def __init__(self, config: dict[str, Any]): + self.config = config + self.name = config.get("name", "unnamed") + self.skill_dir = f"output/{self.name}" + + def run(self) -> int: + """Main entry point — extract source and build skill. + + Returns: + Exit code (0 for success, non-zero for failure). + """ + try: + logger.info(f"Extracting from {self.SOURCE_TYPE} source: {self.name}") + self.extract() + result = self.build_skill() + if result is False: + logger.error(f"āŒ {self.SOURCE_TYPE} build_skill() reported failure") + return 1 + logger.info(f"āœ… Skill built: {self.skill_dir}/") + return 0 + except Exception as e: + logger.exception(f"āŒ {self.SOURCE_TYPE} extraction failed: {e}") + return 1 + + def extract(self): + """Extract content from source. Override in subclass.""" + raise NotImplementedError(f"{self.__class__.__name__} must implement extract()") + + def build_skill(self): + """Build SKILL.md from extracted data. Override in subclass.""" + raise NotImplementedError(f"{self.__class__.__name__} must implement build_skill()") + + +# Registry mapping source type → (module_path, class_name) +CONVERTER_REGISTRY: dict[str, tuple[str, str]] = { + "web": ("skill_seekers.cli.doc_scraper", "DocToSkillConverter"), + "github": ("skill_seekers.cli.github_scraper", "GitHubScraper"), + "pdf": ("skill_seekers.cli.pdf_scraper", "PDFToSkillConverter"), + "word": ("skill_seekers.cli.word_scraper", "WordToSkillConverter"), + "epub": ("skill_seekers.cli.epub_scraper", "EpubToSkillConverter"), + "video": ("skill_seekers.cli.video_scraper", "VideoToSkillConverter"), + "local": ("skill_seekers.cli.codebase_scraper", "CodebaseAnalyzer"), + "jupyter": ("skill_seekers.cli.jupyter_scraper", "JupyterToSkillConverter"), + "html": ("skill_seekers.cli.html_scraper", "HtmlToSkillConverter"), + "openapi": ("skill_seekers.cli.openapi_scraper", "OpenAPIToSkillConverter"), + "asciidoc": ("skill_seekers.cli.asciidoc_scraper", "AsciiDocToSkillConverter"), + "pptx": ("skill_seekers.cli.pptx_scraper", "PptxToSkillConverter"), + "rss": ("skill_seekers.cli.rss_scraper", "RssToSkillConverter"), + "manpage": ("skill_seekers.cli.man_scraper", "ManPageToSkillConverter"), + "confluence": ("skill_seekers.cli.confluence_scraper", "ConfluenceToSkillConverter"), + "notion": ("skill_seekers.cli.notion_scraper", "NotionToSkillConverter"), + "chat": ("skill_seekers.cli.chat_scraper", "ChatToSkillConverter"), + # NOTE: UnifiedScraper takes (config_path: str), not (config: dict). + # Callers must construct it directly, not via get_converter(). + "config": ("skill_seekers.cli.unified_scraper", "UnifiedScraper"), +} + + +def get_converter(source_type: str, config: dict[str, Any]) -> SkillConverter: + """Get the appropriate converter for a source type. + + Args: + source_type: Source type from SourceDetector (web, github, pdf, etc.) + config: Configuration dict for the converter. + + Returns: + Initialized converter instance. + + Raises: + ValueError: If source type is not supported. + """ + import importlib + + if source_type not in CONVERTER_REGISTRY: + raise ValueError( + f"Unknown source type: {source_type}. " + f"Supported: {', '.join(sorted(CONVERTER_REGISTRY))}" + ) + + module_path, class_name = CONVERTER_REGISTRY[source_type] + module = importlib.import_module(module_path) + converter_class = getattr(module, class_name, None) + if converter_class is None: + raise ValueError( + f"Class '{class_name}' not found in module '{module_path}'. " + f"Check CONVERTER_REGISTRY entry for '{source_type}'." + ) + return converter_class(config) diff --git a/src/skill_seekers/cli/unified_scraper.py b/src/skill_seekers/cli/unified_scraper.py index 76894b4..38a90ca 100644 --- a/src/skill_seekers/cli/unified_scraper.py +++ b/src/skill_seekers/cli/unified_scraper.py @@ -12,7 +12,6 @@ Usage: skill-seekers unified --config configs/react_unified.json --merge-mode ai-enhanced """ -import argparse import json import logging import os @@ -28,8 +27,8 @@ try: from skill_seekers.cli.config_validator import validate_config from skill_seekers.cli.conflict_detector import ConflictDetector from skill_seekers.cli.merge_sources import AIEnhancedMerger, RuleBasedMerger + from skill_seekers.cli.skill_converter import SkillConverter from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder - from skill_seekers.cli.utils import setup_logging except ImportError as e: print(f"Error importing modules: {e}") print("Make sure you're running from the project root directory") @@ -38,7 +37,7 @@ except ImportError as e: logger = logging.getLogger(__name__) -class UnifiedScraper: +class UnifiedScraper(SkillConverter): """ Orchestrates multi-source scraping and merging. @@ -50,6 +49,8 @@ class UnifiedScraper: 5. Build unified skill """ + SOURCE_TYPE = "config" + def __init__(self, config_path: str, merge_mode: str | None = None): """ Initialize unified scraper. @@ -58,6 +59,7 @@ class UnifiedScraper: config_path: Path to unified config JSON merge_mode: Override config merge_mode ('rule-based' or 'claude-enhanced') """ + super().__init__({"name": "unified", "config_path": config_path}) self.config_path = config_path # Validate and load config @@ -157,6 +159,42 @@ class UnifiedScraper: logger.info(f"šŸ“ Logging to: {log_file}") logger.info(f"šŸ—‚ļø Cache directory: {self.cache_dir}") + @staticmethod + def _enrich_docs_json(docs_json: dict, data_file_path: str) -> dict: + """Enrich docs summary with page content from individual page files. + + summary.json only has {title, url} per page; full content lives in pages/*.json. + ConflictDetector needs content to extract APIs, so we load page files and convert + to the dict format {url: page_data} that the detector's dict branch understands. + """ + pages = docs_json.get("pages", []) + if not isinstance(pages, list) or not pages or "content" in pages[0]: + return docs_json + + pages_dir = os.path.join(os.path.dirname(data_file_path), "pages") + if not os.path.isdir(pages_dir): + return docs_json + + enriched_pages = {} + for page_file in os.listdir(pages_dir): + if page_file.endswith(".json"): + try: + with open(os.path.join(pages_dir, page_file), encoding="utf-8") as pf: + page_data = json.load(pf) + url = page_data.get("url", "") + if url: + enriched_pages[url] = page_data + except (json.JSONDecodeError, OSError): + continue + + if enriched_pages: + docs_json = {**docs_json, "pages": enriched_pages} + logger.info( + f"Enriched docs data with {len(enriched_pages)} page files for API extraction" + ) + + return docs_json + def scrape_all_sources(self): """ Scrape all configured sources. @@ -259,50 +297,41 @@ class UnifiedScraper: "sources": [doc_source], } - # Write temporary config - temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json") - with open(temp_config_path, "w", encoding="utf-8") as f: - json.dump(doc_config, f, indent=2) - - # Run doc_scraper as subprocess + # Run doc_scraper directly (no subprocess needed with ExecutionContext) logger.info(f"Scraping documentation from {source['base_url']}") - doc_scraper_path = Path(__file__).parent / "doc_scraper.py" - cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"] - - # Forward agent-related CLI args so doc scraper enhancement respects - # the user's chosen agent instead of defaulting to claude. - cli_args = getattr(self, "_cli_args", None) - if cli_args is not None: - if getattr(cli_args, "agent", None): - cmd.extend(["--agent", cli_args.agent]) - if getattr(cli_args, "agent_cmd", None): - cmd.extend(["--agent-cmd", cli_args.agent_cmd]) - if getattr(cli_args, "api_key", None): - cmd.extend(["--api-key", cli_args.api_key]) - # Support "browser": true in source config for JavaScript SPA sites if source.get("browser", False): - cmd.append("--browser") + doc_config["browser"] = True logger.info(" 🌐 Browser mode enabled (JavaScript rendering via Playwright)") + # Import and call directly try: - result = subprocess.run( - cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL, timeout=3600 + from skill_seekers.cli.doc_scraper import scrape_documentation + from skill_seekers.cli.execution_context import ExecutionContext + + # Create child context with doc-specific overrides + doc_ctx = ExecutionContext.get().override( + output__name=f"{self.name}_docs", + scraping__max_pages=source.get("max_pages", 500), ) - except subprocess.TimeoutExpired: - logger.error("Documentation scraping timed out after 60 minutes") - return - if result.returncode != 0: - logger.error(f"Documentation scraping failed with return code {result.returncode}") - logger.error(f"STDERR: {result.stderr}") - logger.error(f"STDOUT: {result.stdout}") - return + with doc_ctx: + result = scrape_documentation( + config=doc_config, + ctx=ExecutionContext.get(), + ) - # Log subprocess output for debugging - if result.stdout: - logger.info(f"Doc scraper output: {result.stdout[-500:]}") # Last 500 chars + if result != 0: + logger.error(f"Documentation scraping failed with return code {result}") + return + + except Exception as e: + logger.error(f"Documentation scraping failed: {e}") + import traceback + + logger.debug(f"Traceback: {traceback.format_exc()}") + return # Load scraped data docs_data_file = f"output/{doc_config['name']}_data/summary.json" @@ -327,10 +356,6 @@ class UnifiedScraper: else: logger.warning("Documentation data file not found") - # Clean up temp config - if os.path.exists(temp_config_path): - os.remove(temp_config_path) - # Move intermediate files to cache to keep output/ clean docs_output_dir = f"output/{doc_config['name']}" docs_data_dir = f"output/{doc_config['name']}_data" @@ -1704,13 +1729,17 @@ class UnifiedScraper: docs_data = docs_list[0] github_data = github_list[0] - # Load data files + # Load data files (cached for reuse in merge_sources) with open(docs_data["data_file"], encoding="utf-8") as f: docs_json = json.load(f) + docs_json = self._enrich_docs_json(docs_json, docs_data["data_file"]) with open(github_data["data_file"], encoding="utf-8") as f: github_json = json.load(f) + self._cached_docs_json = docs_json + self._cached_github_json = github_json + # Detect conflicts detector = ConflictDetector(docs_json, github_json) conflicts = detector.detect_all_conflicts() @@ -1758,15 +1787,18 @@ class UnifiedScraper: logger.warning("Missing documentation or GitHub data for merging") return None - docs_data = docs_list[0] - github_data = github_list[0] + # Reuse cached data from detect_conflicts() to avoid redundant disk I/O + docs_json = getattr(self, "_cached_docs_json", None) + github_json = getattr(self, "_cached_github_json", None) - # Load data - with open(docs_data["data_file"], encoding="utf-8") as f: - docs_json = json.load(f) - - with open(github_data["data_file"], encoding="utf-8") as f: - github_json = json.load(f) + if docs_json is None or github_json is None: + docs_data = docs_list[0] + github_data = github_list[0] + with open(docs_data["data_file"], encoding="utf-8") as f: + docs_json = json.load(f) + docs_json = self._enrich_docs_json(docs_json, docs_data["data_file"]) + with open(github_data["data_file"], encoding="utf-8") as f: + github_json = json.load(f) # Choose merger if self.merge_mode in ("ai-enhanced", "claude-enhanced"): @@ -1786,6 +1818,10 @@ class UnifiedScraper: return merged_data + def extract(self): + """SkillConverter interface — delegates to scrape_all_sources().""" + self.scrape_all_sources() + def build_skill(self, merged_data: dict | None = None): """ Build final unified skill. @@ -1892,17 +1928,28 @@ class UnifiedScraper: run_workflows(effective_args, context=unified_context) # Phase 6: AI Enhancement of SKILL.md - # Triggered by config "enhancement" block or CLI --enhance-level + # Read from ExecutionContext first (has correct priority resolution), + # fall back to raw config dict for backward compatibility. enhancement_config = self.config.get("enhancement", {}) - enhancement_enabled = enhancement_config.get("enabled", False) - enhancement_level = enhancement_config.get("level", 0) - enhancement_mode = enhancement_config.get("mode", "AUTO").upper() + try: + from skill_seekers.cli.execution_context import ExecutionContext - # CLI --enhance-level overrides config - cli_enhance_level = getattr(args, "enhance_level", None) if args is not None else None - if cli_enhance_level is not None: - enhancement_enabled = cli_enhance_level > 0 - enhancement_level = cli_enhance_level + ctx = ExecutionContext.get() + enhancement_enabled = ctx.enhancement.enabled + enhancement_level = ctx.enhancement.level + enhancement_mode = ctx.enhancement.mode.upper() + except (RuntimeError, Exception): + # Fallback to raw config + args + enhancement_enabled = enhancement_config.get("enabled", False) + enhancement_level = enhancement_config.get("level", 0) + enhancement_mode = enhancement_config.get("mode", "AUTO").upper() + + cli_enhance_level = ( + getattr(args, "enhance_level", None) if args is not None else None + ) + if cli_enhance_level is not None: + enhancement_enabled = cli_enhance_level > 0 + enhancement_level = cli_enhance_level if enhancement_enabled and enhancement_level > 0: logger.info("\n" + "=" * 60) @@ -1918,16 +1965,19 @@ class UnifiedScraper: try: from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - # Get agent from CLI args, config enhancement block, or env var - agent = None - agent_cmd = None - if args is not None: - agent = getattr(args, "agent", None) - agent_cmd = getattr(args, "agent_cmd", None) - if not agent: - agent = enhancement_config.get("agent", None) - if not agent: - agent = os.environ.get("SKILL_SEEKER_AGENT", "").strip() or None + # Get agent from ExecutionContext (already resolved with correct priority) + try: + ctx = ExecutionContext.get() + agent = ctx.enhancement.agent + agent_cmd = ctx.enhancement.agent_cmd + except (RuntimeError, Exception): + agent = None + agent_cmd = None + if args is not None: + agent = getattr(args, "agent", None) + agent_cmd = getattr(args, "agent_cmd", None) + if not agent: + agent = os.environ.get("SKILL_SEEKER_AGENT", "").strip() or None # Read timeout from config enhancement block timeout_val = enhancement_config.get("timeout") @@ -2016,183 +2066,14 @@ class UnifiedScraper: logger.info(f"šŸ“ Output: {self.output_dir}/") logger.info(f"šŸ“ Data: {self.data_dir}/") + return 0 + except KeyboardInterrupt: logger.info("\n\nāš ļø Scraping interrupted by user") - sys.exit(1) + return 130 except Exception as e: logger.error(f"\n\nāŒ Error during scraping: {e}") import traceback traceback.print_exc() - sys.exit(1) - - -def main(): - """Main entry point.""" - parser = argparse.ArgumentParser( - description="Unified multi-source scraper", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Basic usage with unified config - skill-seekers unified --config configs/godot_unified.json - - # Override merge mode - skill-seekers unified --config configs/react_unified.json --merge-mode ai-enhanced - - # Backward compatible with legacy configs - skill-seekers unified --config configs/react.json - """, - ) - - parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file") - parser.add_argument( - "--merge-mode", - "-m", - choices=["rule-based", "ai-enhanced", "claude-enhanced"], - help="Override config merge mode (ai-enhanced or rule-based). 'claude-enhanced' accepted as alias.", - ) - parser.add_argument( - "--skip-codebase-analysis", - action="store_true", - help="Skip C3.x codebase analysis for GitHub sources (default: enabled)", - ) - parser.add_argument( - "--fresh", - action="store_true", - help="Clear any existing data and start fresh (ignore checkpoints)", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="Preview what will be scraped without actually scraping", - ) - # Enhancement Workflow arguments (mirrors scrape/github/pdf/codebase scrapers) - parser.add_argument( - "--enhance-workflow", - action="append", - dest="enhance_workflow", - help="Apply enhancement workflow (file path or preset). Can use multiple times to chain workflows.", - metavar="WORKFLOW", - ) - parser.add_argument( - "--enhance-stage", - action="append", - dest="enhance_stage", - help="Add inline enhancement stage (format: 'name:prompt'). Can be used multiple times.", - metavar="STAGE", - ) - parser.add_argument( - "--var", - action="append", - dest="var", - help="Override workflow variable (format: 'key=value'). Can be used multiple times.", - metavar="VAR", - ) - parser.add_argument( - "--workflow-dry-run", - action="store_true", - dest="workflow_dry_run", - help="Preview workflow stages without executing (requires --enhance-workflow)", - ) - parser.add_argument( - "--api-key", - type=str, - metavar="KEY", - help="Anthropic API key (or set ANTHROPIC_API_KEY env var)", - ) - parser.add_argument( - "--enhance-level", - type=int, - choices=[0, 1, 2, 3], - default=None, - metavar="LEVEL", - help=( - "Global AI enhancement level override for all sources " - "(0=off, 1=SKILL.md, 2=+arch/config, 3=full). " - "Overrides per-source enhance_level in config." - ), - ) - parser.add_argument( - "--agent", - type=str, - choices=["claude", "codex", "copilot", "opencode", "kimi", "custom"], - metavar="AGENT", - help="Local coding agent for enhancement (default: AI agent from SKILL_SEEKER_AGENT env var)", - ) - parser.add_argument( - "--agent-cmd", - type=str, - metavar="CMD", - help="Override agent command template (advanced)", - ) - - args = parser.parse_args() - setup_logging() - - # Create scraper - scraper = UnifiedScraper(args.config, args.merge_mode) - - # Disable codebase analysis if requested - if args.skip_codebase_analysis: - for source in scraper.config.get("sources", []): - if source["type"] == "github": - source["enable_codebase_analysis"] = False - logger.info( - f"ā­ļø Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}" - ) - - # Handle --fresh flag (clear cache) - if args.fresh: - import shutil - - if os.path.exists(scraper.cache_dir): - logger.info(f"🧹 Clearing cache: {scraper.cache_dir}") - shutil.rmtree(scraper.cache_dir) - # Recreate directories - os.makedirs(scraper.sources_dir, exist_ok=True) - os.makedirs(scraper.data_dir, exist_ok=True) - os.makedirs(scraper.repos_dir, exist_ok=True) - os.makedirs(scraper.logs_dir, exist_ok=True) - - # Handle --dry-run flag - if args.dry_run: - logger.info("šŸ” DRY RUN MODE - Preview only, no scraping will occur") - logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:") - # Source type display config: type -> (label, key for detail) - _SOURCE_DISPLAY = { - "documentation": ("Documentation", "base_url"), - "github": ("GitHub", "repo"), - "pdf": ("PDF", "path"), - "word": ("Word", "path"), - "epub": ("EPUB", "path"), - "video": ("Video", "url"), - "local": ("Local Codebase", "path"), - "jupyter": ("Jupyter Notebook", "path"), - "html": ("HTML", "path"), - "openapi": ("OpenAPI Spec", "path"), - "asciidoc": ("AsciiDoc", "path"), - "pptx": ("PowerPoint", "path"), - "confluence": ("Confluence", "base_url"), - "notion": ("Notion", "page_id"), - "rss": ("RSS/Atom Feed", "url"), - "manpage": ("Man Page", "names"), - "chat": ("Chat Export", "path"), - } - for idx, source in enumerate(scraper.config.get("sources", []), 1): - source_type = source.get("type", "unknown") - label, key = _SOURCE_DISPLAY.get(source_type, (source_type.title(), "path")) - detail = source.get(key, "N/A") - if isinstance(detail, list): - detail = ", ".join(str(d) for d in detail) - logger.info(f" {idx}. {label}: {detail}") - logger.info(f"\nOutput directory: {scraper.output_dir}") - logger.info(f"Merge mode: {scraper.merge_mode}") - return - - # Run scraper (pass args for workflow integration) - scraper.run(args=args) - - -if __name__ == "__main__": - main() + return 1 diff --git a/src/skill_seekers/cli/unified_skill_builder.py b/src/skill_seekers/cli/unified_skill_builder.py index 28a29cd..7bcd532 100644 --- a/src/skill_seekers/cli/unified_skill_builder.py +++ b/src/skill_seekers/cli/unified_skill_builder.py @@ -15,6 +15,7 @@ discrepancies transparently. import json import logging import os +import re import shutil from pathlib import Path @@ -532,12 +533,49 @@ This skill synthesizes knowledge from multiple sources: logger.warning("No source SKILL.md files found, generating minimal SKILL.md (legacy)") content = self._generate_minimal_skill_md() + # Ensure frontmatter uses config name/description, not auto-generated slugs + content = self._normalize_frontmatter(content) + # Write final content with open(skill_path, "w", encoding="utf-8") as f: f.write(content) logger.info(f"Created SKILL.md ({len(content)} chars, ~{len(content.split())} words)") + def _normalize_frontmatter(self, content: str) -> str: + """Ensure SKILL.md frontmatter uses the config name and description. + + Standalone source SKILL.md files may have auto-generated slugs + (e.g., 'primetween-github-0-kyrylokuzyk-primetween'). This replaces + the name and description with the canonical values from the config. + """ + if not content.startswith("---"): + return content + + end = content.find("---", 3) + if end == -1: + return content + + frontmatter = content[3:end] + body = content[end + 3 :] + + canonical_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64] + frontmatter = re.sub( + r"^name:.*$", f"name: {canonical_name}", frontmatter, count=1, flags=re.MULTILINE + ) + + # Handle both single-line and multiline YAML description values + desc = self.description[:1024] if len(self.description) > 1024 else self.description + frontmatter = re.sub( + r"^description:.*(?:\n[ \t]+.*)*$", + f"description: {desc}", + frontmatter, + count=1, + flags=re.MULTILINE, + ) + + return f"---{frontmatter}---{body}" + def _synthesize_docs_pdf(self, skill_mds: dict[str, str]) -> str: """Synthesize documentation + PDF sources. @@ -958,8 +996,8 @@ This skill combines knowledge from multiple sources: def _format_api_entry(self, api_data: dict, inline_conflict: bool = False) -> str: """Format a single API entry.""" name = api_data.get("name", "Unknown") - signature = api_data.get("merged_signature", name) - description = api_data.get("merged_description", "") + signature = api_data.get("merged_signature", api_data.get("signature", name)) + description = api_data.get("merged_description", api_data.get("description", "")) warning = api_data.get("warning", "") entry = f"#### `{signature}`\n\n" @@ -1302,7 +1340,7 @@ This skill combines knowledge from multiple sources: apis = self.merged_data.get("apis", {}) for api_name in sorted(apis.keys()): - api_data = apis[api_name] + api_data = {**apis[api_name], "name": api_name} entry = self._format_api_entry(api_data, inline_conflict=True) f.write(entry) @@ -1386,16 +1424,33 @@ This skill combines knowledge from multiple sources: if c3_data.get("architecture"): languages = c3_data["architecture"].get("languages", {}) - # If no languages from C3.7, try to get from GitHub data - # github_data already available from method scope - if not languages and github_data.get("languages"): - # GitHub data has languages as list, convert to dict with count 1 - languages = dict.fromkeys(github_data["languages"], 1) + # If no languages from C3.7, try to get from code_analysis or GitHub data + if not languages: + code_analysis = github_data.get("code_analysis", {}) + if code_analysis.get("files_analyzed") and code_analysis.get("languages_analyzed"): + # Use code_analysis file counts per language + files = code_analysis.get("files", []) + lang_counts = {} + for file_info in files: + lang = file_info.get("language", "Unknown") + lang_counts[lang] = lang_counts.get(lang, 0) + 1 + if lang_counts: + languages = lang_counts + else: + # Fallback: total count attributed to primary language + for lang in code_analysis["languages_analyzed"]: + languages[lang] = code_analysis["files_analyzed"] + elif github_data.get("languages"): + gh_langs = github_data["languages"] + if isinstance(gh_langs, dict): + languages = dict.fromkeys(gh_langs, 0) + elif isinstance(gh_langs, list): + languages = dict.fromkeys(gh_langs, 0) if languages: f.write("**Languages Detected**:\n") for lang, count in sorted(languages.items(), key=lambda x: x[1], reverse=True)[:5]: - if isinstance(count, int): + if isinstance(count, int) and count > 0: f.write(f"- {lang}: {count} files\n") else: f.write(f"- {lang}\n") @@ -1534,6 +1589,24 @@ This skill combines knowledge from multiple sources: logger.info("šŸ“ Created ARCHITECTURE.md") + @staticmethod + def _make_path_relative(file_path: str) -> str: + """Strip absolute path prefixes, keeping only the repo-relative path.""" + # Strip .skillseeker-cache repo clone paths + if ".skillseeker-cache" in file_path: + # Pattern: ...repos/{idx}_{owner}_{repo}/relative/path + parts = file_path.split("/repos/") + if len(parts) > 1: + # Skip the repo dir name (e.g., '0_Owner_Repo/') + remainder = parts[1] + slash_idx = remainder.find("/") + if slash_idx != -1: + return remainder[slash_idx + 1 :] + # Generic: if it looks absolute, try to make it relative + if os.path.isabs(file_path): + return os.path.basename(file_path) + return file_path + def _generate_pattern_references(self, c3_dir: str, patterns_data: dict): """Generate design pattern references (C3.1).""" if not patterns_data: @@ -1556,7 +1629,8 @@ This skill combines knowledge from multiple sources: for file_data in patterns_data: patterns = file_data.get("patterns", []) if patterns: - f.write(f"## {file_data['file_path']}\n\n") + display_path = self._make_path_relative(file_data["file_path"]) + f.write(f"## {display_path}\n\n") for p in patterns: f.write(f"### {p['pattern_type']}\n\n") if p.get("class_name"): diff --git a/src/skill_seekers/cli/video_scraper.py b/src/skill_seekers/cli/video_scraper.py index 6dd4956..0f69483 100644 --- a/src/skill_seekers/cli/video_scraper.py +++ b/src/skill_seekers/cli/video_scraper.py @@ -14,14 +14,13 @@ Usage: python3 video_scraper.py --from-json video_extracted.json """ -import argparse import json import logging import os import re -import sys import time +from skill_seekers.cli.skill_converter import SkillConverter from skill_seekers.cli.video_models import ( AudioVisualAlignment, TextGroupTimeline, @@ -318,9 +317,11 @@ def _ai_clean_reference(ref_path: str, content: str, api_key: str | None = None) # ============================================================================= -class VideoToSkillConverter: +class VideoToSkillConverter(SkillConverter): """Convert video content to AI skill.""" + SOURCE_TYPE = "video" + def __init__(self, config: dict): """Initialize converter. @@ -333,6 +334,7 @@ class VideoToSkillConverter: - visual: Whether to enable visual extraction - whisper_model: Whisper model size """ + super().__init__(config) self.config = config self.name = config["name"] self.description = config.get("description", "") @@ -355,6 +357,10 @@ class VideoToSkillConverter: # Results self.result: VideoScraperResult | None = None + def extract(self): + """Extract content from video source (SkillConverter interface).""" + self.process() + def process(self) -> VideoScraperResult: """Run the full video processing pipeline. @@ -1015,241 +1021,3 @@ class VideoToSkillConverter: lines.append(f"- [{video.title}](references/{ref_filename})") return "\n".join(lines) - - -# ============================================================================= -# CLI Entry Point -# ============================================================================= - - -def main() -> int: - """Entry point for video scraper CLI. - - Returns: - Exit code (0 for success, non-zero for error). - """ - from skill_seekers.cli.arguments.video import add_video_arguments - - parser = argparse.ArgumentParser( - prog="skill-seekers-video", - description="Extract transcripts and metadata from videos and generate skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog="""\ -Examples: - skill-seekers video --url https://www.youtube.com/watch?v=... - skill-seekers video --video-file recording.mp4 - skill-seekers video --playlist https://www.youtube.com/playlist?list=... - skill-seekers video --from-json video_extracted.json - skill-seekers video --url https://youtu.be/... --languages en,es -""", - ) - - add_video_arguments(parser) - args = parser.parse_args() - - # --setup: run GPU detection + dependency installation, then exit - if getattr(args, "setup", False): - from skill_seekers.cli.video_setup import run_setup - - return run_setup(interactive=True) - - # Setup logging - log_level = logging.DEBUG if args.verbose else (logging.WARNING if args.quiet else logging.INFO) - logging.basicConfig(level=log_level, format="%(levelname)s: %(message)s") - - # Validate inputs - has_source = any( - [ - getattr(args, "url", None), - getattr(args, "video_file", None), - getattr(args, "playlist", None), - ] - ) - has_json = getattr(args, "from_json", None) - - if not has_source and not has_json: - parser.error("Must specify --url, --video-file, --playlist, or --from-json") - - # Parse and validate time clipping - raw_start = getattr(args, "start_time", None) - raw_end = getattr(args, "end_time", None) - clip_start: float | None = None - clip_end: float | None = None - - if raw_start is not None: - try: - clip_start = parse_time_to_seconds(raw_start) - except ValueError as exc: - parser.error(f"--start-time: {exc}") - if raw_end is not None: - try: - clip_end = parse_time_to_seconds(raw_end) - except ValueError as exc: - parser.error(f"--end-time: {exc}") - - if clip_start is not None or clip_end is not None: - if getattr(args, "playlist", None): - parser.error("--start-time/--end-time cannot be used with --playlist") - if clip_start is not None and clip_end is not None and clip_start >= clip_end: - parser.error(f"--start-time ({clip_start}s) must be before --end-time ({clip_end}s)") - - # Build config - config = { - "name": args.name or "video_skill", - "description": getattr(args, "description", None) or "", - "output": getattr(args, "output", None), - "url": getattr(args, "url", None), - "video_file": getattr(args, "video_file", None), - "playlist": getattr(args, "playlist", None), - "languages": getattr(args, "languages", "en"), - "visual": getattr(args, "visual", False), - "whisper_model": getattr(args, "whisper_model", "base"), - "visual_interval": getattr(args, "visual_interval", 0.7), - "visual_min_gap": getattr(args, "visual_min_gap", 0.5), - "visual_similarity": getattr(args, "visual_similarity", 3.0), - "vision_ocr": getattr(args, "vision_ocr", False), - "start_time": clip_start, - "end_time": clip_end, - } - - converter = VideoToSkillConverter(config) - - # Dry run - if args.dry_run: - logger.info("DRY RUN — would process:") - for key in ["url", "video_file", "playlist"]: - if config.get(key): - logger.info(f" {key}: {config[key]}") - logger.info(f" name: {config['name']}") - logger.info(f" languages: {config['languages']}") - logger.info(f" visual: {config['visual']}") - if clip_start is not None or clip_end is not None: - start_str = _format_duration(clip_start) if clip_start is not None else "start" - end_str = _format_duration(clip_end) if clip_end is not None else "end" - logger.info(f" clip range: {start_str} - {end_str}") - return 0 - - # Workflow 1: Build from JSON - if has_json: - logger.info(f"Loading extracted data from {args.from_json}") - converter.load_extracted_data(args.from_json) - converter.build_skill() - logger.info(f"Skill built at {converter.skill_dir}") - return 0 - - # Workflow 2: Full extraction + build - try: - result = converter.process() - if not result.videos: - logger.error("No videos were successfully processed") - if result.errors: - for err in result.errors: - logger.error(f" {err['source']}: {err['error']}") - return 1 - - converter.save_extracted_data() - converter.build_skill() - - logger.info(f"\nSkill built successfully at {converter.skill_dir}") - logger.info(f" Videos: {len(result.videos)}") - logger.info(f" Segments: {result.total_segments}") - logger.info(f" Duration: {_format_duration(result.total_duration_seconds)}") - logger.info(f" Processing time: {result.processing_time_seconds:.1f}s") - - if result.warnings: - for w in result.warnings: - logger.warning(f" {w}") - - except RuntimeError as e: - logger.error(str(e)) - return 1 - - # Enhancement - enhance_level = getattr(args, "enhance_level", 0) - if enhance_level > 0: - # Pass 1: Clean reference files (Code Timeline reconstruction) - converter._enhance_reference_files(enhance_level, args) - - # Auto-inject video-tutorial workflow if no workflow specified - if not getattr(args, "enhance_workflow", None): - args.enhance_workflow = ["video-tutorial"] - - # Pass 2: Run workflow stages (specialized video analysis) - try: - from skill_seekers.cli.workflow_runner import run_workflows - - video_context = { - "skill_name": converter.name, - "skill_dir": converter.skill_dir, - "source_type": "video_tutorial", - } - run_workflows(args, context=video_context) - except ImportError: - logger.debug("Workflow runner not available, skipping workflow stages") - - # Run traditional SKILL.md enhancement (reads references + rewrites) - _run_video_enhancement(converter.skill_dir, enhance_level, args) - - return 0 - - -def _run_video_enhancement(skill_dir: str, enhance_level: int, args) -> None: - """Run traditional SKILL.md enhancement with video-aware prompt. - - This calls the same SkillEnhancer used by other scrapers, but the prompt - auto-detects video_tutorial source type and uses a video-specific prompt. - """ - import os - import subprocess - - has_api_key = bool( - os.environ.get("ANTHROPIC_API_KEY") - or os.environ.get("ANTHROPIC_AUTH_TOKEN") - or getattr(args, "api_key", None) - or os.environ.get("MOONSHOT_API_KEY") - ) - - agent = getattr(args, "agent", None) - - if not has_api_key and not agent: - logger.info("\nšŸ’” Enhance your video skill with AI:") - logger.info(f" export ANTHROPIC_API_KEY=sk-ant-...") - logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}") - return - - logger.info(f"\nšŸ¤– Running video-aware SKILL.md enhancement (level {enhance_level})...") - - try: - enhance_cmd = ["skill-seekers-enhance", skill_dir] - api_key = getattr(args, "api_key", None) - if api_key: - enhance_cmd.extend(["--api-key", api_key]) - if agent: - enhance_cmd.extend(["--agent", agent]) - - logger.info( - "Starting video skill enhancement (this may take 10+ minutes " - "for large videos with AI enhancement)..." - ) - subprocess.run(enhance_cmd, check=True, timeout=1800) - logger.info("Video skill enhancement complete!") - except subprocess.TimeoutExpired: - logger.warning( - "⚠ Enhancement timed out after 30 minutes. " - "The skill was still built without enhancement. " - "You can retry manually with:\n" - f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}" - ) - except subprocess.CalledProcessError as exc: - logger.warning( - f"⚠ Enhancement failed (exit code {exc.returncode}), " - "but skill was still built. You can retry manually with:\n" - f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}" - ) - except FileNotFoundError: - logger.warning("⚠ skill-seekers-enhance not found. Run manually:") - logger.info(f" skill-seekers enhance {skill_dir} --enhance-level {enhance_level}") - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/cli/word_scraper.py b/src/skill_seekers/cli/word_scraper.py index 23866c2..a6248ec 100644 --- a/src/skill_seekers/cli/word_scraper.py +++ b/src/skill_seekers/cli/word_scraper.py @@ -10,12 +10,10 @@ Usage: python3 word_scraper.py --from-json document_extracted.json """ -import argparse import json import logging import os import re -import sys from pathlib import Path # Optional dependency guard @@ -27,6 +25,8 @@ try: except ImportError: WORD_AVAILABLE = False +from .skill_converter import SkillConverter + logger = logging.getLogger(__name__) @@ -72,10 +72,13 @@ def infer_description_from_word(metadata: dict = None, name: str = "") -> str: ) -class WordToSkillConverter: +class WordToSkillConverter(SkillConverter): """Convert Word document (.docx) to AI skill.""" + SOURCE_TYPE = "word" + def __init__(self, config): + super().__init__(config) self.config = config self.name = config["name"] self.docx_path = config.get("docx_path", "") @@ -93,6 +96,10 @@ class WordToSkillConverter: # Extracted data self.extracted_data = None + def extract(self): + """SkillConverter interface — delegates to extract_docx().""" + return self.extract_docx() + def extract_docx(self): """Extract content from Word document using mammoth + python-docx. @@ -918,146 +925,3 @@ def _score_code_quality(code: str) -> float: score -= 2.0 return min(10.0, max(0.0, score)) - - -def main(): - from .arguments.word import add_word_arguments - - parser = argparse.ArgumentParser( - description="Convert Word document (.docx) to AI skill", - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - - add_word_arguments(parser) - - args = parser.parse_args() - - # Set logging level - if getattr(args, "quiet", False): - logging.getLogger().setLevel(logging.WARNING) - elif getattr(args, "verbose", False): - logging.getLogger().setLevel(logging.DEBUG) - - # Handle --dry-run - if getattr(args, "dry_run", False): - source = getattr(args, "docx", None) or getattr(args, "from_json", None) or "(none)" - print(f"\n{'=' * 60}") - print("DRY RUN: Word Document Extraction") - print(f"{'=' * 60}") - print(f"Source: {source}") - print(f"Name: {getattr(args, 'name', None) or '(auto-detect)'}") - print(f"Enhance level: {getattr(args, 'enhance_level', 0)}") - print(f"\nāœ… Dry run complete") - return 0 - - # Validate inputs - if not (getattr(args, "docx", None) or getattr(args, "from_json", None)): - parser.error("Must specify --docx or --from-json") - - # Build from JSON workflow - if getattr(args, "from_json", None): - name = Path(args.from_json).stem.replace("_extracted", "") - config = { - "name": getattr(args, "name", None) or name, - "description": getattr(args, "description", None) - or f"Use when referencing {name} documentation", - } - try: - converter = WordToSkillConverter(config) - converter.load_extracted_data(args.from_json) - converter.build_skill() - except Exception as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - return 0 - - # Direct DOCX mode - if not getattr(args, "name", None): - # Auto-detect name from filename - args.name = Path(args.docx).stem - - config = { - "name": args.name, - "docx_path": args.docx, - # Pass None so extract_docx() can infer from document metadata (subject/title) - "description": getattr(args, "description", None), - } - if getattr(args, "categories", None): - config["categories"] = args.categories - - try: - converter = WordToSkillConverter(config) - - # Extract - if not converter.extract_docx(): - print("\nāŒ Word extraction failed - see error above", file=sys.stderr) - sys.exit(1) - - # Build skill - converter.build_skill() - - # Enhancement Workflow Integration - from skill_seekers.cli.workflow_runner import run_workflows - - workflow_executed, workflow_names = run_workflows(args) - workflow_name = ", ".join(workflow_names) if workflow_names else None - - # Traditional enhancement (complements workflow system) - if getattr(args, "enhance_level", 0) > 0: - import os - - api_key = getattr(args, "api_key", None) or os.environ.get("ANTHROPIC_API_KEY") - mode = "API" if api_key else "LOCAL" - - print("\n" + "=" * 80) - print(f"šŸ¤– Traditional AI Enhancement ({mode} mode, level {args.enhance_level})") - print("=" * 80) - if workflow_executed: - print(f" Running after workflow: {workflow_name}") - print( - " (Workflow provides specialized analysis, enhancement provides general improvements)" - ) - print("") - - skill_dir = converter.skill_dir - if api_key: - try: - from skill_seekers.cli.enhance_skill import enhance_skill_md - - enhance_skill_md(skill_dir, api_key) - print("āœ… API enhancement complete!") - except ImportError: - print("āŒ API enhancement not available. Falling back to LOCAL mode...") - from pathlib import Path - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - else: - from pathlib import Path - from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer - - agent = getattr(args, "agent", None) if args else None - agent_cmd = getattr(args, "agent_cmd", None) if args else None - enhancer = LocalSkillEnhancer(Path(skill_dir), agent=agent, agent_cmd=agent_cmd) - enhancer.run(headless=True) - print("āœ… Local enhancement complete!") - - except RuntimeError as e: - print(f"\nāŒ Error: {e}", file=sys.stderr) - sys.exit(1) - except Exception as e: - print(f"\nāŒ Unexpected error during Word processing: {e}", file=sys.stderr) - import traceback - - traceback.print_exc() - sys.exit(1) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/skill_seekers/mcp/tools/scraping_tools.py b/src/skill_seekers/mcp/tools/scraping_tools.py index c8f99f2..cba7c92 100644 --- a/src/skill_seekers/mcp/tools/scraping_tools.py +++ b/src/skill_seekers/mcp/tools/scraping_tools.py @@ -13,7 +13,9 @@ This module contains all scraping-related MCP tool implementations: Extracted from server.py for better modularity and organization. """ +import io import json +import logging import sys from pathlib import Path @@ -34,6 +36,48 @@ except ImportError: CLI_DIR = Path(__file__).parent.parent.parent / "cli" +def _run_converter(converter, progress_msg: str) -> list: + """Run a converter in-process with log capture. + + Args: + converter: An initialized SkillConverter instance. + progress_msg: Progress message to prepend to output. + + Returns: + List[TextContent] with success/error message. + """ + log_capture = io.StringIO() + handler = logging.StreamHandler(log_capture) + handler.setLevel(logging.INFO) + sk_logger = logging.getLogger("skill_seekers") + sk_logger.addHandler(handler) + try: + result = converter.run() + except Exception as exc: + captured = log_capture.getvalue() + return [ + TextContent( + type="text", + text=f"{progress_msg}{captured}\n\nāŒ Converter raised an exception:\n{exc}", + ) + ] + finally: + sk_logger.removeHandler(handler) + + captured = log_capture.getvalue() + output = progress_msg + captured + + if result == 0: + return [TextContent(type="text", text=output)] + else: + return [ + TextContent( + type="text", + text=f"{output}\n\nāŒ Converter returned non-zero exit code ({result})", + ) + ] + + def run_subprocess_with_streaming(cmd: list[str], timeout: int = None) -> tuple: """ Run subprocess with real-time output streaming. @@ -141,10 +185,11 @@ async def estimate_pages_tool(args: dict) -> list[TextContent]: # Estimate: 0.5s per page discovered timeout = max(300, max_discovery // 2) # Minimum 5 minutes - # Run estimate_pages.py + # Run estimate_pages module cmd = [ sys.executable, - str(CLI_DIR / "estimate_pages.py"), + "-m", + "skill_seekers.cli.estimate_pages", config_path, "--max-discovery", str(max_discovery), @@ -185,8 +230,6 @@ async def scrape_docs_tool(args: dict) -> list[TextContent]: """ config_path = args["config_path"] unlimited = args.get("unlimited", False) - enhance_local = args.get("enhance_local", False) - skip_scrape = args.get("skip_scrape", False) dry_run = args.get("dry_run", False) merge_mode = args.get("merge_mode") @@ -218,80 +261,52 @@ async def scrape_docs_tool(args: dict) -> list[TextContent]: else: config_to_use = config_path - # Choose scraper based on format + # Build progress message if is_unified: - scraper_script = "unified_scraper.py" progress_msg = "šŸ”„ Starting unified multi-source scraping...\n" progress_msg += "šŸ“¦ Config format: Unified (multiple sources)\n" else: - scraper_script = "doc_scraper.py" progress_msg = "šŸ”„ Starting scraping process...\n" progress_msg += "šŸ“¦ Config format: Legacy (single source)\n" - # Build command - cmd = [sys.executable, str(CLI_DIR / scraper_script), "--config", config_to_use] - - # Add merge mode for unified configs - if is_unified and merge_mode: - cmd.extend(["--merge-mode", merge_mode]) - - # Add --fresh to avoid user input prompts when existing data found - if not skip_scrape: - cmd.append("--fresh") - - if enhance_local: - cmd.append("--enhance-local") - if skip_scrape: - cmd.append("--skip-scrape") - if dry_run: - cmd.append("--dry-run") - - # Determine timeout based on operation type - if dry_run: - timeout = 300 # 5 minutes for dry run - elif skip_scrape: - timeout = 600 # 10 minutes for building from cache - elif unlimited: - timeout = None # No timeout for unlimited mode (user explicitly requested) - else: - # Read config to estimate timeout - try: - if is_unified: - # For unified configs, estimate based on all sources - total_pages = 0 - for source in config.get("sources", []): - if source.get("type") == "documentation": - total_pages += source.get("max_pages", 500) - max_pages = total_pages or 500 - else: - max_pages = config.get("max_pages", 500) - - # Estimate: 30s per page + buffer - timeout = max(3600, max_pages * 35) # Minimum 1 hour, or 35s per page - except Exception: - timeout = 14400 # Default: 4 hours - - # Add progress message - if timeout: - progress_msg += f"ā±ļø Maximum time allowed: {timeout // 60} minutes\n" - else: - progress_msg += "ā±ļø Unlimited mode - no timeout\n" progress_msg += "šŸ“ Progress will be shown below:\n\n" - # Run scraper with streaming - stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + # Run converter in-process + try: + if is_unified: + from skill_seekers.cli.unified_scraper import UnifiedScraper - # Clean up temporary config - if unlimited and Path(config_to_use).exists(): - Path(config_to_use).unlink() + converter = UnifiedScraper(config_to_use, merge_mode=merge_mode) + else: + from skill_seekers.cli.skill_converter import get_converter - output = progress_msg + stdout + # For legacy format, detect type from config keys + with open(config_to_use) as f: + config_to_pass = json.load(f) - if returncode == 0: - return [TextContent(type="text", text=output)] - else: - error_output = output + f"\n\nāŒ Error:\n{stderr}" - return [TextContent(type="text", text=error_output)] + # Detect source type from config content + if "base_url" in config_to_pass: + source_type = "web" + elif "repo" in config_to_pass: + source_type = "github" + elif "pdf_path" in config_to_pass: + source_type = "pdf" + elif "directory" in config_to_pass: + source_type = "local" + else: + source_type = "web" # default fallback + + converter = get_converter(source_type, config_to_pass) + if dry_run: + converter.dry_run = True + + result = _run_converter(converter, progress_msg) + finally: + # Clean up temporary config + if unlimited and Path(config_to_use).exists(): + Path(config_to_use).unlink() + + return result async def scrape_pdf_tool(args: dict) -> list[TextContent]: @@ -318,44 +333,48 @@ async def scrape_pdf_tool(args: dict) -> list[TextContent]: description = args.get("description") from_json = args.get("from_json") - # Build command - cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")] + progress_msg = "šŸ“„ Scraping PDF documentation...\n\n" # Mode 1: Config file if config_path: - cmd.extend(["--config", config_path]) + with open(config_path) as f: + pdf_config = json.load(f) # Mode 2: Direct PDF elif pdf_path and name: - cmd.extend(["--pdf", pdf_path, "--name", name]) + pdf_config = {"name": name, "pdf_path": pdf_path} if description: - cmd.extend(["--description", description]) + pdf_config["description"] = description - # Mode 3: From JSON + # Mode 3: From JSON — use PDFToSkillConverter.load_extracted_data elif from_json: - cmd.extend(["--from-json", from_json]) + from skill_seekers.cli.pdf_scraper import PDFToSkillConverter + + # Build a minimal config; name is derived from the JSON filename + json_name = Path(from_json).stem.replace("_extracted", "") + pdf_config = {"name": json_name} + converter = PDFToSkillConverter(pdf_config) + converter.load_extracted_data(from_json) + converter.build_skill() + return [ + TextContent( + type="text", + text=f"{progress_msg}āœ… Skill built from extracted JSON: {from_json}", + ) + ] else: return [ TextContent( - type="text", text="āŒ Error: Must specify --config, --pdf + --name, or --from-json" + type="text", + text="āŒ Error: Must specify --config, --pdf + --name, or --from-json", ) ] - # Run pdf_scraper.py with streaming (can take a while) - timeout = 600 # 10 minutes for PDF extraction + from skill_seekers.cli.skill_converter import get_converter - progress_msg = "šŸ“„ Scraping PDF documentation...\n" - progress_msg += f"ā±ļø Maximum time: {timeout // 60} minutes\n\n" - - stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) - - output = progress_msg + stdout - - if returncode == 0: - return [TextContent(type="text", text=output)] - else: - return [TextContent(type="text", text=f"{output}\n\nāŒ Error:\n{stderr}")] + converter = get_converter("pdf", pdf_config) + return _run_converter(converter, progress_msg) async def scrape_video_tool(args: dict) -> list[TextContent]: @@ -411,29 +430,23 @@ async def scrape_video_tool(args: dict) -> list[TextContent]: start_time = args.get("start_time") end_time = args.get("end_time") - # Build command - cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")] + # Build config dict for the converter + video_config: dict = {} if from_json: - cmd.extend(["--from-json", from_json]) + video_config["from_json"] = from_json + video_config["name"] = name or Path(from_json).stem.replace("_video_extracted", "") elif url: - cmd.extend(["--url", url]) - if name: - cmd.extend(["--name", name]) - if description: - cmd.extend(["--description", description]) - if languages: - cmd.extend(["--languages", languages]) + video_config["url"] = url + if not name: + return [TextContent(type="text", text="āŒ Error: --name is required with --url")] + video_config["name"] = name elif video_file: - cmd.extend(["--video-file", video_file]) - if name: - cmd.extend(["--name", name]) - if description: - cmd.extend(["--description", description]) + video_config["video_file"] = video_file + video_config["name"] = name or Path(video_file).stem elif playlist: - cmd.extend(["--playlist", playlist]) - if name: - cmd.extend(["--name", name]) + video_config["playlist"] = playlist + video_config["name"] = name or "playlist" else: return [ TextContent( @@ -442,38 +455,31 @@ async def scrape_video_tool(args: dict) -> list[TextContent]: ) ] - # Visual extraction parameters - if visual: - cmd.append("--visual") + if description: + video_config["description"] = description + if languages: + video_config["languages"] = languages + video_config["visual"] = visual if whisper_model: - cmd.extend(["--whisper-model", whisper_model]) + video_config["whisper_model"] = whisper_model if visual_interval is not None: - cmd.extend(["--visual-interval", str(visual_interval)]) + video_config["visual_interval"] = visual_interval if visual_min_gap is not None: - cmd.extend(["--visual-min-gap", str(visual_min_gap)]) + video_config["visual_min_gap"] = visual_min_gap if visual_similarity is not None: - cmd.extend(["--visual-similarity", str(visual_similarity)]) - if vision_ocr: - cmd.append("--vision-ocr") + video_config["visual_similarity"] = visual_similarity + video_config["vision_ocr"] = vision_ocr if start_time: - cmd.extend(["--start-time", str(start_time)]) + video_config["start_time"] = start_time if end_time: - cmd.extend(["--end-time", str(end_time)]) + video_config["end_time"] = end_time - # Run video_scraper.py with streaming - timeout = 600 # 10 minutes for video extraction + progress_msg = "šŸŽ¬ Scraping video content...\n\n" - progress_msg = "šŸŽ¬ Scraping video content...\n" - progress_msg += f"ā±ļø Maximum time: {timeout // 60} minutes\n\n" + from skill_seekers.cli.skill_converter import get_converter - stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) - - output = progress_msg + stdout - - if returncode == 0: - return [TextContent(type="text", text=output)] - else: - return [TextContent(type="text", text=f"{output}\n\nāŒ Error:\n{stderr}")] + converter = get_converter("video", video_config) + return _run_converter(converter, progress_msg) async def scrape_github_tool(args: dict) -> list[TextContent]: @@ -510,50 +516,37 @@ async def scrape_github_tool(args: dict) -> list[TextContent]: max_issues = args.get("max_issues", 100) scrape_only = args.get("scrape_only", False) - # Build command - cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")] - - # Mode 1: Config file + # Build config dict for the converter if config_path: - cmd.extend(["--config", config_path]) - - # Mode 2: Direct repo + with open(config_path) as f: + github_config = json.load(f) elif repo: - cmd.extend(["--repo", repo]) + github_config: dict = {"repo": repo} if name: - cmd.extend(["--name", name]) + github_config["name"] = name if description: - cmd.extend(["--description", description]) + github_config["description"] = description if token: - cmd.extend(["--token", token]) + github_config["token"] = token if no_issues: - cmd.append("--no-issues") + github_config["no_issues"] = True if no_changelog: - cmd.append("--no-changelog") + github_config["no_changelog"] = True if no_releases: - cmd.append("--no-releases") + github_config["no_releases"] = True if max_issues != 100: - cmd.extend(["--max-issues", str(max_issues)]) + github_config["max_issues"] = max_issues if scrape_only: - cmd.append("--scrape-only") - + github_config["scrape_only"] = True else: return [TextContent(type="text", text="āŒ Error: Must specify --repo or --config")] - # Run github_scraper.py with streaming (can take a while) - timeout = 600 # 10 minutes for GitHub scraping + progress_msg = "šŸ™ Scraping GitHub repository...\n\n" - progress_msg = "šŸ™ Scraping GitHub repository...\n" - progress_msg += f"ā±ļø Maximum time: {timeout // 60} minutes\n\n" + from skill_seekers.cli.skill_converter import get_converter - stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) - - output = progress_msg + stdout - - if returncode == 0: - return [TextContent(type="text", text=output)] - else: - return [TextContent(type="text", text=f"{output}\n\nāŒ Error:\n{stderr}")] + converter = get_converter("github", github_config) + return _run_converter(converter, progress_msg) async def scrape_codebase_tool(args: dict) -> list[TextContent]: @@ -605,7 +598,7 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]: if not directory: return [TextContent(type="text", text="āŒ Error: directory parameter is required")] - output = args.get("output", "output/codebase/") + output_dir = args.get("output", "output/codebase/") depth = args.get("depth", "deep") languages = args.get("languages", "") file_patterns = args.get("file_patterns", "") @@ -620,43 +613,28 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]: skip_config_patterns = args.get("skip_config_patterns", False) skip_docs = args.get("skip_docs", False) - # Build command - cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"] - cmd.extend(["--directory", directory]) + # Derive a name from the directory for the converter + dir_name = Path(directory).resolve().name or "codebase" - if output: - cmd.extend(["--output", output]) - if depth: - cmd.extend(["--depth", depth]) + # Build config dict for CodebaseAnalyzer + codebase_config: dict = { + "name": dir_name, + "directory": directory, + "output_dir": output_dir, + "depth": depth, + "enhance_level": enhance_level, + "build_api_reference": not skip_api_reference, + "build_dependency_graph": not skip_dependency_graph, + "detect_patterns": not skip_patterns, + "extract_test_examples": not skip_test_examples, + "build_how_to_guides": not skip_how_to_guides, + "extract_config_patterns": not skip_config_patterns, + "extract_docs": not skip_docs, + } if languages: - cmd.extend(["--languages", languages]) + codebase_config["languages"] = languages if file_patterns: - cmd.extend(["--file-patterns", file_patterns]) - if enhance_level > 0: - cmd.extend(["--enhance-level", str(enhance_level)]) - - # Skip flags - if skip_api_reference: - cmd.append("--skip-api-reference") - if skip_dependency_graph: - cmd.append("--skip-dependency-graph") - if skip_patterns: - cmd.append("--skip-patterns") - if skip_test_examples: - cmd.append("--skip-test-examples") - if skip_how_to_guides: - cmd.append("--skip-how-to-guides") - if skip_config_patterns: - cmd.append("--skip-config-patterns") - if skip_docs: - cmd.append("--skip-docs") - - # Adjust timeout based on enhance_level - timeout = 600 # 10 minutes base - if enhance_level >= 2: - timeout = 1200 # 20 minutes with AI enhancement - if enhance_level >= 3: - timeout = 3600 # 60 minutes for full enhancement + codebase_config["file_patterns"] = file_patterns level_names = {0: "off", 1: "SKILL.md only", 2: "standard", 3: "full"} progress_msg = "šŸ” Analyzing local codebase...\n" @@ -664,16 +642,12 @@ async def scrape_codebase_tool(args: dict) -> list[TextContent]: progress_msg += f"šŸ“Š Depth: {depth}\n" if enhance_level > 0: progress_msg += f"šŸ¤– AI Enhancement: Level {enhance_level} ({level_names.get(enhance_level, 'unknown')})\n" - progress_msg += f"ā±ļø Maximum time: {timeout // 60} minutes\n\n" + progress_msg += "\n" - stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + from skill_seekers.cli.skill_converter import get_converter - output_text = progress_msg + stdout - - if returncode == 0: - return [TextContent(type="text", text=output_text)] - else: - return [TextContent(type="text", text=f"{output_text}\n\nāŒ Error:\n{stderr}")] + converter = get_converter("local", codebase_config) + return _run_converter(converter, progress_msg) async def detect_patterns_tool(args: dict) -> list[TextContent]: @@ -1096,51 +1070,35 @@ async def scrape_generic_tool(args: dict) -> list[TextContent]: ) ] - # Build the subprocess command - # Map source type to module name (most are _scraper, but some differ) - _MODULE_NAMES = { - "manpage": "man_scraper", + # Build config dict for the converter — map MCP args to the keys + # each converter expects in its __init__. + _CONFIG_KEY: dict[str, str] = { + "jupyter": "notebook_path", + "html": "html_path", + "openapi": "spec_path", + "asciidoc": "asciidoc_path", + "pptx": "pptx_path", + "manpage": "man_path", + "confluence": "export_path", + "notion": "export_path", + "rss": "feed_path", + "chat": "export_path", } - module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper") - cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"] - - # Map source type to the correct CLI flag for file/path input and URL input. - # Each scraper has its own flag name — using a generic --path or --url would fail. - _PATH_FLAGS: dict[str, str] = { - "jupyter": "--notebook", - "html": "--html-path", - "openapi": "--spec", - "asciidoc": "--asciidoc-path", - "pptx": "--pptx", - "manpage": "--man-path", - "confluence": "--export-path", - "notion": "--export-path", - "rss": "--feed-path", - "chat": "--export-path", - } - _URL_FLAGS: dict[str, str] = { - "confluence": "--base-url", - "notion": "--page-id", - "rss": "--feed-url", - "openapi": "--spec-url", + _URL_CONFIG_KEY: dict[str, str] = { + "confluence": "base_url", + "notion": "page_id", + "rss": "feed_url", + "openapi": "spec_url", } - # Determine the input flag based on source type + config: dict = {"name": name} + if source_type in _URL_BASED_TYPES and url: - url_flag = _URL_FLAGS.get(source_type, "--url") - cmd.extend([url_flag, url]) + config[_URL_CONFIG_KEY.get(source_type, "url")] = url elif path: - path_flag = _PATH_FLAGS.get(source_type, "--path") - cmd.extend([path_flag, path]) + config[_CONFIG_KEY.get(source_type, "path")] = path elif url: - # Allow url fallback for file-based types (some may accept URLs too) - url_flag = _URL_FLAGS.get(source_type, "--url") - cmd.extend([url_flag, url]) - - cmd.extend(["--name", name]) - - # Set a reasonable timeout - timeout = 600 # 10 minutes + config[_URL_CONFIG_KEY.get(source_type, "url")] = url emoji = _SOURCE_EMOJIS.get(source_type, "šŸ”§") progress_msg = f"{emoji} Scraping {source_type} source...\n" @@ -1148,14 +1106,9 @@ async def scrape_generic_tool(args: dict) -> list[TextContent]: progress_msg += f"šŸ“ Path: {path}\n" if url: progress_msg += f"šŸ”— URL: {url}\n" - progress_msg += f"šŸ“› Name: {name}\n" - progress_msg += f"ā±ļø Maximum time: {timeout // 60} minutes\n\n" + progress_msg += f"šŸ“› Name: {name}\n\n" - stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) + from skill_seekers.cli.skill_converter import get_converter - output = progress_msg + stdout - - if returncode == 0: - return [TextContent(type="text", text=output)] - else: - return [TextContent(type="text", text=f"{output}\n\nāŒ Error:\n{stderr}")] + converter = get_converter(source_type, config) + return _run_converter(converter, progress_msg) diff --git a/tests/conftest.py b/tests/conftest.py index 6886a04..e7079b3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,3 +29,17 @@ def pytest_configure(config): # noqa: ARG001 def anyio_backend(): """Override anyio backend to only use asyncio (not trio).""" return "asyncio" + + +@pytest.fixture(autouse=True) +def _reset_execution_context(): + """Reset the ExecutionContext singleton before and after every test. + + Without this, a test that calls ExecutionContext.initialize() poisons + all subsequent tests in the same process. + """ + from skill_seekers.cli.execution_context import ExecutionContext + + ExecutionContext.reset() + yield + ExecutionContext.reset() diff --git a/tests/test_analyze_command.py b/tests/test_analyze_command.py deleted file mode 100644 index 93bd739..0000000 --- a/tests/test_analyze_command.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python3 -"""Tests for analyze subcommand integration in main CLI.""" - -import sys -import unittest -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - -from skill_seekers.cli.main import create_parser - - -class TestAnalyzeSubcommand(unittest.TestCase): - """Test analyze subcommand registration and argument parsing.""" - - def setUp(self): - """Create parser for testing.""" - self.parser = create_parser() - - def test_analyze_subcommand_exists(self): - """Test that analyze subcommand is registered.""" - args = self.parser.parse_args(["analyze", "--directory", "."]) - self.assertEqual(args.command, "analyze") - self.assertEqual(args.directory, ".") - - def test_analyze_with_output_directory(self): - """Test analyze with custom output directory.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--output", "custom/"]) - self.assertEqual(args.output, "custom/") - - def test_quick_preset_flag(self): - """Test --quick preset flag parsing.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--quick"]) - self.assertTrue(args.quick) - self.assertFalse(args.comprehensive) - - def test_comprehensive_preset_flag(self): - """Test --comprehensive preset flag parsing.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--comprehensive"]) - self.assertTrue(args.comprehensive) - self.assertFalse(args.quick) - - def test_quick_and_comprehensive_mutually_exclusive(self): - """Test that both flags can be parsed (mutual exclusion enforced at runtime).""" - # The parser allows both flags; runtime logic prevents simultaneous use - args = self.parser.parse_args(["analyze", "--directory", ".", "--quick", "--comprehensive"]) - self.assertTrue(args.quick) - self.assertTrue(args.comprehensive) - # Note: Runtime will catch this and return error code 1 - - def test_enhance_level_flag(self): - """Test --enhance-level flag parsing.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--enhance-level", "2"]) - self.assertEqual(args.enhance_level, 2) - - def test_skip_flags_passed_through(self): - """Test that skip flags are recognized.""" - args = self.parser.parse_args( - ["analyze", "--directory", ".", "--skip-patterns", "--skip-test-examples"] - ) - self.assertTrue(args.skip_patterns) - self.assertTrue(args.skip_test_examples) - - def test_all_skip_flags(self): - """Test all skip flags are properly parsed.""" - args = self.parser.parse_args( - [ - "analyze", - "--directory", - ".", - "--skip-api-reference", - "--skip-dependency-graph", - "--skip-patterns", - "--skip-test-examples", - "--skip-how-to-guides", - "--skip-config-patterns", - "--skip-docs", - ] - ) - self.assertTrue(args.skip_api_reference) - self.assertTrue(args.skip_dependency_graph) - self.assertTrue(args.skip_patterns) - self.assertTrue(args.skip_test_examples) - self.assertTrue(args.skip_how_to_guides) - self.assertTrue(args.skip_config_patterns) - self.assertTrue(args.skip_docs) - - def test_backward_compatible_depth_flag(self): - """Test that deprecated --depth flag still works.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--depth", "full"]) - self.assertEqual(args.depth, "full") - - def test_depth_flag_choices(self): - """Test that depth flag accepts correct values.""" - for depth in ["surface", "deep", "full"]: - args = self.parser.parse_args(["analyze", "--directory", ".", "--depth", depth]) - self.assertEqual(args.depth, depth) - - def test_languages_flag(self): - """Test languages flag parsing.""" - args = self.parser.parse_args( - ["analyze", "--directory", ".", "--languages", "Python,JavaScript"] - ) - self.assertEqual(args.languages, "Python,JavaScript") - - def test_file_patterns_flag(self): - """Test file patterns flag parsing.""" - args = self.parser.parse_args( - ["analyze", "--directory", ".", "--file-patterns", "*.py,src/**/*.js"] - ) - self.assertEqual(args.file_patterns, "*.py,src/**/*.js") - - def test_no_comments_flag(self): - """Test no-comments flag parsing.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--no-comments"]) - self.assertTrue(args.no_comments) - - def test_verbose_flag(self): - """Test verbose flag parsing.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--verbose"]) - self.assertTrue(args.verbose) - - def test_complex_command_combination(self): - """Test complex command with multiple flags.""" - args = self.parser.parse_args( - [ - "analyze", - "--directory", - "./src", - "--output", - "analysis/", - "--quick", - "--languages", - "Python", - "--skip-patterns", - "--verbose", - ] - ) - self.assertEqual(args.directory, "./src") - self.assertEqual(args.output, "analysis/") - self.assertTrue(args.quick) - self.assertEqual(args.languages, "Python") - self.assertTrue(args.skip_patterns) - self.assertTrue(args.verbose) - - def test_directory_is_required(self): - """Test that directory argument is required.""" - with self.assertRaises(SystemExit): - self.parser.parse_args(["analyze"]) - - def test_default_output_directory(self): - """Test default output directory value.""" - args = self.parser.parse_args(["analyze", "--directory", "."]) - self.assertEqual(args.output, "output/codebase/") - - -class TestAnalyzePresetBehavior(unittest.TestCase): - """Test preset flag behavior and argument transformation.""" - - def setUp(self): - """Create parser for testing.""" - self.parser = create_parser() - - def test_quick_preset_implies_surface_depth(self): - """Test that --quick preset should trigger surface depth.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--quick"]) - self.assertTrue(args.quick) - # Note: Depth transformation happens in dispatch handler - - def test_comprehensive_preset_implies_full_depth(self): - """Test that --comprehensive preset should trigger full depth.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--comprehensive"]) - self.assertTrue(args.comprehensive) - # Note: Depth transformation happens in dispatch handler - - def test_enhance_level_standalone(self): - """Test --enhance-level can be used without presets.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--enhance-level", "3"]) - self.assertEqual(args.enhance_level, 3) - self.assertFalse(args.quick) - self.assertFalse(args.comprehensive) - - -class TestAnalyzeWorkflowFlags(unittest.TestCase): - """Test workflow and parity flags added to the analyze subcommand.""" - - def setUp(self): - """Create parser for testing.""" - self.parser = create_parser() - - def test_enhance_workflow_accepted_as_list(self): - """Test --enhance-workflow is accepted and stored as a list.""" - args = self.parser.parse_args( - ["analyze", "--directory", ".", "--enhance-workflow", "security-focus"] - ) - self.assertEqual(args.enhance_workflow, ["security-focus"]) - - def test_enhance_workflow_chained_twice(self): - """Test --enhance-workflow can be chained to produce a two-item list.""" - args = self.parser.parse_args( - [ - "analyze", - "--directory", - ".", - "--enhance-workflow", - "security-focus", - "--enhance-workflow", - "minimal", - ] - ) - self.assertEqual(args.enhance_workflow, ["security-focus", "minimal"]) - - def test_enhance_stage_accepted_as_list(self): - """Test --enhance-stage is accepted with action=append.""" - args = self.parser.parse_args( - ["analyze", "--directory", ".", "--enhance-stage", "sec:Analyze security"] - ) - self.assertEqual(args.enhance_stage, ["sec:Analyze security"]) - - def test_var_accepted_as_list(self): - """Test --var is accepted with action=append (dest is 'var').""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--var", "focus=performance"]) - self.assertEqual(args.var, ["focus=performance"]) - - def test_workflow_dry_run_flag(self): - """Test --workflow-dry-run sets the flag.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--workflow-dry-run"]) - self.assertTrue(args.workflow_dry_run) - - def test_api_key_stored_correctly(self): - """Test --api-key is stored in args.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--api-key", "sk-ant-test"]) - self.assertEqual(args.api_key, "sk-ant-test") - - def test_dry_run_stored_correctly(self): - """Test --dry-run is stored in args.""" - args = self.parser.parse_args(["analyze", "--directory", ".", "--dry-run"]) - self.assertTrue(args.dry_run) - - def test_workflow_flags_combined(self): - """Test workflow flags can be combined with other analyze flags.""" - args = self.parser.parse_args( - [ - "analyze", - "--directory", - ".", - "--enhance-workflow", - "security-focus", - "--api-key", - "sk-ant-test", - "--dry-run", - "--enhance-level", - "1", - ] - ) - self.assertEqual(args.enhance_workflow, ["security-focus"]) - self.assertEqual(args.api_key, "sk-ant-test") - self.assertTrue(args.dry_run) - self.assertEqual(args.enhance_level, 1) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_analyze_e2e.py b/tests/test_analyze_e2e.py deleted file mode 100644 index a5b484e..0000000 --- a/tests/test_analyze_e2e.py +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env python3 -""" -End-to-End tests for the new 'analyze' command. -Tests real-world usage scenarios with actual command execution. -""" - -import json -import shutil -import subprocess -import sys -import tempfile -import unittest -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - - -class TestAnalyzeCommandE2E(unittest.TestCase): - """End-to-end tests for skill-seekers analyze command.""" - - @classmethod - def setUpClass(cls): - """Set up test fixtures once for all tests.""" - cls.test_dir = Path(tempfile.mkdtemp(prefix="analyze_e2e_")) - cls.create_sample_codebase() - - @classmethod - def tearDownClass(cls): - """Clean up test directory.""" - if cls.test_dir.exists(): - shutil.rmtree(cls.test_dir) - - @classmethod - def create_sample_codebase(cls): - """Create a sample Python codebase for testing.""" - # Create directory structure - (cls.test_dir / "src").mkdir() - (cls.test_dir / "tests").mkdir() - - # Create sample Python files - (cls.test_dir / "src" / "__init__.py").write_text("") - - (cls.test_dir / "src" / "main.py").write_text(''' -"""Main application module.""" - -class Application: - """Main application class.""" - - def __init__(self, name: str): - """Initialize application. - - Args: - name: Application name - """ - self.name = name - - def run(self): - """Run the application.""" - print(f"Running {self.name}") - return True -''') - - (cls.test_dir / "tests" / "test_main.py").write_text(''' -"""Tests for main module.""" -import unittest -from src.main import Application - -class TestApplication(unittest.TestCase): - """Test Application class.""" - - def test_init(self): - """Test application initialization.""" - app = Application("Test") - self.assertEqual(app.name, "Test") - - def test_run(self): - """Test application run.""" - app = Application("Test") - self.assertTrue(app.run()) -''') - - def run_command(self, *args, timeout=120): - """Run skill-seekers command and return result.""" - cmd = ["skill-seekers"] + list(args) - result = subprocess.run( - cmd, capture_output=True, text=True, timeout=timeout, cwd=str(self.test_dir) - ) - return result - - def test_analyze_help_shows_command(self): - """Test that analyze command appears in main help.""" - result = self.run_command("--help", timeout=5) - self.assertEqual(result.returncode, 0, f"Help failed: {result.stderr}") - self.assertIn("analyze", result.stdout) - self.assertIn("Analyze local codebase", result.stdout) - - def test_analyze_subcommand_help(self): - """Test that analyze subcommand has proper help.""" - result = self.run_command("analyze", "--help", timeout=5) - self.assertEqual(result.returncode, 0, f"Analyze help failed: {result.stderr}") - self.assertIn("--quick", result.stdout) - self.assertIn("--comprehensive", result.stdout) - self.assertIn("--enhance", result.stdout) - self.assertIn("--directory", result.stdout) - - def test_analyze_quick_preset(self): - """Test quick analysis preset (real execution).""" - output_dir = self.test_dir / "output_quick" - - result = self.run_command( - "analyze", "--directory", str(self.test_dir), "--output", str(output_dir), "--quick" - ) - - # Check command succeeded - self.assertEqual( - result.returncode, - 0, - f"Quick analysis failed:\nSTDOUT: {result.stdout}\nSTDERR: {result.stderr}", - ) - - # Verify output directory was created - self.assertTrue(output_dir.exists(), "Output directory not created") - - # Verify SKILL.md was generated - skill_file = output_dir / "SKILL.md" - self.assertTrue(skill_file.exists(), "SKILL.md not generated") - - # Verify SKILL.md has content and valid structure - skill_content = skill_file.read_text() - self.assertGreater(len(skill_content), 100, "SKILL.md is too short") - - # Check for expected structure (works even with 0 files analyzed) - self.assertIn("Codebase", skill_content, "Missing codebase header") - self.assertIn("Analysis", skill_content, "Missing analysis section") - - # Verify it's valid markdown with frontmatter - self.assertTrue(skill_content.startswith("---"), "Missing YAML frontmatter") - self.assertIn("name:", skill_content, "Missing name in frontmatter") - - def test_analyze_with_custom_output(self): - """Test analysis with custom output directory.""" - output_dir = self.test_dir / "custom_output" - - result = self.run_command( - "analyze", "--directory", str(self.test_dir), "--output", str(output_dir), "--quick" - ) - - self.assertEqual(result.returncode, 0, f"Analysis failed: {result.stderr}") - self.assertTrue(output_dir.exists(), "Custom output directory not created") - self.assertTrue((output_dir / "SKILL.md").exists(), "SKILL.md not in custom directory") - - def test_analyze_skip_flags_work(self): - """Test that skip flags are properly handled.""" - output_dir = self.test_dir / "output_skip" - - result = self.run_command( - "analyze", - "--directory", - str(self.test_dir), - "--output", - str(output_dir), - "--quick", - "--skip-patterns", - "--skip-test-examples", - ) - - self.assertEqual(result.returncode, 0, f"Analysis with skip flags failed: {result.stderr}") - self.assertTrue( - (output_dir / "SKILL.md").exists(), "SKILL.md not generated with skip flags" - ) - - def test_analyze_invalid_directory(self): - """Test analysis with non-existent directory.""" - result = self.run_command( - "analyze", "--directory", "/nonexistent/directory/path", "--quick", timeout=10 - ) - - # Should fail with error - self.assertNotEqual(result.returncode, 0, "Should fail with invalid directory") - self.assertTrue( - "not found" in result.stderr.lower() or "does not exist" in result.stderr.lower(), - f"Expected directory error, got: {result.stderr}", - ) - - def test_analyze_missing_directory_arg(self): - """Test that --directory is required.""" - result = self.run_command("analyze", "--quick", timeout=5) - - # Should fail without --directory - self.assertNotEqual(result.returncode, 0, "Should fail without --directory") - self.assertTrue( - "required" in result.stderr.lower() or "directory" in result.stderr.lower(), - f"Expected missing argument error, got: {result.stderr}", - ) - - def test_backward_compatibility_depth_flag(self): - """Test that old --depth flag still works.""" - output_dir = self.test_dir / "output_depth" - - result = self.run_command( - "analyze", - "--directory", - str(self.test_dir), - "--output", - str(output_dir), - "--depth", - "surface", - ) - - self.assertEqual(result.returncode, 0, f"Depth flag failed: {result.stderr}") - self.assertTrue((output_dir / "SKILL.md").exists(), "SKILL.md not generated with --depth") - - def test_analyze_generates_references(self): - """Test that references directory is created.""" - output_dir = self.test_dir / "output_refs" - - result = self.run_command( - "analyze", "--directory", str(self.test_dir), "--output", str(output_dir), "--quick" - ) - - self.assertEqual(result.returncode, 0, f"Analysis failed: {result.stderr}") - - # Check for references directory - refs_dir = output_dir / "references" - if refs_dir.exists(): # Optional, depends on content - self.assertTrue(refs_dir.is_dir(), "References is not a directory") - - def test_analyze_output_structure(self): - """Test that output has expected structure.""" - output_dir = self.test_dir / "output_structure" - - result = self.run_command( - "analyze", "--directory", str(self.test_dir), "--output", str(output_dir), "--quick" - ) - - self.assertEqual(result.returncode, 0, f"Analysis failed: {result.stderr}") - - # Verify expected files/directories - self.assertTrue((output_dir / "SKILL.md").exists(), "SKILL.md missing") - - # Check for code_analysis.json if it exists - analysis_file = output_dir / "code_analysis.json" - if analysis_file.exists(): - # Verify it's valid JSON - with open(analysis_file) as f: - data = json.load(f) - self.assertIsInstance(data, (dict, list), "code_analysis.json is not valid JSON") - - -class TestAnalyzeOldCommand(unittest.TestCase): - """Test that old skill-seekers-codebase command still works.""" - - def test_old_command_still_exists(self): - """Test that skill-seekers-codebase still exists.""" - result = subprocess.run( - ["skill-seekers-codebase", "--help"], capture_output=True, text=True, timeout=5 - ) - - # Command should exist and show help - self.assertEqual(result.returncode, 0, f"Old command doesn't work: {result.stderr}") - self.assertIn("--directory", result.stdout) - - -class TestAnalyzeIntegration(unittest.TestCase): - """Integration tests for analyze command with other features.""" - - def setUp(self): - """Set up test directory.""" - self.test_dir = Path(tempfile.mkdtemp(prefix="analyze_int_")) - - # Create minimal Python project - (self.test_dir / "main.py").write_text(''' -def hello(): - """Say hello.""" - return "Hello, World!" -''') - - def tearDown(self): - """Clean up test directory.""" - if self.test_dir.exists(): - shutil.rmtree(self.test_dir) - - def test_analyze_then_check_output(self): - """Test analyzing and verifying output can be read.""" - output_dir = self.test_dir / "output" - - # Run analysis - result = subprocess.run( - [ - "skill-seekers", - "analyze", - "--directory", - str(self.test_dir), - "--output", - str(output_dir), - "--quick", - ], - capture_output=True, - text=True, - timeout=120, - ) - - self.assertEqual(result.returncode, 0, f"Analysis failed: {result.stderr}") - - # Read and verify SKILL.md - skill_file = output_dir / "SKILL.md" - self.assertTrue(skill_file.exists(), "SKILL.md not created") - - content = skill_file.read_text() - # Check for valid structure instead of specific content - # (file detection may vary in temp directories) - self.assertGreater(len(content), 50, "Output too short") - self.assertIn("Codebase", content, "Missing codebase header") - self.assertTrue(content.startswith("---"), "Missing YAML frontmatter") - - def test_analyze_verbose_flag(self): - """Test that verbose flag works.""" - output_dir = self.test_dir / "output" - - result = subprocess.run( - [ - "skill-seekers", - "analyze", - "--directory", - str(self.test_dir), - "--output", - str(output_dir), - "--quick", - "--verbose", - ], - capture_output=True, - text=True, - timeout=120, - ) - - self.assertEqual(result.returncode, 0, f"Verbose analysis failed: {result.stderr}") - - # Verbose should produce more output - combined_output = result.stdout + result.stderr - self.assertGreater(len(combined_output), 100, "Verbose mode didn't produce extra output") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_bootstrap_skill.py b/tests/test_bootstrap_skill.py index 330f462..6f08e40 100644 --- a/tests/test_bootstrap_skill.py +++ b/tests/test_bootstrap_skill.py @@ -37,9 +37,7 @@ class TestBootstrapSkillScript: # Must have commands table assert "## Commands" in content, "Header must have Commands section" - assert "skill-seekers analyze" in content, "Header must mention analyze command" - assert "skill-seekers scrape" in content, "Header must mention scrape command" - assert "skill-seekers github" in content, "Header must mention github command" + assert "skill-seekers create" in content, "Header must mention create command" def test_header_has_yaml_frontmatter(self, project_root): """Test that header has valid YAML frontmatter.""" diff --git a/tests/test_browser_renderer.py b/tests/test_browser_renderer.py index e266740..8f05935 100644 --- a/tests/test_browser_renderer.py +++ b/tests/test_browser_renderer.py @@ -147,18 +147,31 @@ class TestDocScraperBrowserIntegration: class TestBrowserArgument: - """Test --browser argument is registered in CLI.""" + """Test --browser argument is accepted by DocToSkillConverter config.""" - def test_scrape_parser_accepts_browser_flag(self): - from skill_seekers.cli.doc_scraper import setup_argument_parser + def test_browser_config_true(self): + """Test that DocToSkillConverter accepts browser=True in config.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter - parser = setup_argument_parser() - args = parser.parse_args(["--name", "test", "--url", "https://example.com", "--browser"]) - assert args.browser is True + config = { + "name": "test", + "base_url": "https://example.com", + "browser": True, + "selectors": {}, + "url_patterns": {"include": [], "exclude": []}, + } + scraper = DocToSkillConverter(config) + assert scraper.browser_mode is True - def test_scrape_parser_browser_default_false(self): - from skill_seekers.cli.doc_scraper import setup_argument_parser + def test_browser_config_default_false(self): + """Test that DocToSkillConverter defaults browser to False.""" + from skill_seekers.cli.doc_scraper import DocToSkillConverter - parser = setup_argument_parser() - args = parser.parse_args(["--name", "test", "--url", "https://example.com"]) - assert args.browser is False + config = { + "name": "test", + "base_url": "https://example.com", + "selectors": {}, + "url_patterns": {"include": [], "exclude": []}, + } + scraper = DocToSkillConverter(config) + assert scraper.browser_mode is False diff --git a/tests/test_cli_parsers.py b/tests/test_cli_parsers.py index 7a2fc3a..cc3b629 100644 --- a/tests/test_cli_parsers.py +++ b/tests/test_cli_parsers.py @@ -14,8 +14,6 @@ from skill_seekers.cli.parsers import ( get_parser_names, register_parsers, ) -from skill_seekers.cli.parsers.scrape_parser import ScrapeParser -from skill_seekers.cli.parsers.github_parser import GitHubParser from skill_seekers.cli.parsers.package_parser import PackageParser @@ -24,20 +22,17 @@ class TestParserRegistry: def test_all_parsers_registered(self): """Test that all parsers are registered.""" - assert len(PARSERS) == 36, f"Expected 36 parsers, got {len(PARSERS)}" + assert len(PARSERS) == 18, f"Expected 18 parsers, got {len(PARSERS)}" def test_get_parser_names(self): """Test getting list of parser names.""" names = get_parser_names() - assert len(names) == 36 - assert "scrape" in names - assert "github" in names + assert len(names) == 18 + assert "create" in names assert "package" in names assert "upload" in names - assert "analyze" in names assert "config" in names assert "workflows" in names - assert "video" in names def test_all_parsers_are_subcommand_parsers(self): """Test that all parsers inherit from SubcommandParser.""" @@ -71,29 +66,6 @@ class TestParserRegistry: class TestParserCreation: """Test parser creation functionality.""" - def test_scrape_parser_creates_subparser(self): - """Test that ScrapeParser creates valid subparser.""" - main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers() - - scrape_parser = ScrapeParser() - subparser = scrape_parser.create_parser(subparsers) - - assert subparser is not None - assert scrape_parser.name == "scrape" - assert scrape_parser.help == "Scrape documentation website" - - def test_github_parser_creates_subparser(self): - """Test that GitHubParser creates valid subparser.""" - main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers() - - github_parser = GitHubParser() - subparser = github_parser.create_parser(subparsers) - - assert subparser is not None - assert github_parser.name == "github" - def test_package_parser_creates_subparser(self): """Test that PackageParser creates valid subparser.""" main_parser = argparse.ArgumentParser() @@ -106,21 +78,18 @@ class TestParserCreation: assert package_parser.name == "package" def test_register_parsers_creates_all_subcommands(self): - """Test that register_parsers creates all 19 subcommands.""" + """Test that register_parsers creates all subcommands.""" main_parser = argparse.ArgumentParser() subparsers = main_parser.add_subparsers(dest="command") # Register all parsers register_parsers(subparsers) - # Test that all commands can be parsed + # Test that existing commands can be parsed test_commands = [ "config --show", - "scrape --config test.json", - "github --repo owner/repo", "package output/test/", "upload test.zip", - "analyze --directory .", "enhance output/test/", "estimate test.json", ] @@ -133,40 +102,6 @@ class TestParserCreation: class TestSpecificParsers: """Test specific parser implementations.""" - def test_scrape_parser_arguments(self): - """Test ScrapeParser has correct arguments.""" - main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest="command") - - scrape_parser = ScrapeParser() - scrape_parser.create_parser(subparsers) - - # Test various argument combinations - args = main_parser.parse_args(["scrape", "--config", "test.json"]) - assert args.command == "scrape" - assert args.config == "test.json" - - args = main_parser.parse_args(["scrape", "--config", "test.json", "--max-pages", "100"]) - assert args.max_pages == 100 - - args = main_parser.parse_args(["scrape", "--enhance-level", "2"]) - assert args.enhance_level == 2 - - def test_github_parser_arguments(self): - """Test GitHubParser has correct arguments.""" - main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest="command") - - github_parser = GitHubParser() - github_parser.create_parser(subparsers) - - args = main_parser.parse_args(["github", "--repo", "owner/repo"]) - assert args.command == "github" - assert args.repo == "owner/repo" - - args = main_parser.parse_args(["github", "--repo", "owner/repo", "--non-interactive"]) - assert args.non_interactive is True - def test_package_parser_arguments(self): """Test PackageParser has correct arguments.""" main_parser = argparse.ArgumentParser() @@ -185,44 +120,19 @@ class TestSpecificParsers: args = main_parser.parse_args(["package", "output/test/", "--no-open"]) assert args.no_open is True - def test_analyze_parser_arguments(self): - """Test AnalyzeParser has correct arguments.""" - main_parser = argparse.ArgumentParser() - subparsers = main_parser.add_subparsers(dest="command") - from skill_seekers.cli.parsers.analyze_parser import AnalyzeParser +class TestCurrentCommands: + """Test current CLI commands after Grand Unification.""" - analyze_parser = AnalyzeParser() - analyze_parser.create_parser(subparsers) - - args = main_parser.parse_args(["analyze", "--directory", "."]) - assert args.command == "analyze" - assert args.directory == "." - - args = main_parser.parse_args(["analyze", "--directory", ".", "--quick"]) - assert args.quick is True - - args = main_parser.parse_args(["analyze", "--directory", ".", "--comprehensive"]) - assert args.comprehensive is True - - args = main_parser.parse_args(["analyze", "--directory", ".", "--skip-patterns"]) - assert args.skip_patterns is True - - -class TestBackwardCompatibility: - """Test backward compatibility with old CLI.""" - - def test_all_original_commands_still_work(self): - """Test that all original commands are still registered.""" + def test_all_current_commands_registered(self): + """Test that all current commands are registered.""" names = get_parser_names() - # Original commands from old main.py - original_commands = [ + # Commands that survived the Grand Unification + # (individual scraper commands removed; use 'create' instead) + current_commands = [ "config", - "scrape", - "github", - "pdf", - "unified", + "create", "enhance", "enhance-status", "package", @@ -230,22 +140,50 @@ class TestBackwardCompatibility: "estimate", "extract-test-examples", "install-agent", - "analyze", "install", "resume", "stream", "update", "multilang", "quality", + "doctor", + "workflows", + "sync-config", ] - for cmd in original_commands: + for cmd in current_commands: assert cmd in names, f"Command '{cmd}' not found in parser registry!" + def test_removed_scraper_commands_not_present(self): + """Test that individual scraper commands were removed.""" + names = get_parser_names() + + removed_commands = [ + "scrape", + "github", + "pdf", + "video", + "word", + "epub", + "jupyter", + "html", + "openapi", + "asciidoc", + "pptx", + "rss", + "manpage", + "confluence", + "notion", + "chat", + ] + + for cmd in removed_commands: + assert cmd not in names, f"Removed command '{cmd}' still in parser registry!" + def test_command_count_matches(self): - """Test that we have exactly 35 commands (25 original + 10 new source types).""" - assert len(PARSERS) == 36 - assert len(get_parser_names()) == 36 + """Test that we have exactly 18 commands.""" + assert len(PARSERS) == 18 + assert len(get_parser_names()) == 18 if __name__ == "__main__": diff --git a/tests/test_cli_refactor_e2e.py b/tests/test_cli_refactor_e2e.py index 9ed6a67..2c1a1d5 100644 --- a/tests/test_cli_refactor_e2e.py +++ b/tests/test_cli_refactor_e2e.py @@ -14,152 +14,6 @@ import subprocess import argparse -class TestParserSync: - """E2E tests for parser synchronization (Issue #285).""" - - def test_scrape_interactive_flag_works(self): - """Test that --interactive flag (previously missing) now works.""" - result = subprocess.run( - ["skill-seekers", "scrape", "--interactive", "--help"], capture_output=True, text=True - ) - assert result.returncode == 0, "Command should execute successfully" - assert "--interactive" in result.stdout, "Help should show --interactive flag" - assert "-i" in result.stdout, "Help should show short form -i" - - def test_scrape_chunk_for_rag_flag_works(self): - """Test that --chunk-for-rag flag (previously missing) now works.""" - result = subprocess.run( - ["skill-seekers", "scrape", "--help"], capture_output=True, text=True - ) - assert "--chunk-for-rag" in result.stdout, "Help should show --chunk-for-rag flag" - assert "--chunk-tokens" in result.stdout, "Help should show --chunk-tokens flag" - assert "--chunk-overlap-tokens" in result.stdout, ( - "Help should show --chunk-overlap-tokens flag" - ) - - def test_scrape_verbose_flag_works(self): - """Test that --verbose flag (previously missing) now works.""" - result = subprocess.run( - ["skill-seekers", "scrape", "--help"], capture_output=True, text=True - ) - assert "--verbose" in result.stdout, "Help should show --verbose flag" - assert "-v" in result.stdout, "Help should show short form -v" - - def test_scrape_url_flag_works(self): - """Test that --url flag (previously missing) now works.""" - result = subprocess.run( - ["skill-seekers", "scrape", "--help"], capture_output=True, text=True - ) - assert "--url URL" in result.stdout, "Help should show --url flag" - - def test_github_all_flags_present(self): - """Test that github command has all expected flags.""" - result = subprocess.run( - ["skill-seekers", "github", "--help"], capture_output=True, text=True - ) - # Key github flags that should be present - expected_flags = [ - "--repo", - "--api-key", - "--profile", - "--non-interactive", - ] - for flag in expected_flags: - assert flag in result.stdout, f"Help should show {flag} flag" - - -class TestPresetSystem: - """E2E tests for preset system (Issue #268).""" - - def test_analyze_preset_flag_exists(self): - """Test that analyze command has --preset flag.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--help"], capture_output=True, text=True - ) - assert "--preset" in result.stdout, "Help should show --preset flag" - assert "quick" in result.stdout, "Help should mention 'quick' preset" - assert "standard" in result.stdout, "Help should mention 'standard' preset" - assert "comprehensive" in result.stdout, "Help should mention 'comprehensive' preset" - - def test_analyze_preset_list_flag_exists(self): - """Test that analyze command has --preset-list flag.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--help"], capture_output=True, text=True - ) - assert "--preset-list" in result.stdout, "Help should show --preset-list flag" - - def test_preset_list_shows_presets(self): - """Test that --preset-list shows all available presets.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--preset-list"], capture_output=True, text=True - ) - assert result.returncode == 0, "Command should execute successfully" - assert "Available presets" in result.stdout, "Should show preset list header" - assert "quick" in result.stdout, "Should show quick preset" - assert "standard" in result.stdout, "Should show standard preset" - assert "comprehensive" in result.stdout, "Should show comprehensive preset" - assert "1-2 minutes" in result.stdout, "Should show time estimates" - - def test_deprecated_quick_flag_shows_warning(self, tmp_path): - """Test that --quick flag shows deprecation warning.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--directory", str(tmp_path), "--quick"], - capture_output=True, - text=True, - ) - # Note: Deprecation warnings go to stderr or stdout - output = result.stdout + result.stderr - assert "DEPRECATED" in output, "Should show deprecation warning" - assert "--preset quick" in output, "Should suggest alternative" - - def test_deprecated_comprehensive_flag_shows_warning(self, tmp_path): - """Test that --comprehensive flag shows deprecation warning.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--directory", str(tmp_path), "--comprehensive"], - capture_output=True, - text=True, - ) - output = result.stdout + result.stderr - assert "DEPRECATED" in output, "Should show deprecation warning" - assert "--preset comprehensive" in output, "Should suggest alternative" - - -class TestBackwardCompatibility: - """E2E tests for backward compatibility.""" - - def test_old_scrape_command_still_works(self): - """Test that old scrape command invocations still work.""" - result = subprocess.run(["skill-seekers-scrape", "--help"], capture_output=True, text=True) - assert result.returncode == 0, "Old command should still work" - assert "documentation" in result.stdout.lower(), "Help should mention documentation" - - def test_unified_cli_and_standalone_have_same_args(self): - """Test that unified CLI and standalone have identical arguments.""" - # Get help from unified CLI - unified_result = subprocess.run( - ["skill-seekers", "scrape", "--help"], capture_output=True, text=True - ) - - # Get help from standalone - standalone_result = subprocess.run( - ["skill-seekers-scrape", "--help"], capture_output=True, text=True - ) - - # Both should have the same key flags - key_flags = [ - "--interactive", - "--url", - "--verbose", - "--chunk-for-rag", - "--config", - "--max-pages", - ] - - for flag in key_flags: - assert flag in unified_result.stdout, f"Unified should have {flag}" - assert flag in standalone_result.stdout, f"Standalone should have {flag}" - - class TestProgrammaticAPI: """Test that the shared argument functions work programmatically.""" @@ -211,11 +65,7 @@ class TestIntegration: # All major commands should be listed expected_commands = [ - "scrape", - "github", - "pdf", - "unified", - "analyze", + "create", "enhance", "package", "upload", @@ -224,75 +74,6 @@ class TestIntegration: for cmd in expected_commands: assert cmd in result.stdout, f"Should list {cmd} command" - def test_scrape_help_detailed(self): - """Test that scrape help shows all argument details.""" - result = subprocess.run( - ["skill-seekers", "scrape", "--help"], capture_output=True, text=True - ) - - # Check for argument categories - assert "url" in result.stdout.lower(), "Should show url argument" - assert "scraping options" in result.stdout.lower() or "options" in result.stdout.lower() - assert "enhancement" in result.stdout.lower(), "Should mention enhancement options" - - def test_analyze_help_shows_presets(self): - """Test that analyze help prominently shows preset information.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--help"], capture_output=True, text=True - ) - - assert "--preset" in result.stdout, "Should show --preset flag" - assert "DEFAULT" in result.stdout or "default" in result.stdout, ( - "Should indicate default preset" - ) - - -class TestE2EWorkflow: - """End-to-end workflow tests.""" - - @pytest.mark.slow - def test_dry_run_scrape_with_new_args(self, tmp_path): - """Test scraping with previously missing arguments (dry run).""" - result = subprocess.run( - [ - "skill-seekers", - "scrape", - "--url", - "https://example.com", - "--interactive", - "false", # Would fail if arg didn't exist - "--verbose", # Would fail if arg didn't exist - "--dry-run", - ], - capture_output=True, - text=True, - timeout=10, - ) - - # Dry run should complete without errors - # (it may return non-zero if --interactive false isn't valid, - # but it shouldn't crash with "unrecognized arguments") - assert "unrecognized arguments" not in result.stderr.lower() - - @pytest.mark.slow - def test_analyze_with_preset_flag(self, tmp_path): - """Test analyze with preset flag (no dry-run available).""" - # Create a dummy directory to analyze - test_dir = tmp_path / "test_code" - test_dir.mkdir() - (test_dir / "test.py").write_text("def hello(): pass") - - # Just verify the flag is recognized (no execution) - result = subprocess.run( - ["skill-seekers", "analyze", "--help"], - capture_output=True, - text=True, - ) - - # Verify preset flag exists - assert "--preset" in result.stdout, "Should have --preset flag" - assert "unrecognized arguments" not in result.stderr.lower() - class TestVarFlagRouting: """Test that --var flag is correctly routed through create command.""" @@ -306,15 +87,6 @@ class TestVarFlagRouting: ) assert "--var" in result.stdout, "create --help should show --var flag" - def test_var_flag_accepted_by_analyze(self): - """Test that --var flag is accepted by analyze command.""" - result = subprocess.run( - ["skill-seekers", "analyze", "--help"], - capture_output=True, - text=True, - ) - assert "--var" in result.stdout, "analyze --help should show --var flag" - @pytest.mark.slow def test_var_flag_not_rejected_in_create_local(self, tmp_path): """Test --var KEY=VALUE doesn't cause 'unrecognized arguments' in create.""" @@ -354,15 +126,6 @@ class TestBackwardCompatibleFlags: # but should not cause an error if used assert result.returncode == 0 - def test_no_preserve_code_alias_accepted_by_scrape(self): - """Test --no-preserve-code (old name) is still accepted by scrape command.""" - result = subprocess.run( - ["skill-seekers", "scrape", "--help"], - capture_output=True, - text=True, - ) - assert result.returncode == 0 - def test_no_preserve_code_alias_accepted_by_create(self): """Test --no-preserve-code (old name) is still accepted by create command.""" result = subprocess.run( diff --git a/tests/test_create_integration_basic.py b/tests/test_create_integration_basic.py index 2678c5f..f8baf36 100644 --- a/tests/test_create_integration_basic.py +++ b/tests/test_create_integration_basic.py @@ -101,395 +101,96 @@ class TestCreateCommandBasic: # Verify help works assert result.returncode in [0, 2] - def test_create_invalid_source_shows_error(self): - """Test that invalid sources raise a helpful ValueError.""" - from skill_seekers.cli.source_detector import SourceDetector - with pytest.raises(ValueError) as exc_info: - SourceDetector.detect("not_a_valid_source_123_xyz") +class TestCreateCommandConverterRouting: + """Tests that create command routes to correct converters.""" - error_message = str(exc_info.value) - assert "Cannot determine source type" in error_message - # Error should include helpful examples - assert "https://" in error_message or "github" in error_message.lower() + def test_get_converter_web(self): + """Test that get_converter returns DocToSkillConverter for web.""" + from skill_seekers.cli.skill_converter import get_converter - def test_create_supports_universal_flags(self): - """Test that universal flags are accepted.""" - import subprocess + config = {"name": "test", "base_url": "https://example.com"} + converter = get_converter("web", config) - result = subprocess.run( - ["skill-seekers", "create", "--help"], capture_output=True, text=True, timeout=10 - ) - assert result.returncode == 0 + assert converter.SOURCE_TYPE == "web" + assert converter.name == "test" - # Check that universal flags are present - assert "--name" in result.stdout - assert "--enhance" in result.stdout - assert "--chunk-for-rag" in result.stdout - assert "--preset" in result.stdout - assert "--dry-run" in result.stdout + def test_get_converter_github(self): + """Test that get_converter returns GitHubScraper for github.""" + from skill_seekers.cli.skill_converter import get_converter + + config = {"name": "test", "repo": "owner/repo"} + converter = get_converter("github", config) + + assert converter.SOURCE_TYPE == "github" + assert converter.name == "test" + + def test_get_converter_pdf(self): + """Test that get_converter returns PDFToSkillConverter for pdf.""" + from skill_seekers.cli.skill_converter import get_converter + + config = {"name": "test", "pdf_path": "/tmp/test.pdf"} + converter = get_converter("pdf", config) + + assert converter.SOURCE_TYPE == "pdf" + assert converter.name == "test" + + def test_get_converter_unknown_raises(self): + """Test that get_converter raises ValueError for unknown type.""" + from skill_seekers.cli.skill_converter import get_converter + + with pytest.raises(ValueError, match="Unknown source type"): + get_converter("unknown_type", {}) -class TestCreateCommandArgvForwarding: - """Unit tests for _build_argv argument forwarding.""" +class TestExecutionContextIntegration: + """Tests that ExecutionContext flows correctly through the system.""" - def _make_args(self, **kwargs): + def test_execution_context_auto_initializes(self): + """ExecutionContext.get() returns defaults without explicit init.""" + from skill_seekers.cli.execution_context import ExecutionContext + + # Reset to ensure clean state + ExecutionContext.reset() + + # Should not raise - returns default context + ctx = ExecutionContext.get() + assert ctx is not None + assert ctx.output.name is None # Default value + + ExecutionContext.reset() + + def test_execution_context_values_preserved(self): + """Values set in context are preserved and accessible.""" + from skill_seekers.cli.execution_context import ExecutionContext import argparse - defaults = { - "source": "https://example.com", - "enhance_workflow": None, - "enhance_stage": None, - "var": None, - "workflow_dry_run": False, - "enhance_level": 2, - "output": None, - "name": None, - "description": None, - "config": None, - "api_key": None, - "dry_run": False, - "verbose": False, - "quiet": False, - "chunk_for_rag": False, - "chunk_size": 512, - "chunk_overlap": 50, - "preset": None, - "no_preserve_code_blocks": False, - "no_preserve_paragraphs": False, - "interactive_enhancement": False, - "agent": None, - "agent_cmd": None, - "doc_version": "", - } - defaults.update(kwargs) - return argparse.Namespace(**defaults) + ExecutionContext.reset() - def _collect_argv(self, args): - from skill_seekers.cli.create_command import CreateCommand - from skill_seekers.cli.source_detector import SourceDetector - - cmd = CreateCommand(args) - cmd.source_info = SourceDetector.detect(args.source) - return cmd._build_argv("test_module", []) - - def test_single_enhance_workflow_forwarded(self): - args = self._make_args(enhance_workflow=["security-focus"]) - argv = self._collect_argv(args) - assert argv.count("--enhance-workflow") == 1 - assert "security-focus" in argv - - def test_multiple_enhance_workflows_all_forwarded(self): - """Each workflow must appear as a separate --enhance-workflow flag.""" - args = self._make_args(enhance_workflow=["security-focus", "minimal"]) - argv = self._collect_argv(args) - assert argv.count("--enhance-workflow") == 2 - idx1 = argv.index("security-focus") - idx2 = argv.index("minimal") - assert argv[idx1 - 1] == "--enhance-workflow" - assert argv[idx2 - 1] == "--enhance-workflow" - - def test_no_enhance_workflow_not_forwarded(self): - args = self._make_args(enhance_workflow=None) - argv = self._collect_argv(args) - assert "--enhance-workflow" not in argv - - # ── enhance_stage ──────────────────────────────────────────────────────── - - def test_single_enhance_stage_forwarded(self): - args = self._make_args(enhance_stage=["security:Check for vulnerabilities"]) - argv = self._collect_argv(args) - assert "--enhance-stage" in argv - assert "security:Check for vulnerabilities" in argv - - def test_multiple_enhance_stages_all_forwarded(self): - stages = ["sec:Check security", "cleanup:Remove boilerplate"] - args = self._make_args(enhance_stage=stages) - argv = self._collect_argv(args) - assert argv.count("--enhance-stage") == 2 - for stage in stages: - assert stage in argv - - def test_enhance_stage_none_not_forwarded(self): - args = self._make_args(enhance_stage=None) - argv = self._collect_argv(args) - assert "--enhance-stage" not in argv - - # ── var ────────────────────────────────────────────────────────────────── - - def test_single_var_forwarded(self): - args = self._make_args(var=["depth=comprehensive"]) - argv = self._collect_argv(args) - assert "--var" in argv - assert "depth=comprehensive" in argv - - def test_multiple_vars_all_forwarded(self): - args = self._make_args(var=["depth=comprehensive", "focus=security"]) - argv = self._collect_argv(args) - assert argv.count("--var") == 2 - assert "depth=comprehensive" in argv - assert "focus=security" in argv - - def test_var_none_not_forwarded(self): - args = self._make_args(var=None) - argv = self._collect_argv(args) - assert "--var" not in argv - - # ── workflow_dry_run ───────────────────────────────────────────────────── - - def test_workflow_dry_run_forwarded(self): - args = self._make_args(workflow_dry_run=True) - argv = self._collect_argv(args) - assert "--workflow-dry-run" in argv - - def test_workflow_dry_run_false_not_forwarded(self): - args = self._make_args(workflow_dry_run=False) - argv = self._collect_argv(args) - assert "--workflow-dry-run" not in argv - - # ── mixed ──────────────────────────────────────────────────────────────── - - def test_workflow_and_stage_both_forwarded(self): - args = self._make_args( - enhance_workflow=["security-focus"], - enhance_stage=["cleanup:Remove boilerplate"], - var=["depth=basic"], - workflow_dry_run=True, - ) - argv = self._collect_argv(args) - assert "--enhance-workflow" in argv - assert "security-focus" in argv - assert "--enhance-stage" in argv - assert "--var" in argv - assert "--workflow-dry-run" in argv - - # ── _SKIP_ARGS exclusion ──────────────────────────────────────────────── - - def test_source_never_forwarded(self): - """'source' is in _SKIP_ARGS and must never appear in argv.""" - args = self._make_args(source="https://example.com") - argv = self._collect_argv(args) - assert "--source" not in argv - - def test_func_never_forwarded(self): - """'func' is in _SKIP_ARGS and must never appear in argv.""" - args = self._make_args(func=lambda: None) - argv = self._collect_argv(args) - assert "--func" not in argv - - def test_config_never_forwarded_by_build_argv(self): - """'config' is in _SKIP_ARGS; forwarded manually by specific routes.""" - args = self._make_args(config="/path/to/config.json") - argv = self._collect_argv(args) - assert "--config" not in argv - - def test_subcommand_never_forwarded(self): - """'subcommand' is in _SKIP_ARGS.""" - args = self._make_args(subcommand="create") - argv = self._collect_argv(args) - assert "--subcommand" not in argv - - def test_command_never_forwarded(self): - """'command' is in _SKIP_ARGS.""" - args = self._make_args(command="create") - argv = self._collect_argv(args) - assert "--command" not in argv - - # ── _DEST_TO_FLAG mapping ─────────────────────────────────────────────── - - def test_async_mode_maps_to_async_flag(self): - """async_mode dest should produce --async flag, not --async-mode.""" - args = self._make_args(async_mode=True) - argv = self._collect_argv(args) - assert "--async" in argv - assert "--async-mode" not in argv - - def test_skip_config_maps_to_skip_config_patterns(self): - """skip_config dest should produce --skip-config-patterns flag.""" - args = self._make_args(skip_config=True) - argv = self._collect_argv(args) - assert "--skip-config-patterns" in argv - assert "--skip-config" not in argv - - # ── Boolean arg forwarding ────────────────────────────────────────────── - - def test_boolean_true_appends_flag(self): - args = self._make_args(dry_run=True) - argv = self._collect_argv(args) - assert "--dry-run" in argv - - def test_boolean_false_does_not_append_flag(self): - args = self._make_args(dry_run=False) - argv = self._collect_argv(args) - assert "--dry-run" not in argv - - def test_verbose_true_forwarded(self): - args = self._make_args(verbose=True) - argv = self._collect_argv(args) - assert "--verbose" in argv - - def test_quiet_true_forwarded(self): - args = self._make_args(quiet=True) - argv = self._collect_argv(args) - assert "--quiet" in argv - - # ── List arg forwarding ───────────────────────────────────────────────── - - def test_list_arg_each_item_gets_separate_flag(self): - """Each list item gets its own --flag value pair.""" - args = self._make_args(enhance_workflow=["a", "b", "c"]) - argv = self._collect_argv(args) - assert argv.count("--enhance-workflow") == 3 - for item in ["a", "b", "c"]: - idx = argv.index(item) - assert argv[idx - 1] == "--enhance-workflow" - - # ── _is_explicitly_set ────────────────────────────────────────────────── - - def test_is_explicitly_set_none_is_not_set(self): - """None values should NOT be considered explicitly set.""" - from skill_seekers.cli.create_command import CreateCommand - - args = self._make_args() - cmd = CreateCommand(args) - assert cmd._is_explicitly_set("name", None) is False - - def test_is_explicitly_set_bool_true_is_set(self): - from skill_seekers.cli.create_command import CreateCommand - - args = self._make_args() - cmd = CreateCommand(args) - assert cmd._is_explicitly_set("dry_run", True) is True - - def test_is_explicitly_set_bool_false_is_not_set(self): - from skill_seekers.cli.create_command import CreateCommand - - args = self._make_args() - cmd = CreateCommand(args) - assert cmd._is_explicitly_set("dry_run", False) is False - - def test_is_explicitly_set_default_doc_version_empty_not_set(self): - """doc_version defaults to '' which means not explicitly set.""" - from skill_seekers.cli.create_command import CreateCommand - - args = self._make_args() - cmd = CreateCommand(args) - assert cmd._is_explicitly_set("doc_version", "") is False - - def test_is_explicitly_set_nonempty_string_is_set(self): - from skill_seekers.cli.create_command import CreateCommand - - args = self._make_args() - cmd = CreateCommand(args) - assert cmd._is_explicitly_set("name", "my-skill") is True - - def test_is_explicitly_set_non_default_value_is_set(self): - """A value that differs from the known default IS explicitly set.""" - from skill_seekers.cli.create_command import CreateCommand - - args = self._make_args() - cmd = CreateCommand(args) - # max_issues default is 100; setting to 50 means explicitly set - assert cmd._is_explicitly_set("max_issues", 50) is True - # Setting to default value means NOT explicitly set - assert cmd._is_explicitly_set("max_issues", 100) is False - - # ── Allowlist filtering ───────────────────────────────────────────────── - - def test_allowlist_only_forwards_allowed_args(self): - """When allowlist is provided, only those args are forwarded.""" - from skill_seekers.cli.create_command import CreateCommand - from skill_seekers.cli.source_detector import SourceDetector - - args = self._make_args( + args = argparse.Namespace( + source="https://example.com", + name="test_skill", + enhance_level=3, dry_run=True, - verbose=True, - name="test-skill", ) - cmd = CreateCommand(args) - cmd.source_info = SourceDetector.detect(args.source) - # Only allow dry_run in the allowlist - allowlist = frozenset({"dry_run"}) - argv = cmd._build_argv("test_module", [], allowlist=allowlist) + ctx = ExecutionContext.initialize(args=args) + assert ctx.output.name == "test_skill" + assert ctx.enhancement.level == 3 + assert ctx.output.dry_run is True - assert "--dry-run" in argv - assert "--verbose" not in argv - assert "--name" not in argv + # Getting context again returns same values + ctx2 = ExecutionContext.get() + assert ctx2.output.name == "test_skill" - def test_allowlist_skips_non_allowed_even_if_set(self): - """Args not in the allowlist are excluded even if explicitly set.""" - from skill_seekers.cli.create_command import CreateCommand - from skill_seekers.cli.source_detector import SourceDetector - - args = self._make_args( - enhance_workflow=["security-focus"], - quiet=True, - ) - cmd = CreateCommand(args) - cmd.source_info = SourceDetector.detect(args.source) - - allowlist = frozenset({"quiet"}) - argv = cmd._build_argv("test_module", [], allowlist=allowlist) - - assert "--quiet" in argv - assert "--enhance-workflow" not in argv - - def test_allowlist_empty_forwards_nothing(self): - """Empty allowlist should forward no user args (auto-name may still be added).""" - from skill_seekers.cli.create_command import CreateCommand - from skill_seekers.cli.source_detector import SourceDetector - - args = self._make_args(dry_run=True, verbose=True) - cmd = CreateCommand(args) - cmd.source_info = SourceDetector.detect(args.source) - - allowlist = frozenset() - argv = cmd._build_argv("test_module", ["pos"], allowlist=allowlist) - - # User-set args (dry_run, verbose) should NOT be forwarded - assert "--dry-run" not in argv - assert "--verbose" not in argv - # Only module name, positional, and possibly auto-added --name - assert argv[0] == "test_module" - assert "pos" in argv + ExecutionContext.reset() -class TestBackwardCompatibility: - """Test that old commands still work.""" +class TestUnifiedCommands: + """Test that unified commands still work.""" - def test_scrape_command_still_works(self): - """Old scrape command should still function.""" - import subprocess - - result = subprocess.run( - ["skill-seekers", "scrape", "--help"], capture_output=True, text=True, timeout=10 - ) - assert result.returncode == 0 - assert "scrape" in result.stdout.lower() - - def test_github_command_still_works(self): - """Old github command should still function.""" - import subprocess - - result = subprocess.run( - ["skill-seekers", "github", "--help"], capture_output=True, text=True, timeout=10 - ) - assert result.returncode == 0 - assert "github" in result.stdout.lower() - - def test_analyze_command_still_works(self): - """Old analyze command should still function.""" - import subprocess - - result = subprocess.run( - ["skill-seekers", "analyze", "--help"], capture_output=True, text=True, timeout=10 - ) - assert result.returncode == 0 - assert "analyze" in result.stdout.lower() - - def test_main_help_shows_all_commands(self): - """Main help should show both old and new commands.""" + def test_main_help_shows_available_commands(self): + """Main help should show available commands.""" import subprocess result = subprocess.run( @@ -498,14 +199,11 @@ class TestBackwardCompatibility: assert result.returncode == 0 # Should show create command assert "create" in result.stdout - - # Should still show old commands - assert "scrape" in result.stdout - assert "github" in result.stdout - assert "analyze" in result.stdout + # Should show enhance command + assert "enhance" in result.stdout def test_workflows_command_still_works(self): - """The new workflows subcommand is accessible via the main CLI.""" + """The workflows subcommand is accessible via the main CLI.""" import subprocess result = subprocess.run( @@ -515,4 +213,29 @@ class TestBackwardCompatibility: timeout=10, ) assert result.returncode == 0 - assert "workflow" in result.stdout.lower() + + +class TestRemovedCommands: + """Test that old individual scraper commands are properly removed.""" + + def test_scrape_command_removed(self): + """Old scrape command should not exist.""" + import subprocess + + result = subprocess.run( + ["skill-seekers", "scrape", "--help"], capture_output=True, text=True, timeout=10 + ) + # Should fail - command removed + assert result.returncode == 2 + assert "invalid choice" in result.stderr + + def test_github_command_removed(self): + """Old github command should not exist.""" + import subprocess + + result = subprocess.run( + ["skill-seekers", "github", "--help"], capture_output=True, text=True, timeout=10 + ) + # Should fail - command removed + assert result.returncode == 2 + assert "invalid choice" in result.stderr diff --git a/tests/test_execution_context.py b/tests/test_execution_context.py new file mode 100644 index 0000000..4676441 --- /dev/null +++ b/tests/test_execution_context.py @@ -0,0 +1,511 @@ +"""Tests for ExecutionContext singleton. + +This module tests the ExecutionContext class which provides a single source +of truth for all configuration in Skill Seekers. +""" + +import argparse +import json +import os +import tempfile + +import pytest + +from skill_seekers.cli.execution_context import ( + ExecutionContext, + get_context, +) + + +class TestExecutionContextBasics: + """Basic functionality tests.""" + + def setup_method(self): + """Reset singleton before each test.""" + ExecutionContext.reset() + + def teardown_method(self): + """Clean up after each test.""" + ExecutionContext.reset() + + def test_get_returns_defaults_when_not_initialized(self): + """Should return default context when not explicitly initialized.""" + ctx = ExecutionContext.get() + assert ctx is not None + assert ctx.enhancement.level == 2 # default + assert ctx.output.name is None # default + + def test_get_context_shortcut(self): + """get_context() should be equivalent to ExecutionContext.get().""" + args = argparse.Namespace(name="test-skill") + ExecutionContext.initialize(args=args) + + ctx = get_context() + assert ctx.output.name == "test-skill" + + def test_initialize_returns_instance(self): + """initialize() should return the context instance.""" + args = argparse.Namespace(name="test") + ctx = ExecutionContext.initialize(args=args) + + assert isinstance(ctx, ExecutionContext) + assert ctx.output.name == "test" + + def test_singleton_behavior(self): + """Multiple calls should return same instance.""" + args = argparse.Namespace(name="first") + ctx1 = ExecutionContext.initialize(args=args) + ctx2 = ExecutionContext.get() + + assert ctx1 is ctx2 + + def test_reset_clears_instance(self): + """reset() should clear the initialized instance, get() returns fresh defaults.""" + args = argparse.Namespace(name="test-skill") + ExecutionContext.initialize(args=args) + assert ExecutionContext.get().output.name == "test-skill" + + ExecutionContext.reset() + + # After reset, get() returns default context (not the old one) + ctx = ExecutionContext.get() + assert ctx.output.name is None # default, not "test-skill" + + +class TestExecutionContextFromArgs: + """Tests for building context from CLI args.""" + + def setup_method(self): + ExecutionContext.reset() + + def teardown_method(self): + ExecutionContext.reset() + + def test_basic_args(self): + """Should extract basic args correctly.""" + args = argparse.Namespace( + name="react-docs", + output="custom/output", + doc_version="18.2", + dry_run=True, + enhance_level=3, + agent="kimi", + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.output.name == "react-docs" + assert ctx.output.output_dir == "custom/output" + assert ctx.output.doc_version == "18.2" + assert ctx.output.dry_run is True + assert ctx.enhancement.level == 3 + assert ctx.enhancement.agent == "kimi" + + def test_scraping_args(self): + """Should extract scraping args correctly.""" + args = argparse.Namespace( + name="test", + max_pages=100, + rate_limit=1.5, + browser=True, + workers=4, + async_mode=True, + resume=True, + fresh=False, + skip_scrape=True, + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.scraping.max_pages == 100 + assert ctx.scraping.rate_limit == 1.5 + assert ctx.scraping.browser is True + assert ctx.scraping.workers == 4 + assert ctx.scraping.async_mode is True + assert ctx.scraping.resume is True + assert ctx.scraping.skip_scrape is True + + def test_analysis_args(self): + """Should extract analysis args correctly.""" + args = argparse.Namespace( + name="test", + depth="full", + skip_patterns=True, + skip_test_examples=True, + skip_how_to_guides=True, + file_patterns="*.py,*.js", + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.analysis.depth == "full" + assert ctx.analysis.skip_patterns is True + assert ctx.analysis.skip_test_examples is True + assert ctx.analysis.skip_how_to_guides is True + assert ctx.analysis.file_patterns == ["*.py", "*.js"] + + def test_workflow_args(self): + """Should extract workflow args correctly.""" + args = argparse.Namespace( + name="test", + enhance_workflow=["security-focus", "api-docs"], + enhance_stage=["stage1:prompt1"], + var=["key1=value1", "key2=value2"], + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.enhancement.workflows == ["security-focus", "api-docs"] + assert ctx.enhancement.stages == ["stage1:prompt1"] + assert ctx.enhancement.workflow_vars == {"key1": "value1", "key2": "value2"} + + def test_rag_args(self): + """Should extract RAG args correctly.""" + args = argparse.Namespace( + name="test", + chunk_for_rag=True, + chunk_tokens=1024, + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.rag.chunk_for_rag is True + assert ctx.rag.chunk_tokens == 1024 + + def test_api_mode_detection(self): + """Should detect API mode from api_key.""" + args = argparse.Namespace( + name="test", + api_key="test-key", + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.enhancement.mode == "api" + + def test_local_mode_detection(self): + """Should default to local/auto mode without API key.""" + # Clean API key env vars to ensure test isolation + api_keys = ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "MOONSHOT_API_KEY", "GOOGLE_API_KEY"] + saved = {k: os.environ.pop(k, None) for k in api_keys} + try: + args = argparse.Namespace(name="test") + ctx = ExecutionContext.initialize(args=args) + assert ctx.enhancement.mode in ("local", "auto") + finally: + for k, v in saved.items(): + if v is not None: + os.environ[k] = v + + def test_raw_args_access(self): + """Should provide access to raw args for backward compatibility.""" + args = argparse.Namespace( + name="test", + custom_field="custom_value", + ) + + ctx = ExecutionContext.initialize(args=args) + + assert ctx.get_raw("name") == "test" + assert ctx.get_raw("custom_field") == "custom_value" + assert ctx.get_raw("nonexistent", "default") == "default" + + +class TestExecutionContextFromConfigFile: + """Tests for building context from config files.""" + + def setup_method(self): + ExecutionContext.reset() + + def teardown_method(self): + ExecutionContext.reset() + + def test_unified_config_format(self): + """Should load unified config with sources array.""" + config = { + "name": "unity-docs", + "version": "2022.3", + "enhancement": { + "enabled": True, + "level": 2, + "mode": "local", + "agent": "kimi", + "timeout": "unlimited", + }, + "workflows": ["unity-game-dev"], + "workflow_stages": ["custom:stage"], + "workflow_vars": {"var1": "value1"}, + "sources": [{"type": "documentation", "base_url": "https://docs.unity3d.com/"}], + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + ctx = ExecutionContext.initialize(config_path=config_path) + + assert ctx.output.name == "unity-docs" + assert ctx.output.doc_version == "2022.3" + assert ctx.enhancement.enabled is True + assert ctx.enhancement.level == 2 + assert ctx.enhancement.mode == "local" + assert ctx.enhancement.agent == "kimi" + assert ctx.enhancement.workflows == ["unity-game-dev"] + assert ctx.enhancement.stages == ["custom:stage"] + assert ctx.enhancement.workflow_vars == {"var1": "value1"} + finally: + os.unlink(config_path) + + def test_simple_web_config_format(self): + """Should load simple web config format.""" + config = { + "name": "react-docs", + "version": "18.2", + "base_url": "https://react.dev/", + "max_pages": 500, + "rate_limit": 0.5, + "browser": True, + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + ctx = ExecutionContext.initialize(config_path=config_path) + + assert ctx.output.name == "react-docs" + assert ctx.output.doc_version == "18.2" + assert ctx.scraping.max_pages == 500 + assert ctx.scraping.rate_limit == 0.5 + assert ctx.scraping.browser is True + finally: + os.unlink(config_path) + + def test_timeout_integer(self): + """Should handle integer timeout in config.""" + config = { + "name": "test", + "enhancement": {"timeout": 3600}, + "sources": [], + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + ctx = ExecutionContext.initialize(config_path=config_path) + assert ctx.enhancement.timeout == 3600 + finally: + os.unlink(config_path) + + +class TestExecutionContextPriority: + """Tests for configuration priority (CLI > Config > Env > Defaults).""" + + def setup_method(self): + ExecutionContext.reset() + self._original_env = {} + + def teardown_method(self): + ExecutionContext.reset() + # Restore env vars + for key, value in self._original_env.items(): + if value is not None: + os.environ[key] = value + else: + os.environ.pop(key, None) + + def test_cli_overrides_config(self): + """CLI args should override config file values.""" + config = {"name": "config-name", "sources": []} + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + args = argparse.Namespace(name="cli-name") + ctx = ExecutionContext.initialize(args=args, config_path=config_path) + + # CLI should win + assert ctx.output.name == "cli-name" + finally: + os.unlink(config_path) + + def test_config_overrides_defaults(self): + """Config file should override default values.""" + config = { + "name": "config-name", + "enhancement": {"level": 3}, + "sources": [], + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config, f) + config_path = f.name + + try: + ctx = ExecutionContext.initialize(config_path=config_path) + + # Config should override default (level=2) + assert ctx.enhancement.level == 3 + finally: + os.unlink(config_path) + + def test_env_overrides_defaults(self): + """Environment variables should override defaults.""" + self._original_env["SKILL_SEEKER_AGENT"] = os.environ.get("SKILL_SEEKER_AGENT") + os.environ["SKILL_SEEKER_AGENT"] = "claude" + + ctx = ExecutionContext.initialize() + + # Env var should override default (None) + assert ctx.enhancement.agent == "claude" + + +class TestExecutionContextSourceInfo: + """Tests for source info integration.""" + + def setup_method(self): + ExecutionContext.reset() + + def teardown_method(self): + ExecutionContext.reset() + + def test_source_info_integration(self): + """Should integrate source info from source_detector.""" + + class MockSourceInfo: + type = "web" + raw_source = "https://react.dev/" + parsed = {"url": "https://react.dev/"} + suggested_name = "react" + + ctx = ExecutionContext.initialize(source_info=MockSourceInfo()) + + assert ctx.source is not None + assert ctx.source.type == "web" + assert ctx.source.raw_source == "https://react.dev/" + assert ctx.source.suggested_name == "react" + + +class TestExecutionContextOverride: + """Tests for the override context manager.""" + + def setup_method(self): + ExecutionContext.reset() + + def teardown_method(self): + ExecutionContext.reset() + + def test_override_temporarily_changes_values(self): + """override() should temporarily change values.""" + args = argparse.Namespace(name="original", enhance_level=2) + ctx = ExecutionContext.initialize(args=args) + + assert ctx.enhancement.level == 2 + + with ctx.override(enhancement__level=3): + ctx_from_get = ExecutionContext.get() + assert ctx_from_get.enhancement.level == 3 + + # After exit, original value restored + assert ExecutionContext.get().enhancement.level == 2 + + def test_override_restores_on_exception(self): + """override() should restore values even on exception.""" + args = argparse.Namespace(name="original", enhance_level=2) + ctx = ExecutionContext.initialize(args=args) + + try: + with ctx.override(enhancement__level=3): + assert ExecutionContext.get().enhancement.level == 3 + raise ValueError("Test error") + except ValueError: + pass + + # Should still be restored + assert ExecutionContext.get().enhancement.level == 2 + + +class TestExecutionContextValidation: + """Tests for Pydantic validation.""" + + def setup_method(self): + ExecutionContext.reset() + + def teardown_method(self): + ExecutionContext.reset() + + def test_enhancement_level_bounds(self): + """Enhancement level should be 0-3.""" + args = argparse.Namespace(name="test", enhance_level=5) + + with pytest.raises(ValueError) as exc_info: + ExecutionContext.initialize(args=args) + + assert "level" in str(exc_info.value) + + def test_analysis_depth_choices(self): + """Analysis depth should reject invalid values.""" + import pydantic + + args = argparse.Namespace(name="test", depth="invalid") + with pytest.raises(pydantic.ValidationError): + ExecutionContext.initialize(args=args) + + def test_analysis_depth_valid_choices(self): + """Analysis depth should accept surface, deep, full.""" + for depth in ("surface", "deep", "full"): + ExecutionContext.reset() + args = argparse.Namespace(name="test", depth=depth) + ctx = ExecutionContext.initialize(args=args) + assert ctx.analysis.depth == depth + + +class TestExecutionContextDefaults: + """Tests for default values.""" + + def setup_method(self): + ExecutionContext.reset() + + def teardown_method(self): + ExecutionContext.reset() + + def test_default_values(self): + """Should have sensible defaults.""" + # Clear API key env vars so mode defaults to "auto" regardless of environment + api_keys = ("ANTHROPIC_API_KEY", "OPENAI_API_KEY", "MOONSHOT_API_KEY", "GOOGLE_API_KEY") + saved = {k: os.environ.pop(k, None) for k in api_keys} + try: + ctx = ExecutionContext.initialize() + + # Enhancement defaults + assert ctx.enhancement.enabled is True + assert ctx.enhancement.level == 2 + assert ctx.enhancement.mode == "auto" # Default is auto, resolved at runtime + assert ctx.enhancement.timeout == 2700 # 45 minutes + finally: + for k, v in saved.items(): + if v is not None: + os.environ[k] = v + + # Output defaults + assert ctx.output.name is None + assert ctx.output.dry_run is False + + # Scraping defaults + assert ctx.scraping.browser is False + assert ctx.scraping.workers == 1 + assert ctx.scraping.languages == ["en"] + + # Analysis defaults + assert ctx.analysis.depth == "surface" + assert ctx.analysis.skip_patterns is False + + # RAG defaults + assert ctx.rag.chunk_for_rag is False + assert ctx.rag.chunk_tokens == 512 diff --git a/tests/test_framework_detection.py b/tests/test_framework_detection.py index 49c41e3..0042299 100644 --- a/tests/test_framework_detection.py +++ b/tests/test_framework_detection.py @@ -46,31 +46,20 @@ class TestFrameworkDetection(unittest.TestCase): " return render_template('index.html')\n" ) - # Run codebase analyzer - from skill_seekers.cli.codebase_scraper import main as scraper_main - import sys + # Run codebase analyzer directly + from skill_seekers.cli.codebase_scraper import analyze_codebase - old_argv = sys.argv - try: - sys.argv = [ - "skill-seekers-codebase", - "--directory", - str(self.test_project), - "--output", - str(self.output_dir), - "--depth", - "deep", - "--ai-mode", - "none", - "--skip-patterns", - "--skip-test-examples", - "--skip-how-to-guides", - "--skip-config-patterns", - "--skip-docs", - ] - scraper_main() - finally: - sys.argv = old_argv + analyze_codebase( + directory=self.test_project, + output_dir=self.output_dir, + depth="deep", + enhance_level=0, + detect_patterns=False, + extract_test_examples=False, + build_how_to_guides=False, + extract_config_patterns=False, + extract_docs=False, + ) # Verify Flask was detected arch_file = self.output_dir / "references" / "architecture" / "architectural_patterns.json" @@ -91,26 +80,15 @@ class TestFrameworkDetection(unittest.TestCase): "import django\nfrom flask import Flask\nimport requests" ) - # Run codebase analyzer - from skill_seekers.cli.codebase_scraper import main as scraper_main - import sys + # Run codebase analyzer directly + from skill_seekers.cli.codebase_scraper import analyze_codebase - old_argv = sys.argv - try: - sys.argv = [ - "skill-seekers-codebase", - "--directory", - str(self.test_project), - "--output", - str(self.output_dir), - "--depth", - "deep", - "--ai-mode", - "none", - ] - scraper_main() - finally: - sys.argv = old_argv + analyze_codebase( + directory=self.test_project, + output_dir=self.output_dir, + depth="deep", + enhance_level=0, + ) # Verify file was analyzed code_analysis = self.output_dir / "code_analysis.json" @@ -143,26 +121,15 @@ class TestFrameworkDetection(unittest.TestCase): # File with no framework imports (app_dir / "utils.py").write_text("def my_function():\n return 'hello'\n") - # Run codebase analyzer - from skill_seekers.cli.codebase_scraper import main as scraper_main - import sys + # Run codebase analyzer directly + from skill_seekers.cli.codebase_scraper import analyze_codebase - old_argv = sys.argv - try: - sys.argv = [ - "skill-seekers-codebase", - "--directory", - str(self.test_project), - "--output", - str(self.output_dir), - "--depth", - "deep", - "--ai-mode", - "none", - ] - scraper_main() - finally: - sys.argv = old_argv + analyze_codebase( + directory=self.test_project, + output_dir=self.output_dir, + depth="deep", + enhance_level=0, + ) # Check frameworks detected arch_file = self.output_dir / "references" / "architecture" / "architectural_patterns.json" diff --git a/tests/test_git_sources_e2e.py b/tests/test_git_sources_e2e.py index 773e48f..444bd25 100644 --- a/tests/test_git_sources_e2e.py +++ b/tests/test_git_sources_e2e.py @@ -52,8 +52,8 @@ class TestGitSourcesE2E: """Create a temporary git repository with sample configs.""" repo_dir = tempfile.mkdtemp(prefix="ss_repo_") - # Initialize git repository - repo = git.Repo.init(repo_dir) + # Initialize git repository with 'master' branch for test consistency + repo = git.Repo.init(repo_dir, initial_branch="master") # Create sample config files configs = { @@ -685,8 +685,8 @@ class TestMCPToolsE2E: """Create a temporary git repository with sample configs.""" repo_dir = tempfile.mkdtemp(prefix="ss_mcp_repo_") - # Initialize git repository - repo = git.Repo.init(repo_dir) + # Initialize git repository with 'master' branch for test consistency + repo = git.Repo.init(repo_dir, initial_branch="master") # Create sample config config = { diff --git a/tests/test_issue_219_e2e.py b/tests/test_issue_219_e2e.py index 856455c..918ba56 100644 --- a/tests/test_issue_219_e2e.py +++ b/tests/test_issue_219_e2e.py @@ -8,7 +8,6 @@ Tests verify complete fixes for: 3. Custom API endpoint support (ANTHROPIC_BASE_URL, ANTHROPIC_AUTH_TOKEN) """ -import contextlib import os import shutil import subprocess @@ -117,82 +116,48 @@ class TestIssue219Problem1LargeFiles(unittest.TestCase): class TestIssue219Problem2CLIFlags(unittest.TestCase): - """E2E Test: Problem #2 - CLI flags working through main.py dispatcher""" + """E2E Test: Problem #2 - CLI flags working through create command""" - def test_github_command_has_enhancement_flags(self): - """E2E: Verify --enhance-level flag exists in github command help""" + def test_create_command_has_enhancement_flags(self): + """E2E: Verify --enhance-level flag exists in create command help""" result = subprocess.run( - ["skill-seekers", "github", "--help"], capture_output=True, text=True + ["skill-seekers", "create", "--help"], capture_output=True, text=True ) # VERIFY: Command succeeds - self.assertEqual(result.returncode, 0, "github --help should succeed") + self.assertEqual(result.returncode, 0, "create --help should succeed") # VERIFY: Enhancement flags present self.assertIn("--enhance-level", result.stdout, "Missing --enhance-level flag") - self.assertIn("--api-key", result.stdout, "Missing --api-key flag") - def test_github_command_accepts_enhance_level_flag(self): - """E2E: Verify --enhance-level flag doesn't cause 'unrecognized arguments' error""" - # Strategy: Parse arguments directly without executing to avoid network hangs on CI - # This tests that the CLI accepts the flag without actually running the command - import argparse + def test_enhance_level_flag_accepted_by_create(self): + """E2E: Verify --enhance-level flag is accepted by create command parser""" + from skill_seekers.cli.main import create_parser - # Get the argument parser from github_scraper - parser = argparse.ArgumentParser() - # Add the same arguments as github_scraper.main() - parser.add_argument("--repo", required=True) - parser.add_argument("--enhance-level", type=int, choices=[0, 1, 2, 3], default=2) - parser.add_argument("--api-key") + parser = create_parser() # VERIFY: Parsing succeeds without "unrecognized arguments" error try: - args = parser.parse_args(["--repo", "test/test", "--enhance-level", "2"]) - # If we get here, argument parsing succeeded + args = parser.parse_args(["create", "owner/repo", "--enhance-level", "2"]) self.assertEqual(args.enhance_level, 2, "Flag should be parsed as 2") - self.assertEqual(args.repo, "test/test") except SystemExit as e: - # Argument parsing failed self.fail(f"Argument parsing failed with: {e}") - def test_cli_dispatcher_forwards_flags_to_github_scraper(self): - """E2E: Verify main.py dispatcher forwards flags to github_scraper.py""" - from skill_seekers.cli import main + def test_github_scraper_class_accepts_enhance_level(self): + """E2E: Verify GitHubScraper config accepts enhance_level.""" + from skill_seekers.cli.github_scraper import GitHubScraper - # Mock sys.argv to simulate CLI call - test_args = [ - "skill-seekers", - "github", - "--repo", - "test/test", - "--name", - "test", - "--enhance-level", - "2", - ] + config = { + "repo": "test/test", + "name": "test", + "github_token": None, + "enhance_level": 2, + } - with ( - patch("sys.argv", test_args), - patch("skill_seekers.cli.github_scraper.main") as mock_github_main, - ): - mock_github_main.return_value = 0 - - # Call main dispatcher - with patch("sys.exit"), contextlib.suppress(SystemExit): - main.main() - - # VERIFY: github_scraper.main was called - mock_github_main.assert_called_once() - - # VERIFY: sys.argv contains --enhance-level flag - # (main.py should have added it before calling github_scraper) - called_with_enhance = any( - "--enhance-level" in str(call) for call in mock_github_main.call_args_list - ) - self.assertTrue( - called_with_enhance or "--enhance-level" in sys.argv, - "Flag should be forwarded to github_scraper", - ) + with patch("skill_seekers.cli.github_scraper.Github"): + scraper = GitHubScraper(config) + # Just verify it doesn't crash with enhance_level in config + self.assertIsNotNone(scraper) @unittest.skipIf(not ANTHROPIC_AVAILABLE, "anthropic package not installed") @@ -338,17 +303,16 @@ class TestIssue219IntegrationAll(unittest.TestCase): def test_all_fixes_work_together(self): """E2E: Verify all 3 fixes work in combination""" # This test verifies the complete workflow: - # 1. CLI accepts --enhance-level + # 1. CLI accepts --enhance-level via create command # 2. Large files are downloaded # 3. Custom API endpoints work result = subprocess.run( - ["skill-seekers", "github", "--help"], capture_output=True, text=True + ["skill-seekers", "create", "--help"], capture_output=True, text=True ) # Enhancement flags present self.assertIn("--enhance-level", result.stdout) - self.assertIn("--api-key", result.stdout) # Verify we can import all fixed modules try: diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index 90db63f..16e66dc 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -280,58 +280,69 @@ class TestScrapeDocsTool(unittest.IsolatedAsyncioTestCase): os.chdir(self.original_cwd) shutil.rmtree(self.temp_dir, ignore_errors=True) - @patch("skill_seekers.mcp.tools.scraping_tools.run_subprocess_with_streaming") - async def test_scrape_docs_basic(self, mock_streaming): - """Test basic documentation scraping""" - # Mock successful subprocess run with streaming - mock_streaming.return_value = ("Scraping completed successfully", "", 0) + @patch("skill_seekers.mcp.tools.scraping_tools._run_converter") + @patch("skill_seekers.cli.skill_converter.get_converter") + async def test_scrape_docs_basic(self, mock_get_converter, mock_run_converter): + """Test basic documentation scraping via in-process converter""" + from skill_seekers.mcp.tools.scraping_tools import TextContent + + mock_run_converter.return_value = [ + TextContent(type="text", text="Scraping completed successfully") + ] args = {"config_path": str(self.config_path)} - result = await skill_seeker_server.scrape_docs_tool(args) self.assertIsInstance(result, list) self.assertIn("success", result[0].text.lower()) + mock_get_converter.assert_called_once() + mock_run_converter.assert_called_once() - @patch("skill_seekers.mcp.tools.scraping_tools.run_subprocess_with_streaming") - async def test_scrape_docs_with_skip_scrape(self, mock_streaming): + @patch("skill_seekers.mcp.tools.scraping_tools._run_converter") + @patch("skill_seekers.cli.skill_converter.get_converter") + async def test_scrape_docs_with_skip_scrape(self, mock_get_converter, mock_run_converter): """Test scraping with skip_scrape flag""" - # Mock successful subprocess run with streaming - mock_streaming.return_value = ("Using cached data", "", 0) + from skill_seekers.mcp.tools.scraping_tools import TextContent + + mock_run_converter.return_value = [TextContent(type="text", text="Using cached data")] args = {"config_path": str(self.config_path), "skip_scrape": True} + result = await skill_seeker_server.scrape_docs_tool(args) - _result = await skill_seeker_server.scrape_docs_tool(args) + self.assertIsInstance(result, list) + mock_get_converter.assert_called_once() - # Verify --skip-scrape was passed - call_args = mock_streaming.call_args[0][0] - self.assertIn("--skip-scrape", call_args) + @patch("skill_seekers.mcp.tools.scraping_tools._run_converter") + @patch("skill_seekers.cli.skill_converter.get_converter") + async def test_scrape_docs_with_dry_run(self, mock_get_converter, mock_run_converter): + """Test scraping with dry_run flag sets converter.dry_run""" + from skill_seekers.mcp.tools.scraping_tools import TextContent - @patch("skill_seekers.mcp.tools.scraping_tools.run_subprocess_with_streaming") - async def test_scrape_docs_with_dry_run(self, mock_streaming): - """Test scraping with dry_run flag""" - # Mock successful subprocess run with streaming - mock_streaming.return_value = ("Dry run completed", "", 0) + mock_converter = mock_get_converter.return_value + mock_run_converter.return_value = [TextContent(type="text", text="Dry run completed")] args = {"config_path": str(self.config_path), "dry_run": True} + result = await skill_seeker_server.scrape_docs_tool(args) - _result = await skill_seeker_server.scrape_docs_tool(args) + self.assertIsInstance(result, list) + # Verify dry_run was set on the converter instance + self.assertTrue(mock_converter.dry_run) - call_args = mock_streaming.call_args[0][0] - self.assertIn("--dry-run", call_args) + @patch("skill_seekers.mcp.tools.scraping_tools._run_converter") + @patch("skill_seekers.cli.skill_converter.get_converter") + async def test_scrape_docs_with_enhance_local(self, mock_get_converter, mock_run_converter): + """Test scraping with local enhancement flag""" + from skill_seekers.mcp.tools.scraping_tools import TextContent - @patch("skill_seekers.mcp.tools.scraping_tools.run_subprocess_with_streaming") - async def test_scrape_docs_with_enhance_local(self, mock_streaming): - """Test scraping with local enhancement""" - # Mock successful subprocess run with streaming - mock_streaming.return_value = ("Scraping with enhancement", "", 0) + mock_run_converter.return_value = [ + TextContent(type="text", text="Scraping with enhancement") + ] args = {"config_path": str(self.config_path), "enhance_local": True} + result = await skill_seeker_server.scrape_docs_tool(args) - _result = await skill_seeker_server.scrape_docs_tool(args) - - call_args = mock_streaming.call_args[0][0] - self.assertIn("--enhance-local", call_args) + self.assertIsInstance(result, list) + mock_get_converter.assert_called_once() @unittest.skipUnless(MCP_AVAILABLE, "MCP package not installed") diff --git a/tests/test_new_source_types.py b/tests/test_new_source_types.py index 3c106eb..a2fb019 100644 --- a/tests/test_new_source_types.py +++ b/tests/test_new_source_types.py @@ -13,8 +13,6 @@ import textwrap import pytest from skill_seekers.cli.config_validator import ConfigValidator -from skill_seekers.cli.main import COMMAND_MODULES -from skill_seekers.cli.parsers import PARSERS, get_parser_names from skill_seekers.cli.source_detector import SourceDetector, SourceInfo from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder @@ -554,58 +552,11 @@ class TestUnifiedSkillBuilderGenericMerge: # --------------------------------------------------------------------------- -# 4. COMMAND_MODULES and parser wiring +# 4. New source types accessible via 'create' command # --------------------------------------------------------------------------- - - -class TestCommandModules: - """Test that all 10 new source types are wired into CLI.""" - - NEW_COMMAND_NAMES = [ - "jupyter", - "html", - "openapi", - "asciidoc", - "pptx", - "rss", - "manpage", - "confluence", - "notion", - "chat", - ] - - def test_new_types_in_command_modules(self): - """Test all 10 new source types are in COMMAND_MODULES.""" - for cmd in self.NEW_COMMAND_NAMES: - assert cmd in COMMAND_MODULES, f"'{cmd}' not in COMMAND_MODULES" - - def test_command_modules_values_are_module_paths(self): - """Test COMMAND_MODULES values look like importable module paths.""" - for cmd in self.NEW_COMMAND_NAMES: - module_path = COMMAND_MODULES[cmd] - assert module_path.startswith("skill_seekers.cli."), ( - f"Module path for '{cmd}' doesn't start with 'skill_seekers.cli.'" - ) - - def test_new_parser_names_include_all_10(self): - """Test that get_parser_names() includes all 10 new source types.""" - names = get_parser_names() - for cmd in self.NEW_COMMAND_NAMES: - assert cmd in names, f"Parser '{cmd}' not registered" - - def test_total_parser_count(self): - """Test total PARSERS count is 36 (25 original + 10 new + 1 doctor).""" - assert len(PARSERS) == 36 - - def test_no_duplicate_parser_names(self): - """Test no duplicate parser names exist.""" - names = get_parser_names() - assert len(names) == len(set(names)), "Duplicate parser names found!" - - def test_command_module_count(self): - """Test COMMAND_MODULES has expected number of entries.""" - # 25 original + 10 new + 1 doctor = 36 - assert len(COMMAND_MODULES) == 36 +# Individual scraper CLI commands (jupyter, html, etc.) were removed in the +# Grand Unification refactor. All 17 source types are now accessed via +# `skill-seekers create`. The routing is tested in TestCreateCommandRouting. # --------------------------------------------------------------------------- @@ -769,29 +720,37 @@ class TestSourceDetectorValidation: class TestCreateCommandRouting: - """Test that CreateCommand._route_to_scraper maps new types to _route_generic.""" + """Test that CreateCommand uses get_converter for all source types.""" - # We can't easily call _route_to_scraper (it imports real scrapers), - # but we verify the routing table is correct by checking the method source. + NEW_SOURCE_TYPES = [ + "jupyter", + "html", + "openapi", + "asciidoc", + "pptx", + "rss", + "manpage", + "confluence", + "notion", + "chat", + ] - GENERIC_ROUTES = { - "jupyter": ("jupyter_scraper", "--notebook"), - "html": ("html_scraper", "--html-path"), - "openapi": ("openapi_scraper", "--spec"), - "asciidoc": ("asciidoc_scraper", "--asciidoc-path"), - "pptx": ("pptx_scraper", "--pptx"), - "rss": ("rss_scraper", "--feed-path"), - "manpage": ("man_scraper", "--man-path"), - "confluence": ("confluence_scraper", "--export-path"), - "notion": ("notion_scraper", "--export-path"), - "chat": ("chat_scraper", "--export-path"), - } + def test_get_converter_handles_all_new_types(self): + """Test get_converter returns a converter for each new source type.""" + from skill_seekers.cli.skill_converter import get_converter - def test_route_to_scraper_source_coverage(self): - """Test _route_to_scraper method handles all 10 new types. + for source_type in self.NEW_SOURCE_TYPES: + # get_converter should not raise for known types + # (it may raise ImportError for missing optional deps, which is OK) + try: + converter_cls = get_converter(source_type, {"name": "test"}) + assert converter_cls is not None, f"get_converter returned None for '{source_type}'" + except ImportError: + # Optional dependency not installed - that's fine + pass - We inspect the method source to verify each type has a branch. - """ + def test_route_to_scraper_uses_get_converter(self): + """Test _route_to_scraper delegates to get_converter (not per-type branches).""" import inspect source = inspect.getsource( @@ -800,24 +759,9 @@ class TestCreateCommandRouting: fromlist=["CreateCommand"], ).CreateCommand._route_to_scraper ) - for source_type in self.GENERIC_ROUTES: - assert f'"{source_type}"' in source, ( - f"_route_to_scraper missing branch for '{source_type}'" - ) - - def test_generic_route_module_names(self): - """Test _route_generic is called with correct module names.""" - import inspect - - source = inspect.getsource( - __import__( - "skill_seekers.cli.create_command", - fromlist=["CreateCommand"], - ).CreateCommand._route_to_scraper + assert "get_converter" in source, ( + "_route_to_scraper should use get_converter for unified routing" ) - for source_type, (module, flag) in self.GENERIC_ROUTES.items(): - assert f'"{module}"' in source, f"Module name '{module}' not found for '{source_type}'" - assert f'"{flag}"' in source, f"Flag '{flag}' not found for '{source_type}'" if __name__ == "__main__": diff --git a/tests/test_parser_sync.py b/tests/test_parser_sync.py deleted file mode 100644 index c58380b..0000000 --- a/tests/test_parser_sync.py +++ /dev/null @@ -1,188 +0,0 @@ -"""Test that unified CLI parsers stay in sync with scraper modules. - -This test ensures that the unified CLI (skill-seekers ) has exactly -the same arguments as the standalone scraper modules. This prevents the - parsers from drifting out of sync (Issue #285). -""" - -import argparse - - -class TestScrapeParserSync: - """Ensure scrape_parser has all arguments from doc_scraper.""" - - def test_scrape_argument_count_matches(self): - """Verify unified CLI parser has same argument count as doc_scraper.""" - from skill_seekers.cli.doc_scraper import setup_argument_parser - from skill_seekers.cli.parsers.scrape_parser import ScrapeParser - - # Get source arguments from doc_scraper - source_parser = setup_argument_parser() - source_count = len([a for a in source_parser._actions if a.dest != "help"]) - - # Get target arguments from unified CLI parser - target_parser = argparse.ArgumentParser() - ScrapeParser().add_arguments(target_parser) - target_count = len([a for a in target_parser._actions if a.dest != "help"]) - - assert source_count == target_count, ( - f"Argument count mismatch: doc_scraper has {source_count}, " - f"but unified CLI parser has {target_count}" - ) - - def test_scrape_argument_dests_match(self): - """Verify unified CLI parser has same argument destinations as doc_scraper.""" - from skill_seekers.cli.doc_scraper import setup_argument_parser - from skill_seekers.cli.parsers.scrape_parser import ScrapeParser - - # Get source arguments from doc_scraper - source_parser = setup_argument_parser() - source_dests = {a.dest for a in source_parser._actions if a.dest != "help"} - - # Get target arguments from unified CLI parser - target_parser = argparse.ArgumentParser() - ScrapeParser().add_arguments(target_parser) - target_dests = {a.dest for a in target_parser._actions if a.dest != "help"} - - # Check for missing arguments - missing = source_dests - target_dests - extra = target_dests - source_dests - - assert not missing, f"scrape_parser missing arguments: {missing}" - assert not extra, f"scrape_parser has extra arguments not in doc_scraper: {extra}" - - def test_scrape_specific_arguments_present(self): - """Verify key scrape arguments are present in unified CLI.""" - from skill_seekers.cli.main import create_parser - - parser = create_parser() - - # Get the scrape subparser - subparsers_action = None - for action in parser._actions: - if isinstance(action, argparse._SubParsersAction): - subparsers_action = action - break - - assert subparsers_action is not None, "No subparsers found" - assert "scrape" in subparsers_action.choices, "scrape subparser not found" - - scrape_parser = subparsers_action.choices["scrape"] - arg_dests = {a.dest for a in scrape_parser._actions if a.dest != "help"} - - # Check key arguments that were missing in Issue #285 - required_args = [ - "interactive", - "url", - "verbose", - "quiet", - "resume", - "fresh", - "rate_limit", - "no_rate_limit", - "chunk_for_rag", - ] - - for arg in required_args: - assert arg in arg_dests, f"Required argument '{arg}' missing from scrape parser" - - -class TestGitHubParserSync: - """Ensure github_parser has all arguments from github_scraper.""" - - def test_github_argument_count_matches(self): - """Verify unified CLI parser has same argument count as github_scraper.""" - from skill_seekers.cli.github_scraper import setup_argument_parser - from skill_seekers.cli.parsers.github_parser import GitHubParser - - # Get source arguments from github_scraper - source_parser = setup_argument_parser() - source_count = len([a for a in source_parser._actions if a.dest != "help"]) - - # Get target arguments from unified CLI parser - target_parser = argparse.ArgumentParser() - GitHubParser().add_arguments(target_parser) - target_count = len([a for a in target_parser._actions if a.dest != "help"]) - - assert source_count == target_count, ( - f"Argument count mismatch: github_scraper has {source_count}, " - f"but unified CLI parser has {target_count}" - ) - - def test_github_argument_dests_match(self): - """Verify unified CLI parser has same argument destinations as github_scraper.""" - from skill_seekers.cli.github_scraper import setup_argument_parser - from skill_seekers.cli.parsers.github_parser import GitHubParser - - # Get source arguments from github_scraper - source_parser = setup_argument_parser() - source_dests = {a.dest for a in source_parser._actions if a.dest != "help"} - - # Get target arguments from unified CLI parser - target_parser = argparse.ArgumentParser() - GitHubParser().add_arguments(target_parser) - target_dests = {a.dest for a in target_parser._actions if a.dest != "help"} - - # Check for missing arguments - missing = source_dests - target_dests - extra = target_dests - source_dests - - assert not missing, f"github_parser missing arguments: {missing}" - assert not extra, f"github_parser has extra arguments not in github_scraper: {extra}" - - -class TestUnifiedCLI: - """Test the unified CLI main parser.""" - - def test_main_parser_creates_successfully(self): - """Verify the main parser can be created without errors.""" - from skill_seekers.cli.main import create_parser - - parser = create_parser() - assert parser is not None - - def test_all_subcommands_present(self): - """Verify all expected subcommands are present.""" - from skill_seekers.cli.main import create_parser - - parser = create_parser() - - # Find subparsers action - subparsers_action = None - for action in parser._actions: - if isinstance(action, argparse._SubParsersAction): - subparsers_action = action - break - - assert subparsers_action is not None, "No subparsers found" - - # Check expected subcommands - expected_commands = ["scrape", "github"] - for cmd in expected_commands: - assert cmd in subparsers_action.choices, f"Subcommand '{cmd}' not found" - - def test_scrape_help_works(self): - """Verify scrape subcommand help can be generated.""" - from skill_seekers.cli.main import create_parser - - parser = create_parser() - - # This should not raise an exception - try: - parser.parse_args(["scrape", "--help"]) - except SystemExit as e: - # --help causes SystemExit(0) which is expected - assert e.code == 0 - - def test_github_help_works(self): - """Verify github subcommand help can be generated.""" - from skill_seekers.cli.main import create_parser - - parser = create_parser() - - # This should not raise an exception - try: - parser.parse_args(["github", "--help"]) - except SystemExit as e: - # --help causes SystemExit(0) which is expected - assert e.code == 0 diff --git a/tests/test_pdf_scraper.py b/tests/test_pdf_scraper.py index 46b1403..d16280d 100644 --- a/tests/test_pdf_scraper.py +++ b/tests/test_pdf_scraper.py @@ -519,38 +519,5 @@ class TestJSONWorkflow(unittest.TestCase): self.assertEqual(converter.extracted_data["total_pages"], 1) -class TestPDFCLIArguments(unittest.TestCase): - """Test PDF subcommand CLI argument parsing via the main CLI.""" - - def setUp(self): - import sys - from pathlib import Path - - sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - from skill_seekers.cli.main import create_parser - - self.parser = create_parser() - - def test_api_key_stored_correctly(self): - """Test --api-key is accepted and stored correctly after switching to add_pdf_arguments.""" - args = self.parser.parse_args(["pdf", "--pdf", "test.pdf", "--api-key", "sk-ant-test"]) - self.assertEqual(args.api_key, "sk-ant-test") - - def test_enhance_level_accepted(self): - """Test --enhance-level is accepted for pdf subcommand.""" - args = self.parser.parse_args(["pdf", "--pdf", "test.pdf", "--enhance-level", "1"]) - self.assertEqual(args.enhance_level, 1) - - def test_enhance_workflow_accepted(self): - """Test --enhance-workflow is accepted and stores a list.""" - args = self.parser.parse_args(["pdf", "--pdf", "test.pdf", "--enhance-workflow", "minimal"]) - self.assertEqual(args.enhance_workflow, ["minimal"]) - - def test_workflow_dry_run_accepted(self): - """Test --workflow-dry-run is accepted.""" - args = self.parser.parse_args(["pdf", "--pdf", "test.pdf", "--workflow-dry-run"]) - self.assertTrue(args.workflow_dry_run) - - if __name__ == "__main__": unittest.main() diff --git a/tests/test_preset_system.py b/tests/test_preset_system.py index cee2455..a4a761b 100644 --- a/tests/test_preset_system.py +++ b/tests/test_preset_system.py @@ -207,100 +207,6 @@ class TestPresetApplication: PresetManager.apply_preset("nonexistent", args) -class TestDeprecationWarnings: - """Test deprecation warning functionality.""" - - def test_check_deprecated_flags_quick(self, capsys): - """Test deprecation warning for --quick flag.""" - from skill_seekers.cli.codebase_scraper import _check_deprecated_flags - import argparse - - args = argparse.Namespace(quick=True, comprehensive=False, depth=None, ai_mode="auto") - - _check_deprecated_flags(args) - - captured = capsys.readouterr() - assert "DEPRECATED" in captured.out - assert "--quick" in captured.out - assert "--preset quick" in captured.out - assert "v4.0.0" in captured.out - - def test_check_deprecated_flags_comprehensive(self, capsys): - """Test deprecation warning for --comprehensive flag.""" - from skill_seekers.cli.codebase_scraper import _check_deprecated_flags - import argparse - - args = argparse.Namespace(quick=False, comprehensive=True, depth=None, ai_mode="auto") - - _check_deprecated_flags(args) - - captured = capsys.readouterr() - assert "DEPRECATED" in captured.out - assert "--comprehensive" in captured.out - assert "--preset comprehensive" in captured.out - assert "v4.0.0" in captured.out - - def test_check_deprecated_flags_depth(self, capsys): - """Test deprecation warning for --depth flag.""" - from skill_seekers.cli.codebase_scraper import _check_deprecated_flags - import argparse - - args = argparse.Namespace(quick=False, comprehensive=False, depth="full", ai_mode="auto") - - _check_deprecated_flags(args) - - captured = capsys.readouterr() - assert "DEPRECATED" in captured.out - assert "--depth full" in captured.out - assert "--preset comprehensive" in captured.out - assert "v4.0.0" in captured.out - - def test_check_deprecated_flags_ai_mode(self, capsys): - """Test deprecation warning for --ai-mode flag.""" - from skill_seekers.cli.codebase_scraper import _check_deprecated_flags - import argparse - - args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="api") - - _check_deprecated_flags(args) - - captured = capsys.readouterr() - assert "DEPRECATED" in captured.out - assert "--ai-mode api" in captured.out - assert "--enhance-level" in captured.out - assert "v4.0.0" in captured.out - - def test_check_deprecated_flags_multiple(self, capsys): - """Test deprecation warnings for multiple flags.""" - from skill_seekers.cli.codebase_scraper import _check_deprecated_flags - import argparse - - args = argparse.Namespace(quick=True, comprehensive=False, depth="surface", ai_mode="local") - - _check_deprecated_flags(args) - - captured = capsys.readouterr() - assert "DEPRECATED" in captured.out - assert "--depth surface" in captured.out - assert "--ai-mode local" in captured.out - assert "--quick" in captured.out - assert "MIGRATION TIP" in captured.out - assert "v4.0.0" in captured.out - - def test_check_deprecated_flags_none(self, capsys): - """Test no warnings when no deprecated flags used.""" - from skill_seekers.cli.codebase_scraper import _check_deprecated_flags - import argparse - - args = argparse.Namespace(quick=False, comprehensive=False, depth=None, ai_mode="auto") - - _check_deprecated_flags(args) - - captured = capsys.readouterr() - assert "DEPRECATED" not in captured.out - assert "v4.0.0" not in captured.out - - class TestBackwardCompatibility: """Test backward compatibility with old flags.""" diff --git a/tests/test_unified.py b/tests/test_unified.py index 16b849f..cfcc2b3 100644 --- a/tests/test_unified.py +++ b/tests/test_unified.py @@ -574,62 +574,6 @@ def test_config_file_validation(): os.unlink(config_path) -# =========================== -# Unified CLI Argument Tests -# =========================== - - -class TestUnifiedCLIArguments: - """Test that unified subcommand parser exposes the expected CLI flags.""" - - @pytest.fixture - def parser(self): - import sys - - sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - from skill_seekers.cli.main import create_parser - - return create_parser() - - def test_api_key_stored_correctly(self, parser): - """Test --api-key KEY is stored in args.""" - args = parser.parse_args(["unified", "--config", "my.json", "--api-key", "sk-ant-test"]) - assert args.api_key == "sk-ant-test" - - def test_enhance_level_stored_correctly(self, parser): - """Test --enhance-level 2 is stored in args.""" - args = parser.parse_args(["unified", "--config", "my.json", "--enhance-level", "2"]) - assert args.enhance_level == 2 - - def test_enhance_level_default_is_none(self, parser): - """Test --enhance-level defaults to None (per-source values apply).""" - args = parser.parse_args(["unified", "--config", "my.json"]) - assert args.enhance_level is None - - def test_enhance_level_all_choices(self, parser): - """Test all valid --enhance-level choices are accepted.""" - for level in [0, 1, 2, 3]: - args = parser.parse_args( - ["unified", "--config", "my.json", "--enhance-level", str(level)] - ) - assert args.enhance_level == level - - def test_enhance_workflow_accepted(self, parser): - """Test --enhance-workflow is accepted.""" - args = parser.parse_args( - ["unified", "--config", "my.json", "--enhance-workflow", "security-focus"] - ) - assert args.enhance_workflow == ["security-focus"] - - def test_api_key_and_enhance_level_combined(self, parser): - """Test --api-key and --enhance-level can be combined.""" - args = parser.parse_args( - ["unified", "--config", "my.json", "--api-key", "sk-ant-test", "--enhance-level", "3"] - ) - assert args.api_key == "sk-ant-test" - assert args.enhance_level == 3 - - # =========================== # Workflow JSON Config Tests # =========================== diff --git a/tests/test_unified_scraper_orchestration.py b/tests/test_unified_scraper_orchestration.py index da2f8b0..1cdea7b 100644 --- a/tests/test_unified_scraper_orchestration.py +++ b/tests/test_unified_scraper_orchestration.py @@ -168,35 +168,32 @@ class TestScrapeAllSourcesRouting: class TestScrapeDocumentation: - """_scrape_documentation() writes a temp config and runs doc_scraper as subprocess.""" + """_scrape_documentation() calls scrape_documentation() directly.""" - def test_subprocess_called_with_config_and_fresh_flag(self, tmp_path): - """subprocess.run is called with --config and --fresh for the doc scraper.""" + def test_scrape_documentation_called_directly(self, tmp_path): + """scrape_documentation is called directly (not via subprocess).""" scraper = _make_scraper(tmp_path=tmp_path) source = {"base_url": "https://docs.example.com/", "type": "documentation"} - with patch("skill_seekers.cli.unified_scraper.subprocess.run") as mock_run: - mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="error") + with patch("skill_seekers.cli.doc_scraper.scrape_documentation") as mock_scrape: + mock_scrape.return_value = 1 # simulate failure scraper._scrape_documentation(source) - assert mock_run.called - cmd_args = mock_run.call_args[0][0] - assert "--fresh" in cmd_args - assert "--config" in cmd_args + assert mock_scrape.called - def test_nothing_appended_on_subprocess_failure(self, tmp_path): - """If subprocess returns non-zero, scraped_data["documentation"] stays empty.""" + def test_nothing_appended_on_scrape_failure(self, tmp_path): + """If scrape_documentation returns non-zero, scraped_data["documentation"] stays empty.""" scraper = _make_scraper(tmp_path=tmp_path) source = {"base_url": "https://docs.example.com/", "type": "documentation"} - with patch("skill_seekers.cli.unified_scraper.subprocess.run") as mock_run: - mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="err") + with patch("skill_seekers.cli.doc_scraper.scrape_documentation") as mock_scrape: + mock_scrape.return_value = 1 scraper._scrape_documentation(source) assert scraper.scraped_data["documentation"] == [] def test_llms_txt_url_forwarded_to_doc_config(self, tmp_path): - """llms_txt_url from source is forwarded to the temporary doc config.""" + """llms_txt_url from source is forwarded to the doc config.""" scraper = _make_scraper(tmp_path=tmp_path) source = { "base_url": "https://docs.example.com/", @@ -204,30 +201,21 @@ class TestScrapeDocumentation: "llms_txt_url": "https://docs.example.com/llms.txt", } - written_configs = [] + captured_config = {} - original_json_dump = json.dumps + def fake_scrape(config, ctx=None): # noqa: ARG001 + captured_config.update(config) + return 1 # fail so we don't need to set up output files - def capture_dump(obj, f, **kwargs): - if isinstance(f, str): - return original_json_dump(obj, f, **kwargs) - written_configs.append(obj) - return original_json_dump(obj) - - with ( - patch("skill_seekers.cli.unified_scraper.subprocess.run") as mock_run, - patch( - "skill_seekers.cli.unified_scraper.json.dump", - side_effect=lambda obj, _f, **_kw: written_configs.append(obj), - ), - ): - mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="") + with patch("skill_seekers.cli.doc_scraper.scrape_documentation", side_effect=fake_scrape): scraper._scrape_documentation(source) - assert any("llms_txt_url" in s for c in written_configs for s in c.get("sources", [c])) + # The llms_txt_url should be in the sources list of the doc config + sources = captured_config.get("sources", []) + assert any("llms_txt_url" in s for s in sources) def test_start_urls_forwarded_to_doc_config(self, tmp_path): - """start_urls from source is forwarded to the temporary doc config.""" + """start_urls from source is forwarded to the doc config.""" scraper = _make_scraper(tmp_path=tmp_path) source = { "base_url": "https://docs.example.com/", @@ -235,19 +223,17 @@ class TestScrapeDocumentation: "start_urls": ["https://docs.example.com/intro"], } - written_configs = [] + captured_config = {} - with ( - patch("skill_seekers.cli.unified_scraper.subprocess.run") as mock_run, - patch( - "skill_seekers.cli.unified_scraper.json.dump", - side_effect=lambda obj, _f, **_kw: written_configs.append(obj), - ), - ): - mock_run.return_value = MagicMock(returncode=1, stdout="", stderr="") + def fake_scrape(config, ctx=None): # noqa: ARG001 + captured_config.update(config) + return 1 + + with patch("skill_seekers.cli.doc_scraper.scrape_documentation", side_effect=fake_scrape): scraper._scrape_documentation(source) - assert any("start_urls" in s for c in written_configs for s in c.get("sources", [c])) + sources = captured_config.get("sources", []) + assert any("start_urls" in s for s in sources) # =========================================================================== diff --git a/tests/test_video_scraper.py b/tests/test_video_scraper.py index 21b4275..b941063 100644 --- a/tests/test_video_scraper.py +++ b/tests/test_video_scraper.py @@ -728,13 +728,12 @@ class TestVideoArguments(unittest.TestCase): args = parser.parse_args([]) self.assertEqual(args.enhance_level, 0) - def test_unified_parser_has_video(self): - """Test video subcommand is registered in main parser.""" - from skill_seekers.cli.main import create_parser + def test_video_accessible_via_create(self): + """Test video source is accessible via 'create' command (not as subcommand).""" + from skill_seekers.cli.source_detector import SourceDetector - parser = create_parser() - args = parser.parse_args(["video", "--url", "https://youtube.com/watch?v=test"]) - self.assertEqual(args.url, "https://youtube.com/watch?v=test") + info = SourceDetector.detect("https://youtube.com/watch?v=test") + self.assertEqual(info.type, "video") # ============================================================================= diff --git a/tests/test_video_setup.py b/tests/test_video_setup.py index d336e76..21fb861 100644 --- a/tests/test_video_setup.py +++ b/tests/test_video_setup.py @@ -711,21 +711,16 @@ class TestVideoArgumentSetup(unittest.TestCase): class TestVideoScraperSetupEarlyExit(unittest.TestCase): - """Test that --setup exits before source validation.""" + """Test that --setup triggers run_setup via video setup module.""" @patch("skill_seekers.cli.video_setup.run_setup", return_value=0) - def test_setup_skips_source_validation(self, mock_setup): - """--setup without --url should NOT error about missing source.""" - from skill_seekers.cli.video_scraper import main + def test_setup_runs_successfully(self, mock_setup): + """run_setup(interactive=True) should return 0 on success.""" + from skill_seekers.cli.video_setup import run_setup - old_argv = sys.argv - try: - sys.argv = ["video_scraper", "--setup"] - rc = main() - assert rc == 0 - mock_setup.assert_called_once_with(interactive=True) - finally: - sys.argv = old_argv + rc = run_setup(interactive=True) + assert rc == 0 + mock_setup.assert_called_once_with(interactive=True) if __name__ == "__main__": diff --git a/tests/test_word_scraper.py b/tests/test_word_scraper.py index 2b13f43..60e48e2 100644 --- a/tests/test_word_scraper.py +++ b/tests/test_word_scraper.py @@ -572,61 +572,6 @@ class TestWordJSONWorkflow(unittest.TestCase): self.assertTrue(skill_md.exists()) -class TestWordCLIArguments(unittest.TestCase): - """Test word subcommand CLI argument parsing via the main CLI.""" - - def setUp(self): - import sys - from pathlib import Path as P - - sys.path.insert(0, str(P(__file__).parent.parent / "src")) - from skill_seekers.cli.main import create_parser - - self.parser = create_parser() - - def test_docx_argument_accepted(self): - """--docx flag is accepted for the word subcommand.""" - args = self.parser.parse_args(["word", "--docx", "test.docx"]) - self.assertEqual(args.docx, "test.docx") - - def test_api_key_accepted(self): - """--api-key is accepted for word subcommand.""" - args = self.parser.parse_args(["word", "--docx", "test.docx", "--api-key", "sk-ant-test"]) - self.assertEqual(args.api_key, "sk-ant-test") - - def test_enhance_level_accepted(self): - """--enhance-level is accepted for word subcommand.""" - args = self.parser.parse_args(["word", "--docx", "test.docx", "--enhance-level", "1"]) - self.assertEqual(args.enhance_level, 1) - - def test_enhance_workflow_accepted(self): - """--enhance-workflow is accepted and stores a list.""" - args = self.parser.parse_args( - ["word", "--docx", "test.docx", "--enhance-workflow", "minimal"] - ) - self.assertEqual(args.enhance_workflow, ["minimal"]) - - def test_workflow_dry_run_accepted(self): - """--workflow-dry-run is accepted.""" - args = self.parser.parse_args(["word", "--docx", "test.docx", "--workflow-dry-run"]) - self.assertTrue(args.workflow_dry_run) - - def test_dry_run_accepted(self): - """--dry-run is accepted for word subcommand.""" - args = self.parser.parse_args(["word", "--docx", "test.docx", "--dry-run"]) - self.assertTrue(args.dry_run) - - def test_from_json_accepted(self): - """--from-json is accepted.""" - args = self.parser.parse_args(["word", "--from-json", "data.json"]) - self.assertEqual(args.from_json, "data.json") - - def test_name_accepted(self): - """--name is accepted.""" - args = self.parser.parse_args(["word", "--docx", "test.docx", "--name", "myskill"]) - self.assertEqual(args.name, "myskill") - - class TestWordHelperFunctions(unittest.TestCase): """Test module-level helper functions."""