""" Scraping Tools Module for MCP Server This module contains all scraping-related MCP tool implementations: - estimate_pages_tool: Estimate page count before scraping - scrape_docs_tool: Scrape documentation (legacy or unified) - scrape_github_tool: Scrape GitHub repositories - scrape_pdf_tool: Scrape PDF documentation - scrape_codebase_tool: Analyze local codebase and extract code knowledge - scrape_generic_tool: Generic scraper for new source types (jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat) Extracted from server.py for better modularity and organization. """ import json import sys from pathlib import Path # MCP types - with graceful fallback for testing try: from mcp.types import TextContent except ImportError: # Graceful degradation: Create a simple fallback class for testing class TextContent: """Fallback TextContent for when MCP is not installed""" def __init__(self, type: str, text: str): self.type = type self.text = text # Path to CLI tools CLI_DIR = Path(__file__).parent.parent.parent / "cli" def run_subprocess_with_streaming(cmd: list[str], timeout: int = None) -> tuple: """ Run subprocess with real-time output streaming. This solves the blocking issue where long-running processes (like scraping) would cause MCP to appear frozen. Now we stream output as it comes. Args: cmd: Command list to execute timeout: Optional timeout in seconds Returns: Tuple of (stdout, stderr, returncode) """ import subprocess import time try: process = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, # Line buffered universal_newlines=True, ) stdout_lines = [] stderr_lines = [] start_time = time.time() # Read output line by line as it comes while True: # Check timeout if timeout and (time.time() - start_time) > timeout: process.kill() stderr_lines.append(f"\nโš ๏ธ Process killed after {timeout}s timeout") break # Check if process finished if process.poll() is not None: break # Read available output (non-blocking) try: import select readable, _, _ = select.select([process.stdout, process.stderr], [], [], 0.1) if process.stdout in readable: line = process.stdout.readline() if line: stdout_lines.append(line) if process.stderr in readable: line = process.stderr.readline() if line: stderr_lines.append(line) except Exception: # Fallback for Windows (no select) time.sleep(0.1) # Get any remaining output remaining_stdout, remaining_stderr = process.communicate() if remaining_stdout: stdout_lines.append(remaining_stdout) if remaining_stderr: stderr_lines.append(remaining_stderr) stdout = "".join(stdout_lines) stderr = "".join(stderr_lines) returncode = process.returncode return stdout, stderr, returncode except Exception as e: return "", f"Error running subprocess: {str(e)}", 1 async def estimate_pages_tool(args: dict) -> list[TextContent]: """ Estimate page count from a config file. Performs fast preview without downloading content to estimate how many pages will be scraped. Args: args: Dictionary containing: - config_path (str): Path to config JSON file - max_discovery (int, optional): Maximum pages to discover (default: 1000) - unlimited (bool, optional): Remove discovery limit (default: False) Returns: List[TextContent]: Tool execution results """ config_path = args["config_path"] max_discovery = args.get("max_discovery", 1000) unlimited = args.get("unlimited", False) # Handle unlimited mode if unlimited or max_discovery == -1: max_discovery = -1 timeout = 1800 # 30 minutes for unlimited discovery else: # Estimate: 0.5s per page discovered timeout = max(300, max_discovery // 2) # Minimum 5 minutes # Run estimate_pages.py cmd = [ sys.executable, str(CLI_DIR / "estimate_pages.py"), config_path, "--max-discovery", str(max_discovery), ] progress_msg = "๐Ÿ”„ Estimating page count...\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output)] else: return [TextContent(type="text", text=f"{output}\n\nโŒ Error:\n{stderr}")] async def scrape_docs_tool(args: dict) -> list[TextContent]: """ Scrape documentation and build skill. Auto-detects unified vs legacy format and routes to appropriate scraper. Supports both single-source (legacy) and unified multi-source configs. Creates SKILL.md and reference files. Args: args: Dictionary containing: - config_path (str): Path to config JSON file - unlimited (bool, optional): Remove page limit (default: False) - enhance_local (bool, optional): Open terminal for local enhancement (default: False) - skip_scrape (bool, optional): Skip scraping, use cached data (default: False) - dry_run (bool, optional): Preview without saving (default: False) - merge_mode (str, optional): Override merge mode for unified configs Returns: List[TextContent]: Tool execution results """ config_path = args["config_path"] unlimited = args.get("unlimited", False) enhance_local = args.get("enhance_local", False) skip_scrape = args.get("skip_scrape", False) dry_run = args.get("dry_run", False) merge_mode = args.get("merge_mode") # Load config to detect format with open(config_path) as f: config = json.load(f) # Detect if unified format (has 'sources' array) is_unified = "sources" in config and isinstance(config["sources"], list) # Handle unlimited mode by modifying config temporarily if unlimited: # Set max_pages to None (unlimited) if is_unified: # For unified configs, set max_pages on documentation sources for source in config.get("sources", []): if source.get("type") == "documentation": source["max_pages"] = None else: # For legacy configs config["max_pages"] = None # Create temporary config file temp_config_path = config_path.replace(".json", "_unlimited_temp.json") with open(temp_config_path, "w") as f: json.dump(config, f, indent=2) config_to_use = temp_config_path else: config_to_use = config_path # Choose scraper based on format if is_unified: scraper_script = "unified_scraper.py" progress_msg = "๐Ÿ”„ Starting unified multi-source scraping...\n" progress_msg += "๐Ÿ“ฆ Config format: Unified (multiple sources)\n" else: scraper_script = "doc_scraper.py" progress_msg = "๐Ÿ”„ Starting scraping process...\n" progress_msg += "๐Ÿ“ฆ Config format: Legacy (single source)\n" # Build command cmd = [sys.executable, str(CLI_DIR / scraper_script), "--config", config_to_use] # Add merge mode for unified configs if is_unified and merge_mode: cmd.extend(["--merge-mode", merge_mode]) # Add --fresh to avoid user input prompts when existing data found if not skip_scrape: cmd.append("--fresh") if enhance_local: cmd.append("--enhance-local") if skip_scrape: cmd.append("--skip-scrape") if dry_run: cmd.append("--dry-run") # Determine timeout based on operation type if dry_run: timeout = 300 # 5 minutes for dry run elif skip_scrape: timeout = 600 # 10 minutes for building from cache elif unlimited: timeout = None # No timeout for unlimited mode (user explicitly requested) else: # Read config to estimate timeout try: if is_unified: # For unified configs, estimate based on all sources total_pages = 0 for source in config.get("sources", []): if source.get("type") == "documentation": total_pages += source.get("max_pages", 500) max_pages = total_pages or 500 else: max_pages = config.get("max_pages", 500) # Estimate: 30s per page + buffer timeout = max(3600, max_pages * 35) # Minimum 1 hour, or 35s per page except Exception: timeout = 14400 # Default: 4 hours # Add progress message if timeout: progress_msg += f"โฑ๏ธ Maximum time allowed: {timeout // 60} minutes\n" else: progress_msg += "โฑ๏ธ Unlimited mode - no timeout\n" progress_msg += "๐Ÿ“ Progress will be shown below:\n\n" # Run scraper with streaming stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) # Clean up temporary config if unlimited and Path(config_to_use).exists(): Path(config_to_use).unlink() output = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output)] else: error_output = output + f"\n\nโŒ Error:\n{stderr}" return [TextContent(type="text", text=error_output)] async def scrape_pdf_tool(args: dict) -> list[TextContent]: """ Scrape PDF documentation and build Claude skill. Extracts text, code, and images from PDF files and builds a skill package with organized references. Args: args: Dictionary containing: - config_path (str, optional): Path to PDF config JSON file - pdf_path (str, optional): Direct PDF path (alternative to config_path) - name (str, optional): Skill name (required with pdf_path) - description (str, optional): Skill description - from_json (str, optional): Build from extracted JSON file Returns: List[TextContent]: Tool execution results """ config_path = args.get("config_path") pdf_path = args.get("pdf_path") name = args.get("name") description = args.get("description") from_json = args.get("from_json") # Build command cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")] # Mode 1: Config file if config_path: cmd.extend(["--config", config_path]) # Mode 2: Direct PDF elif pdf_path and name: cmd.extend(["--pdf", pdf_path, "--name", name]) if description: cmd.extend(["--description", description]) # Mode 3: From JSON elif from_json: cmd.extend(["--from-json", from_json]) else: return [ TextContent( type="text", text="โŒ Error: Must specify --config, --pdf + --name, or --from-json" ) ] # Run pdf_scraper.py with streaming (can take a while) timeout = 600 # 10 minutes for PDF extraction progress_msg = "๐Ÿ“„ Scraping PDF documentation...\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output)] else: return [TextContent(type="text", text=f"{output}\n\nโŒ Error:\n{stderr}")] async def scrape_video_tool(args: dict) -> list[TextContent]: """ Scrape video content (YouTube, local files) and build Claude skill. Extracts transcripts, metadata, and optionally visual content from videos to create skills. Args: args: Dictionary containing: - url (str, optional): Video URL (YouTube, Vimeo) - video_file (str, optional): Local video file path - playlist (str, optional): Playlist URL - name (str, optional): Skill name - description (str, optional): Skill description - languages (str, optional): Language preferences (comma-separated) - from_json (str, optional): Build from extracted JSON file - visual (bool, optional): Enable visual frame extraction (default: False) - whisper_model (str, optional): Whisper model size (default: base) - visual_interval (float, optional): Seconds between frame captures (default: 5.0) - visual_min_gap (float, optional): Minimum seconds between kept frames (default: 2.0) - visual_similarity (float, optional): Similarity threshold to skip duplicate frames (default: 0.95) - vision_ocr (bool, optional): Use vision model for OCR on frames (default: False) - start_time (str, optional): Start time for extraction (seconds, MM:SS, or HH:MM:SS) - end_time (str, optional): End time for extraction (seconds, MM:SS, or HH:MM:SS) - setup (bool, optional): Auto-detect GPU and install visual extraction deps Returns: List[TextContent]: Tool execution results """ # Handle --setup early exit if args.get("setup", False): from skill_seekers.cli.video_setup import run_setup rc = run_setup(interactive=False) msg = "Setup completed successfully." if rc == 0 else "Setup failed. Check logs." return [TextContent(type="text", text=msg)] url = args.get("url") video_file = args.get("video_file") playlist = args.get("playlist") name = args.get("name") description = args.get("description") languages = args.get("languages") from_json = args.get("from_json") visual = args.get("visual", False) whisper_model = args.get("whisper_model") visual_interval = args.get("visual_interval") visual_min_gap = args.get("visual_min_gap") visual_similarity = args.get("visual_similarity") vision_ocr = args.get("vision_ocr", False) start_time = args.get("start_time") end_time = args.get("end_time") # Build command cmd = [sys.executable, str(CLI_DIR / "video_scraper.py")] if from_json: cmd.extend(["--from-json", from_json]) elif url: cmd.extend(["--url", url]) if name: cmd.extend(["--name", name]) if description: cmd.extend(["--description", description]) if languages: cmd.extend(["--languages", languages]) elif video_file: cmd.extend(["--video-file", video_file]) if name: cmd.extend(["--name", name]) if description: cmd.extend(["--description", description]) elif playlist: cmd.extend(["--playlist", playlist]) if name: cmd.extend(["--name", name]) else: return [ TextContent( type="text", text="โŒ Error: Must specify --url, --video-file, --playlist, or --from-json", ) ] # Visual extraction parameters if visual: cmd.append("--visual") if whisper_model: cmd.extend(["--whisper-model", whisper_model]) if visual_interval is not None: cmd.extend(["--visual-interval", str(visual_interval)]) if visual_min_gap is not None: cmd.extend(["--visual-min-gap", str(visual_min_gap)]) if visual_similarity is not None: cmd.extend(["--visual-similarity", str(visual_similarity)]) if vision_ocr: cmd.append("--vision-ocr") if start_time: cmd.extend(["--start-time", str(start_time)]) if end_time: cmd.extend(["--end-time", str(end_time)]) # Run video_scraper.py with streaming timeout = 600 # 10 minutes for video extraction progress_msg = "๐ŸŽฌ Scraping video content...\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output)] else: return [TextContent(type="text", text=f"{output}\n\nโŒ Error:\n{stderr}")] async def scrape_github_tool(args: dict) -> list[TextContent]: """ Scrape GitHub repository and build Claude skill. Extracts README, Issues, Changelog, Releases, and code structure from GitHub repositories to create comprehensive skills. Args: args: Dictionary containing: - repo (str, optional): GitHub repository (owner/repo) - config_path (str, optional): Path to GitHub config JSON file - name (str, optional): Skill name (default: repo name) - description (str, optional): Skill description - token (str, optional): GitHub personal access token - no_issues (bool, optional): Skip GitHub issues extraction (default: False) - no_changelog (bool, optional): Skip CHANGELOG extraction (default: False) - no_releases (bool, optional): Skip releases extraction (default: False) - max_issues (int, optional): Maximum issues to fetch (default: 100) - scrape_only (bool, optional): Only scrape, don't build skill (default: False) Returns: List[TextContent]: Tool execution results """ repo = args.get("repo") config_path = args.get("config_path") name = args.get("name") description = args.get("description") token = args.get("token") no_issues = args.get("no_issues", False) no_changelog = args.get("no_changelog", False) no_releases = args.get("no_releases", False) max_issues = args.get("max_issues", 100) scrape_only = args.get("scrape_only", False) # Build command cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")] # Mode 1: Config file if config_path: cmd.extend(["--config", config_path]) # Mode 2: Direct repo elif repo: cmd.extend(["--repo", repo]) if name: cmd.extend(["--name", name]) if description: cmd.extend(["--description", description]) if token: cmd.extend(["--token", token]) if no_issues: cmd.append("--no-issues") if no_changelog: cmd.append("--no-changelog") if no_releases: cmd.append("--no-releases") if max_issues != 100: cmd.extend(["--max-issues", str(max_issues)]) if scrape_only: cmd.append("--scrape-only") else: return [TextContent(type="text", text="โŒ Error: Must specify --repo or --config")] # Run github_scraper.py with streaming (can take a while) timeout = 600 # 10 minutes for GitHub scraping progress_msg = "๐Ÿ™ Scraping GitHub repository...\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output)] else: return [TextContent(type="text", text=f"{output}\n\nโŒ Error:\n{stderr}")] async def scrape_codebase_tool(args: dict) -> list[TextContent]: """ Analyze local codebase and extract code knowledge. Walks directory tree, analyzes code files, extracts signatures, docstrings, and generates API reference documentation, dependency graphs, design patterns, test examples, and how-to guides. All features are ON by default. Use skip_* parameters to disable specific features. Args: args: Dictionary containing: - directory (str): Directory to analyze - output (str, optional): Output directory for results (default: output/codebase/) - depth (str, optional): Analysis depth - surface, deep, full (default: deep) - languages (str, optional): Comma-separated languages (e.g., "Python,JavaScript,C++") - file_patterns (str, optional): Comma-separated file patterns (e.g., "*.py,src/**/*.js") - enhance_level (int, optional): AI enhancement level 0-3 (default: 0) - 0: No AI enhancement - 1: SKILL.md enhancement only - 2: SKILL.md + Architecture + Config enhancement - 3: Full enhancement (patterns, tests, config, architecture, SKILL.md) - skip_api_reference (bool, optional): Skip API reference generation (default: False) - skip_dependency_graph (bool, optional): Skip dependency graph (default: False) - skip_patterns (bool, optional): Skip design pattern detection (default: False) - skip_test_examples (bool, optional): Skip test example extraction (default: False) - skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False) - skip_config_patterns (bool, optional): Skip config pattern extraction (default: False) - skip_docs (bool, optional): Skip project documentation extraction (default: False) Returns: List[TextContent]: Tool execution results Example: scrape_codebase( directory="/path/to/repo", depth="deep", enhance_level=1 ) scrape_codebase( directory="/path/to/repo", enhance_level=2, skip_patterns=True ) """ directory = args.get("directory") if not directory: return [TextContent(type="text", text="โŒ Error: directory parameter is required")] output = args.get("output", "output/codebase/") depth = args.get("depth", "deep") languages = args.get("languages", "") file_patterns = args.get("file_patterns", "") enhance_level = args.get("enhance_level", 0) # Skip flags (features are ON by default) skip_api_reference = args.get("skip_api_reference", False) skip_dependency_graph = args.get("skip_dependency_graph", False) skip_patterns = args.get("skip_patterns", False) skip_test_examples = args.get("skip_test_examples", False) skip_how_to_guides = args.get("skip_how_to_guides", False) skip_config_patterns = args.get("skip_config_patterns", False) skip_docs = args.get("skip_docs", False) # Build command cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"] cmd.extend(["--directory", directory]) if output: cmd.extend(["--output", output]) if depth: cmd.extend(["--depth", depth]) if languages: cmd.extend(["--languages", languages]) if file_patterns: cmd.extend(["--file-patterns", file_patterns]) if enhance_level > 0: cmd.extend(["--enhance-level", str(enhance_level)]) # Skip flags if skip_api_reference: cmd.append("--skip-api-reference") if skip_dependency_graph: cmd.append("--skip-dependency-graph") if skip_patterns: cmd.append("--skip-patterns") if skip_test_examples: cmd.append("--skip-test-examples") if skip_how_to_guides: cmd.append("--skip-how-to-guides") if skip_config_patterns: cmd.append("--skip-config-patterns") if skip_docs: cmd.append("--skip-docs") # Adjust timeout based on enhance_level timeout = 600 # 10 minutes base if enhance_level >= 2: timeout = 1200 # 20 minutes with AI enhancement if enhance_level >= 3: timeout = 3600 # 60 minutes for full enhancement level_names = {0: "off", 1: "SKILL.md only", 2: "standard", 3: "full"} progress_msg = "๐Ÿ” Analyzing local codebase...\n" progress_msg += f"๐Ÿ“ Directory: {directory}\n" progress_msg += f"๐Ÿ“Š Depth: {depth}\n" if enhance_level > 0: progress_msg += f"๐Ÿค– AI Enhancement: Level {enhance_level} ({level_names.get(enhance_level, 'unknown')})\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output_text = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output_text)] else: return [TextContent(type="text", text=f"{output_text}\n\nโŒ Error:\n{stderr}")] async def detect_patterns_tool(args: dict) -> list[TextContent]: """ Detect design patterns in source code. Analyzes source files or directories to detect common design patterns (Singleton, Factory, Observer, Strategy, Decorator, Builder, Adapter, Command, Template Method, Chain of Responsibility). Supports 9 languages: Python, JavaScript, TypeScript, C++, C, C#, Go, Rust, Java, Ruby, PHP. Args: args: Dictionary containing: - file (str, optional): Single file to analyze - directory (str, optional): Directory to analyze (analyzes all source files) - output (str, optional): Output directory for JSON results - depth (str, optional): Detection depth - surface, deep, full (default: deep) - json (bool, optional): Output JSON format (default: False) Returns: List[TextContent]: Pattern detection results Example: detect_patterns(file="src/database.py", depth="deep") detect_patterns(directory="src/", output="patterns/", json=True) """ file_path = args.get("file") directory = args.get("directory") if not file_path and not directory: return [ TextContent( type="text", text="โŒ Error: Must specify either 'file' or 'directory' parameter" ) ] output = args.get("output", "") depth = args.get("depth", "deep") json_output = args.get("json", False) # Build command cmd = [sys.executable, "-m", "skill_seekers.cli.pattern_recognizer"] if file_path: cmd.extend(["--file", file_path]) if directory: cmd.extend(["--directory", directory]) if output: cmd.extend(["--output", output]) if depth: cmd.extend(["--depth", depth]) if json_output: cmd.append("--json") timeout = 300 # 5 minutes for pattern detection progress_msg = "๐Ÿ” Detecting design patterns...\n" if file_path: progress_msg += f"๐Ÿ“„ File: {file_path}\n" if directory: progress_msg += f"๐Ÿ“ Directory: {directory}\n" progress_msg += f"๐ŸŽฏ Detection depth: {depth}\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output_text = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output_text)] else: return [TextContent(type="text", text=f"{output_text}\n\nโŒ Error:\n{stderr}")] async def extract_test_examples_tool(args: dict) -> list[TextContent]: """ Extract usage examples from test files. Analyzes test files to extract real API usage patterns including: - Object instantiation with real parameters - Method calls with expected behaviors - Configuration examples - Setup patterns from fixtures/setUp() - Multi-step workflows from integration tests Supports 9 languages: Python (AST-based deep analysis), JavaScript, TypeScript, Go, Rust, Java, C#, PHP, Ruby (regex-based). Args: args: Dictionary containing: - file (str, optional): Single test file to analyze - directory (str, optional): Directory containing test files - language (str, optional): Filter by language (python, javascript, etc.) - min_confidence (float, optional): Minimum confidence threshold 0.0-1.0 (default: 0.5) - max_per_file (int, optional): Maximum examples per file (default: 10) - json (bool, optional): Output JSON format (default: False) - markdown (bool, optional): Output Markdown format (default: False) Returns: List[TextContent]: Extracted test examples Example: extract_test_examples(directory="tests/", language="python") extract_test_examples(file="tests/test_scraper.py", json=True) """ file_path = args.get("file") directory = args.get("directory") if not file_path and not directory: return [ TextContent( type="text", text="โŒ Error: Must specify either 'file' or 'directory' parameter" ) ] language = args.get("language", "") min_confidence = args.get("min_confidence", 0.5) max_per_file = args.get("max_per_file", 10) json_output = args.get("json", False) markdown_output = args.get("markdown", False) # Build command cmd = [sys.executable, "-m", "skill_seekers.cli.test_example_extractor"] if directory: cmd.append(directory) if file_path: cmd.extend(["--file", file_path]) if language: cmd.extend(["--language", language]) if min_confidence: cmd.extend(["--min-confidence", str(min_confidence)]) if max_per_file: cmd.extend(["--max-per-file", str(max_per_file)]) if json_output: cmd.append("--json") if markdown_output: cmd.append("--markdown") timeout = 180 # 3 minutes for test example extraction progress_msg = "๐Ÿงช Extracting usage examples from test files...\n" if file_path: progress_msg += f"๐Ÿ“„ File: {file_path}\n" if directory: progress_msg += f"๐Ÿ“ Directory: {directory}\n" if language: progress_msg += f"๐Ÿ”ค Language: {language}\n" progress_msg += f"๐ŸŽฏ Min confidence: {min_confidence}\n" progress_msg += f"๐Ÿ“Š Max per file: {max_per_file}\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output_text = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output_text)] else: return [TextContent(type="text", text=f"{output_text}\n\nโŒ Error:\n{stderr}")] async def build_how_to_guides_tool(args: dict) -> list[TextContent]: """ Build how-to guides from workflow test examples. Transforms workflow examples extracted from test files into step-by-step educational guides. Automatically groups related workflows, extracts steps, and generates comprehensive markdown guides. Features: - Python AST-based step extraction (heuristic for other languages) - 4 grouping strategies: ai-tutorial-group, file-path, test-name, complexity - Detects prerequisites, setup code, and verification points - Generates troubleshooting tips and next steps - Creates index with difficulty levels Args: args: Dictionary containing: - input (str): Path to test_examples.json from extract_test_examples - output (str, optional): Output directory for guides (default: output/codebase/tutorials) - group_by (str, optional): Grouping strategy - ai-tutorial-group, file-path, test-name, complexity - no_ai (bool, optional): Disable AI enhancement for grouping (default: False) - json_output (bool, optional): Output JSON format alongside markdown (default: False) Returns: List[TextContent]: Guide building results Example: build_how_to_guides( input="output/codebase/test_examples/test_examples.json", group_by="ai-tutorial-group", output="output/codebase/tutorials" ) """ input_file = args.get("input") if not input_file: return [ TextContent( type="text", text="โŒ Error: input parameter is required (path to test_examples.json)", ) ] output = args.get("output", "output/codebase/tutorials") group_by = args.get("group_by", "ai-tutorial-group") no_ai = args.get("no_ai", False) json_output = args.get("json_output", False) # Build command cmd = [sys.executable, "-m", "skill_seekers.cli.how_to_guide_builder"] cmd.append(input_file) if output: cmd.extend(["--output", output]) if group_by: cmd.extend(["--group-by", group_by]) if no_ai: cmd.append("--no-ai") if json_output: cmd.append("--json-output") timeout = 180 # 3 minutes for guide building progress_msg = "๐Ÿ“š Building how-to guides from workflow examples...\n" progress_msg += f"๐Ÿ“„ Input: {input_file}\n" progress_msg += f"๐Ÿ“ Output: {output}\n" progress_msg += f"๐Ÿ”€ Grouping: {group_by}\n" if no_ai: progress_msg += "๐Ÿšซ AI enhancement disabled\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output_text = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output_text)] else: return [TextContent(type="text", text=f"{output_text}\n\nโŒ Error:\n{stderr}")] async def extract_config_patterns_tool(args: dict) -> list[TextContent]: """ Extract configuration patterns from config files (C3.4). Analyzes configuration files in the codebase to extract settings, detect common patterns (database, API, logging, cache, etc.), and generate comprehensive documentation. Supports 9 config formats: JSON, YAML, TOML, ENV, INI, Python modules, JavaScript/TypeScript configs, Dockerfile, Docker Compose. Detects 7 common patterns: - Database configuration (host, port, credentials) - API configuration (endpoints, keys, timeouts) - Logging configuration (level, format, handlers) - Cache configuration (backend, TTL, keys) - Email configuration (SMTP, credentials) - Authentication configuration (providers, secrets) - Server configuration (host, port, workers) Args: args: Dictionary containing: - directory (str): Directory to analyze - output (str, optional): Output directory (default: output/codebase/config_patterns) - max_files (int, optional): Maximum config files to process (default: 100) - enhance (bool, optional): Enable AI enhancement - API mode (default: False, requires ANTHROPIC_API_KEY) - enhance_local (bool, optional): Enable AI enhancement - LOCAL mode (default: False, uses Claude Code CLI) - ai_mode (str, optional): AI mode - auto, api, local, none (default: none) - json (bool, optional): Output JSON format (default: True) - markdown (bool, optional): Output Markdown format (default: True) Returns: List[TextContent]: Config extraction results with optional AI enhancements Example: extract_config_patterns(directory=".", output="output/configs") extract_config_patterns(directory="/path/to/repo", max_files=50, enhance_local=True) """ directory = args.get("directory") if not directory: return [TextContent(type="text", text="โŒ Error: directory parameter is required")] output = args.get("output", "output/codebase/config_patterns") max_files = args.get("max_files", 100) enhance = args.get("enhance", False) enhance_local = args.get("enhance_local", False) ai_mode = args.get("ai_mode", "none") json_output = args.get("json", True) markdown_output = args.get("markdown", True) # Build command cmd = [sys.executable, "-m", "skill_seekers.cli.config_extractor"] cmd.extend(["--directory", directory]) if output: cmd.extend(["--output", output]) if max_files: cmd.extend(["--max-files", str(max_files)]) if enhance: cmd.append("--enhance") if enhance_local: cmd.append("--enhance-local") if ai_mode and ai_mode != "none": cmd.extend(["--ai-mode", ai_mode]) if json_output: cmd.append("--json") if markdown_output: cmd.append("--markdown") # Adjust timeout for AI enhancement timeout = 180 # 3 minutes base if enhance or enhance_local or ai_mode != "none": timeout = 360 # 6 minutes with AI enhancement progress_msg = "โš™๏ธ Extracting configuration patterns...\n" progress_msg += f"๐Ÿ“ Directory: {directory}\n" progress_msg += f"๐Ÿ“„ Max files: {max_files}\n" if enhance or enhance_local or (ai_mode and ai_mode != "none"): progress_msg += f"๐Ÿค– AI enhancement: {ai_mode if ai_mode != 'none' else ('api' if enhance else 'local')}\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output_text = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output_text)] else: return [TextContent(type="text", text=f"{output_text}\n\nโŒ Error:\n{stderr}")] # Valid source types for the generic scraper GENERIC_SOURCE_TYPES = ( "jupyter", "html", "openapi", "asciidoc", "pptx", "confluence", "notion", "rss", "manpage", "chat", ) # Mapping from source type to the CLI flag used for the primary input argument. # URL-based types use --url; file/path-based types use --path. _URL_BASED_TYPES = {"confluence", "notion", "rss"} # Friendly emoji labels per source type _SOURCE_EMOJIS = { "jupyter": "๐Ÿ““", "html": "๐ŸŒ", "openapi": "๐Ÿ“ก", "asciidoc": "๐Ÿ“„", "pptx": "๐Ÿ“Š", "confluence": "๐Ÿข", "notion": "๐Ÿ“", "rss": "๐Ÿ“ฐ", "manpage": "๐Ÿ“–", "chat": "๐Ÿ’ฌ", } async def scrape_generic_tool(args: dict) -> list[TextContent]: """ Generic scraper for new source types. Handles all 10 new source types by building the appropriate subprocess command and delegating to the corresponding CLI scraper module. Supported source types: jupyter, html, openapi, asciidoc, pptx, confluence, notion, rss, manpage, chat. Args: args: Dictionary containing: - source_type (str): One of the supported source types - path (str, optional): File or directory path (for file-based sources) - url (str, optional): URL (for URL-based sources like confluence, notion, rss) - name (str): Skill name for the output Returns: List[TextContent]: Tool execution results """ source_type = args.get("source_type", "") path = args.get("path") url = args.get("url") name = args.get("name") # Validate source_type if source_type not in GENERIC_SOURCE_TYPES: return [ TextContent( type="text", text=( f"โŒ Error: Unknown source_type '{source_type}'. " f"Must be one of: {', '.join(GENERIC_SOURCE_TYPES)}" ), ) ] # Validate that we have either path or url if not path and not url: return [ TextContent( type="text", text="โŒ Error: Must specify either 'path' (file/directory) or 'url'", ) ] if not name: return [ TextContent( type="text", text="โŒ Error: 'name' parameter is required", ) ] # Build the subprocess command # Map source type to module name (most are _scraper, but some differ) _MODULE_NAMES = { "manpage": "man_scraper", } module_name = _MODULE_NAMES.get(source_type, f"{source_type}_scraper") cmd = [sys.executable, "-m", f"skill_seekers.cli.{module_name}"] # Map source type to the correct CLI flag for file/path input and URL input. # Each scraper has its own flag name โ€” using a generic --path or --url would fail. _PATH_FLAGS: dict[str, str] = { "jupyter": "--notebook", "html": "--html-path", "openapi": "--spec", "asciidoc": "--asciidoc-path", "pptx": "--pptx", "manpage": "--man-path", "confluence": "--export-path", "notion": "--export-path", "rss": "--feed-path", "chat": "--export-path", } _URL_FLAGS: dict[str, str] = { "confluence": "--base-url", "notion": "--page-id", "rss": "--feed-url", "openapi": "--spec-url", } # Determine the input flag based on source type if source_type in _URL_BASED_TYPES and url: url_flag = _URL_FLAGS.get(source_type, "--url") cmd.extend([url_flag, url]) elif path: path_flag = _PATH_FLAGS.get(source_type, "--path") cmd.extend([path_flag, path]) elif url: # Allow url fallback for file-based types (some may accept URLs too) url_flag = _URL_FLAGS.get(source_type, "--url") cmd.extend([url_flag, url]) cmd.extend(["--name", name]) # Set a reasonable timeout timeout = 600 # 10 minutes emoji = _SOURCE_EMOJIS.get(source_type, "๐Ÿ”ง") progress_msg = f"{emoji} Scraping {source_type} source...\n" if path: progress_msg += f"๐Ÿ“ Path: {path}\n" if url: progress_msg += f"๐Ÿ”— URL: {url}\n" progress_msg += f"๐Ÿ“› Name: {name}\n" progress_msg += f"โฑ๏ธ Maximum time: {timeout // 60} minutes\n\n" stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout) output = progress_msg + stdout if returncode == 0: return [TextContent(type="text", text=output)] else: return [TextContent(type="text", text=f"{output}\n\nโŒ Error:\n{stderr}")]