skill-seekers-reference/src/skill_seekers/mcp/tools/scraping_tools.py

"""
Scraping Tools Module for MCP Server

This module contains all scraping-related MCP tool implementations:
- estimate_pages_tool: Estimate page count before scraping
- scrape_docs_tool: Scrape documentation (legacy or unified)
- scrape_github_tool: Scrape GitHub repositories
- scrape_pdf_tool: Scrape PDF documentation
- scrape_codebase_tool: Analyze local codebase and extract code knowledge

Extracted from server.py for better modularity and organization.
"""

import json
import sys
from pathlib import Path

# MCP types - with graceful fallback for testing
try:
    from mcp.types import TextContent
except ImportError:
    # Graceful degradation: Create a simple fallback class for testing
    class TextContent:
        """Fallback TextContent for when MCP is not installed"""

        def __init__(self, type: str, text: str):
            self.type = type
            self.text = text


# Path to CLI tools
CLI_DIR = Path(__file__).parent.parent.parent / "cli"


def run_subprocess_with_streaming(cmd: list[str], timeout: int = None) -> tuple:
    """
    Run subprocess with real-time output streaming.

    This solves the blocking issue where long-running processes (like scraping)
    would cause MCP to appear frozen. Now we stream output as it comes.

    Args:
        cmd: Command list to execute
        timeout: Optional timeout in seconds

    Returns:
        Tuple of (stdout, stderr, returncode)
    """
    import subprocess
    import time

    try:
        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,  # Line buffered
            universal_newlines=True,
        )

        stdout_lines = []
        stderr_lines = []
        start_time = time.time()

        # Read output line by line as it comes
        while True:
            # Check timeout
            if timeout and (time.time() - start_time) > timeout:
                process.kill()
                stderr_lines.append(f"\n⚠️ Process killed after {timeout}s timeout")
                break

            # Check if process finished
            if process.poll() is not None:
                break

            # Read available output (non-blocking)
            try:
                import select

                readable, _, _ = select.select([process.stdout, process.stderr], [], [], 0.1)

                if process.stdout in readable:
                    line = process.stdout.readline()
                    if line:
                        stdout_lines.append(line)

                if process.stderr in readable:
                    line = process.stderr.readline()
                    if line:
                        stderr_lines.append(line)
            except Exception:
                # Fallback for Windows (no select)
                time.sleep(0.1)

        # Get any remaining output
        remaining_stdout, remaining_stderr = process.communicate()
        if remaining_stdout:
            stdout_lines.append(remaining_stdout)
        if remaining_stderr:
            stderr_lines.append(remaining_stderr)

        stdout = "".join(stdout_lines)
        stderr = "".join(stderr_lines)
        returncode = process.returncode

        return stdout, stderr, returncode

    except Exception as e:
        return "", f"Error running subprocess: {str(e)}", 1


async def estimate_pages_tool(args: dict) -> list[TextContent]:
    """
    Estimate page count from a config file.

    Performs fast preview without downloading content to estimate
    how many pages will be scraped.

    Args:
        args: Dictionary containing:
            - config_path (str): Path to config JSON file
            - max_discovery (int, optional): Maximum pages to discover (default: 1000)
            - unlimited (bool, optional): Remove discovery limit (default: False)

    Returns:
        List[TextContent]: Tool execution results
    """
    config_path = args["config_path"]
    max_discovery = args.get("max_discovery", 1000)
    unlimited = args.get("unlimited", False)

    # Handle unlimited mode
    if unlimited or max_discovery == -1:
        max_discovery = -1
        timeout = 1800  # 30 minutes for unlimited discovery
    else:
        # Estimate: 0.5s per page discovered
        timeout = max(300, max_discovery // 2)  # Minimum 5 minutes

    # Run estimate_pages.py
    cmd = [
        sys.executable,
        str(CLI_DIR / "estimate_pages.py"),
        config_path,
        "--max-discovery",
        str(max_discovery),
    ]

    progress_msg = "🔄 Estimating page count...\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output)]
    else:
        return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]


async def scrape_docs_tool(args: dict) -> list[TextContent]:
    """
    Scrape documentation and build skill.

    Auto-detects unified vs legacy format and routes to appropriate scraper.
    Supports both single-source (legacy) and unified multi-source configs.
    Creates SKILL.md and reference files.

    Args:
        args: Dictionary containing:
            - config_path (str): Path to config JSON file
            - unlimited (bool, optional): Remove page limit (default: False)
            - enhance_local (bool, optional): Open terminal for local enhancement (default: False)
            - skip_scrape (bool, optional): Skip scraping, use cached data (default: False)
            - dry_run (bool, optional): Preview without saving (default: False)
            - merge_mode (str, optional): Override merge mode for unified configs

    Returns:
        List[TextContent]: Tool execution results
    """
    config_path = args["config_path"]
    unlimited = args.get("unlimited", False)
    enhance_local = args.get("enhance_local", False)
    skip_scrape = args.get("skip_scrape", False)
    dry_run = args.get("dry_run", False)
    merge_mode = args.get("merge_mode")

    # Load config to detect format
    with open(config_path) as f:
        config = json.load(f)

    # Detect if unified format (has 'sources' array)
    is_unified = "sources" in config and isinstance(config["sources"], list)

    # Handle unlimited mode by modifying config temporarily
    if unlimited:
        # Set max_pages to None (unlimited)
        if is_unified:
            # For unified configs, set max_pages on documentation sources
            for source in config.get("sources", []):
                if source.get("type") == "documentation":
                    source["max_pages"] = None
        else:
            # For legacy configs
            config["max_pages"] = None

        # Create temporary config file
        temp_config_path = config_path.replace(".json", "_unlimited_temp.json")
        with open(temp_config_path, "w") as f:
            json.dump(config, f, indent=2)

        config_to_use = temp_config_path
    else:
        config_to_use = config_path

    # Choose scraper based on format
    if is_unified:
        scraper_script = "unified_scraper.py"
        progress_msg = "🔄 Starting unified multi-source scraping...\n"
        progress_msg += "📦 Config format: Unified (multiple sources)\n"
    else:
        scraper_script = "doc_scraper.py"
        progress_msg = "🔄 Starting scraping process...\n"
        progress_msg += "📦 Config format: Legacy (single source)\n"

    # Build command
    cmd = [sys.executable, str(CLI_DIR / scraper_script), "--config", config_to_use]

    # Add merge mode for unified configs
    if is_unified and merge_mode:
        cmd.extend(["--merge-mode", merge_mode])

    # Add --fresh to avoid user input prompts when existing data found
    if not skip_scrape:
        cmd.append("--fresh")

    if enhance_local:
        cmd.append("--enhance-local")
    if skip_scrape:
        cmd.append("--skip-scrape")
    if dry_run:
        cmd.append("--dry-run")

    # Determine timeout based on operation type
    if dry_run:
        timeout = 300  # 5 minutes for dry run
    elif skip_scrape:
        timeout = 600  # 10 minutes for building from cache
    elif unlimited:
        timeout = None  # No timeout for unlimited mode (user explicitly requested)
    else:
        # Read config to estimate timeout
        try:
            if is_unified:
                # For unified configs, estimate based on all sources
                total_pages = 0
                for source in config.get("sources", []):
                    if source.get("type") == "documentation":
                        total_pages += source.get("max_pages", 500)
                max_pages = total_pages or 500
            else:
                max_pages = config.get("max_pages", 500)

            # Estimate: 30s per page + buffer
            timeout = max(3600, max_pages * 35)  # Minimum 1 hour, or 35s per page
        except Exception:
            timeout = 14400  # Default: 4 hours

    # Add progress message
    if timeout:
        progress_msg += f"⏱️ Maximum time allowed: {timeout // 60} minutes\n"
    else:
        progress_msg += "⏱️ Unlimited mode - no timeout\n"
    progress_msg += "📝 Progress will be shown below:\n\n"

    # Run scraper with streaming
    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    # Clean up temporary config
    if unlimited and Path(config_to_use).exists():
        Path(config_to_use).unlink()

    output = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output)]
    else:
        error_output = output + f"\n\n❌ Error:\n{stderr}"
        return [TextContent(type="text", text=error_output)]


async def scrape_pdf_tool(args: dict) -> list[TextContent]:
    """
    Scrape PDF documentation and build Claude skill.

    Extracts text, code, and images from PDF files and builds
    a skill package with organized references.

    Args:
        args: Dictionary containing:
            - config_path (str, optional): Path to PDF config JSON file
            - pdf_path (str, optional): Direct PDF path (alternative to config_path)
            - name (str, optional): Skill name (required with pdf_path)
            - description (str, optional): Skill description
            - from_json (str, optional): Build from extracted JSON file

    Returns:
        List[TextContent]: Tool execution results
    """
    config_path = args.get("config_path")
    pdf_path = args.get("pdf_path")
    name = args.get("name")
    description = args.get("description")
    from_json = args.get("from_json")

    # Build command
    cmd = [sys.executable, str(CLI_DIR / "pdf_scraper.py")]

    # Mode 1: Config file
    if config_path:
        cmd.extend(["--config", config_path])

    # Mode 2: Direct PDF
    elif pdf_path and name:
        cmd.extend(["--pdf", pdf_path, "--name", name])
        if description:
            cmd.extend(["--description", description])

    # Mode 3: From JSON
    elif from_json:
        cmd.extend(["--from-json", from_json])

    else:
        return [
            TextContent(
                type="text", text="❌ Error: Must specify --config, --pdf + --name, or --from-json"
            )
        ]

    # Run pdf_scraper.py with streaming (can take a while)
    timeout = 600  # 10 minutes for PDF extraction

    progress_msg = "📄 Scraping PDF documentation...\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output)]
    else:
        return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]


async def scrape_github_tool(args: dict) -> list[TextContent]:
    """
    Scrape GitHub repository and build Claude skill.

    Extracts README, Issues, Changelog, Releases, and code structure
    from GitHub repositories to create comprehensive skills.

    Args:
        args: Dictionary containing:
            - repo (str, optional): GitHub repository (owner/repo)
            - config_path (str, optional): Path to GitHub config JSON file
            - name (str, optional): Skill name (default: repo name)
            - description (str, optional): Skill description
            - token (str, optional): GitHub personal access token
            - no_issues (bool, optional): Skip GitHub issues extraction (default: False)
            - no_changelog (bool, optional): Skip CHANGELOG extraction (default: False)
            - no_releases (bool, optional): Skip releases extraction (default: False)
            - max_issues (int, optional): Maximum issues to fetch (default: 100)
            - scrape_only (bool, optional): Only scrape, don't build skill (default: False)

    Returns:
        List[TextContent]: Tool execution results
    """
    repo = args.get("repo")
    config_path = args.get("config_path")
    name = args.get("name")
    description = args.get("description")
    token = args.get("token")
    no_issues = args.get("no_issues", False)
    no_changelog = args.get("no_changelog", False)
    no_releases = args.get("no_releases", False)
    max_issues = args.get("max_issues", 100)
    scrape_only = args.get("scrape_only", False)

    # Build command
    cmd = [sys.executable, str(CLI_DIR / "github_scraper.py")]

    # Mode 1: Config file
    if config_path:
        cmd.extend(["--config", config_path])

    # Mode 2: Direct repo
    elif repo:
        cmd.extend(["--repo", repo])
        if name:
            cmd.extend(["--name", name])
        if description:
            cmd.extend(["--description", description])
        if token:
            cmd.extend(["--token", token])
        if no_issues:
            cmd.append("--no-issues")
        if no_changelog:
            cmd.append("--no-changelog")
        if no_releases:
            cmd.append("--no-releases")
        if max_issues != 100:
            cmd.extend(["--max-issues", str(max_issues)])
        if scrape_only:
            cmd.append("--scrape-only")

    else:
        return [TextContent(type="text", text="❌ Error: Must specify --repo or --config")]

    # Run github_scraper.py with streaming (can take a while)
    timeout = 600  # 10 minutes for GitHub scraping

    progress_msg = "🐙 Scraping GitHub repository...\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output)]
    else:
        return [TextContent(type="text", text=f"{output}\n\n❌ Error:\n{stderr}")]


async def scrape_codebase_tool(args: dict) -> list[TextContent]:
    """
    Analyze local codebase and extract code knowledge.

    Walks directory tree, analyzes code files, extracts signatures,
    docstrings, and generates API reference documentation, dependency graphs,
    design patterns, test examples, and how-to guides.

    All features are ON by default. Use skip_* parameters to disable specific features.

    Args:
        args: Dictionary containing:
            - directory (str): Directory to analyze
            - output (str, optional): Output directory for results (default: output/codebase/)
            - depth (str, optional): Analysis depth - surface, deep, full (default: deep)
            - languages (str, optional): Comma-separated languages (e.g., "Python,JavaScript,C++")
            - file_patterns (str, optional): Comma-separated file patterns (e.g., "*.py,src/**/*.js")
            - enhance_level (int, optional): AI enhancement level 0-3 (default: 0)
                - 0: No AI enhancement
                - 1: SKILL.md enhancement only
                - 2: SKILL.md + Architecture + Config enhancement
                - 3: Full enhancement (patterns, tests, config, architecture, SKILL.md)
            - skip_api_reference (bool, optional): Skip API reference generation (default: False)
            - skip_dependency_graph (bool, optional): Skip dependency graph (default: False)
            - skip_patterns (bool, optional): Skip design pattern detection (default: False)
            - skip_test_examples (bool, optional): Skip test example extraction (default: False)
            - skip_how_to_guides (bool, optional): Skip how-to guide generation (default: False)
            - skip_config_patterns (bool, optional): Skip config pattern extraction (default: False)
            - skip_docs (bool, optional): Skip project documentation extraction (default: False)

    Returns:
        List[TextContent]: Tool execution results

    Example:
        scrape_codebase(
            directory="/path/to/repo",
            depth="deep",
            enhance_level=1
        )
        scrape_codebase(
            directory="/path/to/repo",
            enhance_level=2,
            skip_patterns=True
        )
    """
    directory = args.get("directory")
    if not directory:
        return [TextContent(type="text", text="❌ Error: directory parameter is required")]

    output = args.get("output", "output/codebase/")
    depth = args.get("depth", "deep")
    languages = args.get("languages", "")
    file_patterns = args.get("file_patterns", "")
    enhance_level = args.get("enhance_level", 0)

    # Skip flags (features are ON by default)
    skip_api_reference = args.get("skip_api_reference", False)
    skip_dependency_graph = args.get("skip_dependency_graph", False)
    skip_patterns = args.get("skip_patterns", False)
    skip_test_examples = args.get("skip_test_examples", False)
    skip_how_to_guides = args.get("skip_how_to_guides", False)
    skip_config_patterns = args.get("skip_config_patterns", False)
    skip_docs = args.get("skip_docs", False)

    # Build command
    cmd = [sys.executable, "-m", "skill_seekers.cli.codebase_scraper"]
    cmd.extend(["--directory", directory])

    if output:
        cmd.extend(["--output", output])
    if depth:
        cmd.extend(["--depth", depth])
    if languages:
        cmd.extend(["--languages", languages])
    if file_patterns:
        cmd.extend(["--file-patterns", file_patterns])
    if enhance_level > 0:
        cmd.extend(["--enhance-level", str(enhance_level)])

    # Skip flags
    if skip_api_reference:
        cmd.append("--skip-api-reference")
    if skip_dependency_graph:
        cmd.append("--skip-dependency-graph")
    if skip_patterns:
        cmd.append("--skip-patterns")
    if skip_test_examples:
        cmd.append("--skip-test-examples")
    if skip_how_to_guides:
        cmd.append("--skip-how-to-guides")
    if skip_config_patterns:
        cmd.append("--skip-config-patterns")
    if skip_docs:
        cmd.append("--skip-docs")

    # Adjust timeout based on enhance_level
    timeout = 600  # 10 minutes base
    if enhance_level >= 2:
        timeout = 1200  # 20 minutes with AI enhancement
    if enhance_level >= 3:
        timeout = 3600  # 60 minutes for full enhancement

    level_names = {0: "off", 1: "SKILL.md only", 2: "standard", 3: "full"}
    progress_msg = "🔍 Analyzing local codebase...\n"
    progress_msg += f"📁 Directory: {directory}\n"
    progress_msg += f"📊 Depth: {depth}\n"
    if enhance_level > 0:
        progress_msg += f"🤖 AI Enhancement: Level {enhance_level} ({level_names.get(enhance_level, 'unknown')})\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output_text = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output_text)]
    else:
        return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]


async def detect_patterns_tool(args: dict) -> list[TextContent]:
    """
    Detect design patterns in source code.

    Analyzes source files or directories to detect common design patterns
    (Singleton, Factory, Observer, Strategy, Decorator, Builder, Adapter,
    Command, Template Method, Chain of Responsibility).

    Supports 9 languages: Python, JavaScript, TypeScript, C++, C, C#,
    Go, Rust, Java, Ruby, PHP.

    Args:
        args: Dictionary containing:
            - file (str, optional): Single file to analyze
            - directory (str, optional): Directory to analyze (analyzes all source files)
            - output (str, optional): Output directory for JSON results
            - depth (str, optional): Detection depth - surface, deep, full (default: deep)
            - json (bool, optional): Output JSON format (default: False)

    Returns:
        List[TextContent]: Pattern detection results

    Example:
        detect_patterns(file="src/database.py", depth="deep")
        detect_patterns(directory="src/", output="patterns/", json=True)
    """
    file_path = args.get("file")
    directory = args.get("directory")

    if not file_path and not directory:
        return [
            TextContent(
                type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
            )
        ]

    output = args.get("output", "")
    depth = args.get("depth", "deep")
    json_output = args.get("json", False)

    # Build command
    cmd = [sys.executable, "-m", "skill_seekers.cli.pattern_recognizer"]

    if file_path:
        cmd.extend(["--file", file_path])
    if directory:
        cmd.extend(["--directory", directory])
    if output:
        cmd.extend(["--output", output])
    if depth:
        cmd.extend(["--depth", depth])
    if json_output:
        cmd.append("--json")

    timeout = 300  # 5 minutes for pattern detection

    progress_msg = "🔍 Detecting design patterns...\n"
    if file_path:
        progress_msg += f"📄 File: {file_path}\n"
    if directory:
        progress_msg += f"📁 Directory: {directory}\n"
    progress_msg += f"🎯 Detection depth: {depth}\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output_text = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output_text)]
    else:
        return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]


async def extract_test_examples_tool(args: dict) -> list[TextContent]:
    """
    Extract usage examples from test files.

    Analyzes test files to extract real API usage patterns including:
    - Object instantiation with real parameters
    - Method calls with expected behaviors
    - Configuration examples
    - Setup patterns from fixtures/setUp()
    - Multi-step workflows from integration tests

    Supports 9 languages: Python (AST-based deep analysis), JavaScript,
    TypeScript, Go, Rust, Java, C#, PHP, Ruby (regex-based).

    Args:
        args: Dictionary containing:
            - file (str, optional): Single test file to analyze
            - directory (str, optional): Directory containing test files
            - language (str, optional): Filter by language (python, javascript, etc.)
            - min_confidence (float, optional): Minimum confidence threshold 0.0-1.0 (default: 0.5)
            - max_per_file (int, optional): Maximum examples per file (default: 10)
            - json (bool, optional): Output JSON format (default: False)
            - markdown (bool, optional): Output Markdown format (default: False)

    Returns:
        List[TextContent]: Extracted test examples

    Example:
        extract_test_examples(directory="tests/", language="python")
        extract_test_examples(file="tests/test_scraper.py", json=True)
    """
    file_path = args.get("file")
    directory = args.get("directory")

    if not file_path and not directory:
        return [
            TextContent(
                type="text", text="❌ Error: Must specify either 'file' or 'directory' parameter"
            )
        ]

    language = args.get("language", "")
    min_confidence = args.get("min_confidence", 0.5)
    max_per_file = args.get("max_per_file", 10)
    json_output = args.get("json", False)
    markdown_output = args.get("markdown", False)

    # Build command
    cmd = [sys.executable, "-m", "skill_seekers.cli.test_example_extractor"]

    if directory:
        cmd.append(directory)
    if file_path:
        cmd.extend(["--file", file_path])
    if language:
        cmd.extend(["--language", language])
    if min_confidence:
        cmd.extend(["--min-confidence", str(min_confidence)])
    if max_per_file:
        cmd.extend(["--max-per-file", str(max_per_file)])
    if json_output:
        cmd.append("--json")
    if markdown_output:
        cmd.append("--markdown")

    timeout = 180  # 3 minutes for test example extraction

    progress_msg = "🧪 Extracting usage examples from test files...\n"
    if file_path:
        progress_msg += f"📄 File: {file_path}\n"
    if directory:
        progress_msg += f"📁 Directory: {directory}\n"
    if language:
        progress_msg += f"🔤 Language: {language}\n"
    progress_msg += f"🎯 Min confidence: {min_confidence}\n"
    progress_msg += f"📊 Max per file: {max_per_file}\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output_text = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output_text)]
    else:
        return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]


async def build_how_to_guides_tool(args: dict) -> list[TextContent]:
    """
    Build how-to guides from workflow test examples.

    Transforms workflow examples extracted from test files into step-by-step
    educational guides. Automatically groups related workflows, extracts steps,
    and generates comprehensive markdown guides.

    Features:
    - Python AST-based step extraction (heuristic for other languages)
    - 4 grouping strategies: ai-tutorial-group, file-path, test-name, complexity
    - Detects prerequisites, setup code, and verification points
    - Generates troubleshooting tips and next steps
    - Creates index with difficulty levels

    Args:
        args: Dictionary containing:
            - input (str): Path to test_examples.json from extract_test_examples
            - output (str, optional): Output directory for guides (default: output/codebase/tutorials)
            - group_by (str, optional): Grouping strategy - ai-tutorial-group, file-path, test-name, complexity
            - no_ai (bool, optional): Disable AI enhancement for grouping (default: False)
            - json_output (bool, optional): Output JSON format alongside markdown (default: False)

    Returns:
        List[TextContent]: Guide building results

    Example:
        build_how_to_guides(
            input="output/codebase/test_examples/test_examples.json",
            group_by="ai-tutorial-group",
            output="output/codebase/tutorials"
        )
    """
    input_file = args.get("input")
    if not input_file:
        return [
            TextContent(
                type="text",
                text="❌ Error: input parameter is required (path to test_examples.json)",
            )
        ]

    output = args.get("output", "output/codebase/tutorials")
    group_by = args.get("group_by", "ai-tutorial-group")
    no_ai = args.get("no_ai", False)
    json_output = args.get("json_output", False)

    # Build command
    cmd = [sys.executable, "-m", "skill_seekers.cli.how_to_guide_builder"]
    cmd.append(input_file)

    if output:
        cmd.extend(["--output", output])
    if group_by:
        cmd.extend(["--group-by", group_by])
    if no_ai:
        cmd.append("--no-ai")
    if json_output:
        cmd.append("--json-output")

    timeout = 180  # 3 minutes for guide building

    progress_msg = "📚 Building how-to guides from workflow examples...\n"
    progress_msg += f"📄 Input: {input_file}\n"
    progress_msg += f"📁 Output: {output}\n"
    progress_msg += f"🔀 Grouping: {group_by}\n"
    if no_ai:
        progress_msg += "🚫 AI enhancement disabled\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output_text = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output_text)]
    else:
        return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]


async def extract_config_patterns_tool(args: dict) -> list[TextContent]:
    """
    Extract configuration patterns from config files (C3.4).

    Analyzes configuration files in the codebase to extract settings,
    detect common patterns (database, API, logging, cache, etc.), and
    generate comprehensive documentation.

    Supports 9 config formats: JSON, YAML, TOML, ENV, INI, Python modules,
    JavaScript/TypeScript configs, Dockerfile, Docker Compose.

    Detects 7 common patterns:
    - Database configuration (host, port, credentials)
    - API configuration (endpoints, keys, timeouts)
    - Logging configuration (level, format, handlers)
    - Cache configuration (backend, TTL, keys)
    - Email configuration (SMTP, credentials)
    - Authentication configuration (providers, secrets)
    - Server configuration (host, port, workers)

    Args:
        args: Dictionary containing:
            - directory (str): Directory to analyze
            - output (str, optional): Output directory (default: output/codebase/config_patterns)
            - max_files (int, optional): Maximum config files to process (default: 100)
            - enhance (bool, optional): Enable AI enhancement - API mode (default: False, requires ANTHROPIC_API_KEY)
            - enhance_local (bool, optional): Enable AI enhancement - LOCAL mode (default: False, uses Claude Code CLI)
            - ai_mode (str, optional): AI mode - auto, api, local, none (default: none)
            - json (bool, optional): Output JSON format (default: True)
            - markdown (bool, optional): Output Markdown format (default: True)

    Returns:
        List[TextContent]: Config extraction results with optional AI enhancements

    Example:
        extract_config_patterns(directory=".", output="output/configs")
        extract_config_patterns(directory="/path/to/repo", max_files=50, enhance_local=True)
    """
    directory = args.get("directory")
    if not directory:
        return [TextContent(type="text", text="❌ Error: directory parameter is required")]

    output = args.get("output", "output/codebase/config_patterns")
    max_files = args.get("max_files", 100)
    enhance = args.get("enhance", False)
    enhance_local = args.get("enhance_local", False)
    ai_mode = args.get("ai_mode", "none")
    json_output = args.get("json", True)
    markdown_output = args.get("markdown", True)

    # Build command
    cmd = [sys.executable, "-m", "skill_seekers.cli.config_extractor"]
    cmd.extend(["--directory", directory])

    if output:
        cmd.extend(["--output", output])
    if max_files:
        cmd.extend(["--max-files", str(max_files)])
    if enhance:
        cmd.append("--enhance")
    if enhance_local:
        cmd.append("--enhance-local")
    if ai_mode and ai_mode != "none":
        cmd.extend(["--ai-mode", ai_mode])
    if json_output:
        cmd.append("--json")
    if markdown_output:
        cmd.append("--markdown")

    # Adjust timeout for AI enhancement
    timeout = 180  # 3 minutes base
    if enhance or enhance_local or ai_mode != "none":
        timeout = 360  # 6 minutes with AI enhancement

    progress_msg = "⚙️ Extracting configuration patterns...\n"
    progress_msg += f"📁 Directory: {directory}\n"
    progress_msg += f"📄 Max files: {max_files}\n"
    if enhance or enhance_local or (ai_mode and ai_mode != "none"):
        progress_msg += f"🤖 AI enhancement: {ai_mode if ai_mode != 'none' else ('api' if enhance else 'local')}\n"
    progress_msg += f"⏱️ Maximum time: {timeout // 60} minutes\n\n"

    stdout, stderr, returncode = run_subprocess_with_streaming(cmd, timeout=timeout)

    output_text = progress_msg + stdout

    if returncode == 0:
        return [TextContent(type="text", text=output_text)]
    else:
        return [TextContent(type="text", text=f"{output_text}\n\n❌ Error:\n{stderr}")]