skill-seekers-reference/src/skill_seekers/cli/utils.py

#!/usr/bin/env python3
"""
Utility functions for Skill Seeker CLI tools
"""

import logging
import os
import platform
import subprocess
import time
from collections.abc import Callable
from pathlib import Path
from typing import TypeVar

logger = logging.getLogger(__name__)

T = TypeVar("T")


def setup_logging(verbose: bool = False, quiet: bool = False) -> None:
    """Configure root logging level based on verbosity flags.

    Args:
        verbose: Enable DEBUG level logging
        quiet: Enable WARNING level logging only (suppress INFO)
    """
    if quiet:
        level = logging.WARNING
    elif verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(level=level, format="%(message)s", force=True)


def open_folder(folder_path: str | Path) -> bool:
    """
    Open a folder in the system file browser

    Args:
        folder_path: Path to folder to open

    Returns:
        bool: True if successful, False otherwise
    """
    folder_path = Path(folder_path).resolve()

    if not folder_path.exists():
        print(f"⚠️  Folder not found: {folder_path}")
        return False

    system = platform.system()

    try:
        if system == "Linux":
            # Try xdg-open first (standard)
            subprocess.run(["xdg-open", str(folder_path)], check=True)
        elif system == "Darwin":  # macOS
            subprocess.run(["open", str(folder_path)], check=True)
        elif system == "Windows":
            subprocess.run(["explorer", str(folder_path)], check=True)
        else:
            print(f"⚠️  Unknown operating system: {system}")
            return False

        return True

    except subprocess.CalledProcessError:
        print("⚠️  Could not open folder automatically")
        return False
    except FileNotFoundError:
        print("⚠️  File browser not found on system")
        return False


def has_api_key() -> bool:
    """
    Check if ANTHROPIC_API_KEY is set in environment

    Returns:
        bool: True if API key is set, False otherwise
    """
    api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
    return len(api_key) > 0


def get_api_key() -> str | None:
    """
    Get ANTHROPIC_API_KEY from environment

    Returns:
        str: API key or None if not set
    """
    api_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
    return api_key if api_key else None


def get_upload_url() -> str:
    """
    Get the Claude skills upload URL

    Returns:
        str: Claude skills upload URL
    """
    return "https://claude.ai/skills"


def print_upload_instructions(zip_path: str | Path) -> None:
    """
    Print clear upload instructions for manual upload

    Args:
        zip_path: Path to the .zip file to upload
    """
    zip_path = Path(zip_path)

    print()
    print("╔══════════════════════════════════════════════════════════╗")
    print("║                     NEXT STEP                            ║")
    print("╚══════════════════════════════════════════════════════════╝")
    print()
    print(f"📤 Upload to Claude: {get_upload_url()}")
    print()
    print(f"1. Go to {get_upload_url()}")
    print('2. Click "Upload Skill"')
    print(f"3. Select: {zip_path}")
    print("4. Done! ✅")
    print()


def format_file_size(size_bytes: int) -> str:
    """
    Format file size in human-readable format

    Args:
        size_bytes: Size in bytes

    Returns:
        str: Formatted size (e.g., "45.3 KB")
    """
    if size_bytes < 1024:
        return f"{size_bytes} bytes"
    elif size_bytes < 1024 * 1024:
        return f"{size_bytes / 1024:.1f} KB"
    else:
        return f"{size_bytes / (1024 * 1024):.1f} MB"


def validate_skill_directory(skill_dir: str | Path) -> tuple[bool, str | None]:
    """
    Validate that a directory is a valid skill directory

    Args:
        skill_dir: Path to skill directory

    Returns:
        tuple: (is_valid, error_message)
    """
    skill_path = Path(skill_dir)

    if not skill_path.exists():
        return False, f"Directory not found: {skill_dir}"

    if not skill_path.is_dir():
        return False, f"Not a directory: {skill_dir}"

    skill_md = skill_path / "SKILL.md"
    if not skill_md.exists():
        return False, f"SKILL.md not found in {skill_dir}"

    return True, None


def validate_zip_file(zip_path: str | Path) -> tuple[bool, str | None]:
    """
    Validate that a file is a valid skill .zip file

    Args:
        zip_path: Path to .zip file

    Returns:
        tuple: (is_valid, error_message)
    """
    zip_path = Path(zip_path)

    if not zip_path.exists():
        return False, f"File not found: {zip_path}"

    if not zip_path.is_file():
        return False, f"Not a file: {zip_path}"

    if zip_path.suffix != ".zip":
        return False, f"Not a .zip file: {zip_path}"

    return True, None


def read_reference_files(
    skill_dir: str | Path, max_chars: int = 100000, preview_limit: int = 40000
) -> dict[str, dict]:
    """Read reference files from a skill directory with enriched metadata.

    This function reads markdown files from the references/ subdirectory
    of a skill, applying both per-file and total content limits.
    Returns enriched metadata including source type, confidence, and path.

    Args:
        skill_dir (str or Path): Path to skill directory
        max_chars (int): Maximum total characters to read (default: 100000)
        preview_limit (int): Maximum characters per file (default: 40000)

    Returns:
        dict: Dictionary mapping filename to metadata dict with keys:
            - 'content': File content
            - 'source': Source type (documentation/github/pdf/api/codebase_analysis)
            - 'confidence': Confidence level (high/medium/low)
            - 'path': Relative path from references directory
            - 'repo_id': Repository identifier for multi-source (e.g., 'encode_httpx'), None for single-source

    Example:
        >>> refs = read_reference_files('output/react/', max_chars=50000)
        >>> refs['documentation/api.md']['source']
        'documentation'
        >>> refs['documentation/api.md']['confidence']
        'high'
    """
    from pathlib import Path

    skill_path = Path(skill_dir)
    references_dir = skill_path / "references"
    references: dict[str, dict] = {}

    if not references_dir.exists():
        print(f"⚠ No references directory found at {references_dir}")
        return references

    def _determine_source_metadata(relative_path: Path) -> tuple[str, str, str | None]:
        """Determine source type, confidence level, and repo_id from path.

        For multi-source support, extracts repo_id from paths like:
        - codebase_analysis/encode_httpx/ARCHITECTURE.md -> repo_id='encode_httpx'
        - github/README.md -> repo_id=None (single source)

        Returns:
            tuple: (source_type, confidence_level, repo_id)
        """
        path_str = str(relative_path)
        repo_id = None  # Default: no repo identity

        # Documentation sources (official docs)
        if path_str.startswith("documentation/"):
            return "documentation", "high", None

        # GitHub sources
        elif path_str.startswith("github/"):
            # README and releases are medium confidence
            if "README" in path_str or "releases" in path_str:
                return "github", "medium", None
            # Issues are low confidence (user reports)
            elif "issues" in path_str:
                return "github", "low", None
            else:
                return "github", "medium", None

        # PDF sources (books, manuals)
        elif path_str.startswith("pdf/"):
            return "pdf", "high", None

        # Merged API (synthesized from multiple sources)
        elif path_str.startswith("api/"):
            return "api", "high", None

        # Codebase analysis (C3.x automated analysis)
        elif path_str.startswith("codebase_analysis/"):
            # Extract repo_id from path: codebase_analysis/{repo_id}/...
            parts = Path(path_str).parts
            if len(parts) >= 2:
                repo_id = parts[1]  # e.g., 'encode_httpx', 'encode_httpcore'

            # ARCHITECTURE.md is high confidence (comprehensive)
            if "ARCHITECTURE" in path_str:
                return "codebase_analysis", "high", repo_id
            # Patterns and examples are medium (heuristic-based)
            elif "patterns" in path_str or "examples" in path_str:
                return "codebase_analysis", "medium", repo_id
            # Configuration is high (direct extraction)
            elif "configuration" in path_str:
                return "codebase_analysis", "high", repo_id
            else:
                return "codebase_analysis", "medium", repo_id

        # Video tutorial sources (video_*.md from video scraper)
        elif relative_path.name.startswith("video_"):
            return "video_tutorial", "high", None

        # Conflicts report (discrepancy detection)
        elif "conflicts" in path_str:
            return "conflicts", "medium", None

        # Fallback
        else:
            return "unknown", "medium", None

    total_chars = 0
    # Search recursively for all .md files (including subdirectories like github/README.md)
    for ref_file in sorted(references_dir.rglob("*.md")):
        # Note: We now include index.md files as they contain important content
        # (patterns, examples, configuration analysis)

        content = ref_file.read_text(encoding="utf-8")

        # Limit size per file
        truncated = False
        if len(content) > preview_limit:
            content = content[:preview_limit] + "\n\n[Content truncated...]"
            truncated = True

        # Use relative path from references_dir as key for nested files
        relative_path = ref_file.relative_to(references_dir)
        source_type, confidence, repo_id = _determine_source_metadata(relative_path)

        # Build enriched metadata (with repo_id for multi-source support)
        references[str(relative_path)] = {
            "content": content,
            "source": source_type,
            "confidence": confidence,
            "path": str(relative_path),
            "truncated": truncated,
            "size": len(content),
            "repo_id": repo_id,  # None for single-source, repo identifier for multi-source
        }

        total_chars += len(content)

        # Stop if we've read enough
        if total_chars > max_chars:
            print(f"  ℹ Limiting input to {max_chars:,} characters")
            break

    return references


def retry_with_backoff(
    operation: Callable[[], T],
    max_attempts: int = 3,
    base_delay: float = 1.0,
    operation_name: str = "operation",
) -> T:
    """Retry an operation with exponential backoff.

    Useful for network operations that may fail due to transient errors.
    Waits progressively longer between retries (exponential backoff).

    Args:
        operation: Function to retry (takes no arguments, returns result)
        max_attempts: Maximum number of attempts (default: 3)
        base_delay: Base delay in seconds, doubles each retry (default: 1.0)
        operation_name: Name for logging purposes (default: "operation")

    Returns:
        Result of successful operation

    Raises:
        Exception: Last exception if all retries fail

    Example:
        >>> def fetch_page():
        ...     response = requests.get(url, timeout=30)
        ...     response.raise_for_status()
        ...     return response.text
        >>> content = retry_with_backoff(fetch_page, max_attempts=3, operation_name=f"fetch {url}")
    """
    last_exception: Exception | None = None

    for attempt in range(1, max_attempts + 1):
        try:
            return operation()
        except Exception as e:
            last_exception = e
            if attempt < max_attempts:
                delay = base_delay * (2 ** (attempt - 1))
                logger.warning(
                    "%s failed (attempt %d/%d), retrying in %.1fs: %s",
                    operation_name,
                    attempt,
                    max_attempts,
                    delay,
                    e,
                )
                time.sleep(delay)
            else:
                logger.error("%s failed after %d attempts: %s", operation_name, max_attempts, e)

    # This should always have a value, but mypy doesn't know that
    if last_exception is not None:
        raise last_exception
    raise RuntimeError(f"{operation_name} failed with no exception captured")


async def retry_with_backoff_async(
    operation: Callable[[], T],
    max_attempts: int = 3,
    base_delay: float = 1.0,
    operation_name: str = "operation",
) -> T:
    """Async version of retry_with_backoff for async operations.

    Args:
        operation: Async function to retry (takes no arguments, returns awaitable)
        max_attempts: Maximum number of attempts (default: 3)
        base_delay: Base delay in seconds, doubles each retry (default: 1.0)
        operation_name: Name for logging purposes (default: "operation")

    Returns:
        Result of successful operation

    Raises:
        Exception: Last exception if all retries fail

    Example:
        >>> async def fetch_page():
        ...     response = await client.get(url, timeout=30.0)
        ...     response.raise_for_status()
        ...     return response.text
        >>> content = await retry_with_backoff_async(fetch_page, operation_name=f"fetch {url}")
    """
    import asyncio

    last_exception: Exception | None = None

    for attempt in range(1, max_attempts + 1):
        try:
            return await operation()
        except Exception as e:
            last_exception = e
            if attempt < max_attempts:
                delay = base_delay * (2 ** (attempt - 1))
                logger.warning(
                    "%s failed (attempt %d/%d), retrying in %.1fs: %s",
                    operation_name,
                    attempt,
                    max_attempts,
                    delay,
                    e,
                )
                await asyncio.sleep(delay)
            else:
                logger.error("%s failed after %d attempts: %s", operation_name, max_attempts, e)

    if last_exception is not None:
        raise last_exception
    raise RuntimeError(f"{operation_name} failed with no exception captured")