claude-code-skills-reference/transcript-fixer/scripts/utils/health_check.py

#!/usr/bin/env python3
"""
Health Check Module - System Health Monitoring

CRITICAL FIX (P1-4): Production-grade health checks for monitoring

Features:
- Database connectivity and schema validation
- File system access checks
- Configuration validation
- Dependency verification
- Resource availability checks

Health Check Levels:
- Basic: Quick connectivity checks (< 100ms)
- Standard: Full system validation (< 1s)
- Deep: Comprehensive diagnostics (< 5s)
"""

from __future__ import annotations

import json
import logging
import os
import sys
import time
from dataclasses import dataclass, asdict
from enum import Enum
from pathlib import Path
from typing import List, Dict, Optional, Final

logger = logging.getLogger(__name__)

# Import configuration for centralized config management (P1-5 fix)
from .config import get_config

# Health check thresholds
RESPONSE_TIME_WARNING: Final[float] = 1.0  # seconds
RESPONSE_TIME_CRITICAL: Final[float] = 5.0  # seconds
MIN_DISK_SPACE_MB: Final[int] = 100  # MB


class HealthStatus(Enum):
    """Health status levels"""
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    UNHEALTHY = "unhealthy"
    UNKNOWN = "unknown"


class CheckLevel(Enum):
    """Health check thoroughness levels"""
    BASIC = "basic"        # Quick checks (< 100ms)
    STANDARD = "standard"  # Full validation (< 1s)
    DEEP = "deep"          # Comprehensive (< 5s)


@dataclass
class HealthCheckResult:
    """Result of a single health check"""
    name: str
    status: HealthStatus
    message: str
    duration_ms: float
    details: Optional[Dict] = None
    error: Optional[str] = None

    def to_dict(self) -> Dict:
        """Convert to dictionary"""
        result = asdict(self)
        result['status'] = self.status.value
        return result


@dataclass
class SystemHealth:
    """Overall system health status"""
    status: HealthStatus
    timestamp: str
    duration_ms: float
    checks: List[HealthCheckResult]
    summary: Dict[str, int]

    def to_dict(self) -> Dict:
        """Convert to dictionary"""
        return {
            'status': self.status.value,
            'timestamp': self.timestamp,
            'duration_ms': round(self.duration_ms, 2),
            'checks': [check.to_dict() for check in self.checks],
            'summary': self.summary
        }

    def to_json(self) -> str:
        """Convert to JSON string"""
        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)


class HealthChecker:
    """
    System health checker with configurable thoroughness levels.

    CRITICAL FIX (P1-4): Enables monitoring and observability
    """

    def __init__(self, config_dir: Optional[Path] = None):
        """
        Initialize health checker

        Args:
            config_dir: Configuration directory (defaults to ~/.transcript-fixer)
        """
        # P1-5 FIX: Use centralized configuration
        config = get_config()

        # For backward compatibility, still accept config_dir parameter
        self.config_dir = config_dir or config.paths.config_dir
        self.db_path = config.database.path

    def check_health(self, level: CheckLevel = CheckLevel.STANDARD) -> SystemHealth:
        """
        Perform health check at specified level

        Args:
            level: Thoroughness level (BASIC, STANDARD, DEEP)

        Returns:
            SystemHealth with overall status and individual check results
        """
        start_time = time.time()
        checks: List[HealthCheckResult] = []

        logger.info(f"Starting health check (level: {level.value})")

        # Always run basic checks
        checks.append(self._check_config_directory())
        checks.append(self._check_database())

        # Standard level: add configuration checks
        if level in (CheckLevel.STANDARD, CheckLevel.DEEP):
            checks.append(self._check_api_key())
            checks.append(self._check_dependencies())
            checks.append(self._check_disk_space())

        # Deep level: add comprehensive diagnostics
        if level == CheckLevel.DEEP:
            checks.append(self._check_database_schema())
            checks.append(self._check_file_permissions())
            checks.append(self._check_python_version())

        # Calculate overall status
        duration_ms = (time.time() - start_time) * 1000
        overall_status = self._calculate_overall_status(checks)

        # Generate summary
        summary = {
            'total': len(checks),
            'healthy': sum(1 for c in checks if c.status == HealthStatus.HEALTHY),
            'degraded': sum(1 for c in checks if c.status == HealthStatus.DEGRADED),
            'unhealthy': sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY),
        }

        # Check for slow response time
        if duration_ms > RESPONSE_TIME_CRITICAL * 1000:
            logger.warning(f"Health check took {duration_ms:.0f}ms (critical threshold)")
        elif duration_ms > RESPONSE_TIME_WARNING * 1000:
            logger.warning(f"Health check took {duration_ms:.0f}ms (warning threshold)")

        return SystemHealth(
            status=overall_status,
            timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
            duration_ms=duration_ms,
            checks=checks,
            summary=summary
        )

    def _calculate_overall_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
        """Calculate overall system status from individual checks"""
        if not checks:
            return HealthStatus.UNKNOWN

        # Any unhealthy check = system unhealthy
        if any(c.status == HealthStatus.UNHEALTHY for c in checks):
            return HealthStatus.UNHEALTHY

        # Any degraded check = system degraded
        if any(c.status == HealthStatus.DEGRADED for c in checks):
            return HealthStatus.DEGRADED

        # All healthy = system healthy
        if all(c.status == HealthStatus.HEALTHY for c in checks):
            return HealthStatus.HEALTHY

        return HealthStatus.UNKNOWN

    def _check_config_directory(self) -> HealthCheckResult:
        """Check configuration directory exists and is writable"""
        start_time = time.time()
        name = "config_directory"

        try:
            # Check existence
            if not self.config_dir.exists():
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.UNHEALTHY,
                    message="Configuration directory does not exist",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'path': str(self.config_dir)},
                    error="Directory not found"
                )

            # Check writability
            test_file = self.config_dir / ".health_check_test"
            try:
                test_file.touch()
                test_file.unlink()
            except (PermissionError, OSError) as e:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message="Configuration directory not writable",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'path': str(self.config_dir)},
                    error=str(e)
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message="Configuration directory accessible",
                duration_ms=(time.time() - start_time) * 1000,
                details={'path': str(self.config_dir)}
            )

        except Exception as e:
            logger.exception("Config directory check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNHEALTHY,
                message="Configuration directory check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_database(self) -> HealthCheckResult:
        """Check database exists and is accessible"""
        start_time = time.time()
        name = "database"

        try:
            if not self.db_path.exists():
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message="Database not initialized",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'path': str(self.db_path)},
                    error="Database file not found"
                )

            # Try to open database
            import sqlite3
            try:
                conn = sqlite3.connect(str(self.db_path), timeout=5.0)
                cursor = conn.execute("SELECT COUNT(*) FROM sqlite_master WHERE type='table'")
                table_count = cursor.fetchone()[0]
                conn.close()

                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.HEALTHY,
                    message="Database accessible",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={
                        'path': str(self.db_path),
                        'tables': table_count,
                        'size_kb': self.db_path.stat().st_size // 1024
                    }
                )

            except sqlite3.Error as e:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.UNHEALTHY,
                    message="Database connection failed",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'path': str(self.db_path)},
                    error=str(e)
                )

        except Exception as e:
            logger.exception("Database check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNHEALTHY,
                message="Database check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_api_key(self) -> HealthCheckResult:
        """Check API key is configured"""
        start_time = time.time()
        name = "api_key"

        try:
            # P1-5 FIX: Use centralized configuration
            config = get_config()
            api_key = config.api.api_key

            if not api_key:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message="API key not configured",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'env_vars_checked': ['GLM_API_KEY', 'ANTHROPIC_API_KEY']},
                    error="No API key found in environment"
                )

            # Check key format (don't validate by calling API)
            if len(api_key) < 10:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message="API key format suspicious",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'key_length': len(api_key)},
                    error="API key too short"
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message="API key configured",
                duration_ms=(time.time() - start_time) * 1000,
                details={'key_length': len(api_key), 'masked_key': api_key[:8] + '***'}
            )

        except Exception as e:
            logger.exception("API key check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNHEALTHY,
                message="API key check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_dependencies(self) -> HealthCheckResult:
        """Check required dependencies are installed"""
        start_time = time.time()
        name = "dependencies"

        required_modules = ['httpx', 'filelock']
        missing = []
        installed = []

        try:
            for module in required_modules:
                try:
                    __import__(module)
                    installed.append(module)
                except ImportError:
                    missing.append(module)

            if missing:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.UNHEALTHY,
                    message=f"Missing dependencies: {', '.join(missing)}",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'installed': installed, 'missing': missing},
                    error=f"Install with: pip install {' '.join(missing)}"
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message="All dependencies installed",
                duration_ms=(time.time() - start_time) * 1000,
                details={'installed': installed}
            )

        except Exception as e:
            logger.exception("Dependencies check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNHEALTHY,
                message="Dependencies check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_disk_space(self) -> HealthCheckResult:
        """Check available disk space"""
        start_time = time.time()
        name = "disk_space"

        try:
            import shutil
            stat = shutil.disk_usage(self.config_dir.parent)

            free_mb = stat.free / (1024 * 1024)
            total_mb = stat.total / (1024 * 1024)
            used_percent = (stat.used / stat.total) * 100

            if free_mb < MIN_DISK_SPACE_MB:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.UNHEALTHY,
                    message=f"Low disk space: {free_mb:.0f}MB free",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={
                        'free_mb': round(free_mb, 2),
                        'total_mb': round(total_mb, 2),
                        'used_percent': round(used_percent, 1)
                    },
                    error=f"Less than {MIN_DISK_SPACE_MB}MB available"
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message=f"Sufficient disk space: {free_mb:.0f}MB free",
                duration_ms=(time.time() - start_time) * 1000,
                details={
                    'free_mb': round(free_mb, 2),
                    'total_mb': round(total_mb, 2),
                    'used_percent': round(used_percent, 1)
                }
            )

        except Exception as e:
            logger.exception("Disk space check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNKNOWN,
                message="Disk space check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_database_schema(self) -> HealthCheckResult:
        """Check database schema is valid (deep check)"""
        start_time = time.time()
        name = "database_schema"

        expected_tables = [
            'corrections', 'context_rules', 'correction_history',
            'correction_changes', 'learned_suggestions', 'suggestion_examples',
            'system_config', 'audit_log'
        ]

        try:
            if not self.db_path.exists():
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message="Database not initialized",
                    duration_ms=(time.time() - start_time) * 1000,
                    error="Cannot check schema - database missing"
                )

            import sqlite3
            conn = sqlite3.connect(str(self.db_path), timeout=5.0)
            cursor = conn.execute(
                "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
            )
            actual_tables = [row[0] for row in cursor.fetchall()]
            conn.close()

            missing = [t for t in expected_tables if t not in actual_tables]
            extra = [t for t in actual_tables if t not in expected_tables and not t.startswith('sqlite_')]

            if missing:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message=f"Missing tables: {', '.join(missing)}",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={
                        'expected': expected_tables,
                        'actual': actual_tables,
                        'missing': missing,
                        'extra': extra
                    },
                    error="Schema incomplete"
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message="Database schema valid",
                duration_ms=(time.time() - start_time) * 1000,
                details={
                    'tables': actual_tables,
                    'count': len(actual_tables)
                }
            )

        except Exception as e:
            logger.exception("Database schema check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNHEALTHY,
                message="Database schema check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_file_permissions(self) -> HealthCheckResult:
        """Check file permissions (deep check)"""
        start_time = time.time()
        name = "file_permissions"

        try:
            issues = []

            # Check config directory permissions
            if not os.access(self.config_dir, os.R_OK | os.W_OK | os.X_OK):
                issues.append(f"Config dir: insufficient permissions")

            # Check database permissions (if exists)
            if self.db_path.exists():
                if not os.access(self.db_path, os.R_OK | os.W_OK):
                    issues.append(f"Database: read/write denied")

            if issues:
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message="Permission issues detected",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'issues': issues},
                    error='; '.join(issues)
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message="File permissions correct",
                duration_ms=(time.time() - start_time) * 1000
            )

        except Exception as e:
            logger.exception("File permissions check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNKNOWN,
                message="File permissions check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )

    def _check_python_version(self) -> HealthCheckResult:
        """Check Python version (deep check)"""
        start_time = time.time()
        name = "python_version"

        try:
            version = sys.version_info
            version_str = f"{version.major}.{version.minor}.{version.micro}"

            # Minimum required: Python 3.8
            if version < (3, 8):
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.UNHEALTHY,
                    message=f"Python version too old: {version_str}",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'version': version_str, 'minimum': '3.8'},
                    error="Python 3.8+ required"
                )

            # Warn if using Python 3.12+ (may have compatibility issues)
            if version >= (3, 13):
                return HealthCheckResult(
                    name=name,
                    status=HealthStatus.DEGRADED,
                    message=f"Python version very new: {version_str}",
                    duration_ms=(time.time() - start_time) * 1000,
                    details={'version': version_str, 'recommended': '3.8-3.12'},
                    error="May have untested compatibility issues"
                )

            return HealthCheckResult(
                name=name,
                status=HealthStatus.HEALTHY,
                message=f"Python version supported: {version_str}",
                duration_ms=(time.time() - start_time) * 1000,
                details={'version': version_str}
            )

        except Exception as e:
            logger.exception("Python version check failed")
            return HealthCheckResult(
                name=name,
                status=HealthStatus.UNKNOWN,
                message="Python version check failed",
                duration_ms=(time.time() - start_time) * 1000,
                error=str(e)
            )


def format_health_output(health: SystemHealth, verbose: bool = False) -> str:
    """
    Format health check results for CLI output

    Args:
        health: SystemHealth object
        verbose: Show detailed information

    Returns:
        Formatted string for display
    """
    lines = []

    # Header - icon mapping
    status_icon_map = {
        HealthStatus.HEALTHY: "✅",
        HealthStatus.DEGRADED: "⚠️",
        HealthStatus.UNHEALTHY: "❌",
        HealthStatus.UNKNOWN: "❓"
    }

    overall_icon = status_icon_map[health.status]

    lines.append(f"\n{overall_icon} System Health: {health.status.value.upper()}")
    lines.append(f"{'=' * 70}")
    lines.append(f"Timestamp: {health.timestamp}")
    lines.append(f"Duration: {health.duration_ms:.1f}ms")
    lines.append(f"Checks: {health.summary['healthy']}/{health.summary['total']} passed")
    lines.append("")

    # Individual checks
    for check in health.checks:
        icon = status_icon_map.get(check.status, "❓")
        lines.append(f"{icon} {check.name}: {check.message}")

        if verbose and check.details:
            for key, value in check.details.items():
                lines.append(f"    {key}: {value}")

        if check.error:
            lines.append(f"    Error: {check.error}")

        if verbose:
            lines.append(f"    Duration: {check.duration_ms:.1f}ms")

    lines.append(f"\n{'=' * 70}")

    return "\n".join(lines)