Files
claude-code-skills-reference/transcript-fixer/scripts/utils/health_check.py
daymade 9b724f33e3 Release v1.9.0: Add video-comparer skill and enhance transcript-fixer
## New Skill: video-comparer v1.0.0
- Compare original and compressed videos with interactive HTML reports
- Calculate quality metrics (PSNR, SSIM) for compression analysis
- Generate frame-by-frame visual comparisons (slider, side-by-side, grid)
- Extract video metadata (codec, resolution, bitrate, duration)
- Multi-platform FFmpeg support with security features

## transcript-fixer Enhancements
- Add async AI processor for parallel processing
- Add connection pool management for database operations
- Add concurrency manager and rate limiter
- Add audit log retention and database migrations
- Add health check and metrics monitoring
- Add comprehensive test suite (8 new test files)
- Enhance security with domain and path validators

## Marketplace Updates
- Update marketplace version from 1.8.0 to 1.9.0
- Update skills count from 15 to 16
- Update documentation (README.md, CLAUDE.md, CHANGELOG.md)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-30 00:23:12 +08:00

655 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Health Check Module - System Health Monitoring
CRITICAL FIX (P1-4): Production-grade health checks for monitoring
Features:
- Database connectivity and schema validation
- File system access checks
- Configuration validation
- Dependency verification
- Resource availability checks
Health Check Levels:
- Basic: Quick connectivity checks (< 100ms)
- Standard: Full system validation (< 1s)
- Deep: Comprehensive diagnostics (< 5s)
"""
from __future__ import annotations
import json
import logging
import os
import sys
import time
from dataclasses import dataclass, asdict
from enum import Enum
from pathlib import Path
from typing import List, Dict, Optional, Final
logger = logging.getLogger(__name__)
# Import configuration for centralized config management (P1-5 fix)
from .config import get_config
# Health check thresholds
RESPONSE_TIME_WARNING: Final[float] = 1.0 # seconds
RESPONSE_TIME_CRITICAL: Final[float] = 5.0 # seconds
MIN_DISK_SPACE_MB: Final[int] = 100 # MB
class HealthStatus(Enum):
"""Health status levels"""
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
UNKNOWN = "unknown"
class CheckLevel(Enum):
"""Health check thoroughness levels"""
BASIC = "basic" # Quick checks (< 100ms)
STANDARD = "standard" # Full validation (< 1s)
DEEP = "deep" # Comprehensive (< 5s)
@dataclass
class HealthCheckResult:
"""Result of a single health check"""
name: str
status: HealthStatus
message: str
duration_ms: float
details: Optional[Dict] = None
error: Optional[str] = None
def to_dict(self) -> Dict:
"""Convert to dictionary"""
result = asdict(self)
result['status'] = self.status.value
return result
@dataclass
class SystemHealth:
"""Overall system health status"""
status: HealthStatus
timestamp: str
duration_ms: float
checks: List[HealthCheckResult]
summary: Dict[str, int]
def to_dict(self) -> Dict:
"""Convert to dictionary"""
return {
'status': self.status.value,
'timestamp': self.timestamp,
'duration_ms': round(self.duration_ms, 2),
'checks': [check.to_dict() for check in self.checks],
'summary': self.summary
}
def to_json(self) -> str:
"""Convert to JSON string"""
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
class HealthChecker:
"""
System health checker with configurable thoroughness levels.
CRITICAL FIX (P1-4): Enables monitoring and observability
"""
def __init__(self, config_dir: Optional[Path] = None):
"""
Initialize health checker
Args:
config_dir: Configuration directory (defaults to ~/.transcript-fixer)
"""
# P1-5 FIX: Use centralized configuration
config = get_config()
# For backward compatibility, still accept config_dir parameter
self.config_dir = config_dir or config.paths.config_dir
self.db_path = config.database.path
def check_health(self, level: CheckLevel = CheckLevel.STANDARD) -> SystemHealth:
"""
Perform health check at specified level
Args:
level: Thoroughness level (BASIC, STANDARD, DEEP)
Returns:
SystemHealth with overall status and individual check results
"""
start_time = time.time()
checks: List[HealthCheckResult] = []
logger.info(f"Starting health check (level: {level.value})")
# Always run basic checks
checks.append(self._check_config_directory())
checks.append(self._check_database())
# Standard level: add configuration checks
if level in (CheckLevel.STANDARD, CheckLevel.DEEP):
checks.append(self._check_api_key())
checks.append(self._check_dependencies())
checks.append(self._check_disk_space())
# Deep level: add comprehensive diagnostics
if level == CheckLevel.DEEP:
checks.append(self._check_database_schema())
checks.append(self._check_file_permissions())
checks.append(self._check_python_version())
# Calculate overall status
duration_ms = (time.time() - start_time) * 1000
overall_status = self._calculate_overall_status(checks)
# Generate summary
summary = {
'total': len(checks),
'healthy': sum(1 for c in checks if c.status == HealthStatus.HEALTHY),
'degraded': sum(1 for c in checks if c.status == HealthStatus.DEGRADED),
'unhealthy': sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY),
}
# Check for slow response time
if duration_ms > RESPONSE_TIME_CRITICAL * 1000:
logger.warning(f"Health check took {duration_ms:.0f}ms (critical threshold)")
elif duration_ms > RESPONSE_TIME_WARNING * 1000:
logger.warning(f"Health check took {duration_ms:.0f}ms (warning threshold)")
return SystemHealth(
status=overall_status,
timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
duration_ms=duration_ms,
checks=checks,
summary=summary
)
def _calculate_overall_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
"""Calculate overall system status from individual checks"""
if not checks:
return HealthStatus.UNKNOWN
# Any unhealthy check = system unhealthy
if any(c.status == HealthStatus.UNHEALTHY for c in checks):
return HealthStatus.UNHEALTHY
# Any degraded check = system degraded
if any(c.status == HealthStatus.DEGRADED for c in checks):
return HealthStatus.DEGRADED
# All healthy = system healthy
if all(c.status == HealthStatus.HEALTHY for c in checks):
return HealthStatus.HEALTHY
return HealthStatus.UNKNOWN
def _check_config_directory(self) -> HealthCheckResult:
"""Check configuration directory exists and is writable"""
start_time = time.time()
name = "config_directory"
try:
# Check existence
if not self.config_dir.exists():
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="Configuration directory does not exist",
duration_ms=(time.time() - start_time) * 1000,
details={'path': str(self.config_dir)},
error="Directory not found"
)
# Check writability
test_file = self.config_dir / ".health_check_test"
try:
test_file.touch()
test_file.unlink()
except (PermissionError, OSError) as e:
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message="Configuration directory not writable",
duration_ms=(time.time() - start_time) * 1000,
details={'path': str(self.config_dir)},
error=str(e)
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message="Configuration directory accessible",
duration_ms=(time.time() - start_time) * 1000,
details={'path': str(self.config_dir)}
)
except Exception as e:
logger.exception("Config directory check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="Configuration directory check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_database(self) -> HealthCheckResult:
"""Check database exists and is accessible"""
start_time = time.time()
name = "database"
try:
if not self.db_path.exists():
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message="Database not initialized",
duration_ms=(time.time() - start_time) * 1000,
details={'path': str(self.db_path)},
error="Database file not found"
)
# Try to open database
import sqlite3
try:
conn = sqlite3.connect(str(self.db_path), timeout=5.0)
cursor = conn.execute("SELECT COUNT(*) FROM sqlite_master WHERE type='table'")
table_count = cursor.fetchone()[0]
conn.close()
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message="Database accessible",
duration_ms=(time.time() - start_time) * 1000,
details={
'path': str(self.db_path),
'tables': table_count,
'size_kb': self.db_path.stat().st_size // 1024
}
)
except sqlite3.Error as e:
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="Database connection failed",
duration_ms=(time.time() - start_time) * 1000,
details={'path': str(self.db_path)},
error=str(e)
)
except Exception as e:
logger.exception("Database check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="Database check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_api_key(self) -> HealthCheckResult:
"""Check API key is configured"""
start_time = time.time()
name = "api_key"
try:
# P1-5 FIX: Use centralized configuration
config = get_config()
api_key = config.api.api_key
if not api_key:
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message="API key not configured",
duration_ms=(time.time() - start_time) * 1000,
details={'env_vars_checked': ['GLM_API_KEY', 'ANTHROPIC_API_KEY']},
error="No API key found in environment"
)
# Check key format (don't validate by calling API)
if len(api_key) < 10:
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message="API key format suspicious",
duration_ms=(time.time() - start_time) * 1000,
details={'key_length': len(api_key)},
error="API key too short"
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message="API key configured",
duration_ms=(time.time() - start_time) * 1000,
details={'key_length': len(api_key), 'masked_key': api_key[:8] + '***'}
)
except Exception as e:
logger.exception("API key check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="API key check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_dependencies(self) -> HealthCheckResult:
"""Check required dependencies are installed"""
start_time = time.time()
name = "dependencies"
required_modules = ['httpx', 'filelock']
missing = []
installed = []
try:
for module in required_modules:
try:
__import__(module)
installed.append(module)
except ImportError:
missing.append(module)
if missing:
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message=f"Missing dependencies: {', '.join(missing)}",
duration_ms=(time.time() - start_time) * 1000,
details={'installed': installed, 'missing': missing},
error=f"Install with: pip install {' '.join(missing)}"
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message="All dependencies installed",
duration_ms=(time.time() - start_time) * 1000,
details={'installed': installed}
)
except Exception as e:
logger.exception("Dependencies check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="Dependencies check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_disk_space(self) -> HealthCheckResult:
"""Check available disk space"""
start_time = time.time()
name = "disk_space"
try:
import shutil
stat = shutil.disk_usage(self.config_dir.parent)
free_mb = stat.free / (1024 * 1024)
total_mb = stat.total / (1024 * 1024)
used_percent = (stat.used / stat.total) * 100
if free_mb < MIN_DISK_SPACE_MB:
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message=f"Low disk space: {free_mb:.0f}MB free",
duration_ms=(time.time() - start_time) * 1000,
details={
'free_mb': round(free_mb, 2),
'total_mb': round(total_mb, 2),
'used_percent': round(used_percent, 1)
},
error=f"Less than {MIN_DISK_SPACE_MB}MB available"
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message=f"Sufficient disk space: {free_mb:.0f}MB free",
duration_ms=(time.time() - start_time) * 1000,
details={
'free_mb': round(free_mb, 2),
'total_mb': round(total_mb, 2),
'used_percent': round(used_percent, 1)
}
)
except Exception as e:
logger.exception("Disk space check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNKNOWN,
message="Disk space check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_database_schema(self) -> HealthCheckResult:
"""Check database schema is valid (deep check)"""
start_time = time.time()
name = "database_schema"
expected_tables = [
'corrections', 'context_rules', 'correction_history',
'correction_changes', 'learned_suggestions', 'suggestion_examples',
'system_config', 'audit_log'
]
try:
if not self.db_path.exists():
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message="Database not initialized",
duration_ms=(time.time() - start_time) * 1000,
error="Cannot check schema - database missing"
)
import sqlite3
conn = sqlite3.connect(str(self.db_path), timeout=5.0)
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
)
actual_tables = [row[0] for row in cursor.fetchall()]
conn.close()
missing = [t for t in expected_tables if t not in actual_tables]
extra = [t for t in actual_tables if t not in expected_tables and not t.startswith('sqlite_')]
if missing:
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message=f"Missing tables: {', '.join(missing)}",
duration_ms=(time.time() - start_time) * 1000,
details={
'expected': expected_tables,
'actual': actual_tables,
'missing': missing,
'extra': extra
},
error="Schema incomplete"
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message="Database schema valid",
duration_ms=(time.time() - start_time) * 1000,
details={
'tables': actual_tables,
'count': len(actual_tables)
}
)
except Exception as e:
logger.exception("Database schema check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message="Database schema check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_file_permissions(self) -> HealthCheckResult:
"""Check file permissions (deep check)"""
start_time = time.time()
name = "file_permissions"
try:
issues = []
# Check config directory permissions
if not os.access(self.config_dir, os.R_OK | os.W_OK | os.X_OK):
issues.append(f"Config dir: insufficient permissions")
# Check database permissions (if exists)
if self.db_path.exists():
if not os.access(self.db_path, os.R_OK | os.W_OK):
issues.append(f"Database: read/write denied")
if issues:
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message="Permission issues detected",
duration_ms=(time.time() - start_time) * 1000,
details={'issues': issues},
error='; '.join(issues)
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message="File permissions correct",
duration_ms=(time.time() - start_time) * 1000
)
except Exception as e:
logger.exception("File permissions check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNKNOWN,
message="File permissions check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def _check_python_version(self) -> HealthCheckResult:
"""Check Python version (deep check)"""
start_time = time.time()
name = "python_version"
try:
version = sys.version_info
version_str = f"{version.major}.{version.minor}.{version.micro}"
# Minimum required: Python 3.8
if version < (3, 8):
return HealthCheckResult(
name=name,
status=HealthStatus.UNHEALTHY,
message=f"Python version too old: {version_str}",
duration_ms=(time.time() - start_time) * 1000,
details={'version': version_str, 'minimum': '3.8'},
error="Python 3.8+ required"
)
# Warn if using Python 3.12+ (may have compatibility issues)
if version >= (3, 13):
return HealthCheckResult(
name=name,
status=HealthStatus.DEGRADED,
message=f"Python version very new: {version_str}",
duration_ms=(time.time() - start_time) * 1000,
details={'version': version_str, 'recommended': '3.8-3.12'},
error="May have untested compatibility issues"
)
return HealthCheckResult(
name=name,
status=HealthStatus.HEALTHY,
message=f"Python version supported: {version_str}",
duration_ms=(time.time() - start_time) * 1000,
details={'version': version_str}
)
except Exception as e:
logger.exception("Python version check failed")
return HealthCheckResult(
name=name,
status=HealthStatus.UNKNOWN,
message="Python version check failed",
duration_ms=(time.time() - start_time) * 1000,
error=str(e)
)
def format_health_output(health: SystemHealth, verbose: bool = False) -> str:
"""
Format health check results for CLI output
Args:
health: SystemHealth object
verbose: Show detailed information
Returns:
Formatted string for display
"""
lines = []
# Header - icon mapping
status_icon_map = {
HealthStatus.HEALTHY: "",
HealthStatus.DEGRADED: "⚠️",
HealthStatus.UNHEALTHY: "",
HealthStatus.UNKNOWN: ""
}
overall_icon = status_icon_map[health.status]
lines.append(f"\n{overall_icon} System Health: {health.status.value.upper()}")
lines.append(f"{'=' * 70}")
lines.append(f"Timestamp: {health.timestamp}")
lines.append(f"Duration: {health.duration_ms:.1f}ms")
lines.append(f"Checks: {health.summary['healthy']}/{health.summary['total']} passed")
lines.append("")
# Individual checks
for check in health.checks:
icon = status_icon_map.get(check.status, "")
lines.append(f"{icon} {check.name}: {check.message}")
if verbose and check.details:
for key, value in check.details.items():
lines.append(f" {key}: {value}")
if check.error:
lines.append(f" Error: {check.error}")
if verbose:
lines.append(f" Duration: {check.duration_ms:.1f}ms")
lines.append(f"\n{'=' * 70}")
return "\n".join(lines)