## New Skill: video-comparer v1.0.0 - Compare original and compressed videos with interactive HTML reports - Calculate quality metrics (PSNR, SSIM) for compression analysis - Generate frame-by-frame visual comparisons (slider, side-by-side, grid) - Extract video metadata (codec, resolution, bitrate, duration) - Multi-platform FFmpeg support with security features ## transcript-fixer Enhancements - Add async AI processor for parallel processing - Add connection pool management for database operations - Add concurrency manager and rate limiter - Add audit log retention and database migrations - Add health check and metrics monitoring - Add comprehensive test suite (8 new test files) - Enhance security with domain and path validators ## Marketplace Updates - Update marketplace version from 1.8.0 to 1.9.0 - Update skills count from 15 to 16 - Update documentation (README.md, CLAUDE.md, CHANGELOG.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
655 lines
23 KiB
Python
655 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Health Check Module - System Health Monitoring
|
|
|
|
CRITICAL FIX (P1-4): Production-grade health checks for monitoring
|
|
|
|
Features:
|
|
- Database connectivity and schema validation
|
|
- File system access checks
|
|
- Configuration validation
|
|
- Dependency verification
|
|
- Resource availability checks
|
|
|
|
Health Check Levels:
|
|
- Basic: Quick connectivity checks (< 100ms)
|
|
- Standard: Full system validation (< 1s)
|
|
- Deep: Comprehensive diagnostics (< 5s)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, asdict
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional, Final
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Import configuration for centralized config management (P1-5 fix)
|
|
from .config import get_config
|
|
|
|
# Health check thresholds
|
|
RESPONSE_TIME_WARNING: Final[float] = 1.0 # seconds
|
|
RESPONSE_TIME_CRITICAL: Final[float] = 5.0 # seconds
|
|
MIN_DISK_SPACE_MB: Final[int] = 100 # MB
|
|
|
|
|
|
class HealthStatus(Enum):
|
|
"""Health status levels"""
|
|
HEALTHY = "healthy"
|
|
DEGRADED = "degraded"
|
|
UNHEALTHY = "unhealthy"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
class CheckLevel(Enum):
|
|
"""Health check thoroughness levels"""
|
|
BASIC = "basic" # Quick checks (< 100ms)
|
|
STANDARD = "standard" # Full validation (< 1s)
|
|
DEEP = "deep" # Comprehensive (< 5s)
|
|
|
|
|
|
@dataclass
|
|
class HealthCheckResult:
|
|
"""Result of a single health check"""
|
|
name: str
|
|
status: HealthStatus
|
|
message: str
|
|
duration_ms: float
|
|
details: Optional[Dict] = None
|
|
error: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict:
|
|
"""Convert to dictionary"""
|
|
result = asdict(self)
|
|
result['status'] = self.status.value
|
|
return result
|
|
|
|
|
|
@dataclass
|
|
class SystemHealth:
|
|
"""Overall system health status"""
|
|
status: HealthStatus
|
|
timestamp: str
|
|
duration_ms: float
|
|
checks: List[HealthCheckResult]
|
|
summary: Dict[str, int]
|
|
|
|
def to_dict(self) -> Dict:
|
|
"""Convert to dictionary"""
|
|
return {
|
|
'status': self.status.value,
|
|
'timestamp': self.timestamp,
|
|
'duration_ms': round(self.duration_ms, 2),
|
|
'checks': [check.to_dict() for check in self.checks],
|
|
'summary': self.summary
|
|
}
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert to JSON string"""
|
|
return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
|
|
|
|
|
|
class HealthChecker:
|
|
"""
|
|
System health checker with configurable thoroughness levels.
|
|
|
|
CRITICAL FIX (P1-4): Enables monitoring and observability
|
|
"""
|
|
|
|
def __init__(self, config_dir: Optional[Path] = None):
|
|
"""
|
|
Initialize health checker
|
|
|
|
Args:
|
|
config_dir: Configuration directory (defaults to ~/.transcript-fixer)
|
|
"""
|
|
# P1-5 FIX: Use centralized configuration
|
|
config = get_config()
|
|
|
|
# For backward compatibility, still accept config_dir parameter
|
|
self.config_dir = config_dir or config.paths.config_dir
|
|
self.db_path = config.database.path
|
|
|
|
def check_health(self, level: CheckLevel = CheckLevel.STANDARD) -> SystemHealth:
|
|
"""
|
|
Perform health check at specified level
|
|
|
|
Args:
|
|
level: Thoroughness level (BASIC, STANDARD, DEEP)
|
|
|
|
Returns:
|
|
SystemHealth with overall status and individual check results
|
|
"""
|
|
start_time = time.time()
|
|
checks: List[HealthCheckResult] = []
|
|
|
|
logger.info(f"Starting health check (level: {level.value})")
|
|
|
|
# Always run basic checks
|
|
checks.append(self._check_config_directory())
|
|
checks.append(self._check_database())
|
|
|
|
# Standard level: add configuration checks
|
|
if level in (CheckLevel.STANDARD, CheckLevel.DEEP):
|
|
checks.append(self._check_api_key())
|
|
checks.append(self._check_dependencies())
|
|
checks.append(self._check_disk_space())
|
|
|
|
# Deep level: add comprehensive diagnostics
|
|
if level == CheckLevel.DEEP:
|
|
checks.append(self._check_database_schema())
|
|
checks.append(self._check_file_permissions())
|
|
checks.append(self._check_python_version())
|
|
|
|
# Calculate overall status
|
|
duration_ms = (time.time() - start_time) * 1000
|
|
overall_status = self._calculate_overall_status(checks)
|
|
|
|
# Generate summary
|
|
summary = {
|
|
'total': len(checks),
|
|
'healthy': sum(1 for c in checks if c.status == HealthStatus.HEALTHY),
|
|
'degraded': sum(1 for c in checks if c.status == HealthStatus.DEGRADED),
|
|
'unhealthy': sum(1 for c in checks if c.status == HealthStatus.UNHEALTHY),
|
|
}
|
|
|
|
# Check for slow response time
|
|
if duration_ms > RESPONSE_TIME_CRITICAL * 1000:
|
|
logger.warning(f"Health check took {duration_ms:.0f}ms (critical threshold)")
|
|
elif duration_ms > RESPONSE_TIME_WARNING * 1000:
|
|
logger.warning(f"Health check took {duration_ms:.0f}ms (warning threshold)")
|
|
|
|
return SystemHealth(
|
|
status=overall_status,
|
|
timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
duration_ms=duration_ms,
|
|
checks=checks,
|
|
summary=summary
|
|
)
|
|
|
|
def _calculate_overall_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
|
|
"""Calculate overall system status from individual checks"""
|
|
if not checks:
|
|
return HealthStatus.UNKNOWN
|
|
|
|
# Any unhealthy check = system unhealthy
|
|
if any(c.status == HealthStatus.UNHEALTHY for c in checks):
|
|
return HealthStatus.UNHEALTHY
|
|
|
|
# Any degraded check = system degraded
|
|
if any(c.status == HealthStatus.DEGRADED for c in checks):
|
|
return HealthStatus.DEGRADED
|
|
|
|
# All healthy = system healthy
|
|
if all(c.status == HealthStatus.HEALTHY for c in checks):
|
|
return HealthStatus.HEALTHY
|
|
|
|
return HealthStatus.UNKNOWN
|
|
|
|
def _check_config_directory(self) -> HealthCheckResult:
|
|
"""Check configuration directory exists and is writable"""
|
|
start_time = time.time()
|
|
name = "config_directory"
|
|
|
|
try:
|
|
# Check existence
|
|
if not self.config_dir.exists():
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="Configuration directory does not exist",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'path': str(self.config_dir)},
|
|
error="Directory not found"
|
|
)
|
|
|
|
# Check writability
|
|
test_file = self.config_dir / ".health_check_test"
|
|
try:
|
|
test_file.touch()
|
|
test_file.unlink()
|
|
except (PermissionError, OSError) as e:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message="Configuration directory not writable",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'path': str(self.config_dir)},
|
|
error=str(e)
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="Configuration directory accessible",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'path': str(self.config_dir)}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Config directory check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="Configuration directory check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_database(self) -> HealthCheckResult:
|
|
"""Check database exists and is accessible"""
|
|
start_time = time.time()
|
|
name = "database"
|
|
|
|
try:
|
|
if not self.db_path.exists():
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message="Database not initialized",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'path': str(self.db_path)},
|
|
error="Database file not found"
|
|
)
|
|
|
|
# Try to open database
|
|
import sqlite3
|
|
try:
|
|
conn = sqlite3.connect(str(self.db_path), timeout=5.0)
|
|
cursor = conn.execute("SELECT COUNT(*) FROM sqlite_master WHERE type='table'")
|
|
table_count = cursor.fetchone()[0]
|
|
conn.close()
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="Database accessible",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={
|
|
'path': str(self.db_path),
|
|
'tables': table_count,
|
|
'size_kb': self.db_path.stat().st_size // 1024
|
|
}
|
|
)
|
|
|
|
except sqlite3.Error as e:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="Database connection failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'path': str(self.db_path)},
|
|
error=str(e)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Database check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="Database check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_api_key(self) -> HealthCheckResult:
|
|
"""Check API key is configured"""
|
|
start_time = time.time()
|
|
name = "api_key"
|
|
|
|
try:
|
|
# P1-5 FIX: Use centralized configuration
|
|
config = get_config()
|
|
api_key = config.api.api_key
|
|
|
|
if not api_key:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message="API key not configured",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'env_vars_checked': ['GLM_API_KEY', 'ANTHROPIC_API_KEY']},
|
|
error="No API key found in environment"
|
|
)
|
|
|
|
# Check key format (don't validate by calling API)
|
|
if len(api_key) < 10:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message="API key format suspicious",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'key_length': len(api_key)},
|
|
error="API key too short"
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="API key configured",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'key_length': len(api_key), 'masked_key': api_key[:8] + '***'}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("API key check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="API key check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_dependencies(self) -> HealthCheckResult:
|
|
"""Check required dependencies are installed"""
|
|
start_time = time.time()
|
|
name = "dependencies"
|
|
|
|
required_modules = ['httpx', 'filelock']
|
|
missing = []
|
|
installed = []
|
|
|
|
try:
|
|
for module in required_modules:
|
|
try:
|
|
__import__(module)
|
|
installed.append(module)
|
|
except ImportError:
|
|
missing.append(module)
|
|
|
|
if missing:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Missing dependencies: {', '.join(missing)}",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'installed': installed, 'missing': missing},
|
|
error=f"Install with: pip install {' '.join(missing)}"
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="All dependencies installed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'installed': installed}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Dependencies check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="Dependencies check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_disk_space(self) -> HealthCheckResult:
|
|
"""Check available disk space"""
|
|
start_time = time.time()
|
|
name = "disk_space"
|
|
|
|
try:
|
|
import shutil
|
|
stat = shutil.disk_usage(self.config_dir.parent)
|
|
|
|
free_mb = stat.free / (1024 * 1024)
|
|
total_mb = stat.total / (1024 * 1024)
|
|
used_percent = (stat.used / stat.total) * 100
|
|
|
|
if free_mb < MIN_DISK_SPACE_MB:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Low disk space: {free_mb:.0f}MB free",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={
|
|
'free_mb': round(free_mb, 2),
|
|
'total_mb': round(total_mb, 2),
|
|
'used_percent': round(used_percent, 1)
|
|
},
|
|
error=f"Less than {MIN_DISK_SPACE_MB}MB available"
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message=f"Sufficient disk space: {free_mb:.0f}MB free",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={
|
|
'free_mb': round(free_mb, 2),
|
|
'total_mb': round(total_mb, 2),
|
|
'used_percent': round(used_percent, 1)
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Disk space check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNKNOWN,
|
|
message="Disk space check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_database_schema(self) -> HealthCheckResult:
|
|
"""Check database schema is valid (deep check)"""
|
|
start_time = time.time()
|
|
name = "database_schema"
|
|
|
|
expected_tables = [
|
|
'corrections', 'context_rules', 'correction_history',
|
|
'correction_changes', 'learned_suggestions', 'suggestion_examples',
|
|
'system_config', 'audit_log'
|
|
]
|
|
|
|
try:
|
|
if not self.db_path.exists():
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message="Database not initialized",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error="Cannot check schema - database missing"
|
|
)
|
|
|
|
import sqlite3
|
|
conn = sqlite3.connect(str(self.db_path), timeout=5.0)
|
|
cursor = conn.execute(
|
|
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
|
|
)
|
|
actual_tables = [row[0] for row in cursor.fetchall()]
|
|
conn.close()
|
|
|
|
missing = [t for t in expected_tables if t not in actual_tables]
|
|
extra = [t for t in actual_tables if t not in expected_tables and not t.startswith('sqlite_')]
|
|
|
|
if missing:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message=f"Missing tables: {', '.join(missing)}",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={
|
|
'expected': expected_tables,
|
|
'actual': actual_tables,
|
|
'missing': missing,
|
|
'extra': extra
|
|
},
|
|
error="Schema incomplete"
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="Database schema valid",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={
|
|
'tables': actual_tables,
|
|
'count': len(actual_tables)
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Database schema check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message="Database schema check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_file_permissions(self) -> HealthCheckResult:
|
|
"""Check file permissions (deep check)"""
|
|
start_time = time.time()
|
|
name = "file_permissions"
|
|
|
|
try:
|
|
issues = []
|
|
|
|
# Check config directory permissions
|
|
if not os.access(self.config_dir, os.R_OK | os.W_OK | os.X_OK):
|
|
issues.append(f"Config dir: insufficient permissions")
|
|
|
|
# Check database permissions (if exists)
|
|
if self.db_path.exists():
|
|
if not os.access(self.db_path, os.R_OK | os.W_OK):
|
|
issues.append(f"Database: read/write denied")
|
|
|
|
if issues:
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message="Permission issues detected",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'issues': issues},
|
|
error='; '.join(issues)
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message="File permissions correct",
|
|
duration_ms=(time.time() - start_time) * 1000
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("File permissions check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNKNOWN,
|
|
message="File permissions check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
def _check_python_version(self) -> HealthCheckResult:
|
|
"""Check Python version (deep check)"""
|
|
start_time = time.time()
|
|
name = "python_version"
|
|
|
|
try:
|
|
version = sys.version_info
|
|
version_str = f"{version.major}.{version.minor}.{version.micro}"
|
|
|
|
# Minimum required: Python 3.8
|
|
if version < (3, 8):
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNHEALTHY,
|
|
message=f"Python version too old: {version_str}",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'version': version_str, 'minimum': '3.8'},
|
|
error="Python 3.8+ required"
|
|
)
|
|
|
|
# Warn if using Python 3.12+ (may have compatibility issues)
|
|
if version >= (3, 13):
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.DEGRADED,
|
|
message=f"Python version very new: {version_str}",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'version': version_str, 'recommended': '3.8-3.12'},
|
|
error="May have untested compatibility issues"
|
|
)
|
|
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.HEALTHY,
|
|
message=f"Python version supported: {version_str}",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
details={'version': version_str}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Python version check failed")
|
|
return HealthCheckResult(
|
|
name=name,
|
|
status=HealthStatus.UNKNOWN,
|
|
message="Python version check failed",
|
|
duration_ms=(time.time() - start_time) * 1000,
|
|
error=str(e)
|
|
)
|
|
|
|
|
|
def format_health_output(health: SystemHealth, verbose: bool = False) -> str:
|
|
"""
|
|
Format health check results for CLI output
|
|
|
|
Args:
|
|
health: SystemHealth object
|
|
verbose: Show detailed information
|
|
|
|
Returns:
|
|
Formatted string for display
|
|
"""
|
|
lines = []
|
|
|
|
# Header - icon mapping
|
|
status_icon_map = {
|
|
HealthStatus.HEALTHY: "✅",
|
|
HealthStatus.DEGRADED: "⚠️",
|
|
HealthStatus.UNHEALTHY: "❌",
|
|
HealthStatus.UNKNOWN: "❓"
|
|
}
|
|
|
|
overall_icon = status_icon_map[health.status]
|
|
|
|
lines.append(f"\n{overall_icon} System Health: {health.status.value.upper()}")
|
|
lines.append(f"{'=' * 70}")
|
|
lines.append(f"Timestamp: {health.timestamp}")
|
|
lines.append(f"Duration: {health.duration_ms:.1f}ms")
|
|
lines.append(f"Checks: {health.summary['healthy']}/{health.summary['total']} passed")
|
|
lines.append("")
|
|
|
|
# Individual checks
|
|
for check in health.checks:
|
|
icon = status_icon_map.get(check.status, "❓")
|
|
lines.append(f"{icon} {check.name}: {check.message}")
|
|
|
|
if verbose and check.details:
|
|
for key, value in check.details.items():
|
|
lines.append(f" {key}: {value}")
|
|
|
|
if check.error:
|
|
lines.append(f" Error: {check.error}")
|
|
|
|
if verbose:
|
|
lines.append(f" Duration: {check.duration_ms:.1f}ms")
|
|
|
|
lines.append(f"\n{'=' * 70}")
|
|
|
|
return "\n".join(lines)
|