## New Skill: video-comparer v1.0.0 - Compare original and compressed videos with interactive HTML reports - Calculate quality metrics (PSNR, SSIM) for compression analysis - Generate frame-by-frame visual comparisons (slider, side-by-side, grid) - Extract video metadata (codec, resolution, bitrate, duration) - Multi-platform FFmpeg support with security features ## transcript-fixer Enhancements - Add async AI processor for parallel processing - Add connection pool management for database operations - Add concurrency manager and rate limiter - Add audit log retention and database migrations - Add health check and metrics monitoring - Add comprehensive test suite (8 new test files) - Enhance security with domain and path validators ## Marketplace Updates - Update marketplace version from 1.8.0 to 1.9.0 - Update skills count from 15 to 16 - Update documentation (README.md, CLAUDE.md, CHANGELOG.md) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
536 lines
17 KiB
Python
536 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Metrics Collection and Monitoring
|
|
|
|
CRITICAL FIX (P1-7): Production-grade metrics and observability
|
|
|
|
Features:
|
|
- Real-time metrics collection
|
|
- Time-series data storage (in-memory)
|
|
- Prometheus-compatible export format
|
|
- Common metrics: requests, errors, latency, throughput
|
|
- Custom metric support
|
|
- Thread-safe operations
|
|
|
|
Metrics Types:
|
|
- Counter: Monotonically increasing value (e.g., total requests)
|
|
- Gauge: Point-in-time value (e.g., active connections)
|
|
- Histogram: Distribution of values (e.g., response times)
|
|
- Summary: Statistical summary (e.g., percentiles)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import threading
|
|
import time
|
|
from collections import defaultdict, deque
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Dict, List, Optional, Deque, Final
|
|
from contextlib import contextmanager
|
|
import json
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Configuration constants
|
|
MAX_HISTOGRAM_SAMPLES: Final[int] = 1000 # Keep last 1000 samples per histogram
|
|
MAX_TIMESERIES_POINTS: Final[int] = 100 # Keep last 100 time series points
|
|
PERCENTILES: Final[List[float]] = [0.5, 0.9, 0.95, 0.99] # P50, P90, P95, P99
|
|
|
|
|
|
class MetricType(Enum):
|
|
"""Type of metric"""
|
|
COUNTER = "counter"
|
|
GAUGE = "gauge"
|
|
HISTOGRAM = "histogram"
|
|
SUMMARY = "summary"
|
|
|
|
|
|
@dataclass
|
|
class MetricValue:
|
|
"""Single metric data point"""
|
|
timestamp: float
|
|
value: float
|
|
labels: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class MetricSnapshot:
|
|
"""Snapshot of a metric at a point in time"""
|
|
name: str
|
|
type: MetricType
|
|
value: float
|
|
labels: Dict[str, str]
|
|
help_text: str
|
|
timestamp: float
|
|
|
|
# Additional statistics for histograms
|
|
samples: Optional[int] = None
|
|
sum: Optional[float] = None
|
|
percentiles: Optional[Dict[str, float]] = None
|
|
|
|
def to_dict(self) -> Dict:
|
|
"""Convert to dictionary"""
|
|
result = {
|
|
'name': self.name,
|
|
'type': self.type.value,
|
|
'value': self.value,
|
|
'labels': self.labels,
|
|
'help': self.help_text,
|
|
'timestamp': self.timestamp
|
|
}
|
|
if self.samples is not None:
|
|
result['samples'] = self.samples
|
|
if self.sum is not None:
|
|
result['sum'] = self.sum
|
|
if self.percentiles:
|
|
result['percentiles'] = self.percentiles
|
|
return result
|
|
|
|
|
|
class Counter:
|
|
"""
|
|
Counter metric - monotonically increasing value.
|
|
|
|
Use for: total requests, total errors, total API calls
|
|
"""
|
|
|
|
def __init__(self, name: str, help_text: str = ""):
|
|
self.name = name
|
|
self.help_text = help_text
|
|
self._value = 0.0
|
|
self._lock = threading.Lock()
|
|
self._labels: Dict[str, str] = {}
|
|
|
|
def inc(self, amount: float = 1.0) -> None:
|
|
"""Increment counter by amount"""
|
|
if amount < 0:
|
|
raise ValueError("Counter can only increase")
|
|
|
|
with self._lock:
|
|
self._value += amount
|
|
|
|
def get(self) -> float:
|
|
"""Get current value"""
|
|
with self._lock:
|
|
return self._value
|
|
|
|
def snapshot(self) -> MetricSnapshot:
|
|
"""Get current snapshot"""
|
|
return MetricSnapshot(
|
|
name=self.name,
|
|
type=MetricType.COUNTER,
|
|
value=self.get(),
|
|
labels=self._labels.copy(),
|
|
help_text=self.help_text,
|
|
timestamp=time.time()
|
|
)
|
|
|
|
|
|
class Gauge:
|
|
"""
|
|
Gauge metric - can increase or decrease.
|
|
|
|
Use for: active connections, memory usage, queue size
|
|
"""
|
|
|
|
def __init__(self, name: str, help_text: str = ""):
|
|
self.name = name
|
|
self.help_text = help_text
|
|
self._value = 0.0
|
|
self._lock = threading.Lock()
|
|
self._labels: Dict[str, str] = {}
|
|
|
|
def set(self, value: float) -> None:
|
|
"""Set gauge to specific value"""
|
|
with self._lock:
|
|
self._value = value
|
|
|
|
def inc(self, amount: float = 1.0) -> None:
|
|
"""Increment gauge"""
|
|
with self._lock:
|
|
self._value += amount
|
|
|
|
def dec(self, amount: float = 1.0) -> None:
|
|
"""Decrement gauge"""
|
|
with self._lock:
|
|
self._value -= amount
|
|
|
|
def get(self) -> float:
|
|
"""Get current value"""
|
|
with self._lock:
|
|
return self._value
|
|
|
|
def snapshot(self) -> MetricSnapshot:
|
|
"""Get current snapshot"""
|
|
return MetricSnapshot(
|
|
name=self.name,
|
|
type=MetricType.GAUGE,
|
|
value=self.get(),
|
|
labels=self._labels.copy(),
|
|
help_text=self.help_text,
|
|
timestamp=time.time()
|
|
)
|
|
|
|
|
|
class Histogram:
|
|
"""
|
|
Histogram metric - tracks distribution of values.
|
|
|
|
Use for: request latency, response sizes, processing times
|
|
"""
|
|
|
|
def __init__(self, name: str, help_text: str = ""):
|
|
self.name = name
|
|
self.help_text = help_text
|
|
self._samples: Deque[float] = deque(maxlen=MAX_HISTOGRAM_SAMPLES)
|
|
self._count = 0
|
|
self._sum = 0.0
|
|
self._lock = threading.Lock()
|
|
self._labels: Dict[str, str] = {}
|
|
|
|
def observe(self, value: float) -> None:
|
|
"""Record a new observation"""
|
|
with self._lock:
|
|
self._samples.append(value)
|
|
self._count += 1
|
|
self._sum += value
|
|
|
|
def get_percentile(self, percentile: float) -> float:
|
|
"""
|
|
Calculate percentile value.
|
|
|
|
Args:
|
|
percentile: Value between 0 and 1 (e.g., 0.95 for P95)
|
|
"""
|
|
with self._lock:
|
|
if not self._samples:
|
|
return 0.0
|
|
|
|
sorted_samples = sorted(self._samples)
|
|
index = int(len(sorted_samples) * percentile)
|
|
index = max(0, min(index, len(sorted_samples) - 1))
|
|
return sorted_samples[index]
|
|
|
|
def get_mean(self) -> float:
|
|
"""Calculate mean value"""
|
|
with self._lock:
|
|
if self._count == 0:
|
|
return 0.0
|
|
return self._sum / self._count
|
|
|
|
def snapshot(self) -> MetricSnapshot:
|
|
"""Get current snapshot with percentiles"""
|
|
percentiles = {
|
|
f"p{int(p * 100)}": self.get_percentile(p)
|
|
for p in PERCENTILES
|
|
}
|
|
|
|
return MetricSnapshot(
|
|
name=self.name,
|
|
type=MetricType.HISTOGRAM,
|
|
value=self.get_mean(),
|
|
labels=self._labels.copy(),
|
|
help_text=self.help_text,
|
|
timestamp=time.time(),
|
|
samples=len(self._samples),
|
|
sum=self._sum,
|
|
percentiles=percentiles
|
|
)
|
|
|
|
|
|
class MetricsCollector:
|
|
"""
|
|
Central metrics collector for the application.
|
|
|
|
CRITICAL FIX (P1-7): Thread-safe metrics collection and aggregation
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._counters: Dict[str, Counter] = {}
|
|
self._gauges: Dict[str, Gauge] = {}
|
|
self._histograms: Dict[str, Histogram] = {}
|
|
self._lock = threading.Lock()
|
|
|
|
# Initialize standard metrics
|
|
self._init_standard_metrics()
|
|
|
|
def _init_standard_metrics(self) -> None:
|
|
"""Initialize standard application metrics"""
|
|
# Request metrics
|
|
self.register_counter("requests_total", "Total number of requests")
|
|
self.register_counter("requests_success", "Total successful requests")
|
|
self.register_counter("requests_failed", "Total failed requests")
|
|
|
|
# Performance metrics
|
|
self.register_histogram("request_duration_seconds", "Request duration in seconds")
|
|
self.register_histogram("api_call_duration_seconds", "API call duration in seconds")
|
|
|
|
# Resource metrics
|
|
self.register_gauge("active_connections", "Current active connections")
|
|
self.register_gauge("active_tasks", "Current active tasks")
|
|
|
|
# Database metrics
|
|
self.register_counter("db_queries_total", "Total database queries")
|
|
self.register_histogram("db_query_duration_seconds", "Database query duration")
|
|
|
|
# Error metrics
|
|
self.register_counter("errors_total", "Total errors")
|
|
self.register_counter("errors_by_type", "Errors by type")
|
|
|
|
def register_counter(self, name: str, help_text: str = "") -> Counter:
|
|
"""Register a new counter metric"""
|
|
with self._lock:
|
|
if name not in self._counters:
|
|
self._counters[name] = Counter(name, help_text)
|
|
return self._counters[name]
|
|
|
|
def register_gauge(self, name: str, help_text: str = "") -> Gauge:
|
|
"""Register a new gauge metric"""
|
|
with self._lock:
|
|
if name not in self._gauges:
|
|
self._gauges[name] = Gauge(name, help_text)
|
|
return self._gauges[name]
|
|
|
|
def register_histogram(self, name: str, help_text: str = "") -> Histogram:
|
|
"""Register a new histogram metric"""
|
|
with self._lock:
|
|
if name not in self._histograms:
|
|
self._histograms[name] = Histogram(name, help_text)
|
|
return self._histograms[name]
|
|
|
|
def get_counter(self, name: str) -> Optional[Counter]:
|
|
"""Get counter by name"""
|
|
return self._counters.get(name)
|
|
|
|
def get_gauge(self, name: str) -> Optional[Gauge]:
|
|
"""Get gauge by name"""
|
|
return self._gauges.get(name)
|
|
|
|
def get_histogram(self, name: str) -> Optional[Histogram]:
|
|
"""Get histogram by name"""
|
|
return self._histograms.get(name)
|
|
|
|
@contextmanager
|
|
def track_request(self, success: bool = True):
|
|
"""
|
|
Context manager to track request metrics.
|
|
|
|
Usage:
|
|
with metrics.track_request():
|
|
# Do work
|
|
pass
|
|
"""
|
|
start_time = time.time()
|
|
self.get_gauge("active_tasks").inc()
|
|
|
|
try:
|
|
yield
|
|
if success:
|
|
self.get_counter("requests_success").inc()
|
|
except Exception:
|
|
self.get_counter("requests_failed").inc()
|
|
raise
|
|
finally:
|
|
duration = time.time() - start_time
|
|
self.get_histogram("request_duration_seconds").observe(duration)
|
|
self.get_counter("requests_total").inc()
|
|
self.get_gauge("active_tasks").dec()
|
|
|
|
@contextmanager
|
|
def track_api_call(self):
|
|
"""
|
|
Context manager to track API call metrics.
|
|
|
|
Usage:
|
|
with metrics.track_api_call():
|
|
response = await client.post(...)
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
yield
|
|
finally:
|
|
duration = time.time() - start_time
|
|
self.get_histogram("api_call_duration_seconds").observe(duration)
|
|
|
|
@contextmanager
|
|
def track_db_query(self):
|
|
"""
|
|
Context manager to track database query metrics.
|
|
|
|
Usage:
|
|
with metrics.track_db_query():
|
|
cursor.execute(query)
|
|
"""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
yield
|
|
finally:
|
|
duration = time.time() - start_time
|
|
self.get_histogram("db_query_duration_seconds").observe(duration)
|
|
self.get_counter("db_queries_total").inc()
|
|
|
|
def get_all_snapshots(self) -> List[MetricSnapshot]:
|
|
"""Get snapshots of all metrics"""
|
|
snapshots = []
|
|
|
|
with self._lock:
|
|
for counter in self._counters.values():
|
|
snapshots.append(counter.snapshot())
|
|
|
|
for gauge in self._gauges.values():
|
|
snapshots.append(gauge.snapshot())
|
|
|
|
for histogram in self._histograms.values():
|
|
snapshots.append(histogram.snapshot())
|
|
|
|
return snapshots
|
|
|
|
def to_json(self) -> str:
|
|
"""Export all metrics as JSON"""
|
|
snapshots = self.get_all_snapshots()
|
|
data = {
|
|
'timestamp': time.time(),
|
|
'metrics': [s.to_dict() for s in snapshots]
|
|
}
|
|
return json.dumps(data, indent=2)
|
|
|
|
def to_prometheus(self) -> str:
|
|
"""
|
|
Export metrics in Prometheus text format.
|
|
|
|
Format:
|
|
# HELP metric_name Description
|
|
# TYPE metric_name counter
|
|
metric_name{label="value"} 123.45 timestamp
|
|
"""
|
|
lines = []
|
|
snapshots = self.get_all_snapshots()
|
|
|
|
for snapshot in snapshots:
|
|
# HELP line
|
|
lines.append(f"# HELP {snapshot.name} {snapshot.help_text}")
|
|
|
|
# TYPE line
|
|
lines.append(f"# TYPE {snapshot.name} {snapshot.type.value}")
|
|
|
|
# Metric line
|
|
labels_str = ",".join(f'{k}="{v}"' for k, v in snapshot.labels.items())
|
|
if labels_str:
|
|
labels_str = f"{{{labels_str}}}"
|
|
|
|
# For histograms, export percentiles
|
|
if snapshot.type == MetricType.HISTOGRAM and snapshot.percentiles:
|
|
for pct_name, pct_value in snapshot.percentiles.items():
|
|
lines.append(
|
|
f'{snapshot.name}_bucket{{le="{pct_name}"}}{labels_str} '
|
|
f'{pct_value} {int(snapshot.timestamp * 1000)}'
|
|
)
|
|
lines.append(
|
|
f'{snapshot.name}_count{labels_str} '
|
|
f'{snapshot.samples} {int(snapshot.timestamp * 1000)}'
|
|
)
|
|
lines.append(
|
|
f'{snapshot.name}_sum{labels_str} '
|
|
f'{snapshot.sum} {int(snapshot.timestamp * 1000)}'
|
|
)
|
|
else:
|
|
lines.append(
|
|
f'{snapshot.name}{labels_str} '
|
|
f'{snapshot.value} {int(snapshot.timestamp * 1000)}'
|
|
)
|
|
|
|
lines.append("") # Blank line between metrics
|
|
|
|
return "\n".join(lines)
|
|
|
|
def get_summary(self) -> Dict:
|
|
"""Get human-readable summary of key metrics"""
|
|
request_duration = self.get_histogram("request_duration_seconds")
|
|
api_duration = self.get_histogram("api_call_duration_seconds")
|
|
db_duration = self.get_histogram("db_query_duration_seconds")
|
|
|
|
return {
|
|
'requests': {
|
|
'total': int(self.get_counter("requests_total").get()),
|
|
'success': int(self.get_counter("requests_success").get()),
|
|
'failed': int(self.get_counter("requests_failed").get()),
|
|
'active': int(self.get_gauge("active_tasks").get()),
|
|
'avg_duration_ms': round(request_duration.get_mean() * 1000, 2),
|
|
'p95_duration_ms': round(request_duration.get_percentile(0.95) * 1000, 2),
|
|
},
|
|
'api_calls': {
|
|
'avg_duration_ms': round(api_duration.get_mean() * 1000, 2),
|
|
'p95_duration_ms': round(api_duration.get_percentile(0.95) * 1000, 2),
|
|
},
|
|
'database': {
|
|
'total_queries': int(self.get_counter("db_queries_total").get()),
|
|
'avg_duration_ms': round(db_duration.get_mean() * 1000, 2),
|
|
'p95_duration_ms': round(db_duration.get_percentile(0.95) * 1000, 2),
|
|
},
|
|
'errors': {
|
|
'total': int(self.get_counter("errors_total").get()),
|
|
},
|
|
'resources': {
|
|
'active_connections': int(self.get_gauge("active_connections").get()),
|
|
'active_tasks': int(self.get_gauge("active_tasks").get()),
|
|
}
|
|
}
|
|
|
|
|
|
# Global metrics collector singleton
|
|
_global_metrics: Optional[MetricsCollector] = None
|
|
_metrics_lock = threading.Lock()
|
|
|
|
|
|
def get_metrics() -> MetricsCollector:
|
|
"""Get global metrics collector (singleton)"""
|
|
global _global_metrics
|
|
|
|
if _global_metrics is None:
|
|
with _metrics_lock:
|
|
if _global_metrics is None:
|
|
_global_metrics = MetricsCollector()
|
|
logger.info("Initialized global metrics collector")
|
|
|
|
return _global_metrics
|
|
|
|
|
|
def format_metrics_summary(summary: Dict) -> str:
|
|
"""Format metrics summary for CLI display"""
|
|
lines = [
|
|
"\n📊 Metrics Summary",
|
|
"=" * 70,
|
|
"",
|
|
"Requests:",
|
|
f" Total: {summary['requests']['total']}",
|
|
f" Success: {summary['requests']['success']}",
|
|
f" Failed: {summary['requests']['failed']}",
|
|
f" Active: {summary['requests']['active']}",
|
|
f" Avg Duration: {summary['requests']['avg_duration_ms']}ms",
|
|
f" P95 Duration: {summary['requests']['p95_duration_ms']}ms",
|
|
"",
|
|
"API Calls:",
|
|
f" Avg Duration: {summary['api_calls']['avg_duration_ms']}ms",
|
|
f" P95 Duration: {summary['api_calls']['p95_duration_ms']}ms",
|
|
"",
|
|
"Database:",
|
|
f" Total Queries: {summary['database']['total_queries']}",
|
|
f" Avg Duration: {summary['database']['avg_duration_ms']}ms",
|
|
f" P95 Duration: {summary['database']['p95_duration_ms']}ms",
|
|
"",
|
|
"Errors:",
|
|
f" Total: {summary['errors']['total']}",
|
|
"",
|
|
"Resources:",
|
|
f" Active Connections: {summary['resources']['active_connections']}",
|
|
f" Active Tasks: {summary['resources']['active_tasks']}",
|
|
"",
|
|
"=" * 70
|
|
]
|
|
|
|
return "\n".join(lines)
|