From 52732f7e2bbd58b2b98d8c418f42697a20c92190 Mon Sep 17 00:00:00 2001 From: Leo Date: Mon, 16 Feb 2026 14:03:12 +0000 Subject: [PATCH] feat: add observability-designer POWERFUL-tier skill - SLO Designer: generates comprehensive SLI/SLO frameworks with error budgets and burn rate alerts - Alert Optimizer: analyzes and optimizes alert configurations to reduce noise and improve effectiveness - Dashboard Generator: creates role-based dashboard specifications with golden signals coverage Includes comprehensive documentation, sample data, and expected outputs for testing. --- engineering/observability-designer/README.md | 384 ++++++ engineering/observability-designer/SKILL.md | 264 ++++ .../assets/sample_alerts.json | 276 ++++ .../assets/sample_service_api.json | 83 ++ .../assets/sample_service_web.json | 113 ++ .../expected_outputs/sample_dashboard.json | 811 +++++++++++ .../sample_slo_framework.json | 545 ++++++++ .../references/alert_design_patterns.md | 469 +++++++ .../references/dashboard_best_practices.md | 571 ++++++++ .../references/slo_cookbook.md | 329 +++++ .../scripts/alert_optimizer.py | 1059 ++++++++++++++ .../scripts/dashboard_generator.py | 1219 +++++++++++++++++ .../scripts/slo_designer.py | 670 +++++++++ 13 files changed, 6793 insertions(+) create mode 100644 engineering/observability-designer/README.md create mode 100644 engineering/observability-designer/SKILL.md create mode 100644 engineering/observability-designer/assets/sample_alerts.json create mode 100644 engineering/observability-designer/assets/sample_service_api.json create mode 100644 engineering/observability-designer/assets/sample_service_web.json create mode 100644 engineering/observability-designer/expected_outputs/sample_dashboard.json create mode 100644 engineering/observability-designer/expected_outputs/sample_slo_framework.json create mode 100644 engineering/observability-designer/references/alert_design_patterns.md create mode 100644 engineering/observability-designer/references/dashboard_best_practices.md create mode 100644 engineering/observability-designer/references/slo_cookbook.md create mode 100644 engineering/observability-designer/scripts/alert_optimizer.py create mode 100644 engineering/observability-designer/scripts/dashboard_generator.py create mode 100644 engineering/observability-designer/scripts/slo_designer.py diff --git a/engineering/observability-designer/README.md b/engineering/observability-designer/README.md new file mode 100644 index 0000000..d6a1753 --- /dev/null +++ b/engineering/observability-designer/README.md @@ -0,0 +1,384 @@ +# Observability Designer + +A comprehensive toolkit for designing production-ready observability strategies including SLI/SLO frameworks, alert optimization, and dashboard generation. + +## Overview + +The Observability Designer skill provides three powerful Python scripts that help you create, optimize, and maintain observability systems: + +- **SLO Designer**: Generate complete SLI/SLO frameworks with error budgets and burn rate alerts +- **Alert Optimizer**: Analyze and optimize existing alert configurations to reduce noise and improve effectiveness +- **Dashboard Generator**: Create comprehensive dashboard specifications with role-based layouts and drill-down paths + +## Quick Start + +### Prerequisites + +- Python 3.7+ +- No external dependencies required (uses Python standard library only) + +### Basic Usage + +```bash +# Generate SLO framework for a service +python3 scripts/slo_designer.py --service-type api --criticality critical --user-facing true --service-name payment-service + +# Optimize existing alerts +python3 scripts/alert_optimizer.py --input assets/sample_alerts.json --analyze-only + +# Generate a dashboard specification +python3 scripts/dashboard_generator.py --service-type web --name "Customer Portal" --role sre +``` + +## Scripts Documentation + +### SLO Designer (`slo_designer.py`) + +Generates comprehensive SLO frameworks based on service characteristics. + +#### Features +- **Automatic SLI Selection**: Recommends appropriate SLIs based on service type +- **Target Setting**: Suggests SLO targets based on service criticality +- **Error Budget Calculation**: Computes error budgets and burn rate thresholds +- **Multi-Window Burn Rate Alerts**: Generates 4-window burn rate alerting rules +- **SLA Recommendations**: Provides customer-facing SLA guidance + +#### Usage Examples + +```bash +# From service definition file +python3 scripts/slo_designer.py --input assets/sample_service_api.json --output slo_framework.json + +# From command line parameters +python3 scripts/slo_designer.py \ + --service-type api \ + --criticality critical \ + --user-facing true \ + --service-name payment-service \ + --output payment_slos.json + +# Generate and display summary only +python3 scripts/slo_designer.py --input assets/sample_service_web.json --summary-only +``` + +#### Service Definition Format + +```json +{ + "name": "payment-service", + "type": "api", + "criticality": "critical", + "user_facing": true, + "description": "Handles payment processing", + "team": "payments", + "environment": "production", + "dependencies": [ + { + "name": "user-service", + "type": "api", + "criticality": "high" + } + ] +} +``` + +#### Supported Service Types +- **api**: REST APIs, GraphQL services +- **web**: Web applications, SPAs +- **database**: Database services, data stores +- **queue**: Message queues, event streams +- **batch**: Batch processing jobs +- **ml**: Machine learning services + +#### Criticality Levels +- **critical**: 99.99% availability, <100ms P95 latency, <0.1% error rate +- **high**: 99.9% availability, <200ms P95 latency, <0.5% error rate +- **medium**: 99.5% availability, <500ms P95 latency, <1% error rate +- **low**: 99% availability, <1s P95 latency, <2% error rate + +### Alert Optimizer (`alert_optimizer.py`) + +Analyzes existing alert configurations and provides optimization recommendations. + +#### Features +- **Noise Detection**: Identifies alerts with high false positive rates +- **Coverage Analysis**: Finds gaps in monitoring coverage +- **Duplicate Detection**: Locates redundant or overlapping alerts +- **Threshold Analysis**: Reviews alert thresholds for appropriateness +- **Fatigue Assessment**: Evaluates alert volume and routing + +#### Usage Examples + +```bash +# Analyze existing alerts +python3 scripts/alert_optimizer.py --input assets/sample_alerts.json --analyze-only + +# Generate optimized configuration +python3 scripts/alert_optimizer.py \ + --input assets/sample_alerts.json \ + --output optimized_alerts.json + +# Generate HTML report +python3 scripts/alert_optimizer.py \ + --input assets/sample_alerts.json \ + --report alert_analysis.html \ + --format html +``` + +#### Alert Configuration Format + +```json +{ + "alerts": [ + { + "alert": "HighLatency", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5", + "for": "5m", + "labels": { + "severity": "warning", + "service": "payment-service" + }, + "annotations": { + "summary": "High request latency detected", + "runbook_url": "https://runbooks.company.com/high-latency" + }, + "historical_data": { + "fires_per_day": 2.5, + "false_positive_rate": 0.15 + } + } + ], + "services": [ + { + "name": "payment-service", + "criticality": "critical" + } + ] +} +``` + +#### Analysis Categories +- **Golden Signals**: Latency, traffic, errors, saturation +- **Resource Utilization**: CPU, memory, disk, network +- **Business Metrics**: Revenue, conversion, user engagement +- **Security**: Auth failures, suspicious activity +- **Availability**: Uptime, health checks + +### Dashboard Generator (`dashboard_generator.py`) + +Creates comprehensive dashboard specifications with role-based optimization. + +#### Features +- **Role-Based Layouts**: Optimized for SRE, Developer, Executive, and Ops personas +- **Golden Signals Coverage**: Automatic inclusion of key monitoring metrics +- **Service-Type Specific Panels**: Tailored panels based on service characteristics +- **Interactive Elements**: Template variables, drill-down paths, time range controls +- **Grafana Compatibility**: Generates Grafana-compatible JSON + +#### Usage Examples + +```bash +# From service definition +python3 scripts/dashboard_generator.py \ + --input assets/sample_service_web.json \ + --output dashboard.json + +# With specific role optimization +python3 scripts/dashboard_generator.py \ + --service-type api \ + --name "Payment Service" \ + --role developer \ + --output payment_dev_dashboard.json + +# Generate Grafana-compatible JSON +python3 scripts/dashboard_generator.py \ + --input assets/sample_service_api.json \ + --output dashboard.json \ + --format grafana + +# With documentation +python3 scripts/dashboard_generator.py \ + --service-type web \ + --name "Customer Portal" \ + --output portal_dashboard.json \ + --doc-output portal_docs.md +``` + +#### Target Roles + +- **sre**: Focus on availability, latency, errors, resource utilization +- **developer**: Emphasize latency, errors, throughput, business metrics +- **executive**: Highlight availability, business metrics, user experience +- **ops**: Priority on resource utilization, capacity, alerts, deployments + +#### Panel Types +- **Stat**: Single value displays with thresholds +- **Gauge**: Resource utilization and capacity metrics +- **Timeseries**: Trend analysis and historical data +- **Table**: Top N lists and detailed breakdowns +- **Heatmap**: Distribution and correlation analysis + +## Sample Data + +The `assets/` directory contains sample configurations for testing: + +- `sample_service_api.json`: Critical API service definition +- `sample_service_web.json`: High-priority web application definition +- `sample_alerts.json`: Alert configuration with optimization opportunities + +The `expected_outputs/` directory shows example outputs from each script: + +- `sample_slo_framework.json`: Complete SLO framework for API service +- `optimized_alerts.json`: Optimized alert configuration +- `sample_dashboard.json`: SRE dashboard specification + +## Best Practices + +### SLO Design +- Start with 1-2 SLOs per service and iterate +- Choose SLIs that directly impact user experience +- Set targets based on user needs, not technical capabilities +- Use error budgets to balance reliability and velocity + +### Alert Optimization +- Every alert must be actionable +- Alert on symptoms, not causes +- Use multi-window burn rate alerts for SLO protection +- Implement proper escalation and routing policies + +### Dashboard Design +- Follow the F-pattern for visual hierarchy +- Use consistent color semantics across dashboards +- Include drill-down paths for effective troubleshooting +- Optimize for the target role's specific needs + +## Integration Patterns + +### CI/CD Integration +```bash +# Generate SLOs during service onboarding +python3 scripts/slo_designer.py --input service-config.json --output slos.json + +# Validate alert configurations in pipeline +python3 scripts/alert_optimizer.py --input alerts.json --analyze-only --report validation.html + +# Auto-generate dashboards for new services +python3 scripts/dashboard_generator.py --input service-config.json --format grafana --output dashboard.json +``` + +### Monitoring Stack Integration +- **Prometheus**: Generated alert rules and recording rules +- **Grafana**: Dashboard JSON for direct import +- **Alertmanager**: Routing and escalation policies +- **PagerDuty**: Escalation configuration + +### GitOps Workflow +1. Store service definitions in version control +2. Generate observability configurations in CI/CD +3. Deploy configurations via GitOps +4. Monitor effectiveness and iterate + +## Advanced Usage + +### Custom SLO Targets +Override default targets by including them in service definitions: + +```json +{ + "name": "special-service", + "type": "api", + "criticality": "high", + "custom_slos": { + "availability_target": 0.9995, + "latency_p95_target_ms": 150, + "error_rate_target": 0.002 + } +} +``` + +### Alert Rule Templates +Use template variables for reusable alert rules: + +```yaml +# Generated Prometheus alert rule +- alert: {{ service_name }}_HighLatency + expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service="{{ service_name }}"}[5m])) > {{ latency_threshold }} + for: 5m + labels: + severity: warning + service: "{{ service_name }}" +``` + +### Dashboard Variants +Generate multiple dashboard variants for different use cases: + +```bash +# SRE operational dashboard +python3 scripts/dashboard_generator.py --input service.json --role sre --output sre-dashboard.json + +# Developer debugging dashboard +python3 scripts/dashboard_generator.py --input service.json --role developer --output dev-dashboard.json + +# Executive business dashboard +python3 scripts/dashboard_generator.py --input service.json --role executive --output exec-dashboard.json +``` + +## Troubleshooting + +### Common Issues + +#### Script Execution Errors +- Ensure Python 3.7+ is installed +- Check file paths and permissions +- Validate JSON syntax in input files + +#### Invalid Service Definitions +- Required fields: `name`, `type`, `criticality` +- Valid service types: `api`, `web`, `database`, `queue`, `batch`, `ml` +- Valid criticality levels: `critical`, `high`, `medium`, `low` + +#### Missing Historical Data +- Alert historical data is optional but improves analysis +- Include `fires_per_day` and `false_positive_rate` when available +- Use monitoring system APIs to populate historical metrics + +### Debug Mode +Enable verbose logging by setting environment variable: + +```bash +export DEBUG=1 +python3 scripts/slo_designer.py --input service.json +``` + +## Contributing + +### Development Setup +```bash +# Clone the repository +git clone +cd engineering/observability-designer + +# Run tests +python3 -m pytest tests/ + +# Lint code +python3 -m flake8 scripts/ +``` + +### Adding New Features +1. Follow existing code patterns and error handling +2. Include comprehensive docstrings and type hints +3. Add test cases for new functionality +4. Update documentation and examples + +## Support + +For questions, issues, or feature requests: +- Check existing documentation and examples +- Review the reference materials in `references/` +- Open an issue with detailed reproduction steps +- Include sample configurations when reporting bugs + +--- + +*This skill is part of the Claude Skills marketplace. For more information about observability best practices, see the reference documentation in the `references/` directory.* \ No newline at end of file diff --git a/engineering/observability-designer/SKILL.md b/engineering/observability-designer/SKILL.md new file mode 100644 index 0000000..1cb45d5 --- /dev/null +++ b/engineering/observability-designer/SKILL.md @@ -0,0 +1,264 @@ +# Observability Designer (POWERFUL) + +**Category:** Engineering +**Tier:** POWERFUL +**Description:** Design comprehensive observability strategies for production systems including SLI/SLO frameworks, alerting optimization, and dashboard generation. + +## Overview + +Observability Designer enables you to create production-ready observability strategies that provide deep insights into system behavior, performance, and reliability. This skill combines the three pillars of observability (metrics, logs, traces) with proven frameworks like SLI/SLO design, golden signals monitoring, and alert optimization to create comprehensive observability solutions. + +## Core Competencies + +### SLI/SLO/SLA Framework Design +- **Service Level Indicators (SLI):** Define measurable signals that indicate service health +- **Service Level Objectives (SLO):** Set reliability targets based on user experience +- **Service Level Agreements (SLA):** Establish customer-facing commitments with consequences +- **Error Budget Management:** Calculate and track error budget consumption +- **Burn Rate Alerting:** Multi-window burn rate alerts for proactive SLO protection + +### Three Pillars of Observability + +#### Metrics +- **Golden Signals:** Latency, traffic, errors, and saturation monitoring +- **RED Method:** Rate, Errors, and Duration for request-driven services +- **USE Method:** Utilization, Saturation, and Errors for resource monitoring +- **Business Metrics:** Revenue, user engagement, and feature adoption tracking +- **Infrastructure Metrics:** CPU, memory, disk, network, and custom resource metrics + +#### Logs +- **Structured Logging:** JSON-based log formats with consistent fields +- **Log Aggregation:** Centralized log collection and indexing strategies +- **Log Levels:** Appropriate use of DEBUG, INFO, WARN, ERROR, FATAL levels +- **Correlation IDs:** Request tracing through distributed systems +- **Log Sampling:** Volume management for high-throughput systems + +#### Traces +- **Distributed Tracing:** End-to-end request flow visualization +- **Span Design:** Meaningful span boundaries and metadata +- **Trace Sampling:** Intelligent sampling strategies for performance and cost +- **Service Maps:** Automatic dependency discovery through traces +- **Root Cause Analysis:** Trace-driven debugging workflows + +### Dashboard Design Principles + +#### Information Architecture +- **Hierarchy:** Overview → Service → Component → Instance drill-down paths +- **Golden Ratio:** 80% operational metrics, 20% exploratory metrics +- **Cognitive Load:** Maximum 7±2 panels per dashboard screen +- **User Journey:** Role-based dashboard personas (SRE, Developer, Executive) + +#### Visualization Best Practices +- **Chart Selection:** Time series for trends, heatmaps for distributions, gauges for status +- **Color Theory:** Red for critical, amber for warning, green for healthy states +- **Reference Lines:** SLO targets, capacity thresholds, and historical baselines +- **Time Ranges:** Default to meaningful windows (4h for incidents, 7d for trends) + +#### Panel Design +- **Metric Queries:** Efficient Prometheus/InfluxDB queries with proper aggregation +- **Alerting Integration:** Visual alert state indicators on relevant panels +- **Interactive Elements:** Template variables, drill-down links, and annotation overlays +- **Performance:** Sub-second render times through query optimization + +### Alert Design and Optimization + +#### Alert Classification +- **Severity Levels:** + - **Critical:** Service down, SLO burn rate high + - **Warning:** Approaching thresholds, non-user-facing issues + - **Info:** Deployment notifications, capacity planning alerts +- **Actionability:** Every alert must have a clear response action +- **Alert Routing:** Escalation policies based on severity and team ownership + +#### Alert Fatigue Prevention +- **Signal vs Noise:** High precision (few false positives) over high recall +- **Hysteresis:** Different thresholds for firing and resolving alerts +- **Suppression:** Dependent alert suppression during known outages +- **Grouping:** Related alerts grouped into single notifications + +#### Alert Rule Design +- **Threshold Selection:** Statistical methods for threshold determination +- **Window Functions:** Appropriate averaging windows and percentile calculations +- **Alert Lifecycle:** Clear firing conditions and automatic resolution criteria +- **Testing:** Alert rule validation against historical data + +### Runbook Generation and Incident Response + +#### Runbook Structure +- **Alert Context:** What the alert means and why it fired +- **Impact Assessment:** User-facing vs internal impact evaluation +- **Investigation Steps:** Ordered troubleshooting procedures with time estimates +- **Resolution Actions:** Common fixes and escalation procedures +- **Post-Incident:** Follow-up tasks and prevention measures + +#### Incident Detection Patterns +- **Anomaly Detection:** Statistical methods for detecting unusual patterns +- **Composite Alerts:** Multi-signal alerts for complex failure modes +- **Predictive Alerts:** Capacity and trend-based forward-looking alerts +- **Canary Monitoring:** Early detection through progressive deployment monitoring + +### Golden Signals Framework + +#### Latency Monitoring +- **Request Latency:** P50, P95, P99 response time tracking +- **Queue Latency:** Time spent waiting in processing queues +- **Network Latency:** Inter-service communication delays +- **Database Latency:** Query execution and connection pool metrics + +#### Traffic Monitoring +- **Request Rate:** Requests per second with burst detection +- **Bandwidth Usage:** Network throughput and capacity utilization +- **User Sessions:** Active user tracking and session duration +- **Feature Usage:** API endpoint and feature adoption metrics + +#### Error Monitoring +- **Error Rate:** 4xx and 5xx HTTP response code tracking +- **Error Budget:** SLO-based error rate targets and consumption +- **Error Distribution:** Error type classification and trending +- **Silent Failures:** Detection of processing failures without HTTP errors + +#### Saturation Monitoring +- **Resource Utilization:** CPU, memory, disk, and network usage +- **Queue Depth:** Processing queue length and wait times +- **Connection Pools:** Database and service connection saturation +- **Rate Limiting:** API throttling and quota exhaustion tracking + +### Distributed Tracing Strategies + +#### Trace Architecture +- **Sampling Strategy:** Head-based, tail-based, and adaptive sampling +- **Trace Propagation:** Context propagation across service boundaries +- **Span Correlation:** Parent-child relationship modeling +- **Trace Storage:** Retention policies and storage optimization + +#### Service Instrumentation +- **Auto-Instrumentation:** Framework-based automatic trace generation +- **Manual Instrumentation:** Custom span creation for business logic +- **Baggage Handling:** Cross-cutting concern propagation +- **Performance Impact:** Instrumentation overhead measurement and optimization + +### Log Aggregation Patterns + +#### Collection Architecture +- **Agent Deployment:** Log shipping agent strategies (push vs pull) +- **Log Routing:** Topic-based routing and filtering +- **Parsing Strategies:** Structured vs unstructured log handling +- **Schema Evolution:** Log format versioning and migration + +#### Storage and Indexing +- **Index Design:** Optimized field indexing for common query patterns +- **Retention Policies:** Time and volume-based log retention +- **Compression:** Log data compression and archival strategies +- **Search Performance:** Query optimization and result caching + +### Cost Optimization for Observability + +#### Data Management +- **Metric Retention:** Tiered retention based on metric importance +- **Log Sampling:** Intelligent sampling to reduce ingestion costs +- **Trace Sampling:** Cost-effective trace collection strategies +- **Data Archival:** Cold storage for historical observability data + +#### Resource Optimization +- **Query Efficiency:** Optimized metric and log queries +- **Storage Costs:** Appropriate storage tiers for different data types +- **Ingestion Rate Limiting:** Controlled data ingestion to manage costs +- **Cardinality Management:** High-cardinality metric detection and mitigation + +## Scripts Overview + +This skill includes three powerful Python scripts for comprehensive observability design: + +### 1. SLO Designer (`slo_designer.py`) +Generates complete SLI/SLO frameworks based on service characteristics: +- **Input:** Service description JSON (type, criticality, dependencies) +- **Output:** SLI definitions, SLO targets, error budgets, burn rate alerts, SLA recommendations +- **Features:** Multi-window burn rate calculations, error budget policies, alert rule generation + +### 2. Alert Optimizer (`alert_optimizer.py`) +Analyzes and optimizes existing alert configurations: +- **Input:** Alert configuration JSON with rules, thresholds, and routing +- **Output:** Optimization report and improved alert configuration +- **Features:** Noise detection, coverage gaps, duplicate identification, threshold optimization + +### 3. Dashboard Generator (`dashboard_generator.py`) +Creates comprehensive dashboard specifications: +- **Input:** Service/system description JSON +- **Output:** Grafana-compatible dashboard JSON and documentation +- **Features:** Golden signals coverage, RED/USE methods, drill-down paths, role-based views + +## Integration Patterns + +### Monitoring Stack Integration +- **Prometheus:** Metric collection and alerting rule generation +- **Grafana:** Dashboard creation and visualization configuration +- **Elasticsearch/Kibana:** Log analysis and dashboard integration +- **Jaeger/Zipkin:** Distributed tracing configuration and analysis + +### CI/CD Integration +- **Pipeline Monitoring:** Build, test, and deployment observability +- **Deployment Correlation:** Release impact tracking and rollback triggers +- **Feature Flag Monitoring:** A/B test and feature rollout observability +- **Performance Regression:** Automated performance monitoring in pipelines + +### Incident Management Integration +- **PagerDuty/VictorOps:** Alert routing and escalation policies +- **Slack/Teams:** Notification and collaboration integration +- **JIRA/ServiceNow:** Incident tracking and resolution workflows +- **Post-Mortem:** Automated incident analysis and improvement tracking + +## Advanced Patterns + +### Multi-Cloud Observability +- **Cross-Cloud Metrics:** Unified metrics across AWS, GCP, Azure +- **Network Observability:** Inter-cloud connectivity monitoring +- **Cost Attribution:** Cloud resource cost tracking and optimization +- **Compliance Monitoring:** Security and compliance posture tracking + +### Microservices Observability +- **Service Mesh Integration:** Istio/Linkerd observability configuration +- **API Gateway Monitoring:** Request routing and rate limiting observability +- **Container Orchestration:** Kubernetes cluster and workload monitoring +- **Service Discovery:** Dynamic service monitoring and health checks + +### Machine Learning Observability +- **Model Performance:** Accuracy, drift, and bias monitoring +- **Feature Store Monitoring:** Feature quality and freshness tracking +- **Pipeline Observability:** ML pipeline execution and performance monitoring +- **A/B Test Analysis:** Statistical significance and business impact measurement + +## Best Practices + +### Organizational Alignment +- **SLO Setting:** Collaborative target setting between product and engineering +- **Alert Ownership:** Clear escalation paths and team responsibilities +- **Dashboard Governance:** Centralized dashboard management and standards +- **Training Programs:** Team education on observability tools and practices + +### Technical Excellence +- **Infrastructure as Code:** Observability configuration version control +- **Testing Strategy:** Alert rule testing and dashboard validation +- **Performance Monitoring:** Observability system performance tracking +- **Security Considerations:** Access control and data privacy in observability + +### Continuous Improvement +- **Metrics Review:** Regular SLI/SLO effectiveness assessment +- **Alert Tuning:** Ongoing alert threshold and routing optimization +- **Dashboard Evolution:** User feedback-driven dashboard improvements +- **Tool Evaluation:** Regular assessment of observability tool effectiveness + +## Success Metrics + +### Operational Metrics +- **Mean Time to Detection (MTTD):** How quickly issues are identified +- **Mean Time to Resolution (MTTR):** Time from detection to resolution +- **Alert Precision:** Percentage of actionable alerts +- **SLO Achievement:** Percentage of SLO targets met consistently + +### Business Metrics +- **System Reliability:** Overall uptime and user experience quality +- **Engineering Velocity:** Development team productivity and deployment frequency +- **Cost Efficiency:** Observability cost as percentage of infrastructure spend +- **Customer Satisfaction:** User-reported reliability and performance satisfaction + +This comprehensive observability design skill enables organizations to build robust, scalable monitoring and alerting systems that provide actionable insights while maintaining cost efficiency and operational excellence. \ No newline at end of file diff --git a/engineering/observability-designer/assets/sample_alerts.json b/engineering/observability-designer/assets/sample_alerts.json new file mode 100644 index 0000000..14e1cc1 --- /dev/null +++ b/engineering/observability-designer/assets/sample_alerts.json @@ -0,0 +1,276 @@ +{ + "alerts": [ + { + "alert": "HighLatency", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5", + "for": "5m", + "labels": { + "severity": "warning", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "High request latency detected", + "description": "95th percentile latency is {{ $value }}s for payment-service", + "runbook_url": "https://runbooks.company.com/high-latency" + }, + "historical_data": { + "fires_per_day": 2.5, + "false_positive_rate": 0.15, + "average_duration_minutes": 12 + } + }, + { + "alert": "ServiceDown", + "expr": "up{service=\"payment-service\"} == 0", + "labels": { + "severity": "critical", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "Payment service is down", + "description": "Payment service has been down for more than 1 minute", + "runbook_url": "https://runbooks.company.com/service-down" + }, + "historical_data": { + "fires_per_day": 0.1, + "false_positive_rate": 0.05, + "average_duration_minutes": 3 + } + }, + { + "alert": "HighErrorRate", + "expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.01", + "for": "2m", + "labels": { + "severity": "warning", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "High error rate detected", + "description": "Error rate is {{ $value | humanizePercentage }} for payment-service", + "runbook_url": "https://runbooks.company.com/high-error-rate" + }, + "historical_data": { + "fires_per_day": 1.8, + "false_positive_rate": 0.25, + "average_duration_minutes": 8 + } + }, + { + "alert": "HighCPUUsage", + "expr": "rate(process_cpu_seconds_total{service=\"payment-service\"}[5m]) * 100 > 80", + "labels": { + "severity": "warning", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "High CPU usage", + "description": "CPU usage is {{ $value }}% for payment-service" + }, + "historical_data": { + "fires_per_day": 15.2, + "false_positive_rate": 0.8, + "average_duration_minutes": 45 + } + }, + { + "alert": "HighMemoryUsage", + "expr": "process_resident_memory_bytes{service=\"payment-service\"} / process_virtual_memory_max_bytes{service=\"payment-service\"} * 100 > 85", + "labels": { + "severity": "info", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "High memory usage", + "description": "Memory usage is {{ $value }}% for payment-service" + }, + "historical_data": { + "fires_per_day": 8.5, + "false_positive_rate": 0.6, + "average_duration_minutes": 30 + } + }, + { + "alert": "DatabaseConnectionPoolExhaustion", + "expr": "db_connections_active{service=\"payment-service\"} / db_connections_max{service=\"payment-service\"} > 0.9", + "for": "1m", + "labels": { + "severity": "critical", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "Database connection pool near exhaustion", + "description": "Connection pool utilization is {{ $value | humanizePercentage }}", + "runbook_url": "https://runbooks.company.com/db-connections" + }, + "historical_data": { + "fires_per_day": 0.3, + "false_positive_rate": 0.1, + "average_duration_minutes": 5 + } + }, + { + "alert": "LowTraffic", + "expr": "sum(rate(http_requests_total{service=\"payment-service\"}[5m])) < 10", + "for": "10m", + "labels": { + "severity": "warning", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "Unusually low traffic", + "description": "Request rate is {{ $value }} RPS, which is unusually low" + }, + "historical_data": { + "fires_per_day": 12.0, + "false_positive_rate": 0.9, + "average_duration_minutes": 120 + } + }, + { + "alert": "HighLatencyDuplicate", + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5", + "for": "5m", + "labels": { + "severity": "warning", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "High request latency detected (duplicate)", + "description": "95th percentile latency is {{ $value }}s for payment-service" + }, + "historical_data": { + "fires_per_day": 2.5, + "false_positive_rate": 0.15, + "average_duration_minutes": 12 + } + }, + { + "alert": "VeryLowErrorRate", + "expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.001", + "labels": { + "severity": "info", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "Error rate above 0.1%", + "description": "Error rate is {{ $value | humanizePercentage }}" + }, + "historical_data": { + "fires_per_day": 25.0, + "false_positive_rate": 0.95, + "average_duration_minutes": 5 + } + }, + { + "alert": "DiskUsageHigh", + "expr": "disk_usage_percent{service=\"payment-service\"} > 85", + "labels": { + "severity": "warning", + "service": "payment-service", + "team": "payments" + }, + "annotations": { + "summary": "Disk usage high", + "description": "Disk usage is {{ $value }}%" + }, + "historical_data": { + "fires_per_day": 3.2, + "false_positive_rate": 0.4, + "average_duration_minutes": 240 + } + } + ], + "services": [ + { + "name": "payment-service", + "type": "api", + "criticality": "critical", + "team": "payments" + }, + { + "name": "user-service", + "type": "api", + "criticality": "high", + "team": "identity" + }, + { + "name": "notification-service", + "type": "api", + "criticality": "medium", + "team": "communications" + } + ], + "alert_routing": { + "routes": [ + { + "match": { + "severity": "critical" + }, + "receiver": "pager-critical", + "group_wait": "10s", + "group_interval": "1m", + "repeat_interval": "5m" + }, + { + "match": { + "severity": "warning" + }, + "receiver": "slack-warnings", + "group_wait": "30s", + "group_interval": "5m", + "repeat_interval": "1h" + }, + { + "match": { + "severity": "info" + }, + "receiver": "email-info", + "group_wait": "2m", + "group_interval": "10m", + "repeat_interval": "24h" + } + ] + }, + "receivers": [ + { + "name": "pager-critical", + "pagerduty_configs": [ + { + "routing_key": "pager-key-critical", + "description": "Critical alert: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}" + } + ] + }, + { + "name": "slack-warnings", + "slack_configs": [ + { + "api_url": "https://hooks.slack.com/services/warnings", + "channel": "#alerts-warnings", + "title": "Warning Alert", + "text": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}" + } + ] + }, + { + "name": "email-info", + "email_configs": [ + { + "to": "team-notifications@company.com", + "subject": "Info Alert: {{ .GroupLabels.alertname }}", + "body": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}" + } + ] + } + ] +} \ No newline at end of file diff --git a/engineering/observability-designer/assets/sample_service_api.json b/engineering/observability-designer/assets/sample_service_api.json new file mode 100644 index 0000000..0eb7c6b --- /dev/null +++ b/engineering/observability-designer/assets/sample_service_api.json @@ -0,0 +1,83 @@ +{ + "name": "payment-service", + "type": "api", + "criticality": "critical", + "user_facing": true, + "description": "Handles payment processing and transaction management", + "team": "payments", + "environment": "production", + "dependencies": [ + { + "name": "user-service", + "type": "api", + "criticality": "high" + }, + { + "name": "payment-gateway", + "type": "external", + "criticality": "critical" + }, + { + "name": "fraud-detection", + "type": "ml", + "criticality": "high" + } + ], + "endpoints": [ + { + "path": "/api/v1/payments", + "method": "POST", + "sla_latency_ms": 500, + "expected_tps": 100 + }, + { + "path": "/api/v1/payments/{id}", + "method": "GET", + "sla_latency_ms": 200, + "expected_tps": 500 + }, + { + "path": "/api/v1/payments/{id}/refund", + "method": "POST", + "sla_latency_ms": 1000, + "expected_tps": 10 + } + ], + "business_metrics": { + "revenue_per_hour": { + "metric": "sum(payment_amount * rate(payments_successful_total[1h]))", + "target": 50000, + "unit": "USD" + }, + "conversion_rate": { + "metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))", + "target": 0.95, + "unit": "percentage" + } + }, + "infrastructure": { + "container_orchestrator": "kubernetes", + "replicas": 6, + "cpu_limit": "2000m", + "memory_limit": "4Gi", + "database": { + "type": "postgresql", + "connection_pool_size": 20 + }, + "cache": { + "type": "redis", + "cluster_size": 3 + } + }, + "compliance_requirements": [ + "PCI-DSS", + "SOX", + "GDPR" + ], + "tags": [ + "payment", + "transaction", + "critical-path", + "revenue-generating" + ] +} \ No newline at end of file diff --git a/engineering/observability-designer/assets/sample_service_web.json b/engineering/observability-designer/assets/sample_service_web.json new file mode 100644 index 0000000..affc31e --- /dev/null +++ b/engineering/observability-designer/assets/sample_service_web.json @@ -0,0 +1,113 @@ +{ + "name": "customer-portal", + "type": "web", + "criticality": "high", + "user_facing": true, + "description": "Customer-facing web application for account management and billing", + "team": "frontend", + "environment": "production", + "dependencies": [ + { + "name": "user-service", + "type": "api", + "criticality": "high" + }, + { + "name": "billing-service", + "type": "api", + "criticality": "high" + }, + { + "name": "notification-service", + "type": "api", + "criticality": "medium" + }, + { + "name": "cdn", + "type": "external", + "criticality": "medium" + } + ], + "pages": [ + { + "path": "/dashboard", + "sla_load_time_ms": 2000, + "expected_concurrent_users": 1000 + }, + { + "path": "/billing", + "sla_load_time_ms": 3000, + "expected_concurrent_users": 200 + }, + { + "path": "/settings", + "sla_load_time_ms": 1500, + "expected_concurrent_users": 100 + } + ], + "business_metrics": { + "daily_active_users": { + "metric": "count(user_sessions_started_total[1d])", + "target": 10000, + "unit": "users" + }, + "session_duration": { + "metric": "avg(user_session_duration_seconds)", + "target": 300, + "unit": "seconds" + }, + "bounce_rate": { + "metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))", + "target": 0.3, + "unit": "percentage" + } + }, + "infrastructure": { + "container_orchestrator": "kubernetes", + "replicas": 4, + "cpu_limit": "1000m", + "memory_limit": "2Gi", + "storage": { + "type": "nfs", + "size": "50Gi" + }, + "ingress": { + "type": "nginx", + "ssl_termination": true, + "rate_limiting": { + "requests_per_second": 100, + "burst": 200 + } + } + }, + "monitoring": { + "synthetic_checks": [ + { + "name": "login_flow", + "url": "/auth/login", + "frequency": "1m", + "locations": ["us-east", "eu-west", "ap-south"] + }, + { + "name": "checkout_flow", + "url": "/billing/checkout", + "frequency": "5m", + "locations": ["us-east", "eu-west"] + } + ], + "rum": { + "enabled": true, + "sampling_rate": 0.1 + } + }, + "compliance_requirements": [ + "GDPR", + "CCPA" + ], + "tags": [ + "frontend", + "customer-facing", + "billing", + "high-traffic" + ] +} \ No newline at end of file diff --git a/engineering/observability-designer/expected_outputs/sample_dashboard.json b/engineering/observability-designer/expected_outputs/sample_dashboard.json new file mode 100644 index 0000000..4069c71 --- /dev/null +++ b/engineering/observability-designer/expected_outputs/sample_dashboard.json @@ -0,0 +1,811 @@ +{ + "metadata": { + "title": "customer-portal - SRE Dashboard", + "service": { + "name": "customer-portal", + "type": "web", + "criticality": "high", + "user_facing": true, + "description": "Customer-facing web application for account management and billing", + "team": "frontend", + "environment": "production", + "dependencies": [ + { + "name": "user-service", + "type": "api", + "criticality": "high" + }, + { + "name": "billing-service", + "type": "api", + "criticality": "high" + }, + { + "name": "notification-service", + "type": "api", + "criticality": "medium" + }, + { + "name": "cdn", + "type": "external", + "criticality": "medium" + } + ], + "pages": [ + { + "path": "/dashboard", + "sla_load_time_ms": 2000, + "expected_concurrent_users": 1000 + }, + { + "path": "/billing", + "sla_load_time_ms": 3000, + "expected_concurrent_users": 200 + }, + { + "path": "/settings", + "sla_load_time_ms": 1500, + "expected_concurrent_users": 100 + } + ], + "business_metrics": { + "daily_active_users": { + "metric": "count(user_sessions_started_total[1d])", + "target": 10000, + "unit": "users" + }, + "session_duration": { + "metric": "avg(user_session_duration_seconds)", + "target": 300, + "unit": "seconds" + }, + "bounce_rate": { + "metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))", + "target": 0.3, + "unit": "percentage" + } + }, + "infrastructure": { + "container_orchestrator": "kubernetes", + "replicas": 4, + "cpu_limit": "1000m", + "memory_limit": "2Gi", + "storage": { + "type": "nfs", + "size": "50Gi" + }, + "ingress": { + "type": "nginx", + "ssl_termination": true, + "rate_limiting": { + "requests_per_second": 100, + "burst": 200 + } + } + }, + "monitoring": { + "synthetic_checks": [ + { + "name": "login_flow", + "url": "/auth/login", + "frequency": "1m", + "locations": [ + "us-east", + "eu-west", + "ap-south" + ] + }, + { + "name": "checkout_flow", + "url": "/billing/checkout", + "frequency": "5m", + "locations": [ + "us-east", + "eu-west" + ] + } + ], + "rum": { + "enabled": true, + "sampling_rate": 0.1 + } + }, + "compliance_requirements": [ + "GDPR", + "CCPA" + ], + "tags": [ + "frontend", + "customer-facing", + "billing", + "high-traffic" + ] + }, + "target_role": "sre", + "generated_at": "2026-02-16T14:02:03.421248Z", + "version": "1.0" + }, + "configuration": { + "time_ranges": [ + "1h", + "6h", + "1d", + "7d" + ], + "default_time_range": "6h", + "refresh_interval": "30s", + "timezone": "UTC", + "theme": "dark" + }, + "layout": { + "grid_settings": { + "width": 24, + "height_unit": "px", + "cell_height": 30 + }, + "sections": [ + { + "title": "Service Overview", + "collapsed": false, + "y_position": 0, + "panels": [ + "service_status", + "slo_summary", + "error_budget" + ] + }, + { + "title": "Golden Signals", + "collapsed": false, + "y_position": 8, + "panels": [ + "latency", + "traffic", + "errors", + "saturation" + ] + }, + { + "title": "Resource Utilization", + "collapsed": false, + "y_position": 16, + "panels": [ + "cpu_usage", + "memory_usage", + "network_io", + "disk_io" + ] + }, + { + "title": "Dependencies & Downstream", + "collapsed": true, + "y_position": 24, + "panels": [ + "dependency_status", + "downstream_latency", + "circuit_breakers" + ] + } + ] + }, + "panels": [ + { + "id": "service_status", + "title": "Service Status", + "type": "stat", + "grid_pos": { + "x": 0, + "y": 0, + "w": 6, + "h": 4 + }, + "targets": [ + { + "expr": "up{service=\"customer-portal\"}", + "legendFormat": "Status" + } + ], + "field_config": { + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Status" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "thresholds" + } + }, + { + "id": "thresholds", + "value": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + } + }, + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "text": "DOWN" + } + }, + "type": "value" + }, + { + "options": { + "1": { + "text": "UP" + } + }, + "type": "value" + } + ] + } + ] + } + ] + }, + "options": { + "orientation": "horizontal", + "textMode": "value_and_name" + } + }, + { + "id": "slo_summary", + "title": "SLO Achievement (30d)", + "type": "stat", + "grid_pos": { + "x": 6, + "y": 0, + "w": 9, + "h": 4 + }, + "targets": [ + { + "expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d]))) * 100", + "legendFormat": "Availability" + }, + { + "expr": "histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{service=\"customer-portal\"}[30d])) * 1000", + "legendFormat": "P95 Latency (ms)" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 99.0 + }, + { + "color": "green", + "value": 99.9 + } + ] + } + } + }, + "options": { + "orientation": "horizontal", + "textMode": "value_and_name" + } + }, + { + "id": "error_budget", + "title": "Error Budget Remaining", + "type": "gauge", + "grid_pos": { + "x": 15, + "y": 0, + "w": 9, + "h": 4 + }, + "targets": [ + { + "expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d])) - 0.999) / 0.001 * 100", + "legendFormat": "Error Budget %" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 25 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "percent" + } + }, + "options": { + "showThresholdLabels": true, + "showThresholdMarkers": true + } + }, + { + "id": "latency", + "title": "Request Latency", + "type": "timeseries", + "grid_pos": { + "x": 0, + "y": 8, + "w": 12, + "h": 6 + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000", + "legendFormat": "P50 Latency" + }, + { + "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000", + "legendFormat": "P95 Latency" + }, + { + "expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000", + "legendFormat": "P99 Latency" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "ms", + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 1, + "fillOpacity": 10 + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom" + } + } + }, + { + "id": "traffic", + "title": "Request Rate", + "type": "timeseries", + "grid_pos": { + "x": 12, + "y": 8, + "w": 12, + "h": 6 + }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=\"customer-portal\"}[5m]))", + "legendFormat": "Total RPS" + }, + { + "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"2..\"}[5m]))", + "legendFormat": "2xx RPS" + }, + { + "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m]))", + "legendFormat": "4xx RPS" + }, + { + "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m]))", + "legendFormat": "5xx RPS" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "reqps", + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 1, + "fillOpacity": 0 + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom" + } + } + }, + { + "id": "errors", + "title": "Error Rate", + "type": "timeseries", + "grid_pos": { + "x": 0, + "y": 14, + "w": 12, + "h": 6 + }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100", + "legendFormat": "5xx Error Rate" + }, + { + "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100", + "legendFormat": "4xx Error Rate" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percent", + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 2, + "fillOpacity": 20 + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "5xx Error Rate" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red" + } + } + ] + } + ] + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom" + } + } + }, + { + "id": "saturation", + "title": "Saturation Metrics", + "type": "timeseries", + "grid_pos": { + "x": 12, + "y": 14, + "w": 12, + "h": 6 + }, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100", + "legendFormat": "CPU Usage %" + }, + { + "expr": "process_resident_memory_bytes{service=\"customer-portal\"} / process_virtual_memory_max_bytes{service=\"customer-portal\"} * 100", + "legendFormat": "Memory Usage %" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "percent", + "max": 100, + "custom": { + "drawStyle": "line", + "lineInterpolation": "linear", + "lineWidth": 1, + "fillOpacity": 10 + } + } + }, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + }, + "legend": { + "displayMode": "table", + "placement": "bottom" + } + } + }, + { + "id": "cpu_usage", + "title": "CPU Usage", + "type": "gauge", + "grid_pos": { + "x": 0, + "y": 20, + "w": 6, + "h": 4 + }, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100", + "legendFormat": "CPU %" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } + } + }, + "options": { + "showThresholdLabels": true, + "showThresholdMarkers": true + } + }, + { + "id": "memory_usage", + "title": "Memory Usage", + "type": "gauge", + "grid_pos": { + "x": 6, + "y": 20, + "w": 6, + "h": 4 + }, + "targets": [ + { + "expr": "process_resident_memory_bytes{service=\"customer-portal\"} / 1024 / 1024", + "legendFormat": "Memory MB" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "unit": "decbytes", + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 512000000 + }, + { + "color": "red", + "value": 1024000000 + } + ] + } + } + } + }, + { + "id": "network_io", + "title": "Network I/O", + "type": "timeseries", + "grid_pos": { + "x": 12, + "y": 20, + "w": 6, + "h": 4 + }, + "targets": [ + { + "expr": "rate(process_network_receive_bytes_total{service=\"customer-portal\"}[5m])", + "legendFormat": "RX Bytes/s" + }, + { + "expr": "rate(process_network_transmit_bytes_total{service=\"customer-portal\"}[5m])", + "legendFormat": "TX Bytes/s" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "binBps" + } + } + }, + { + "id": "disk_io", + "title": "Disk I/O", + "type": "timeseries", + "grid_pos": { + "x": 18, + "y": 20, + "w": 6, + "h": 4 + }, + "targets": [ + { + "expr": "rate(process_disk_read_bytes_total{service=\"customer-portal\"}[5m])", + "legendFormat": "Read Bytes/s" + }, + { + "expr": "rate(process_disk_write_bytes_total{service=\"customer-portal\"}[5m])", + "legendFormat": "Write Bytes/s" + } + ], + "field_config": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "binBps" + } + } + } + ], + "variables": [ + { + "name": "environment", + "type": "query", + "query": "label_values(environment)", + "current": { + "text": "production", + "value": "production" + }, + "includeAll": false, + "multi": false, + "refresh": "on_dashboard_load" + }, + { + "name": "instance", + "type": "query", + "query": "label_values(up{service=\"customer-portal\"}, instance)", + "current": { + "text": "All", + "value": "$__all" + }, + "includeAll": true, + "multi": true, + "refresh": "on_time_range_change" + }, + { + "name": "handler", + "type": "query", + "query": "label_values(http_requests_total{service=\"customer-portal\"}, handler)", + "current": { + "text": "All", + "value": "$__all" + }, + "includeAll": true, + "multi": true, + "refresh": "on_time_range_change" + } + ], + "alerts_integration": { + "alert_annotations": true, + "alert_rules_query": "ALERTS{service=\"customer-portal\"}", + "alert_panels": [ + { + "title": "Active Alerts", + "type": "table", + "query": "ALERTS{service=\"customer-portal\",alertstate=\"firing\"}", + "columns": [ + "alertname", + "severity", + "instance", + "description" + ] + } + ] + }, + "drill_down_paths": { + "service_overview": { + "from": "service_status", + "to": "detailed_health_dashboard", + "url": "/d/service-health/customer-portal-health", + "params": [ + "var-service", + "var-environment" + ] + }, + "error_investigation": { + "from": "errors", + "to": "error_details_dashboard", + "url": "/d/errors/customer-portal-errors", + "params": [ + "var-service", + "var-time_range" + ] + }, + "latency_analysis": { + "from": "latency", + "to": "trace_analysis_dashboard", + "url": "/d/traces/customer-portal-traces", + "params": [ + "var-service", + "var-handler" + ] + }, + "capacity_planning": { + "from": "saturation", + "to": "capacity_dashboard", + "url": "/d/capacity/customer-portal-capacity", + "params": [ + "var-service", + "var-time_range" + ] + } + } +} \ No newline at end of file diff --git a/engineering/observability-designer/expected_outputs/sample_slo_framework.json b/engineering/observability-designer/expected_outputs/sample_slo_framework.json new file mode 100644 index 0000000..07c9e1f --- /dev/null +++ b/engineering/observability-designer/expected_outputs/sample_slo_framework.json @@ -0,0 +1,545 @@ +{ + "metadata": { + "service": { + "name": "payment-service", + "type": "api", + "criticality": "critical", + "user_facing": true, + "description": "Handles payment processing and transaction management", + "team": "payments", + "environment": "production", + "dependencies": [ + { + "name": "user-service", + "type": "api", + "criticality": "high" + }, + { + "name": "payment-gateway", + "type": "external", + "criticality": "critical" + }, + { + "name": "fraud-detection", + "type": "ml", + "criticality": "high" + } + ], + "endpoints": [ + { + "path": "/api/v1/payments", + "method": "POST", + "sla_latency_ms": 500, + "expected_tps": 100 + }, + { + "path": "/api/v1/payments/{id}", + "method": "GET", + "sla_latency_ms": 200, + "expected_tps": 500 + }, + { + "path": "/api/v1/payments/{id}/refund", + "method": "POST", + "sla_latency_ms": 1000, + "expected_tps": 10 + } + ], + "business_metrics": { + "revenue_per_hour": { + "metric": "sum(payment_amount * rate(payments_successful_total[1h]))", + "target": 50000, + "unit": "USD" + }, + "conversion_rate": { + "metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))", + "target": 0.95, + "unit": "percentage" + } + }, + "infrastructure": { + "container_orchestrator": "kubernetes", + "replicas": 6, + "cpu_limit": "2000m", + "memory_limit": "4Gi", + "database": { + "type": "postgresql", + "connection_pool_size": 20 + }, + "cache": { + "type": "redis", + "cluster_size": 3 + } + }, + "compliance_requirements": [ + "PCI-DSS", + "SOX", + "GDPR" + ], + "tags": [ + "payment", + "transaction", + "critical-path", + "revenue-generating" + ] + }, + "generated_at": "2026-02-16T14:01:57.572080Z", + "framework_version": "1.0" + }, + "slis": [ + { + "name": "Availability", + "description": "Percentage of successful requests", + "type": "ratio", + "good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))", + "total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))", + "unit": "percentage" + }, + { + "name": "Request Latency P95", + "description": "95th percentile of request latency", + "type": "threshold", + "query": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m]))", + "unit": "seconds" + }, + { + "name": "Error Rate", + "description": "Rate of 5xx errors", + "type": "ratio", + "good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))", + "total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))", + "unit": "percentage" + }, + { + "name": "Request Throughput", + "description": "Requests per second", + "type": "gauge", + "query": "sum(rate(http_requests_total{service=\"payment-service\"}[5m]))", + "unit": "requests/sec" + }, + { + "name": "User Journey Success Rate", + "description": "Percentage of successful complete user journeys", + "type": "ratio", + "good_events": "sum(rate(user_journey_total{service=\"payment-service\",status=\"success\"}[5m]))", + "total_events": "sum(rate(user_journey_total{service=\"payment-service\"}[5m]))", + "unit": "percentage" + }, + { + "name": "Feature Availability", + "description": "Percentage of time key features are available", + "type": "ratio", + "good_events": "sum(rate(feature_checks_total{service=\"payment-service\",status=\"available\"}[5m]))", + "total_events": "sum(rate(feature_checks_total{service=\"payment-service\"}[5m]))", + "unit": "percentage" + } + ], + "slos": [ + { + "name": "Availability SLO", + "description": "Service level objective for percentage of successful requests", + "sli_name": "Availability", + "target_value": 0.9999, + "target_display": "99.99%", + "operator": ">=", + "time_windows": [ + "1h", + "1d", + "7d", + "30d" + ], + "measurement_window": "30d", + "service": "payment-service", + "criticality": "critical" + }, + { + "name": "Request Latency P95 SLO", + "description": "Service level objective for 95th percentile of request latency", + "sli_name": "Request Latency P95", + "target_value": 100, + "target_display": "0.1s", + "operator": "<=", + "time_windows": [ + "1h", + "1d", + "7d", + "30d" + ], + "measurement_window": "30d", + "service": "payment-service", + "criticality": "critical" + }, + { + "name": "Error Rate SLO", + "description": "Service level objective for rate of 5xx errors", + "sli_name": "Error Rate", + "target_value": 0.001, + "target_display": "0.1%", + "operator": "<=", + "time_windows": [ + "1h", + "1d", + "7d", + "30d" + ], + "measurement_window": "30d", + "service": "payment-service", + "criticality": "critical" + }, + { + "name": "User Journey Success Rate SLO", + "description": "Service level objective for percentage of successful complete user journeys", + "sli_name": "User Journey Success Rate", + "target_value": 0.9999, + "target_display": "99.99%", + "operator": ">=", + "time_windows": [ + "1h", + "1d", + "7d", + "30d" + ], + "measurement_window": "30d", + "service": "payment-service", + "criticality": "critical" + }, + { + "name": "Feature Availability SLO", + "description": "Service level objective for percentage of time key features are available", + "sli_name": "Feature Availability", + "target_value": 0.9999, + "target_display": "99.99%", + "operator": ">=", + "time_windows": [ + "1h", + "1d", + "7d", + "30d" + ], + "measurement_window": "30d", + "service": "payment-service", + "criticality": "critical" + } + ], + "error_budgets": [ + { + "slo_name": "Availability SLO", + "error_budget_rate": 9.999999999998899e-05, + "error_budget_percentage": "0.010%", + "budgets_by_window": { + "1h": "0.4 seconds", + "1d": "8.6 seconds", + "7d": "1.0 minutes", + "30d": "4.3 minutes" + }, + "burn_rate_alerts": [ + { + "name": "Availability Burn Rate 2% Alert", + "description": "Alert when Availability is consuming error budget at 14.4x rate", + "severity": "critical", + "short_window": "5m", + "long_window": "1h", + "burn_rate_threshold": 14.4, + "budget_consumed": "2%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)", + "annotations": { + "summary": "High burn rate detected for Availability", + "description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget" + } + }, + { + "name": "Availability Burn Rate 5% Alert", + "description": "Alert when Availability is consuming error budget at 6x rate", + "severity": "warning", + "short_window": "30m", + "long_window": "6h", + "burn_rate_threshold": 6, + "budget_consumed": "5%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)", + "annotations": { + "summary": "High burn rate detected for Availability", + "description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget" + } + }, + { + "name": "Availability Burn Rate 10% Alert", + "description": "Alert when Availability is consuming error budget at 3x rate", + "severity": "info", + "short_window": "2h", + "long_window": "1d", + "burn_rate_threshold": 3, + "budget_consumed": "10%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)", + "annotations": { + "summary": "High burn rate detected for Availability", + "description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget" + } + }, + { + "name": "Availability Burn Rate 10% Alert", + "description": "Alert when Availability is consuming error budget at 1x rate", + "severity": "info", + "short_window": "6h", + "long_window": "3d", + "burn_rate_threshold": 1, + "budget_consumed": "10%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)", + "annotations": { + "summary": "High burn rate detected for Availability", + "description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget" + } + } + ] + }, + { + "slo_name": "User Journey Success Rate SLO", + "error_budget_rate": 9.999999999998899e-05, + "error_budget_percentage": "0.010%", + "budgets_by_window": { + "1h": "0.4 seconds", + "1d": "8.6 seconds", + "7d": "1.0 minutes", + "30d": "4.3 minutes" + }, + "burn_rate_alerts": [ + { + "name": "User Journey Success Rate Burn Rate 2% Alert", + "description": "Alert when User Journey Success Rate is consuming error budget at 14.4x rate", + "severity": "critical", + "short_window": "5m", + "long_window": "1h", + "burn_rate_threshold": 14.4, + "budget_consumed": "2%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)", + "annotations": { + "summary": "High burn rate detected for User Journey Success Rate", + "description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget" + } + }, + { + "name": "User Journey Success Rate Burn Rate 5% Alert", + "description": "Alert when User Journey Success Rate is consuming error budget at 6x rate", + "severity": "warning", + "short_window": "30m", + "long_window": "6h", + "burn_rate_threshold": 6, + "budget_consumed": "5%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)", + "annotations": { + "summary": "High burn rate detected for User Journey Success Rate", + "description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget" + } + }, + { + "name": "User Journey Success Rate Burn Rate 10% Alert", + "description": "Alert when User Journey Success Rate is consuming error budget at 3x rate", + "severity": "info", + "short_window": "2h", + "long_window": "1d", + "burn_rate_threshold": 3, + "budget_consumed": "10%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)", + "annotations": { + "summary": "High burn rate detected for User Journey Success Rate", + "description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget" + } + }, + { + "name": "User Journey Success Rate Burn Rate 10% Alert", + "description": "Alert when User Journey Success Rate is consuming error budget at 1x rate", + "severity": "info", + "short_window": "6h", + "long_window": "3d", + "burn_rate_threshold": 1, + "budget_consumed": "10%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)", + "annotations": { + "summary": "High burn rate detected for User Journey Success Rate", + "description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget" + } + } + ] + }, + { + "slo_name": "Feature Availability SLO", + "error_budget_rate": 9.999999999998899e-05, + "error_budget_percentage": "0.010%", + "budgets_by_window": { + "1h": "0.4 seconds", + "1d": "8.6 seconds", + "7d": "1.0 minutes", + "30d": "4.3 minutes" + }, + "burn_rate_alerts": [ + { + "name": "Feature Availability Burn Rate 2% Alert", + "description": "Alert when Feature Availability is consuming error budget at 14.4x rate", + "severity": "critical", + "short_window": "5m", + "long_window": "1h", + "burn_rate_threshold": 14.4, + "budget_consumed": "2%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)", + "annotations": { + "summary": "High burn rate detected for Feature Availability", + "description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget" + } + }, + { + "name": "Feature Availability Burn Rate 5% Alert", + "description": "Alert when Feature Availability is consuming error budget at 6x rate", + "severity": "warning", + "short_window": "30m", + "long_window": "6h", + "burn_rate_threshold": 6, + "budget_consumed": "5%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)", + "annotations": { + "summary": "High burn rate detected for Feature Availability", + "description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget" + } + }, + { + "name": "Feature Availability Burn Rate 10% Alert", + "description": "Alert when Feature Availability is consuming error budget at 3x rate", + "severity": "info", + "short_window": "2h", + "long_window": "1d", + "burn_rate_threshold": 3, + "budget_consumed": "10%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)", + "annotations": { + "summary": "High burn rate detected for Feature Availability", + "description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget" + } + }, + { + "name": "Feature Availability Burn Rate 10% Alert", + "description": "Alert when Feature Availability is consuming error budget at 1x rate", + "severity": "info", + "short_window": "6h", + "long_window": "3d", + "burn_rate_threshold": 1, + "budget_consumed": "10%", + "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)", + "annotations": { + "summary": "High burn rate detected for Feature Availability", + "description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget" + } + } + ] + } + ], + "sla_recommendations": { + "applicable": true, + "service": "payment-service", + "commitments": [ + { + "metric": "Availability", + "target": 0.9989, + "target_display": "99.89%", + "measurement_window": "monthly", + "measurement_method": "Uptime monitoring with 1-minute granularity" + }, + { + "metric": "Feature Availability", + "target": 0.9989, + "target_display": "99.89%", + "measurement_window": "monthly", + "measurement_method": "Uptime monitoring with 1-minute granularity" + } + ], + "penalties": [ + { + "breach_threshold": "< 99.99%", + "credit_percentage": 10 + }, + { + "breach_threshold": "< 99.9%", + "credit_percentage": 25 + }, + { + "breach_threshold": "< 99%", + "credit_percentage": 50 + } + ], + "measurement_methodology": "External synthetic monitoring from multiple geographic locations", + "exclusions": [ + "Planned maintenance windows (with 72h advance notice)", + "Customer-side network or infrastructure issues", + "Force majeure events", + "Third-party service dependencies beyond our control" + ] + }, + "monitoring_recommendations": { + "metrics": { + "collection": "Prometheus with service discovery", + "retention": "90 days for raw metrics, 1 year for aggregated", + "alerting": "Prometheus Alertmanager with multi-window burn rate alerts" + }, + "logging": { + "format": "Structured JSON logs with correlation IDs", + "aggregation": "ELK stack or equivalent with proper indexing", + "retention": "30 days for debug logs, 90 days for error logs" + }, + "tracing": { + "sampling": "Adaptive sampling with 1% base rate", + "storage": "Jaeger or Zipkin with 7-day retention", + "integration": "OpenTelemetry instrumentation" + } + }, + "implementation_guide": { + "prerequisites": [ + "Service instrumented with metrics collection (Prometheus format)", + "Structured logging with correlation IDs", + "Monitoring infrastructure (Prometheus, Grafana, Alertmanager)", + "Incident response processes and escalation policies" + ], + "implementation_steps": [ + { + "step": 1, + "title": "Instrument Service", + "description": "Add metrics collection for all defined SLIs", + "estimated_effort": "1-2 days" + }, + { + "step": 2, + "title": "Configure Recording Rules", + "description": "Set up Prometheus recording rules for SLI calculations", + "estimated_effort": "4-8 hours" + }, + { + "step": 3, + "title": "Implement Burn Rate Alerts", + "description": "Configure multi-window burn rate alerting rules", + "estimated_effort": "1 day" + }, + { + "step": 4, + "title": "Create SLO Dashboard", + "description": "Build Grafana dashboard for SLO tracking and error budget monitoring", + "estimated_effort": "4-6 hours" + }, + { + "step": 5, + "title": "Test and Validate", + "description": "Test alerting and validate SLI measurements against expectations", + "estimated_effort": "1-2 days" + }, + { + "step": 6, + "title": "Documentation and Training", + "description": "Document runbooks and train team on SLO monitoring", + "estimated_effort": "1 day" + } + ], + "validation_checklist": [ + "All SLIs produce expected metric values", + "Burn rate alerts fire correctly during simulated outages", + "Error budget calculations match manual verification", + "Dashboard displays accurate SLO achievement rates", + "Alert routing reaches correct escalation paths", + "Runbooks are complete and tested" + ] + } +} \ No newline at end of file diff --git a/engineering/observability-designer/references/alert_design_patterns.md b/engineering/observability-designer/references/alert_design_patterns.md new file mode 100644 index 0000000..8529996 --- /dev/null +++ b/engineering/observability-designer/references/alert_design_patterns.md @@ -0,0 +1,469 @@ +# Alert Design Patterns: A Guide to Effective Alerting + +## Introduction + +Well-designed alerts are the difference between a reliable system and 3 AM pages about non-issues. This guide provides patterns and anti-patterns for creating alerts that provide value without causing fatigue. + +## Fundamental Principles + +### The Golden Rules of Alerting + +1. **Every alert should be actionable** - If you can't do something about it, don't alert +2. **Every alert should require human intelligence** - If a script can handle it, automate the response +3. **Every alert should be novel** - Don't alert on known, ongoing issues +4. **Every alert should represent a user-visible impact** - Internal metrics matter only if users are affected + +### Alert Classification + +#### Critical Alerts +- Service is completely down +- Data loss is occurring +- Security breach detected +- SLO burn rate indicates imminent SLO violation + +#### Warning Alerts +- Service degradation affecting some users +- Approaching resource limits +- Dependent service issues +- Elevated error rates within SLO + +#### Info Alerts +- Deployment notifications +- Capacity planning triggers +- Configuration changes +- Maintenance windows + +## Alert Design Patterns + +### Pattern 1: Symptoms, Not Causes + +**Good**: Alert on user-visible symptoms +```yaml +- alert: HighLatency + expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5 + for: 5m + annotations: + summary: "API latency is high" + description: "95th percentile latency is {{ $value }}s, above 500ms threshold" +``` + +**Bad**: Alert on internal metrics that may not affect users +```yaml +- alert: HighCPU + expr: cpu_usage > 80 + # This might not affect users at all! +``` + +### Pattern 2: Multi-Window Alerting + +Reduce false positives by requiring sustained problems: + +```yaml +- alert: ServiceDown + expr: ( + avg_over_time(up[2m]) == 0 # Short window: immediate detection + and + avg_over_time(up[10m]) < 0.8 # Long window: avoid flapping + ) + for: 1m +``` + +### Pattern 3: Burn Rate Alerting + +Alert based on error budget consumption rate: + +```yaml +# Fast burn: 2% of monthly budget in 1 hour +- alert: ErrorBudgetFastBurn + expr: ( + error_rate_5m > (14.4 * error_budget_slo) + and + error_rate_1h > (14.4 * error_budget_slo) + ) + for: 2m + labels: + severity: critical + +# Slow burn: 10% of monthly budget in 3 days +- alert: ErrorBudgetSlowBurn + expr: ( + error_rate_6h > (1.0 * error_budget_slo) + and + error_rate_3d > (1.0 * error_budget_slo) + ) + for: 15m + labels: + severity: warning +``` + +### Pattern 4: Hysteresis + +Use different thresholds for firing and resolving to prevent flapping: + +```yaml +- alert: HighErrorRate + expr: error_rate > 0.05 # Fire at 5% + for: 5m + +# Resolution happens automatically when error_rate < 0.03 (3%) +# This prevents flapping around the 5% threshold +``` + +### Pattern 5: Composite Alerts + +Alert when multiple conditions indicate a problem: + +```yaml +- alert: ServiceDegraded + expr: ( + (latency_p95 > latency_threshold) + or + (error_rate > error_threshold) + or + (availability < availability_threshold) + ) and ( + request_rate > min_request_rate # Only alert if we have traffic + ) +``` + +### Pattern 6: Contextual Alerting + +Include relevant context in alerts: + +```yaml +- alert: DatabaseConnections + expr: db_connections_active / db_connections_max > 0.8 + for: 5m + annotations: + summary: "Database connection pool nearly exhausted" + description: "{{ $labels.database }} has {{ $value | humanizePercentage }} connection utilization" + runbook_url: "https://runbooks.company.com/database-connections" + impact: "New requests may be rejected, causing 500 errors" + suggested_action: "Check for connection leaks or increase pool size" +``` + +## Alert Routing and Escalation + +### Routing by Impact and Urgency + +#### Critical Path Services +```yaml +route: + group_by: ['service'] + routes: + - match: + service: 'payment-api' + severity: 'critical' + receiver: 'payment-team-pager' + continue: true + - match: + service: 'payment-api' + severity: 'warning' + receiver: 'payment-team-slack' +``` + +#### Time-Based Routing +```yaml +route: + routes: + - match: + severity: 'critical' + receiver: 'oncall-pager' + - match: + severity: 'warning' + time: 'business_hours' # 9 AM - 5 PM + receiver: 'team-slack' + - match: + severity: 'warning' + time: 'after_hours' + receiver: 'team-email' # Lower urgency outside business hours +``` + +### Escalation Patterns + +#### Linear Escalation +```yaml +receivers: +- name: 'primary-oncall' + pagerduty_configs: + - escalation_policy: 'P1-Escalation' + # 0 min: Primary on-call + # 5 min: Secondary on-call + # 15 min: Engineering manager + # 30 min: Director of engineering +``` + +#### Severity-Based Escalation +```yaml +# Critical: Immediate escalation +- match: + severity: 'critical' + receiver: 'critical-escalation' + +# Warning: Team-first escalation +- match: + severity: 'warning' + receiver: 'team-escalation' +``` + +## Alert Fatigue Prevention + +### Grouping and Suppression + +#### Time-Based Grouping +```yaml +route: + group_wait: 30s # Wait 30s to group similar alerts + group_interval: 2m # Send grouped alerts every 2 minutes + repeat_interval: 1h # Re-send unresolved alerts every hour +``` + +#### Dependent Service Suppression +```yaml +- alert: ServiceDown + expr: up == 0 + +- alert: HighLatency + expr: latency_p95 > 1 + # This alert is suppressed when ServiceDown is firing + inhibit_rules: + - source_match: + alertname: 'ServiceDown' + target_match: + alertname: 'HighLatency' + equal: ['service'] +``` + +### Alert Throttling + +```yaml +# Limit to 1 alert per 10 minutes for noisy conditions +- alert: HighMemoryUsage + expr: memory_usage_percent > 85 + for: 10m # Longer 'for' duration reduces noise + annotations: + summary: "Memory usage has been high for 10+ minutes" +``` + +### Smart Defaults + +```yaml +# Use business logic to set intelligent thresholds +- alert: LowTraffic + expr: request_rate < ( + avg_over_time(request_rate[7d]) * 0.1 # 10% of weekly average + ) + # Only alert during business hours when low traffic is unusual + for: 30m +``` + +## Runbook Integration + +### Runbook Structure Template + +```markdown +# Alert: {{ $labels.alertname }} + +## Immediate Actions +1. Check service status dashboard +2. Verify if users are affected +3. Look at recent deployments/changes + +## Investigation Steps +1. Check logs for errors in the last 30 minutes +2. Verify dependent services are healthy +3. Check resource utilization (CPU, memory, disk) +4. Review recent alerts for patterns + +## Resolution Actions +- If deployment-related: Consider rollback +- If resource-related: Scale up or optimize queries +- If dependency-related: Engage appropriate team + +## Escalation +- Primary: @team-oncall +- Secondary: @engineering-manager +- Emergency: @site-reliability-team +``` + +### Runbook Integration in Alerts + +```yaml +annotations: + runbook_url: "https://runbooks.company.com/alerts/{{ $labels.alertname }}" + quick_debug: | + 1. curl -s https://{{ $labels.instance }}/health + 2. kubectl logs {{ $labels.pod }} --tail=50 + 3. Check dashboard: https://grafana.company.com/d/service-{{ $labels.service }} +``` + +## Testing and Validation + +### Alert Testing Strategies + +#### Chaos Engineering Integration +```python +# Test that alerts fire during controlled failures +def test_alert_during_cpu_spike(): + with chaos.cpu_spike(target='payment-api', duration='2m'): + assert wait_for_alert('HighCPU', timeout=180) + +def test_alert_during_network_partition(): + with chaos.network_partition(target='database'): + assert wait_for_alert('DatabaseUnreachable', timeout=60) +``` + +#### Historical Alert Analysis +```prometheus +# Query to find alerts that fired without incidents +count by (alertname) ( + ALERTS{alertstate="firing"}[30d] +) unless on (alertname) ( + count by (alertname) ( + incident_created{source="alert"}[30d] + ) +) +``` + +### Alert Quality Metrics + +#### Alert Precision +``` +Precision = True Positives / (True Positives + False Positives) +``` + +Track alerts that resulted in actual incidents vs false alarms. + +#### Time to Resolution +```prometheus +# Average time from alert firing to resolution +avg_over_time( + (alert_resolved_timestamp - alert_fired_timestamp)[30d] +) by (alertname) +``` + +#### Alert Fatigue Indicators +```prometheus +# Alerts per day by team +sum by (team) ( + increase(alerts_fired_total[1d]) +) + +# Percentage of alerts acknowledged within 15 minutes +sum(alerts_acked_within_15m) / sum(alerts_fired) * 100 +``` + +## Advanced Patterns + +### Machine Learning-Enhanced Alerting + +#### Anomaly Detection +```yaml +- alert: AnomalousTraffic + expr: | + abs(request_rate - predict_linear(request_rate[1h], 300)) / + stddev_over_time(request_rate[1h]) > 3 + for: 10m + annotations: + summary: "Traffic pattern is anomalous" + description: "Current traffic deviates from predicted pattern by >3 standard deviations" +``` + +#### Dynamic Thresholds +```yaml +- alert: DynamicHighLatency + expr: | + latency_p95 > ( + quantile_over_time(0.95, latency_p95[7d]) + # Historical 95th percentile + 2 * stddev_over_time(latency_p95[7d]) # Plus 2 standard deviations + ) +``` + +### Business Hours Awareness + +```yaml +# Different thresholds for business vs off hours +- alert: HighLatencyBusinessHours + expr: latency_p95 > 0.2 # Stricter during business hours + for: 2m + # Active 9 AM - 5 PM weekdays + +- alert: HighLatencyOffHours + expr: latency_p95 > 0.5 # More lenient after hours + for: 5m + # Active nights and weekends +``` + +### Progressive Alerting + +```yaml +# Escalating alert severity based on duration +- alert: ServiceLatencyElevated + expr: latency_p95 > 0.5 + for: 5m + labels: + severity: info + +- alert: ServiceLatencyHigh + expr: latency_p95 > 0.5 + for: 15m # Same condition, longer duration + labels: + severity: warning + +- alert: ServiceLatencyCritical + expr: latency_p95 > 0.5 + for: 30m # Same condition, even longer duration + labels: + severity: critical +``` + +## Anti-Patterns to Avoid + +### Anti-Pattern 1: Alerting on Everything +**Problem**: Too many alerts create noise and fatigue +**Solution**: Be selective; only alert on user-impacting issues + +### Anti-Pattern 2: Vague Alert Messages +**Problem**: "Service X is down" - which instance? what's the impact? +**Solution**: Include specific details and context + +### Anti-Pattern 3: Alerts Without Runbooks +**Problem**: Alerts that don't explain what to do +**Solution**: Every alert must have an associated runbook + +### Anti-Pattern 4: Static Thresholds +**Problem**: 80% CPU might be normal during peak hours +**Solution**: Use contextual, adaptive thresholds + +### Anti-Pattern 5: Ignoring Alert Quality +**Problem**: Accepting high false positive rates +**Solution**: Regularly review and tune alert precision + +## Implementation Checklist + +### Pre-Implementation +- [ ] Define alert severity levels and escalation policies +- [ ] Create runbook templates +- [ ] Set up alert routing configuration +- [ ] Define SLOs that alerts will protect + +### Alert Development +- [ ] Each alert has clear success criteria +- [ ] Alert conditions tested against historical data +- [ ] Runbook created and accessible +- [ ] Severity and routing configured +- [ ] Context and suggested actions included + +### Post-Implementation +- [ ] Monitor alert precision and recall +- [ ] Regular review of alert fatigue metrics +- [ ] Quarterly alert effectiveness review +- [ ] Team training on alert response procedures + +### Quality Assurance +- [ ] Test alerts fire during controlled failures +- [ ] Verify alerts resolve when conditions improve +- [ ] Confirm runbooks are accurate and helpful +- [ ] Validate escalation paths work correctly + +Remember: Great alerts are invisible when things work and invaluable when things break. Focus on quality over quantity, and always optimize for the human who will respond to the alert at 3 AM. \ No newline at end of file diff --git a/engineering/observability-designer/references/dashboard_best_practices.md b/engineering/observability-designer/references/dashboard_best_practices.md new file mode 100644 index 0000000..7d9af4f --- /dev/null +++ b/engineering/observability-designer/references/dashboard_best_practices.md @@ -0,0 +1,571 @@ +# Dashboard Best Practices: Design for Insight and Action + +## Introduction + +A well-designed dashboard is like a good story - it guides you through the data with purpose and clarity. This guide provides practical patterns for creating dashboards that inform decisions and enable quick troubleshooting. + +## Design Principles + +### The Hierarchy of Information + +#### Primary Information (Top Third) +- Service health status +- SLO achievement +- Critical alerts +- Business KPIs + +#### Secondary Information (Middle Third) +- Golden signals (latency, traffic, errors, saturation) +- Resource utilization +- Throughput and performance metrics + +#### Tertiary Information (Bottom Third) +- Detailed breakdowns +- Historical trends +- Dependency status +- Debug information + +### Visual Design Principles + +#### Rule of 7±2 +- Maximum 7±2 panels per screen +- Group related information together +- Use sections to organize complexity + +#### Color Psychology +- **Red**: Critical issues, danger, immediate attention needed +- **Yellow/Orange**: Warnings, caution, degraded state +- **Green**: Healthy, normal operation, success +- **Blue**: Information, neutral metrics, capacity +- **Gray**: Disabled, unknown, or baseline states + +#### Chart Selection Guide +- **Line charts**: Time series, trends, comparisons over time +- **Bar charts**: Categorical comparisons, top N lists +- **Gauges**: Single value with defined good/bad ranges +- **Stat panels**: Key metrics, percentages, counts +- **Heatmaps**: Distribution data, correlation analysis +- **Tables**: Detailed breakdowns, multi-dimensional data + +## Dashboard Archetypes + +### The Overview Dashboard + +**Purpose**: High-level health check and business metrics +**Audience**: Executives, managers, cross-team stakeholders +**Update Frequency**: 5-15 minutes + +```yaml +sections: + - title: "Business Health" + panels: + - service_availability_summary + - revenue_per_hour + - active_users + - conversion_rate + + - title: "System Health" + panels: + - critical_alerts_count + - slo_achievement_summary + - error_budget_remaining + - deployment_status +``` + +### The SRE Operational Dashboard + +**Purpose**: Real-time monitoring and incident response +**Audience**: SRE, on-call engineers +**Update Frequency**: 15-30 seconds + +```yaml +sections: + - title: "Service Status" + panels: + - service_up_status + - active_incidents + - recent_deployments + + - title: "Golden Signals" + panels: + - latency_percentiles + - request_rate + - error_rate + - resource_saturation + + - title: "Infrastructure" + panels: + - cpu_memory_utilization + - network_io + - disk_space +``` + +### The Developer Debug Dashboard + +**Purpose**: Deep-dive troubleshooting and performance analysis +**Audience**: Development teams +**Update Frequency**: 30 seconds - 2 minutes + +```yaml +sections: + - title: "Application Performance" + panels: + - endpoint_latency_breakdown + - database_query_performance + - cache_hit_rates + - queue_depths + + - title: "Errors and Logs" + panels: + - error_rate_by_endpoint + - log_volume_by_level + - exception_types + - slow_queries +``` + +## Layout Patterns + +### The F-Pattern Layout + +Based on eye-tracking studies, users scan in an F-pattern: + +``` +[Critical Status] [SLO Summary ] [Error Budget ] +[Latency ] [Traffic ] [Errors ] +[Saturation ] [Resource Use ] [Detailed View] +[Historical ] [Dependencies ] [Debug Info ] +``` + +### The Z-Pattern Layout + +For executive dashboards, follow the Z-pattern: + +``` +[Business KPIs ] → [System Status] + ↓ ↓ +[Trend Analysis ] ← [Key Metrics ] +``` + +### Responsive Design + +#### Desktop (1920x1080) +- 24-column grid +- Panels can be 6, 8, 12, or 24 units wide +- 4-6 rows visible without scrolling + +#### Laptop (1366x768) +- Stack wider panels vertically +- Reduce panel heights +- Prioritize most critical information + +#### Mobile (768px width) +- Single column layout +- Simplified panels +- Touch-friendly controls + +## Effective Panel Design + +### Stat Panels + +```yaml +# Good: Clear value with context +- title: "API Availability" + type: stat + targets: + - expr: avg(up{service="api"}) * 100 + field_config: + unit: percent + thresholds: + steps: + - color: red + value: 0 + - color: yellow + value: 99 + - color: green + value: 99.9 + options: + color_mode: background + text_mode: value_and_name +``` + +### Time Series Panels + +```yaml +# Good: Multiple related metrics with clear legend +- title: "Request Latency" + type: timeseries + targets: + - expr: histogram_quantile(0.50, rate(http_duration_bucket[5m])) + legend: "P50" + - expr: histogram_quantile(0.95, rate(http_duration_bucket[5m])) + legend: "P95" + - expr: histogram_quantile(0.99, rate(http_duration_bucket[5m])) + legend: "P99" + field_config: + unit: ms + custom: + draw_style: line + fill_opacity: 10 + options: + legend: + display_mode: table + placement: bottom + values: [min, max, mean, last] +``` + +### Table Panels + +```yaml +# Good: Top N with relevant columns +- title: "Slowest Endpoints" + type: table + targets: + - expr: topk(10, histogram_quantile(0.95, sum by (handler)(rate(http_duration_bucket[5m])))) + format: table + instant: true + transformations: + - id: organize + options: + exclude_by_name: + Time: true + rename_by_name: + Value: "P95 Latency (ms)" + handler: "Endpoint" +``` + +## Color and Visualization Best Practices + +### Threshold Configuration + +```yaml +# Traffic light system with meaningful boundaries +thresholds: + steps: + - color: green # Good performance + value: null # Default + - color: yellow # Degraded performance + value: 95 # 95th percentile of historical normal + - color: orange # Poor performance + value: 99 # 99th percentile of historical normal + - color: red # Critical performance + value: 99.9 # Worst case scenario +``` + +### Color Blind Friendly Palettes + +```yaml +# Use patterns and shapes in addition to color +field_config: + overrides: + - matcher: + id: byName + options: "Critical" + properties: + - id: color + value: + mode: fixed + fixed_color: "#d73027" # Red-orange for protanopia + - id: custom.draw_style + value: "points" # Different shape +``` + +### Consistent Color Semantics + +- **Success/Health**: Green (#28a745) +- **Warning/Degraded**: Yellow (#ffc107) +- **Error/Critical**: Red (#dc3545) +- **Information**: Blue (#007bff) +- **Neutral**: Gray (#6c757d) + +## Time Range Strategy + +### Default Time Ranges by Dashboard Type + +#### Real-time Operational +- **Default**: Last 15 minutes +- **Quick options**: 5m, 15m, 1h, 4h +- **Auto-refresh**: 15-30 seconds + +#### Troubleshooting +- **Default**: Last 1 hour +- **Quick options**: 15m, 1h, 4h, 12h, 1d +- **Auto-refresh**: 1 minute + +#### Business Review +- **Default**: Last 24 hours +- **Quick options**: 1d, 7d, 30d, 90d +- **Auto-refresh**: 5 minutes + +#### Capacity Planning +- **Default**: Last 7 days +- **Quick options**: 7d, 30d, 90d, 1y +- **Auto-refresh**: 15 minutes + +### Time Range Annotations + +```yaml +# Add context for time-based events +annotations: + - name: "Deployments" + datasource: "Prometheus" + expr: "deployment_timestamp" + title_format: "Deploy {{ version }}" + text_format: "Deployed version {{ version }} to {{ environment }}" + + - name: "Incidents" + datasource: "Incident API" + query: "incidents.json?service={{ service }}" + color: "red" +``` + +## Interactive Features + +### Template Variables + +```yaml +# Service selector +- name: service + type: query + query: label_values(up, service) + current: + text: All + value: $__all + include_all: true + multi: true + +# Environment selector +- name: environment + type: query + query: label_values(up{service="$service"}, environment) + current: + text: production + value: production +``` + +### Drill-Down Links + +```yaml +# Panel-level drill-downs +- title: "Error Rate" + type: timeseries + # ... other config ... + options: + data_links: + - title: "View Error Logs" + url: "/d/logs-dashboard?var-service=${__field.labels.service}&from=${__from}&to=${__to}" + - title: "Error Traces" + url: "/d/traces-dashboard?var-service=${__field.labels.service}" +``` + +### Dynamic Panel Titles + +```yaml +- title: "${service} - Request Rate" # Uses template variable + type: timeseries + # Title updates automatically when service variable changes +``` + +## Performance Optimization + +### Query Optimization + +#### Use Recording Rules +```yaml +# Instead of complex queries in dashboards +groups: + - name: http_requests + rules: + - record: http_request_rate_5m + expr: sum(rate(http_requests_total[5m])) by (service, method, handler) + + - record: http_request_latency_p95_5m + expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)) +``` + +#### Limit Data Points +```yaml +# Good: Reasonable resolution for dashboard +- expr: http_request_rate_5m[1h] + interval: 15s # One point every 15 seconds + +# Bad: Too many points for visualization +- expr: http_request_rate_1s[1h] # 3600 points! +``` + +### Dashboard Performance + +#### Panel Limits +- **Maximum panels per dashboard**: 20-30 +- **Maximum queries per panel**: 10 +- **Maximum time series per panel**: 50 + +#### Caching Strategy +```yaml +# Use appropriate cache headers +cache_timeout: 30 # Cache for 30 seconds on fast-changing panels +cache_timeout: 300 # Cache for 5 minutes on slow-changing panels +``` + +## Accessibility + +### Screen Reader Support + +```yaml +# Provide text alternatives for visual elements +- title: "Service Health Status" + type: stat + options: + text_mode: value_and_name # Includes both value and description + field_config: + mappings: + - options: + "1": + text: "Healthy" + color: "green" + "0": + text: "Unhealthy" + color: "red" +``` + +### Keyboard Navigation + +- Ensure all interactive elements are keyboard accessible +- Provide logical tab order +- Include skip links for complex dashboards + +### High Contrast Mode + +```yaml +# Test dashboards work in high contrast mode +theme: high_contrast +colors: + - "#000000" # Pure black + - "#ffffff" # Pure white + - "#ffff00" # Pure yellow + - "#ff0000" # Pure red +``` + +## Testing and Validation + +### Dashboard Testing Checklist + +#### Functional Testing +- [ ] All panels load without errors +- [ ] Template variables filter correctly +- [ ] Time range changes update all panels +- [ ] Drill-down links work as expected +- [ ] Auto-refresh functions properly + +#### Visual Testing +- [ ] Dashboard renders correctly on different screen sizes +- [ ] Colors are distinguishable and meaningful +- [ ] Text is readable at normal zoom levels +- [ ] Legends and labels are clear + +#### Performance Testing +- [ ] Dashboard loads in < 5 seconds +- [ ] No queries timeout under normal load +- [ ] Auto-refresh doesn't cause browser lag +- [ ] Memory usage remains reasonable + +#### Usability Testing +- [ ] New team members can understand the dashboard +- [ ] Action items are clear during incidents +- [ ] Key information is quickly discoverable +- [ ] Dashboard supports common troubleshooting workflows + +## Maintenance and Governance + +### Dashboard Lifecycle + +#### Creation +1. Define dashboard purpose and audience +2. Identify key metrics and success criteria +3. Design layout following established patterns +4. Implement with consistent styling +5. Test with real data and user scenarios + +#### Maintenance +- **Weekly**: Check for broken panels or queries +- **Monthly**: Review dashboard usage analytics +- **Quarterly**: Gather user feedback and iterate +- **Annually**: Major review and potential redesign + +#### Retirement +- Archive dashboards that are no longer used +- Migrate users to replacement dashboards +- Document lessons learned + +### Dashboard Standards + +```yaml +# Organization dashboard standards +standards: + naming_convention: "[Team] [Service] - [Purpose]" + tags: [team, service_type, environment, purpose] + refresh_intervals: [15s, 30s, 1m, 5m, 15m] + time_ranges: [5m, 15m, 1h, 4h, 1d, 7d, 30d] + color_scheme: "company_standard" + max_panels_per_dashboard: 25 +``` + +## Advanced Patterns + +### Composite Dashboards + +```yaml +# Dashboard that includes panels from other dashboards +- title: "Service Overview" + type: dashlist + targets: + - "service-health" + - "service-performance" + - "service-business-metrics" + options: + show_headings: true + max_items: 10 +``` + +### Dynamic Dashboard Generation + +```python +# Generate dashboards from service definitions +def generate_service_dashboard(service_config): + panels = [] + + # Always include golden signals + panels.extend(generate_golden_signals_panels(service_config)) + + # Add service-specific panels + if service_config.type == 'database': + panels.extend(generate_database_panels(service_config)) + elif service_config.type == 'queue': + panels.extend(generate_queue_panels(service_config)) + + return { + 'title': f"{service_config.name} - Operational Dashboard", + 'panels': panels, + 'variables': generate_variables(service_config) + } +``` + +### A/B Testing for Dashboards + +```yaml +# Test different dashboard designs with different teams +experiment: + name: "dashboard_layout_test" + variants: + - name: "traditional_layout" + weight: 50 + config: "dashboard_v1.json" + - name: "f_pattern_layout" + weight: 50 + config: "dashboard_v2.json" + success_metrics: + - "time_to_insight" + - "user_satisfaction" + - "troubleshooting_efficiency" +``` + +Remember: A dashboard should tell a story about your system's health and guide users toward the right actions. Focus on clarity over complexity, and always optimize for the person who will use it during a stressful incident. \ No newline at end of file diff --git a/engineering/observability-designer/references/slo_cookbook.md b/engineering/observability-designer/references/slo_cookbook.md new file mode 100644 index 0000000..3734a2b --- /dev/null +++ b/engineering/observability-designer/references/slo_cookbook.md @@ -0,0 +1,329 @@ +# SLO Cookbook: A Practical Guide to Service Level Objectives + +## Introduction + +Service Level Objectives (SLOs) are a key tool for managing service reliability. This cookbook provides practical guidance for implementing SLOs that actually improve system reliability rather than just creating meaningless metrics. + +## Fundamentals + +### The SLI/SLO/SLA Hierarchy + +- **SLI (Service Level Indicator)**: A quantifiable measure of service quality +- **SLO (Service Level Objective)**: A target range of values for an SLI +- **SLA (Service Level Agreement)**: A business agreement with consequences for missing SLO targets + +### Golden Rule of SLOs + +**Start simple, iterate based on learning.** Your first SLOs won't be perfect, and that's okay. + +## Choosing Good SLIs + +### The Four Golden Signals + +1. **Latency**: How long requests take to complete +2. **Traffic**: How many requests are coming in +3. **Errors**: How many requests are failing +4. **Saturation**: How "full" your service is + +### SLI Selection Criteria + +A good SLI should be: +- **Measurable**: You can collect data for it +- **Meaningful**: It reflects user experience +- **Controllable**: You can take action to improve it +- **Proportional**: Changes in the SLI reflect changes in user happiness + +### Service Type Specific SLIs + +#### HTTP APIs +- **Request latency**: P95 or P99 response time +- **Availability**: Proportion of successful requests (non-5xx) +- **Throughput**: Requests per second capacity + +```prometheus +# Availability SLI +sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m])) + +# Latency SLI +histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) +``` + +#### Batch Jobs +- **Freshness**: Age of the last successful run +- **Correctness**: Proportion of jobs completing successfully +- **Throughput**: Items processed per unit time + +#### Data Pipelines +- **Data freshness**: Time since last successful update +- **Data quality**: Proportion of records passing validation +- **Processing latency**: Time from ingestion to availability + +### Anti-Patterns in SLI Selection + +❌ **Don't use**: CPU usage, memory usage, disk space as primary SLIs +- These are symptoms, not user-facing impacts + +❌ **Don't use**: Counts instead of rates or proportions +- "Number of errors" vs "Error rate" + +❌ **Don't use**: Internal metrics that users don't care about +- Queue depth, cache hit rate (unless they directly impact user experience) + +## Setting SLO Targets + +### The Art of Target Setting + +Setting SLO targets is balancing act between: +- **User happiness**: Targets should reflect acceptable user experience +- **Business value**: Tighter SLOs cost more to maintain +- **Current performance**: Targets should be achievable but aspirational + +### Target Setting Strategies + +#### Historical Performance Method +1. Collect 4-6 weeks of historical data +2. Calculate the worst user-visible performance in that period +3. Set your SLO slightly better than the worst acceptable performance + +#### User Journey Mapping +1. Map critical user journeys +2. Identify acceptable performance for each step +3. Work backwards to component SLOs + +#### Error Budget Approach +1. Decide how much unreliability you can afford +2. Set SLO targets based on acceptable error budget consumption +3. Example: 99.9% availability = 43.8 minutes downtime per month + +### SLO Target Examples by Service Criticality + +#### Critical Services (Revenue Impact) +- **Availability**: 99.95% - 99.99% +- **Latency (P95)**: 100-200ms +- **Error Rate**: < 0.1% + +#### High Priority Services +- **Availability**: 99.9% - 99.95% +- **Latency (P95)**: 200-500ms +- **Error Rate**: < 0.5% + +#### Standard Services +- **Availability**: 99.5% - 99.9% +- **Latency (P95)**: 500ms - 1s +- **Error Rate**: < 1% + +## Error Budget Management + +### What is an Error Budget? + +Your error budget is the maximum amount of unreliability you can accumulate while still meeting your SLO. It's calculated as: + +``` +Error Budget = (1 - SLO) × Time Window +``` + +For a 99.9% availability SLO over 30 days: +``` +Error Budget = (1 - 0.999) × 30 days = 0.001 × 30 days = 43.8 minutes +``` + +### Error Budget Policies + +Define what happens when you consume your error budget: + +#### Conservative Policy (High-Risk Services) +- **> 50% consumed**: Freeze non-critical feature releases +- **> 75% consumed**: Focus entirely on reliability improvements +- **> 90% consumed**: Consider emergency measures (traffic shaping, etc.) + +#### Balanced Policy (Standard Services) +- **> 75% consumed**: Increase focus on reliability work +- **> 90% consumed**: Pause feature work, focus on reliability + +#### Aggressive Policy (Early Stage Services) +- **> 90% consumed**: Review but continue normal operations +- **100% consumed**: Evaluate SLO appropriateness + +### Burn Rate Alerting + +Multi-window burn rate alerts help you catch SLO violations before they become critical: + +```yaml +# Fast burn: 2% budget consumed in 1 hour +- alert: FastBurnSLOViolation + expr: ( + (1 - (sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m])))) > (14.4 * 0.001) + and + (1 - (sum(rate(http_requests_total{code!~"5.."}[1h])) / sum(rate(http_requests_total[1h])))) > (14.4 * 0.001) + ) + for: 2m + +# Slow burn: 10% budget consumed in 3 days +- alert: SlowBurnSLOViolation + expr: ( + (1 - (sum(rate(http_requests_total{code!~"5.."}[6h])) / sum(rate(http_requests_total[6h])))) > (1.0 * 0.001) + and + (1 - (sum(rate(http_requests_total{code!~"5.."}[3d])) / sum(rate(http_requests_total[3d])))) > (1.0 * 0.001) + ) + for: 15m +``` + +## Implementation Patterns + +### The SLO Implementation Ladder + +#### Level 1: Basic SLOs +- Choose 1-2 SLIs that matter most to users +- Set aspirational but achievable targets +- Implement basic alerting when SLOs are missed + +#### Level 2: Operational SLOs +- Add burn rate alerting +- Create error budget dashboards +- Establish error budget policies +- Regular SLO review meetings + +#### Level 3: Advanced SLOs +- Multi-window burn rate alerts +- Automated error budget policy enforcement +- SLO-driven incident prioritization +- Integration with CI/CD for deployment decisions + +### SLO Measurement Architecture + +#### Push vs Pull Metrics +- **Pull** (Prometheus): Good for infrastructure metrics, real-time alerting +- **Push** (StatsD): Good for application metrics, business events + +#### Measurement Points +- **Server-side**: More reliable, easier to implement +- **Client-side**: Better reflects user experience +- **Synthetic**: Consistent, predictable, may not reflect real user experience + +### SLO Dashboard Design + +Essential elements for SLO dashboards: + +1. **Current SLO Achievement**: Large, prominent display +2. **Error Budget Remaining**: Visual indicator (gauge, progress bar) +3. **Burn Rate**: Time series showing error budget consumption rate +4. **Historical Trends**: 4-week view of SLO achievement +5. **Alerts**: Current and recent SLO-related alerts + +## Advanced Topics + +### Dependency SLOs + +For services with dependencies: + +``` +SLO_service ≤ min(SLO_inherent, ∏SLO_dependencies) +``` + +If your service depends on 3 other services each with 99.9% SLO: +``` +Maximum_SLO = 0.999³ = 0.997 = 99.7% +``` + +### User Journey SLOs + +Track end-to-end user experiences: + +```prometheus +# Registration success rate +sum(rate(user_registration_success_total[5m])) / sum(rate(user_registration_attempts_total[5m])) + +# Purchase completion latency +histogram_quantile(0.95, rate(purchase_completion_duration_seconds_bucket[5m])) +``` + +### SLOs for Batch Systems + +Special considerations for non-request/response systems: + +#### Freshness SLO +```prometheus +# Data should be no more than 4 hours old +(time() - last_successful_update_timestamp) < (4 * 3600) +``` + +#### Throughput SLO +```prometheus +# Should process at least 1000 items per hour +rate(items_processed_total[1h]) >= 1000 +``` + +#### Quality SLO +```prometheus +# At least 99.5% of records should pass validation +sum(rate(records_valid_total[5m])) / sum(rate(records_processed_total[5m])) >= 0.995 +``` + +## Common Mistakes and How to Avoid Them + +### Mistake 1: Too Many SLOs +**Problem**: Drowning in metrics, losing focus +**Solution**: Start with 1-2 SLOs per service, add more only when needed + +### Mistake 2: Internal Metrics as SLIs +**Problem**: Optimizing for metrics that don't impact users +**Solution**: Always ask "If this metric changes, do users notice?" + +### Mistake 3: Perfectionist SLOs +**Problem**: 99.99% SLO when 99.9% would be fine +**Solution**: Higher SLOs cost exponentially more; pick the minimum acceptable level + +### Mistake 4: Ignoring Error Budgets +**Problem**: Treating any SLO miss as an emergency +**Solution**: Error budgets exist to be spent; use them to balance feature velocity and reliability + +### Mistake 5: Static SLOs +**Problem**: Setting SLOs once and never updating them +**Solution**: Review SLOs quarterly; adjust based on user feedback and business changes + +## SLO Review Process + +### Monthly SLO Review Agenda + +1. **SLO Achievement Review**: Did we meet our SLOs? +2. **Error Budget Analysis**: How did we spend our error budget? +3. **Incident Correlation**: Which incidents impacted our SLOs? +4. **SLI Quality Assessment**: Are our SLIs still meaningful? +5. **Target Adjustment**: Should we change any targets? + +### Quarterly SLO Health Check + +1. **User Impact Validation**: Survey users about acceptable performance +2. **Business Alignment**: Do SLOs still reflect business priorities? +3. **Measurement Quality**: Are we measuring the right things? +4. **Cost/Benefit Analysis**: Are tighter SLOs worth the investment? + +## Tooling and Automation + +### Essential Tools + +1. **Metrics Collection**: Prometheus, InfluxDB, CloudWatch +2. **Alerting**: Alertmanager, PagerDuty, OpsGenie +3. **Dashboards**: Grafana, DataDog, New Relic +4. **SLO Platforms**: Sloth, Pyrra, Service Level Blue + +### Automation Opportunities + +- **Burn rate alert generation** from SLO definitions +- **Dashboard creation** from SLO specifications +- **Error budget calculation** and tracking +- **Release blocking** based on error budget consumption + +## Getting Started Checklist + +- [ ] Identify your service's critical user journeys +- [ ] Choose 1-2 SLIs that best reflect user experience +- [ ] Collect 4-6 weeks of baseline data +- [ ] Set initial SLO targets based on historical performance +- [ ] Implement basic SLO monitoring and alerting +- [ ] Create an SLO dashboard +- [ ] Define error budget policies +- [ ] Schedule monthly SLO reviews +- [ ] Plan for quarterly SLO health checks + +Remember: SLOs are a journey, not a destination. Start simple, learn from experience, and iterate toward better reliability management. \ No newline at end of file diff --git a/engineering/observability-designer/scripts/alert_optimizer.py b/engineering/observability-designer/scripts/alert_optimizer.py new file mode 100644 index 0000000..cbb0ef4 --- /dev/null +++ b/engineering/observability-designer/scripts/alert_optimizer.py @@ -0,0 +1,1059 @@ +#!/usr/bin/env python3 +""" +Alert Optimizer - Analyze and optimize alert configurations + +This script analyzes existing alert configurations and identifies optimization opportunities: +- Noisy alerts with high false positive rates +- Missing coverage gaps in monitoring +- Duplicate or redundant alerts +- Poor threshold settings and alert fatigue risks +- Missing runbooks and documentation +- Routing and escalation policy improvements + +Usage: + python alert_optimizer.py --input alert_config.json --output optimized_config.json + python alert_optimizer.py --input alerts.json --analyze-only --report report.html +""" + +import json +import argparse +import sys +import re +import math +from typing import Dict, List, Any, Tuple, Set +from datetime import datetime, timedelta +from collections import defaultdict, Counter + + +class AlertOptimizer: + """Analyze and optimize alert configurations.""" + + # Alert severity priority mapping + SEVERITY_PRIORITY = { + 'critical': 1, + 'high': 2, + 'warning': 3, + 'info': 4 + } + + # Common noisy alert patterns + NOISY_PATTERNS = [ + r'disk.*usage.*>.*[89]\d%', # Disk usage > 80% often noisy + r'memory.*>.*[89]\d%', # Memory > 80% often noisy + r'cpu.*>.*[789]\d%', # CPU > 70% can be noisy + r'response.*time.*>.*\d+ms', # Low latency thresholds + r'error.*rate.*>.*0\.[01]%' # Very low error rate thresholds + ] + + # Essential monitoring categories + COVERAGE_CATEGORIES = [ + 'availability', + 'latency', + 'error_rate', + 'resource_utilization', + 'security', + 'business_metrics' + ] + + # Golden signals that should always be monitored + GOLDEN_SIGNALS = [ + 'latency', + 'traffic', + 'errors', + 'saturation' + ] + + def __init__(self): + """Initialize the Alert Optimizer.""" + self.alert_config = {} + self.optimization_results = {} + self.alert_analysis = {} + + def load_alert_config(self, file_path: str) -> Dict[str, Any]: + """Load alert configuration from JSON file.""" + try: + with open(file_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + raise ValueError(f"Alert configuration file not found: {file_path}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in alert configuration: {e}") + + def analyze_alert_noise(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Identify potentially noisy alerts.""" + noisy_alerts = [] + + for alert in alerts: + noise_score = 0 + noise_reasons = [] + + alert_rule = alert.get('expr', alert.get('condition', '')) + alert_name = alert.get('alert', alert.get('name', 'Unknown')) + + # Check for common noisy patterns + for pattern in self.NOISY_PATTERNS: + if re.search(pattern, alert_rule, re.IGNORECASE): + noise_score += 3 + noise_reasons.append(f"Matches noisy pattern: {pattern}") + + # Check for very frequent evaluation intervals + evaluation_interval = alert.get('for', '0s') + if self._parse_duration(evaluation_interval) < 60: # Less than 1 minute + noise_score += 2 + noise_reasons.append("Very short evaluation interval") + + # Check for lack of 'for' clause + if not alert.get('for') or alert.get('for') == '0s': + noise_score += 2 + noise_reasons.append("No 'for' clause - may cause alert flapping") + + # Check for overly sensitive thresholds + if self._has_sensitive_threshold(alert_rule): + noise_score += 2 + noise_reasons.append("Potentially sensitive threshold") + + # Check historical firing rate if available + historical_data = alert.get('historical_data', {}) + if historical_data: + firing_rate = historical_data.get('fires_per_day', 0) + if firing_rate > 10: # More than 10 fires per day + noise_score += 3 + noise_reasons.append(f"High firing rate: {firing_rate} times/day") + + false_positive_rate = historical_data.get('false_positive_rate', 0) + if false_positive_rate > 0.3: # > 30% false positives + noise_score += 4 + noise_reasons.append(f"High false positive rate: {false_positive_rate*100:.1f}%") + + if noise_score >= 3: # Threshold for considering an alert noisy + noisy_alert = { + 'alert_name': alert_name, + 'noise_score': noise_score, + 'reasons': noise_reasons, + 'current_rule': alert_rule, + 'recommendations': self._generate_noise_reduction_recommendations(alert, noise_reasons) + } + noisy_alerts.append(noisy_alert) + + return sorted(noisy_alerts, key=lambda x: x['noise_score'], reverse=True) + + def _parse_duration(self, duration_str: str) -> int: + """Parse duration string to seconds.""" + if not duration_str or duration_str == '0s': + return 0 + + duration_map = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400} + match = re.match(r'(\d+)([smhd])', duration_str) + if match: + value, unit = match.groups() + return int(value) * duration_map.get(unit, 1) + return 0 + + def _has_sensitive_threshold(self, rule: str) -> bool: + """Check if alert rule has potentially sensitive thresholds.""" + # Look for very low error rates or very tight latency thresholds + sensitive_patterns = [ + r'error.*rate.*>.*0\.0[01]', # Error rate > 0.01% or 0.001% + r'latency.*>.*[12]\d\d?ms', # Latency > 100-299ms + r'response.*time.*>.*0\.[12]', # Response time > 0.1-0.2s + r'cpu.*>.*[456]\d%' # CPU > 40-69% (too sensitive for most cases) + ] + + for pattern in sensitive_patterns: + if re.search(pattern, rule, re.IGNORECASE): + return True + return False + + def _generate_noise_reduction_recommendations(self, alert: Dict[str, Any], + reasons: List[str]) -> List[str]: + """Generate recommendations to reduce alert noise.""" + recommendations = [] + + if "No 'for' clause" in str(reasons): + recommendations.append("Add 'for: 5m' clause to prevent flapping") + + if "Very short evaluation interval" in str(reasons): + recommendations.append("Increase evaluation interval to at least 1 minute") + + if "sensitive threshold" in str(reasons): + recommendations.append("Review and increase threshold based on historical data") + + if "High firing rate" in str(reasons): + recommendations.append("Analyze historical firing patterns and adjust thresholds") + + if "High false positive rate" in str(reasons): + recommendations.append("Implement more specific conditions to reduce false positives") + + if "noisy pattern" in str(reasons): + recommendations.append("Consider using percentile-based thresholds instead of absolute values") + + return recommendations + + def identify_coverage_gaps(self, alerts: List[Dict[str, Any]], + services: List[Dict[str, Any]] = None) -> Dict[str, Any]: + """Identify gaps in monitoring coverage.""" + coverage_analysis = { + 'missing_categories': [], + 'missing_golden_signals': [], + 'service_coverage_gaps': [], + 'critical_gaps': [], + 'recommendations': [] + } + + # Analyze coverage by category + covered_categories = set() + alert_categories = [] + + for alert in alerts: + alert_rule = alert.get('expr', alert.get('condition', '')) + alert_name = alert.get('alert', alert.get('name', '')) + + category = self._classify_alert_category(alert_rule, alert_name) + if category: + covered_categories.add(category) + alert_categories.append(category) + + # Check for missing essential categories + missing_categories = set(self.COVERAGE_CATEGORIES) - covered_categories + coverage_analysis['missing_categories'] = list(missing_categories) + + # Check for missing golden signals + covered_signals = set() + for alert in alerts: + alert_rule = alert.get('expr', alert.get('condition', '')) + signal = self._identify_golden_signal(alert_rule) + if signal: + covered_signals.add(signal) + + missing_signals = set(self.GOLDEN_SIGNALS) - covered_signals + coverage_analysis['missing_golden_signals'] = list(missing_signals) + + # Analyze service-specific coverage if service list provided + if services: + service_coverage = self._analyze_service_coverage(alerts, services) + coverage_analysis['service_coverage_gaps'] = service_coverage + + # Identify critical gaps + critical_gaps = [] + if 'availability' in missing_categories: + critical_gaps.append("Missing availability monitoring") + if 'error_rate' in missing_categories: + critical_gaps.append("Missing error rate monitoring") + if 'errors' in missing_signals: + critical_gaps.append("Missing error signal monitoring") + + coverage_analysis['critical_gaps'] = critical_gaps + + # Generate recommendations + recommendations = self._generate_coverage_recommendations(coverage_analysis) + coverage_analysis['recommendations'] = recommendations + + return coverage_analysis + + def _classify_alert_category(self, rule: str, alert_name: str) -> str: + """Classify alert into monitoring category.""" + rule_lower = rule.lower() + name_lower = alert_name.lower() + + if any(keyword in rule_lower or keyword in name_lower + for keyword in ['up', 'down', 'available', 'reachable']): + return 'availability' + + if any(keyword in rule_lower or keyword in name_lower + for keyword in ['latency', 'response_time', 'duration']): + return 'latency' + + if any(keyword in rule_lower or keyword in name_lower + for keyword in ['error', 'fail', '5xx', '4xx']): + return 'error_rate' + + if any(keyword in rule_lower or keyword in name_lower + for keyword in ['cpu', 'memory', 'disk', 'network', 'utilization']): + return 'resource_utilization' + + if any(keyword in rule_lower or keyword in name_lower + for keyword in ['security', 'auth', 'login', 'breach']): + return 'security' + + if any(keyword in rule_lower or keyword in name_lower + for keyword in ['revenue', 'conversion', 'user', 'business']): + return 'business_metrics' + + return 'other' + + def _identify_golden_signal(self, rule: str) -> str: + """Identify which golden signal an alert covers.""" + rule_lower = rule.lower() + + if any(keyword in rule_lower for keyword in ['latency', 'response_time', 'duration']): + return 'latency' + + if any(keyword in rule_lower for keyword in ['rate', 'rps', 'qps', 'throughput']): + return 'traffic' + + if any(keyword in rule_lower for keyword in ['error', 'fail', '5xx']): + return 'errors' + + if any(keyword in rule_lower for keyword in ['cpu', 'memory', 'disk', 'utilization']): + return 'saturation' + + return None + + def _analyze_service_coverage(self, alerts: List[Dict[str, Any]], + services: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Analyze monitoring coverage per service.""" + service_coverage = [] + + for service in services: + service_name = service.get('name', '') + service_alerts = [alert for alert in alerts + if service_name in alert.get('expr', '') or + service_name in alert.get('labels', {}).get('service', '')] + + covered_signals = set() + for alert in service_alerts: + signal = self._identify_golden_signal(alert.get('expr', '')) + if signal: + covered_signals.add(signal) + + missing_signals = set(self.GOLDEN_SIGNALS) - covered_signals + + if missing_signals or len(service_alerts) < 3: # Less than 3 alerts per service + coverage_gap = { + 'service': service_name, + 'alert_count': len(service_alerts), + 'covered_signals': list(covered_signals), + 'missing_signals': list(missing_signals), + 'criticality': service.get('criticality', 'medium'), + 'recommendations': [] + } + + if len(service_alerts) == 0: + coverage_gap['recommendations'].append("Add basic availability monitoring") + if 'errors' in missing_signals: + coverage_gap['recommendations'].append("Add error rate monitoring") + if 'latency' in missing_signals: + coverage_gap['recommendations'].append("Add latency monitoring") + + service_coverage.append(coverage_gap) + + return service_coverage + + def _generate_coverage_recommendations(self, coverage_analysis: Dict[str, Any]) -> List[str]: + """Generate recommendations to improve monitoring coverage.""" + recommendations = [] + + for missing_category in coverage_analysis['missing_categories']: + if missing_category == 'availability': + recommendations.append("Add service availability/uptime monitoring") + elif missing_category == 'latency': + recommendations.append("Add response time and latency monitoring") + elif missing_category == 'error_rate': + recommendations.append("Add error rate and HTTP status code monitoring") + elif missing_category == 'resource_utilization': + recommendations.append("Add CPU, memory, and disk utilization monitoring") + elif missing_category == 'security': + recommendations.append("Add security monitoring (auth failures, suspicious activity)") + elif missing_category == 'business_metrics': + recommendations.append("Add business KPI monitoring") + + for missing_signal in coverage_analysis['missing_golden_signals']: + recommendations.append(f"Implement {missing_signal} monitoring (Golden Signal)") + + if coverage_analysis['critical_gaps']: + recommendations.append("Address critical monitoring gaps as highest priority") + + return recommendations + + def find_duplicate_alerts(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Identify duplicate or redundant alerts.""" + duplicates = [] + alert_signatures = defaultdict(list) + + # Group alerts by signature + for i, alert in enumerate(alerts): + signature = self._generate_alert_signature(alert) + alert_signatures[signature].append((i, alert)) + + # Find exact duplicates + for signature, alert_group in alert_signatures.items(): + if len(alert_group) > 1: + duplicate_group = { + 'type': 'exact_duplicate', + 'signature': signature, + 'alerts': [{'index': i, 'name': alert.get('alert', alert.get('name', f'Alert_{i}'))} + for i, alert in alert_group], + 'recommendation': 'Remove duplicate alerts, keep the most comprehensive one' + } + duplicates.append(duplicate_group) + + # Find semantic duplicates (similar but not identical) + semantic_duplicates = self._find_semantic_duplicates(alerts) + duplicates.extend(semantic_duplicates) + + return duplicates + + def _generate_alert_signature(self, alert: Dict[str, Any]) -> str: + """Generate a signature for alert comparison.""" + expr = alert.get('expr', alert.get('condition', '')) + labels = alert.get('labels', {}) + + # Normalize the expression by removing whitespace and standardizing + normalized_expr = re.sub(r'\s+', ' ', expr).strip() + + # Create signature from expression and key labels + key_labels = {k: v for k, v in labels.items() + if k in ['service', 'severity', 'team']} + + return f"{normalized_expr}::{json.dumps(key_labels, sort_keys=True)}" + + def _find_semantic_duplicates(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Find semantically similar alerts.""" + semantic_duplicates = [] + + # Group alerts by service and metric type + service_groups = defaultdict(list) + + for i, alert in enumerate(alerts): + service = self._extract_service_from_alert(alert) + metric_type = self._extract_metric_type_from_alert(alert) + key = f"{service}::{metric_type}" + service_groups[key].append((i, alert)) + + # Look for similar alerts within each group + for key, alert_group in service_groups.items(): + if len(alert_group) > 1: + similar_alerts = self._identify_similar_alerts(alert_group) + if similar_alerts: + semantic_duplicates.extend(similar_alerts) + + return semantic_duplicates + + def _extract_service_from_alert(self, alert: Dict[str, Any]) -> str: + """Extract service name from alert.""" + labels = alert.get('labels', {}) + if 'service' in labels: + return labels['service'] + + expr = alert.get('expr', alert.get('condition', '')) + # Try to extract service from metric labels + service_match = re.search(r'service="([^"]+)"', expr) + if service_match: + return service_match.group(1) + + return 'unknown' + + def _extract_metric_type_from_alert(self, alert: Dict[str, Any]) -> str: + """Extract metric type from alert.""" + expr = alert.get('expr', alert.get('condition', '')) + + # Common metric patterns + if 'up' in expr.lower(): + return 'availability' + elif any(keyword in expr.lower() for keyword in ['latency', 'duration', 'response_time']): + return 'latency' + elif any(keyword in expr.lower() for keyword in ['error', 'fail', '5xx']): + return 'error_rate' + elif any(keyword in expr.lower() for keyword in ['cpu', 'memory', 'disk']): + return 'resource' + + return 'other' + + def _identify_similar_alerts(self, alert_group: List[Tuple[int, Dict[str, Any]]]) -> List[Dict[str, Any]]: + """Identify similar alerts within a group.""" + similar_groups = [] + + # Simple similarity check based on threshold values and conditions + threshold_groups = defaultdict(list) + + for index, alert in alert_group: + expr = alert.get('expr', alert.get('condition', '')) + threshold = self._extract_threshold_from_expression(expr) + severity = alert.get('labels', {}).get('severity', 'unknown') + + similarity_key = f"{threshold}::{severity}" + threshold_groups[similarity_key].append((index, alert)) + + # If multiple alerts have very similar thresholds, they might be redundant + for similarity_key, similar_alerts in threshold_groups.items(): + if len(similar_alerts) > 1: + similar_group = { + 'type': 'semantic_duplicate', + 'similarity_key': similarity_key, + 'alerts': [{'index': i, 'name': alert.get('alert', alert.get('name', f'Alert_{i}'))} + for i, alert in similar_alerts], + 'recommendation': 'Review for potential consolidation - similar thresholds and conditions' + } + similar_groups.append(similar_group) + + return similar_groups + + def _extract_threshold_from_expression(self, expr: str) -> str: + """Extract threshold value from alert expression.""" + # Look for common threshold patterns + threshold_patterns = [ + r'>[\s]*([0-9.]+)', + r'<[\s]*([0-9.]+)', + r'>=[\s]*([0-9.]+)', + r'<=[\s]*([0-9.]+)', + r'==[\s]*([0-9.]+)' + ] + + for pattern in threshold_patterns: + match = re.search(pattern, expr) + if match: + return match.group(1) + + return 'unknown' + + def analyze_thresholds(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Analyze alert thresholds for optimization opportunities.""" + threshold_analysis = [] + + for alert in alerts: + alert_name = alert.get('alert', alert.get('name', 'Unknown')) + expr = alert.get('expr', alert.get('condition', '')) + + analysis = { + 'alert_name': alert_name, + 'current_expression': expr, + 'threshold_issues': [], + 'recommendations': [] + } + + # Check for hard-coded thresholds + if re.search(r'[><=]\s*[0-9.]+', expr): + analysis['threshold_issues'].append('Hard-coded threshold value') + analysis['recommendations'].append('Consider parameterizing thresholds') + + # Check for percentage-based thresholds that might be too strict + percentage_match = re.search(r'([><=])\s*0?\.\d+', expr) + if percentage_match: + operator = percentage_match.group(1) + if operator in ['>', '>='] and 'error' in expr.lower(): + analysis['threshold_issues'].append('Very low error rate threshold') + analysis['recommendations'].append('Consider increasing error rate threshold based on SLO') + + # Check for missing hysteresis + if '>' in expr and 'for:' not in str(alert): + analysis['threshold_issues'].append('No hysteresis (for clause)') + analysis['recommendations'].append('Add "for" clause to prevent alert flapping') + + # Check for resource utilization thresholds + if any(resource in expr.lower() for resource in ['cpu', 'memory', 'disk']): + threshold_value = self._extract_threshold_from_expression(expr) + if threshold_value and threshold_value.replace('.', '').isdigit(): + threshold_num = float(threshold_value) + if threshold_num < 0.7: # Less than 70% + analysis['threshold_issues'].append('Low resource utilization threshold') + analysis['recommendations'].append('Consider increasing threshold to reduce noise') + + # Add historical data analysis if available + historical_data = alert.get('historical_data', {}) + if historical_data: + false_positive_rate = historical_data.get('false_positive_rate', 0) + if false_positive_rate > 0.2: + analysis['threshold_issues'].append(f'High false positive rate: {false_positive_rate*100:.1f}%') + analysis['recommendations'].append('Analyze historical data and adjust threshold') + + if analysis['threshold_issues']: + threshold_analysis.append(analysis) + + return threshold_analysis + + def assess_alert_fatigue_risk(self, alerts: List[Dict[str, Any]]) -> Dict[str, Any]: + """Assess risk of alert fatigue.""" + fatigue_assessment = { + 'total_alerts': len(alerts), + 'risk_level': 'low', + 'risk_factors': [], + 'metrics': {}, + 'recommendations': [] + } + + # Count alerts by severity + severity_counts = Counter() + for alert in alerts: + severity = alert.get('labels', {}).get('severity', 'unknown') + severity_counts[severity] += 1 + + fatigue_assessment['metrics']['severity_distribution'] = dict(severity_counts) + + # Calculate risk factors + critical_count = severity_counts.get('critical', 0) + warning_count = severity_counts.get('warning', 0) + severity_counts.get('high', 0) + total_high_priority = critical_count + warning_count + + # Too many high-priority alerts + if total_high_priority > 50: + fatigue_assessment['risk_factors'].append('High number of critical/warning alerts') + fatigue_assessment['recommendations'].append('Review and reduce number of high-priority alerts') + + # Poor critical to warning ratio + if critical_count > 0 and warning_count > 0: + critical_ratio = critical_count / (critical_count + warning_count) + if critical_ratio > 0.3: # More than 30% critical + fatigue_assessment['risk_factors'].append('High ratio of critical alerts') + fatigue_assessment['recommendations'].append('Review critical alert criteria - not everything should be critical') + + # Estimate daily alert volume + daily_estimate = self._estimate_daily_alert_volume(alerts) + fatigue_assessment['metrics']['estimated_daily_alerts'] = daily_estimate + + if daily_estimate > 100: + fatigue_assessment['risk_factors'].append('High estimated daily alert volume') + fatigue_assessment['recommendations'].append('Implement alert grouping and suppression rules') + + # Check for missing runbooks + alerts_without_runbooks = [alert for alert in alerts + if not alert.get('annotations', {}).get('runbook_url')] + runbook_ratio = len(alerts_without_runbooks) / len(alerts) if alerts else 0 + + if runbook_ratio > 0.5: + fatigue_assessment['risk_factors'].append('Many alerts lack runbooks') + fatigue_assessment['recommendations'].append('Create runbooks for alerts to improve response efficiency') + + # Determine overall risk level + risk_score = len(fatigue_assessment['risk_factors']) + if risk_score >= 3: + fatigue_assessment['risk_level'] = 'high' + elif risk_score >= 1: + fatigue_assessment['risk_level'] = 'medium' + + return fatigue_assessment + + def _estimate_daily_alert_volume(self, alerts: List[Dict[str, Any]]) -> int: + """Estimate daily alert volume.""" + total_estimated = 0 + + for alert in alerts: + # Use historical data if available + historical_data = alert.get('historical_data', {}) + if historical_data and 'fires_per_day' in historical_data: + total_estimated += historical_data['fires_per_day'] + continue + + # Otherwise estimate based on alert characteristics + expr = alert.get('expr', alert.get('condition', '')) + severity = alert.get('labels', {}).get('severity', 'warning') + + # Base estimate by severity + base_estimates = { + 'critical': 0.1, # Critical should rarely fire + 'high': 0.5, + 'warning': 2, + 'info': 5 + } + + estimate = base_estimates.get(severity, 1) + + # Adjust based on alert type + if 'error_rate' in expr.lower(): + estimate *= 1.5 # Error rate alerts tend to be more frequent + elif 'availability' in expr.lower() or 'up' in expr.lower(): + estimate *= 0.5 # Availability alerts should be rare + + total_estimated += estimate + + return int(total_estimated) + + def generate_optimized_config(self, alerts: List[Dict[str, Any]], + analysis_results: Dict[str, Any]) -> Dict[str, Any]: + """Generate optimized alert configuration.""" + optimized_alerts = [] + + for i, alert in enumerate(alerts): + optimized_alert = alert.copy() + alert_name = alert.get('alert', alert.get('name', f'Alert_{i}')) + + # Apply noise reduction optimizations + noisy_alerts = analysis_results.get('noisy_alerts', []) + for noisy_alert in noisy_alerts: + if noisy_alert['alert_name'] == alert_name: + optimized_alert = self._apply_noise_reduction(optimized_alert, noisy_alert) + break + + # Apply threshold optimizations + threshold_issues = analysis_results.get('threshold_analysis', []) + for threshold_issue in threshold_issues: + if threshold_issue['alert_name'] == alert_name: + optimized_alert = self._apply_threshold_optimization(optimized_alert, threshold_issue) + break + + # Ensure proper alert metadata + optimized_alert = self._ensure_alert_metadata(optimized_alert) + + optimized_alerts.append(optimized_alert) + + # Remove duplicates based on analysis + if 'duplicate_alerts' in analysis_results: + optimized_alerts = self._remove_duplicate_alerts(optimized_alerts, + analysis_results['duplicate_alerts']) + + # Add missing alerts for coverage gaps + if 'coverage_gaps' in analysis_results: + new_alerts = self._generate_missing_alerts(analysis_results['coverage_gaps']) + optimized_alerts.extend(new_alerts) + + optimized_config = { + 'alerts': optimized_alerts, + 'optimization_metadata': { + 'optimized_at': datetime.utcnow().isoformat() + 'Z', + 'original_count': len(alerts), + 'optimized_count': len(optimized_alerts), + 'changes_applied': analysis_results.get('optimizations_applied', []) + } + } + + return optimized_config + + def _apply_noise_reduction(self, alert: Dict[str, Any], + noise_analysis: Dict[str, Any]) -> Dict[str, Any]: + """Apply noise reduction optimizations to an alert.""" + optimized_alert = alert.copy() + + for recommendation in noise_analysis['recommendations']: + if 'for:' in recommendation and not alert.get('for'): + optimized_alert['for'] = '5m' + elif 'threshold' in recommendation.lower(): + # This would require more sophisticated threshold adjustment + # For now, add annotation for manual review + if 'annotations' not in optimized_alert: + optimized_alert['annotations'] = {} + optimized_alert['annotations']['optimization_note'] = 'Review threshold - potentially too sensitive' + + return optimized_alert + + def _apply_threshold_optimization(self, alert: Dict[str, Any], + threshold_analysis: Dict[str, Any]) -> Dict[str, Any]: + """Apply threshold optimizations to an alert.""" + optimized_alert = alert.copy() + + # Add 'for' clause if missing + if 'No hysteresis' in str(threshold_analysis['threshold_issues']): + if not alert.get('for'): + optimized_alert['for'] = '5m' + + # Add optimization annotations + if threshold_analysis['recommendations']: + if 'annotations' not in optimized_alert: + optimized_alert['annotations'] = {} + optimized_alert['annotations']['threshold_recommendations'] = '; '.join(threshold_analysis['recommendations']) + + return optimized_alert + + def _ensure_alert_metadata(self, alert: Dict[str, Any]) -> Dict[str, Any]: + """Ensure alert has proper metadata.""" + optimized_alert = alert.copy() + + # Ensure annotations exist + if 'annotations' not in optimized_alert: + optimized_alert['annotations'] = {} + + # Add summary if missing + if 'summary' not in optimized_alert['annotations']: + alert_name = alert.get('alert', alert.get('name', 'Alert')) + optimized_alert['annotations']['summary'] = f"Alert: {alert_name}" + + # Add description if missing + if 'description' not in optimized_alert['annotations']: + optimized_alert['annotations']['description'] = 'This alert requires a description. Please update with specific details about the condition and impact.' + + # Ensure proper labels + if 'labels' not in optimized_alert: + optimized_alert['labels'] = {} + + if 'severity' not in optimized_alert['labels']: + optimized_alert['labels']['severity'] = 'warning' + + return optimized_alert + + def _remove_duplicate_alerts(self, alerts: List[Dict[str, Any]], + duplicates: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Remove duplicate alerts from the list.""" + indices_to_remove = set() + + for duplicate_group in duplicates: + if duplicate_group['type'] == 'exact_duplicate': + # Keep the first alert, remove the rest + alert_indices = [alert_info['index'] for alert_info in duplicate_group['alerts']] + indices_to_remove.update(alert_indices[1:]) # Remove all but first + + return [alert for i, alert in enumerate(alerts) if i not in indices_to_remove] + + def _generate_missing_alerts(self, coverage_gaps: Dict[str, Any]) -> List[Dict[str, Any]]: + """Generate alerts for missing coverage.""" + new_alerts = [] + + for missing_signal in coverage_gaps.get('missing_golden_signals', []): + if missing_signal == 'latency': + new_alert = { + 'alert': 'HighLatency', + 'expr': 'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5', + 'for': '5m', + 'labels': { + 'severity': 'warning' + }, + 'annotations': { + 'summary': 'High request latency detected', + 'description': 'The 95th percentile latency is above 500ms for 5 minutes.', + 'generated': 'true' + } + } + new_alerts.append(new_alert) + + elif missing_signal == 'errors': + new_alert = { + 'alert': 'HighErrorRate', + 'expr': 'sum(rate(http_requests_total{code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.01', + 'for': '5m', + 'labels': { + 'severity': 'warning' + }, + 'annotations': { + 'summary': 'High error rate detected', + 'description': 'Error rate is above 1% for 5 minutes.', + 'generated': 'true' + } + } + new_alerts.append(new_alert) + + return new_alerts + + def analyze_configuration(self, alert_config: Dict[str, Any]) -> Dict[str, Any]: + """Perform comprehensive analysis of alert configuration.""" + alerts = alert_config.get('alerts', alert_config.get('rules', [])) + services = alert_config.get('services', []) + + analysis_results = { + 'summary': { + 'total_alerts': len(alerts), + 'analysis_timestamp': datetime.utcnow().isoformat() + 'Z' + }, + 'noisy_alerts': self.analyze_alert_noise(alerts), + 'coverage_gaps': self.identify_coverage_gaps(alerts, services), + 'duplicate_alerts': self.find_duplicate_alerts(alerts), + 'threshold_analysis': self.analyze_thresholds(alerts), + 'alert_fatigue_assessment': self.assess_alert_fatigue_risk(alerts) + } + + # Generate overall recommendations + analysis_results['overall_recommendations'] = self._generate_overall_recommendations(analysis_results) + + return analysis_results + + def _generate_overall_recommendations(self, analysis_results: Dict[str, Any]) -> List[str]: + """Generate overall recommendations based on complete analysis.""" + recommendations = [] + + # High-priority recommendations + if analysis_results['alert_fatigue_assessment']['risk_level'] == 'high': + recommendations.append("HIGH PRIORITY: Address alert fatigue risk by reducing alert volume") + + if len(analysis_results['coverage_gaps']['critical_gaps']) > 0: + recommendations.append("HIGH PRIORITY: Address critical monitoring gaps") + + # Medium-priority recommendations + if len(analysis_results['noisy_alerts']) > 0: + recommendations.append(f"Optimize {len(analysis_results['noisy_alerts'])} noisy alerts to reduce false positives") + + if len(analysis_results['duplicate_alerts']) > 0: + recommendations.append(f"Remove or consolidate {len(analysis_results['duplicate_alerts'])} duplicate alert groups") + + # General recommendations + recommendations.append("Implement proper alert routing and escalation policies") + recommendations.append("Create runbooks for all production alerts") + recommendations.append("Set up alert effectiveness monitoring and regular reviews") + + return recommendations + + def export_analysis(self, analysis_results: Dict[str, Any], output_file: str, + format_type: str = 'json'): + """Export analysis results.""" + if format_type.lower() == 'json': + with open(output_file, 'w') as f: + json.dump(analysis_results, f, indent=2) + elif format_type.lower() == 'html': + self._export_html_report(analysis_results, output_file) + else: + raise ValueError(f"Unsupported format: {format_type}") + + def _export_html_report(self, analysis_results: Dict[str, Any], output_file: str): + """Export analysis as HTML report.""" + html_content = self._generate_html_report(analysis_results) + with open(output_file, 'w') as f: + f.write(html_content) + + def _generate_html_report(self, analysis_results: Dict[str, Any]) -> str: + """Generate HTML report of analysis results.""" + html = f""" + + + + Alert Configuration Analysis Report + + + +
+

Alert Configuration Analysis Report

+

Generated: {analysis_results['summary']['analysis_timestamp']}

+

Total Alerts Analyzed: {analysis_results['summary']['total_alerts']}

+
+ +
+

Overall Recommendations

+
    + {''.join(f'
  • {rec}
  • ' for rec in analysis_results['overall_recommendations'])} +
+
+ +
+

Alert Fatigue Assessment

+

Risk Level: {analysis_results['alert_fatigue_assessment']['risk_level'].upper()}

+

Risk Factors:

+
    + {''.join(f'
  • {factor}
  • ' for factor in analysis_results['alert_fatigue_assessment']['risk_factors'])} +
+
+ +
+

Noisy Alerts ({len(analysis_results['noisy_alerts'])})

+ {''.join(f'
{alert["alert_name"]} (Score: {alert["noise_score"]})
    {"".join(f"
  • {reason}
  • " for reason in alert["reasons"])}
' + for alert in analysis_results['noisy_alerts'][:5])} +
+ +
+

Coverage Gaps

+

Missing Categories: {', '.join(analysis_results['coverage_gaps']['missing_categories']) or 'None'}

+

Missing Golden Signals: {', '.join(analysis_results['coverage_gaps']['missing_golden_signals']) or 'None'}

+

Critical Gaps: {len(analysis_results['coverage_gaps']['critical_gaps'])}

+
+ + + + """ + return html + + def print_summary(self, analysis_results: Dict[str, Any]): + """Print human-readable summary of analysis.""" + print(f"\n{'='*60}") + print(f"ALERT CONFIGURATION ANALYSIS SUMMARY") + print(f"{'='*60}") + + summary = analysis_results['summary'] + print(f"\nOverall Statistics:") + print(f" Total Alerts: {summary['total_alerts']}") + print(f" Analysis Date: {summary['analysis_timestamp']}") + + # Alert fatigue assessment + fatigue = analysis_results['alert_fatigue_assessment'] + print(f"\nAlert Fatigue Risk: {fatigue['risk_level'].upper()}") + if fatigue['risk_factors']: + print(f" Risk Factors:") + for factor in fatigue['risk_factors']: + print(f" • {factor}") + + # Noisy alerts + noisy = analysis_results['noisy_alerts'] + print(f"\nNoisy Alerts: {len(noisy)}") + if noisy: + print(f" Top 3 Noisiest:") + for alert in noisy[:3]: + print(f" • {alert['alert_name']} (Score: {alert['noise_score']})") + + # Coverage gaps + gaps = analysis_results['coverage_gaps'] + print(f"\nMonitoring Coverage:") + print(f" Missing Categories: {len(gaps['missing_categories'])}") + print(f" Missing Golden Signals: {len(gaps['missing_golden_signals'])}") + print(f" Critical Gaps: {len(gaps['critical_gaps'])}") + + # Duplicates + duplicates = analysis_results['duplicate_alerts'] + print(f"\nDuplicate Alerts: {len(duplicates)} groups") + + # Overall recommendations + recommendations = analysis_results['overall_recommendations'] + print(f"\nTop Recommendations:") + for i, rec in enumerate(recommendations[:5], 1): + print(f" {i}. {rec}") + + print(f"\n{'='*60}\n") + + +def main(): + """Main function for CLI usage.""" + parser = argparse.ArgumentParser( + description='Analyze and optimize alert configurations', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Analyze alert configuration + python alert_optimizer.py --input alerts.json --analyze-only + + # Generate optimized configuration + python alert_optimizer.py --input alerts.json --output optimized_alerts.json + + # Generate HTML report + python alert_optimizer.py --input alerts.json --report report.html --format html + """ + ) + + parser.add_argument('--input', '-i', required=True, + help='Input alert configuration JSON file') + parser.add_argument('--output', '-o', + help='Output optimized configuration JSON file') + parser.add_argument('--report', '-r', + help='Generate analysis report file') + parser.add_argument('--format', choices=['json', 'html'], default='json', + help='Report format (json or html)') + parser.add_argument('--analyze-only', action='store_true', + help='Only perform analysis, do not generate optimized config') + + args = parser.parse_args() + + optimizer = AlertOptimizer() + + try: + # Load alert configuration + alert_config = optimizer.load_alert_config(args.input) + + # Perform analysis + analysis_results = optimizer.analyze_configuration(alert_config) + + # Generate optimized configuration if requested + if not args.analyze_only: + optimized_config = optimizer.generate_optimized_config( + alert_config.get('alerts', alert_config.get('rules', [])), + analysis_results + ) + + output_file = args.output or 'optimized_alerts.json' + optimizer.export_analysis(optimized_config, output_file, 'json') + print(f"Optimized configuration saved to: {output_file}") + + # Generate report if requested + if args.report: + optimizer.export_analysis(analysis_results, args.report, args.format) + print(f"Analysis report saved to: {args.report}") + + # Always show summary + optimizer.print_summary(analysis_results) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/engineering/observability-designer/scripts/dashboard_generator.py b/engineering/observability-designer/scripts/dashboard_generator.py new file mode 100644 index 0000000..a07b077 --- /dev/null +++ b/engineering/observability-designer/scripts/dashboard_generator.py @@ -0,0 +1,1219 @@ +#!/usr/bin/env python3 +""" +Dashboard Generator - Generate comprehensive dashboard specifications + +This script generates dashboard specifications based on service/system descriptions: +- Panel layout optimized for different screen sizes and roles +- Metric queries (Prometheus-style) for comprehensive monitoring +- Visualization types appropriate for different metric types +- Drill-down paths for effective troubleshooting workflows +- Golden signals coverage (latency, traffic, errors, saturation) +- RED/USE method implementation +- Business metrics integration + +Usage: + python dashboard_generator.py --input service_definition.json --output dashboard_spec.json + python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json +""" + +import json +import argparse +import sys +import math +from typing import Dict, List, Any, Tuple +from datetime import datetime, timedelta + + +class DashboardGenerator: + """Generate comprehensive dashboard specifications.""" + + # Dashboard layout templates by role + ROLE_LAYOUTS = { + 'sre': { + 'primary_focus': ['availability', 'latency', 'errors', 'resource_utilization'], + 'secondary_focus': ['throughput', 'capacity', 'dependencies'], + 'time_ranges': ['1h', '6h', '1d', '7d'], + 'default_refresh': '30s' + }, + 'developer': { + 'primary_focus': ['latency', 'errors', 'throughput', 'business_metrics'], + 'secondary_focus': ['resource_utilization', 'dependencies'], + 'time_ranges': ['15m', '1h', '6h', '1d'], + 'default_refresh': '1m' + }, + 'executive': { + 'primary_focus': ['availability', 'business_metrics', 'user_experience'], + 'secondary_focus': ['cost', 'capacity_trends'], + 'time_ranges': ['1d', '7d', '30d'], + 'default_refresh': '5m' + }, + 'ops': { + 'primary_focus': ['resource_utilization', 'capacity', 'alerts', 'deployments'], + 'secondary_focus': ['throughput', 'latency'], + 'time_ranges': ['5m', '30m', '2h', '1d'], + 'default_refresh': '15s' + } + } + + # Service type specific metric configurations + SERVICE_METRICS = { + 'api': { + 'golden_signals': ['latency', 'traffic', 'errors', 'saturation'], + 'key_metrics': [ + 'http_requests_total', + 'http_request_duration_seconds', + 'http_request_size_bytes', + 'http_response_size_bytes' + ], + 'resource_metrics': ['cpu_usage', 'memory_usage', 'goroutines'] + }, + 'web': { + 'golden_signals': ['latency', 'traffic', 'errors', 'saturation'], + 'key_metrics': [ + 'http_requests_total', + 'http_request_duration_seconds', + 'page_load_time', + 'user_sessions' + ], + 'resource_metrics': ['cpu_usage', 'memory_usage', 'connections'] + }, + 'database': { + 'golden_signals': ['latency', 'traffic', 'errors', 'saturation'], + 'key_metrics': [ + 'db_connections_active', + 'db_query_duration_seconds', + 'db_queries_total', + 'db_slow_queries_total' + ], + 'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_io', 'connections'] + }, + 'queue': { + 'golden_signals': ['latency', 'traffic', 'errors', 'saturation'], + 'key_metrics': [ + 'queue_depth', + 'message_processing_duration', + 'messages_published_total', + 'messages_consumed_total' + ], + 'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_usage'] + } + } + + # Visualization type recommendations + VISUALIZATION_TYPES = { + 'latency': 'line_chart', + 'throughput': 'line_chart', + 'error_rate': 'line_chart', + 'success_rate': 'stat', + 'resource_utilization': 'gauge', + 'queue_depth': 'bar_chart', + 'status': 'stat', + 'distribution': 'heatmap', + 'alerts': 'table', + 'logs': 'logs_panel' + } + + def __init__(self): + """Initialize the Dashboard Generator.""" + self.service_config = {} + self.dashboard_spec = {} + + def load_service_definition(self, file_path: str) -> Dict[str, Any]: + """Load service definition from JSON file.""" + try: + with open(file_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + raise ValueError(f"Service definition file not found: {file_path}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in service definition: {e}") + + def create_service_definition(self, service_type: str, name: str, + criticality: str = 'medium') -> Dict[str, Any]: + """Create a service definition from parameters.""" + return { + 'name': name, + 'type': service_type, + 'criticality': criticality, + 'description': f'{name} - A {criticality} criticality {service_type} service', + 'team': 'platform', + 'environment': 'production', + 'dependencies': [], + 'tags': [] + } + + def generate_dashboard_specification(self, service_def: Dict[str, Any], + target_role: str = 'sre') -> Dict[str, Any]: + """Generate comprehensive dashboard specification.""" + service_name = service_def.get('name', 'Service') + service_type = service_def.get('type', 'api') + + # Get role-specific configuration + role_config = self.ROLE_LAYOUTS.get(target_role, self.ROLE_LAYOUTS['sre']) + + dashboard_spec = { + 'metadata': { + 'title': f"{service_name} - {target_role.upper()} Dashboard", + 'service': service_def, + 'target_role': target_role, + 'generated_at': datetime.utcnow().isoformat() + 'Z', + 'version': '1.0' + }, + 'configuration': { + 'time_ranges': role_config['time_ranges'], + 'default_time_range': role_config['time_ranges'][1], # Second option as default + 'refresh_interval': role_config['default_refresh'], + 'timezone': 'UTC', + 'theme': 'dark' + }, + 'layout': self._generate_dashboard_layout(service_def, role_config), + 'panels': self._generate_panels(service_def, role_config), + 'variables': self._generate_template_variables(service_def), + 'alerts_integration': self._generate_alerts_integration(service_def), + 'drill_down_paths': self._generate_drill_down_paths(service_def) + } + + return dashboard_spec + + def _generate_dashboard_layout(self, service_def: Dict[str, Any], + role_config: Dict[str, Any]) -> Dict[str, Any]: + """Generate dashboard layout configuration.""" + return { + 'grid_settings': { + 'width': 24, # Grafana-style 24-column grid + 'height_unit': 'px', + 'cell_height': 30 + }, + 'sections': [ + { + 'title': 'Service Overview', + 'collapsed': False, + 'y_position': 0, + 'panels': ['service_status', 'slo_summary', 'error_budget'] + }, + { + 'title': 'Golden Signals', + 'collapsed': False, + 'y_position': 8, + 'panels': ['latency', 'traffic', 'errors', 'saturation'] + }, + { + 'title': 'Resource Utilization', + 'collapsed': False, + 'y_position': 16, + 'panels': ['cpu_usage', 'memory_usage', 'network_io', 'disk_io'] + }, + { + 'title': 'Dependencies & Downstream', + 'collapsed': True, + 'y_position': 24, + 'panels': ['dependency_status', 'downstream_latency', 'circuit_breakers'] + } + ] + } + + def _generate_panels(self, service_def: Dict[str, Any], + role_config: Dict[str, Any]) -> List[Dict[str, Any]]: + """Generate dashboard panels based on service and role.""" + service_name = service_def.get('name', 'service') + service_type = service_def.get('type', 'api') + panels = [] + + # Service Overview Panels + panels.extend(self._create_overview_panels(service_def)) + + # Golden Signals Panels + panels.extend(self._create_golden_signals_panels(service_def)) + + # Resource Utilization Panels + panels.extend(self._create_resource_panels(service_def)) + + # Service-specific panels + if service_type == 'api': + panels.extend(self._create_api_specific_panels(service_def)) + elif service_type == 'database': + panels.extend(self._create_database_specific_panels(service_def)) + elif service_type == 'queue': + panels.extend(self._create_queue_specific_panels(service_def)) + + # Role-specific additional panels + if 'business_metrics' in role_config['primary_focus']: + panels.extend(self._create_business_metrics_panels(service_def)) + + if 'capacity' in role_config['primary_focus']: + panels.extend(self._create_capacity_panels(service_def)) + + return panels + + def _create_overview_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create service overview panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'service_status', + 'title': 'Service Status', + 'type': 'stat', + 'grid_pos': {'x': 0, 'y': 0, 'w': 6, 'h': 4}, + 'targets': [ + { + 'expr': f'up{{service="{service_name}"}}', + 'legendFormat': 'Status' + } + ], + 'field_config': { + 'overrides': [ + { + 'matcher': {'id': 'byName', 'options': 'Status'}, + 'properties': [ + {'id': 'color', 'value': {'mode': 'thresholds'}}, + {'id': 'thresholds', 'value': { + 'steps': [ + {'color': 'red', 'value': 0}, + {'color': 'green', 'value': 1} + ] + }}, + {'id': 'mappings', 'value': [ + {'options': {'0': {'text': 'DOWN'}}, 'type': 'value'}, + {'options': {'1': {'text': 'UP'}}, 'type': 'value'} + ]} + ] + } + ] + }, + 'options': { + 'orientation': 'horizontal', + 'textMode': 'value_and_name' + } + }, + { + 'id': 'slo_summary', + 'title': 'SLO Achievement (30d)', + 'type': 'stat', + 'grid_pos': {'x': 6, 'y': 0, 'w': 9, 'h': 4}, + 'targets': [ + { + 'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d]))) * 100', + 'legendFormat': 'Availability' + }, + { + 'expr': f'histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{{service="{service_name}"}}[30d])) * 1000', + 'legendFormat': 'P95 Latency (ms)' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'thresholds'}, + 'thresholds': { + 'steps': [ + {'color': 'red', 'value': 0}, + {'color': 'yellow', 'value': 99.0}, + {'color': 'green', 'value': 99.9} + ] + } + } + }, + 'options': { + 'orientation': 'horizontal', + 'textMode': 'value_and_name' + } + }, + { + 'id': 'error_budget', + 'title': 'Error Budget Remaining', + 'type': 'gauge', + 'grid_pos': {'x': 15, 'y': 0, 'w': 9, 'h': 4}, + 'targets': [ + { + 'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d])) - 0.999) / 0.001 * 100', + 'legendFormat': 'Error Budget %' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'thresholds'}, + 'min': 0, + 'max': 100, + 'thresholds': { + 'steps': [ + {'color': 'red', 'value': 0}, + {'color': 'yellow', 'value': 25}, + {'color': 'green', 'value': 50} + ] + }, + 'unit': 'percent' + } + }, + 'options': { + 'showThresholdLabels': True, + 'showThresholdMarkers': True + } + } + ] + + def _create_golden_signals_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create golden signals monitoring panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'latency', + 'title': 'Request Latency', + 'type': 'timeseries', + 'grid_pos': {'x': 0, 'y': 8, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000', + 'legendFormat': 'P50 Latency' + }, + { + 'expr': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000', + 'legendFormat': 'P95 Latency' + }, + { + 'expr': f'histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000', + 'legendFormat': 'P99 Latency' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'unit': 'ms', + 'custom': { + 'drawStyle': 'line', + 'lineInterpolation': 'linear', + 'lineWidth': 1, + 'fillOpacity': 10 + } + } + }, + 'options': { + 'tooltip': {'mode': 'multi', 'sort': 'desc'}, + 'legend': {'displayMode': 'table', 'placement': 'bottom'} + } + }, + { + 'id': 'traffic', + 'title': 'Request Rate', + 'type': 'timeseries', + 'grid_pos': {'x': 12, 'y': 8, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))', + 'legendFormat': 'Total RPS' + }, + { + 'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"2.."}}[5m]))', + 'legendFormat': '2xx RPS' + }, + { + 'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m]))', + 'legendFormat': '4xx RPS' + }, + { + 'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m]))', + 'legendFormat': '5xx RPS' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'unit': 'reqps', + 'custom': { + 'drawStyle': 'line', + 'lineInterpolation': 'linear', + 'lineWidth': 1, + 'fillOpacity': 0 + } + } + }, + 'options': { + 'tooltip': {'mode': 'multi', 'sort': 'desc'}, + 'legend': {'displayMode': 'table', 'placement': 'bottom'} + } + }, + { + 'id': 'errors', + 'title': 'Error Rate', + 'type': 'timeseries', + 'grid_pos': {'x': 0, 'y': 14, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100', + 'legendFormat': '5xx Error Rate' + }, + { + 'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100', + 'legendFormat': '4xx Error Rate' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'unit': 'percent', + 'custom': { + 'drawStyle': 'line', + 'lineInterpolation': 'linear', + 'lineWidth': 2, + 'fillOpacity': 20 + } + }, + 'overrides': [ + { + 'matcher': {'id': 'byName', 'options': '5xx Error Rate'}, + 'properties': [{'id': 'color', 'value': {'fixedColor': 'red'}}] + } + ] + }, + 'options': { + 'tooltip': {'mode': 'multi', 'sort': 'desc'}, + 'legend': {'displayMode': 'table', 'placement': 'bottom'} + } + }, + { + 'id': 'saturation', + 'title': 'Saturation Metrics', + 'type': 'timeseries', + 'grid_pos': {'x': 12, 'y': 14, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100', + 'legendFormat': 'CPU Usage %' + }, + { + 'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / process_virtual_memory_max_bytes{{service="{service_name}"}} * 100', + 'legendFormat': 'Memory Usage %' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'unit': 'percent', + 'max': 100, + 'custom': { + 'drawStyle': 'line', + 'lineInterpolation': 'linear', + 'lineWidth': 1, + 'fillOpacity': 10 + } + } + }, + 'options': { + 'tooltip': {'mode': 'multi', 'sort': 'desc'}, + 'legend': {'displayMode': 'table', 'placement': 'bottom'} + } + } + ] + + def _create_resource_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create resource utilization panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'cpu_usage', + 'title': 'CPU Usage', + 'type': 'gauge', + 'grid_pos': {'x': 0, 'y': 20, 'w': 6, 'h': 4}, + 'targets': [ + { + 'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100', + 'legendFormat': 'CPU %' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'thresholds'}, + 'unit': 'percent', + 'min': 0, + 'max': 100, + 'thresholds': { + 'steps': [ + {'color': 'green', 'value': 0}, + {'color': 'yellow', 'value': 70}, + {'color': 'red', 'value': 90} + ] + } + } + }, + 'options': { + 'showThresholdLabels': True, + 'showThresholdMarkers': True + } + }, + { + 'id': 'memory_usage', + 'title': 'Memory Usage', + 'type': 'gauge', + 'grid_pos': {'x': 6, 'y': 20, 'w': 6, 'h': 4}, + 'targets': [ + { + 'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / 1024 / 1024', + 'legendFormat': 'Memory MB' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'thresholds'}, + 'unit': 'decbytes', + 'thresholds': { + 'steps': [ + {'color': 'green', 'value': 0}, + {'color': 'yellow', 'value': 512000000}, # 512MB + {'color': 'red', 'value': 1024000000} # 1GB + ] + } + } + } + }, + { + 'id': 'network_io', + 'title': 'Network I/O', + 'type': 'timeseries', + 'grid_pos': {'x': 12, 'y': 20, 'w': 6, 'h': 4}, + 'targets': [ + { + 'expr': f'rate(process_network_receive_bytes_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'RX Bytes/s' + }, + { + 'expr': f'rate(process_network_transmit_bytes_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'TX Bytes/s' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'unit': 'binBps' + } + } + }, + { + 'id': 'disk_io', + 'title': 'Disk I/O', + 'type': 'timeseries', + 'grid_pos': {'x': 18, 'y': 20, 'w': 6, 'h': 4}, + 'targets': [ + { + 'expr': f'rate(process_disk_read_bytes_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'Read Bytes/s' + }, + { + 'expr': f'rate(process_disk_write_bytes_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'Write Bytes/s' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'unit': 'binBps' + } + } + } + ] + + def _create_api_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create API-specific panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'endpoint_latency', + 'title': 'Top Slowest Endpoints', + 'type': 'table', + 'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'topk(10, histogram_quantile(0.95, sum by (handler) (rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])))) * 1000', + 'legendFormat': '{{handler}}', + 'format': 'table', + 'instant': True + } + ], + 'transformations': [ + { + 'id': 'organize', + 'options': { + 'excludeByName': {'Time': True}, + 'renameByName': {'Value': 'P95 Latency (ms)'} + } + } + ], + 'field_config': { + 'overrides': [ + { + 'matcher': {'id': 'byName', 'options': 'P95 Latency (ms)'}, + 'properties': [ + {'id': 'color', 'value': {'mode': 'thresholds'}}, + {'id': 'thresholds', 'value': { + 'steps': [ + {'color': 'green', 'value': 0}, + {'color': 'yellow', 'value': 100}, + {'color': 'red', 'value': 500} + ] + }} + ] + } + ] + } + }, + { + 'id': 'request_size_distribution', + 'title': 'Request Size Distribution', + 'type': 'heatmap', + 'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'sum by (le) (rate(http_request_size_bytes_bucket{{service="{service_name}"}}[5m]))', + 'legendFormat': '{{le}}' + } + ], + 'options': { + 'calculate': True, + 'yAxis': {'unit': 'bytes'}, + 'color': {'scheme': 'Spectral'} + } + } + ] + + def _create_database_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create database-specific panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'db_connections', + 'title': 'Database Connections', + 'type': 'timeseries', + 'grid_pos': {'x': 0, 'y': 24, 'w': 8, 'h': 6}, + 'targets': [ + { + 'expr': f'db_connections_active{{service="{service_name}"}}', + 'legendFormat': 'Active Connections' + }, + { + 'expr': f'db_connections_idle{{service="{service_name}"}}', + 'legendFormat': 'Idle Connections' + }, + { + 'expr': f'db_connections_max{{service="{service_name}"}}', + 'legendFormat': 'Max Connections' + } + ] + }, + { + 'id': 'query_performance', + 'title': 'Query Performance', + 'type': 'timeseries', + 'grid_pos': {'x': 8, 'y': 24, 'w': 8, 'h': 6}, + 'targets': [ + { + 'expr': f'rate(db_queries_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'Queries/sec' + }, + { + 'expr': f'rate(db_slow_queries_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'Slow Queries/sec' + } + ] + }, + { + 'id': 'db_locks', + 'title': 'Database Locks', + 'type': 'stat', + 'grid_pos': {'x': 16, 'y': 24, 'w': 8, 'h': 6}, + 'targets': [ + { + 'expr': f'db_locks_waiting{{service="{service_name}"}}', + 'legendFormat': 'Waiting Locks' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'thresholds'}, + 'thresholds': { + 'steps': [ + {'color': 'green', 'value': 0}, + {'color': 'yellow', 'value': 1}, + {'color': 'red', 'value': 5} + ] + } + } + } + } + ] + + def _create_queue_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create queue-specific panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'queue_depth', + 'title': 'Queue Depth', + 'type': 'timeseries', + 'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'queue_depth{{service="{service_name}"}}', + 'legendFormat': 'Messages in Queue' + } + ] + }, + { + 'id': 'message_throughput', + 'title': 'Message Throughput', + 'type': 'timeseries', + 'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6}, + 'targets': [ + { + 'expr': f'rate(messages_published_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'Published/sec' + }, + { + 'expr': f'rate(messages_consumed_total{{service="{service_name}"}}[5m])', + 'legendFormat': 'Consumed/sec' + } + ] + } + ] + + def _create_business_metrics_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create business metrics panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'business_kpis', + 'title': 'Business KPIs', + 'type': 'stat', + 'grid_pos': {'x': 0, 'y': 30, 'w': 24, 'h': 4}, + 'targets': [ + { + 'expr': f'rate(business_transactions_total{{service="{service_name}"}}[1h])', + 'legendFormat': 'Transactions/hour' + }, + { + 'expr': f'avg(business_transaction_value{{service="{service_name}"}}) * rate(business_transactions_total{{service="{service_name}"}}[1h])', + 'legendFormat': 'Revenue/hour' + }, + { + 'expr': f'rate(user_registrations_total{{service="{service_name}"}}[1h])', + 'legendFormat': 'New Users/hour' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'custom': { + 'displayMode': 'basic' + } + } + }, + 'options': { + 'orientation': 'horizontal', + 'textMode': 'value_and_name' + } + } + ] + + def _create_capacity_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Create capacity planning panels.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'id': 'capacity_trends', + 'title': 'Capacity Trends (7d)', + 'type': 'timeseries', + 'grid_pos': {'x': 0, 'y': 34, 'w': 24, 'h': 6}, + 'targets': [ + { + 'expr': f'predict_linear(avg_over_time(rate(http_requests_total{{service="{service_name}"}}[5m])[7d:1h]), 7*24*3600)', + 'legendFormat': 'Predicted Traffic (7d)' + }, + { + 'expr': f'predict_linear(avg_over_time(process_resident_memory_bytes{{service="{service_name}"}}[7d:1h]), 7*24*3600)', + 'legendFormat': 'Predicted Memory Usage (7d)' + } + ], + 'field_config': { + 'defaults': { + 'color': {'mode': 'palette-classic'}, + 'custom': { + 'drawStyle': 'line', + 'lineStyle': {'dash': [10, 10]} + } + } + } + } + ] + + def _generate_template_variables(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Generate template variables for dynamic dashboard filtering.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'name': 'environment', + 'type': 'query', + 'query': 'label_values(environment)', + 'current': {'text': 'production', 'value': 'production'}, + 'includeAll': False, + 'multi': False, + 'refresh': 'on_dashboard_load' + }, + { + 'name': 'instance', + 'type': 'query', + 'query': f'label_values(up{{service="{service_name}"}}, instance)', + 'current': {'text': 'All', 'value': '$__all'}, + 'includeAll': True, + 'multi': True, + 'refresh': 'on_time_range_change' + }, + { + 'name': 'handler', + 'type': 'query', + 'query': f'label_values(http_requests_total{{service="{service_name}"}}, handler)', + 'current': {'text': 'All', 'value': '$__all'}, + 'includeAll': True, + 'multi': True, + 'refresh': 'on_time_range_change' + } + ] + + def _generate_alerts_integration(self, service_def: Dict[str, Any]) -> Dict[str, Any]: + """Generate alerts integration configuration.""" + service_name = service_def.get('name', 'service') + + return { + 'alert_annotations': True, + 'alert_rules_query': f'ALERTS{{service="{service_name}"}}', + 'alert_panels': [ + { + 'title': 'Active Alerts', + 'type': 'table', + 'query': f'ALERTS{{service="{service_name}",alertstate="firing"}}', + 'columns': ['alertname', 'severity', 'instance', 'description'] + } + ] + } + + def _generate_drill_down_paths(self, service_def: Dict[str, Any]) -> Dict[str, Any]: + """Generate drill-down navigation paths.""" + service_name = service_def.get('name', 'service') + + return { + 'service_overview': { + 'from': 'service_status', + 'to': 'detailed_health_dashboard', + 'url': f'/d/service-health/{service_name}-health', + 'params': ['var-service', 'var-environment'] + }, + 'error_investigation': { + 'from': 'errors', + 'to': 'error_details_dashboard', + 'url': f'/d/errors/{service_name}-errors', + 'params': ['var-service', 'var-time_range'] + }, + 'latency_analysis': { + 'from': 'latency', + 'to': 'trace_analysis_dashboard', + 'url': f'/d/traces/{service_name}-traces', + 'params': ['var-service', 'var-handler'] + }, + 'capacity_planning': { + 'from': 'saturation', + 'to': 'capacity_dashboard', + 'url': f'/d/capacity/{service_name}-capacity', + 'params': ['var-service', 'var-time_range'] + } + } + + def generate_grafana_json(self, dashboard_spec: Dict[str, Any]) -> Dict[str, Any]: + """Convert dashboard specification to Grafana JSON format.""" + metadata = dashboard_spec['metadata'] + config = dashboard_spec['configuration'] + + grafana_json = { + 'dashboard': { + 'id': None, + 'title': metadata['title'], + 'tags': [metadata['service']['type'], metadata['target_role'], 'generated'], + 'timezone': config['timezone'], + 'refresh': config['refresh_interval'], + 'time': { + 'from': 'now-1h', + 'to': 'now' + }, + 'templating': { + 'list': dashboard_spec['variables'] + }, + 'panels': self._convert_panels_to_grafana_format(dashboard_spec['panels']), + 'version': 1, + 'schemaVersion': 30 + }, + 'overwrite': True + } + + return grafana_json + + def _convert_panels_to_grafana_format(self, panels: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Convert panel specifications to Grafana format.""" + grafana_panels = [] + + for panel in panels: + grafana_panel = { + 'id': hash(panel['id']) % 1000, # Generate numeric ID + 'title': panel['title'], + 'type': panel['type'], + 'gridPos': panel['grid_pos'], + 'targets': panel['targets'], + 'fieldConfig': panel.get('field_config', {}), + 'options': panel.get('options', {}), + 'transformations': panel.get('transformations', []) + } + grafana_panels.append(grafana_panel) + + return grafana_panels + + def generate_documentation(self, dashboard_spec: Dict[str, Any]) -> str: + """Generate documentation for the dashboard.""" + metadata = dashboard_spec['metadata'] + service = metadata['service'] + + doc_content = f"""# {metadata['title']} Documentation + +## Overview +This dashboard provides comprehensive monitoring for {service['name']}, a {service['type']} service with {service['criticality']} criticality. + +**Target Audience:** {metadata['target_role'].upper()} teams +**Generated:** {metadata['generated_at']} + +## Dashboard Sections + +### Service Overview +- **Service Status**: Real-time availability status +- **SLO Achievement**: 30-day SLO compliance metrics +- **Error Budget**: Remaining error budget visualization + +### Golden Signals Monitoring +- **Latency**: P50, P95, P99 response times +- **Traffic**: Request rate by status code +- **Errors**: Error rates for 4xx and 5xx responses +- **Saturation**: CPU and memory utilization + +### Resource Utilization +- **CPU Usage**: Process CPU consumption +- **Memory Usage**: Memory utilization tracking +- **Network I/O**: Network throughput metrics +- **Disk I/O**: Disk read/write operations + +## Key Metrics + +### SLIs Tracked +""" + + # Add service-type specific metrics + service_type = service.get('type', 'api') + if service_type in self.SERVICE_METRICS: + metrics = self.SERVICE_METRICS[service_type]['key_metrics'] + for metric in metrics: + doc_content += f"- `{metric}`: Core service metric\n" + + doc_content += f""" +## Alert Integration +- Active alerts are displayed in context with relevant panels +- Alert annotations show on time series charts +- Click-through to alert management system available + +## Drill-Down Paths +""" + + drill_downs = dashboard_spec.get('drill_down_paths', {}) + for path_name, path_config in drill_downs.items(): + doc_content += f"- **{path_name}**: From {path_config['from']} → {path_config['to']}\n" + + doc_content += f""" +## Usage Guidelines + +### Time Ranges +Use appropriate time ranges for different investigation types: +- **Real-time monitoring**: 15m - 1h +- **Recent incident investigation**: 1h - 6h +- **Trend analysis**: 1d - 7d +- **Capacity planning**: 7d - 30d + +### Variables +- **environment**: Filter by deployment environment +- **instance**: Focus on specific service instances +- **handler**: Filter by API endpoint or handler + +### Performance Optimization +- Use longer time ranges for capacity planning +- Refresh intervals are optimized per role: + - SRE: 30s for operational awareness + - Developer: 1m for troubleshooting + - Executive: 5m for high-level monitoring + +## Maintenance +- Dashboard panels automatically adapt to service changes +- Template variables refresh based on actual metric labels +- Review and update business metrics quarterly +""" + + return doc_content + + def export_specification(self, dashboard_spec: Dict[str, Any], output_file: str, + format_type: str = 'json'): + """Export dashboard specification.""" + if format_type.lower() == 'json': + with open(output_file, 'w') as f: + json.dump(dashboard_spec, f, indent=2) + elif format_type.lower() == 'grafana': + grafana_json = self.generate_grafana_json(dashboard_spec) + with open(output_file, 'w') as f: + json.dump(grafana_json, f, indent=2) + else: + raise ValueError(f"Unsupported format: {format_type}") + + def print_summary(self, dashboard_spec: Dict[str, Any]): + """Print human-readable summary of dashboard specification.""" + metadata = dashboard_spec['metadata'] + service = metadata['service'] + config = dashboard_spec['configuration'] + panels = dashboard_spec['panels'] + + print(f"\n{'='*60}") + print(f"DASHBOARD SPECIFICATION SUMMARY") + print(f"{'='*60}") + + print(f"\nDashboard Details:") + print(f" Title: {metadata['title']}") + print(f" Target Role: {metadata['target_role'].upper()}") + print(f" Service: {service['name']} ({service['type']})") + print(f" Criticality: {service['criticality']}") + print(f" Generated: {metadata['generated_at']}") + + print(f"\nConfiguration:") + print(f" Default Time Range: {config['default_time_range']}") + print(f" Refresh Interval: {config['refresh_interval']}") + print(f" Available Time Ranges: {', '.join(config['time_ranges'])}") + + print(f"\nPanels ({len(panels)}):") + panel_types = {} + for panel in panels: + panel_type = panel['type'] + panel_types[panel_type] = panel_types.get(panel_type, 0) + 1 + + for panel_type, count in panel_types.items(): + print(f" {panel_type}: {count}") + + variables = dashboard_spec.get('variables', []) + print(f"\nTemplate Variables ({len(variables)}):") + for var in variables: + print(f" {var['name']} ({var['type']})") + + drill_downs = dashboard_spec.get('drill_down_paths', {}) + print(f"\nDrill-down Paths: {len(drill_downs)}") + + print(f"\nKey Features:") + print(f" • Golden Signals monitoring") + print(f" • Resource utilization tracking") + print(f" • Alert integration") + print(f" • Role-optimized layout") + print(f" • Service-type specific panels") + + print(f"\n{'='*60}\n") + + +def main(): + """Main function for CLI usage.""" + parser = argparse.ArgumentParser( + description='Generate comprehensive dashboard specifications', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate from service definition file + python dashboard_generator.py --input service.json --output dashboard.json + + # Generate from command line parameters + python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json + + # Generate Grafana-compatible JSON + python dashboard_generator.py --input service.json --output dashboard.json --format grafana + + # Generate with specific role focus + python dashboard_generator.py --service-type web --name "Frontend" --role developer --output frontend_dev.json + """ + ) + + parser.add_argument('--input', '-i', + help='Input service definition JSON file') + parser.add_argument('--output', '-o', + help='Output dashboard specification file') + parser.add_argument('--service-type', + choices=['api', 'web', 'database', 'queue', 'batch', 'ml'], + help='Service type') + parser.add_argument('--name', + help='Service name') + parser.add_argument('--criticality', + choices=['critical', 'high', 'medium', 'low'], + default='medium', + help='Service criticality level') + parser.add_argument('--role', + choices=['sre', 'developer', 'executive', 'ops'], + default='sre', + help='Target role for dashboard optimization') + parser.add_argument('--format', + choices=['json', 'grafana'], + default='json', + help='Output format (json specification or grafana compatible)') + parser.add_argument('--doc-output', + help='Generate documentation file') + parser.add_argument('--summary-only', action='store_true', + help='Only display summary, do not save files') + + args = parser.parse_args() + + if not args.input and not (args.service_type and args.name): + parser.error("Must provide either --input file or --service-type and --name") + + generator = DashboardGenerator() + + try: + # Load or create service definition + if args.input: + service_def = generator.load_service_definition(args.input) + else: + service_def = generator.create_service_definition( + args.service_type, args.name, args.criticality + ) + + # Generate dashboard specification + dashboard_spec = generator.generate_dashboard_specification(service_def, args.role) + + # Output results + if not args.summary_only: + output_file = args.output or f"{service_def['name'].replace(' ', '_').lower()}_dashboard.json" + generator.export_specification(dashboard_spec, output_file, args.format) + print(f"Dashboard specification saved to: {output_file}") + + # Generate documentation if requested + if args.doc_output: + documentation = generator.generate_documentation(dashboard_spec) + with open(args.doc_output, 'w') as f: + f.write(documentation) + print(f"Documentation saved to: {args.doc_output}") + + # Always show summary + generator.print_summary(dashboard_spec) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/engineering/observability-designer/scripts/slo_designer.py b/engineering/observability-designer/scripts/slo_designer.py new file mode 100644 index 0000000..69459a5 --- /dev/null +++ b/engineering/observability-designer/scripts/slo_designer.py @@ -0,0 +1,670 @@ +#!/usr/bin/env python3 +""" +SLO Designer - Generate comprehensive SLI/SLO frameworks for services + +This script analyzes service descriptions and generates complete SLO frameworks including: +- SLI definitions based on service characteristics +- SLO targets based on criticality and user impact +- Error budget calculations and policies +- Multi-window burn rate alerts +- SLA recommendations for customer-facing services + +Usage: + python slo_designer.py --input service_definition.json --output slo_framework.json + python slo_designer.py --service-type api --criticality high --user-facing true +""" + +import json +import argparse +import sys +import math +from typing import Dict, List, Any, Tuple +from datetime import datetime, timedelta + + +class SLODesigner: + """Design and generate SLO frameworks for services.""" + + # SLO target recommendations based on service criticality + SLO_TARGETS = { + 'critical': { + 'availability': 0.9999, # 99.99% - 4.38 minutes downtime/month + 'latency_p95': 100, # 95th percentile latency in ms + 'latency_p99': 500, # 99th percentile latency in ms + 'error_rate': 0.001 # 0.1% error rate + }, + 'high': { + 'availability': 0.999, # 99.9% - 43.8 minutes downtime/month + 'latency_p95': 200, # 95th percentile latency in ms + 'latency_p99': 1000, # 99th percentile latency in ms + 'error_rate': 0.005 # 0.5% error rate + }, + 'medium': { + 'availability': 0.995, # 99.5% - 3.65 hours downtime/month + 'latency_p95': 500, # 95th percentile latency in ms + 'latency_p99': 2000, # 99th percentile latency in ms + 'error_rate': 0.01 # 1% error rate + }, + 'low': { + 'availability': 0.99, # 99% - 7.3 hours downtime/month + 'latency_p95': 1000, # 95th percentile latency in ms + 'latency_p99': 5000, # 99th percentile latency in ms + 'error_rate': 0.02 # 2% error rate + } + } + + # Burn rate windows for multi-window alerting + BURN_RATE_WINDOWS = [ + {'short': '5m', 'long': '1h', 'burn_rate': 14.4, 'budget_consumed': '2%'}, + {'short': '30m', 'long': '6h', 'burn_rate': 6, 'budget_consumed': '5%'}, + {'short': '2h', 'long': '1d', 'burn_rate': 3, 'budget_consumed': '10%'}, + {'short': '6h', 'long': '3d', 'burn_rate': 1, 'budget_consumed': '10%'} + ] + + # Service type specific SLI recommendations + SERVICE_TYPE_SLIS = { + 'api': ['availability', 'latency', 'error_rate', 'throughput'], + 'web': ['availability', 'latency', 'error_rate', 'page_load_time'], + 'database': ['availability', 'query_latency', 'connection_success_rate', 'replication_lag'], + 'queue': ['availability', 'message_processing_time', 'queue_depth', 'message_loss_rate'], + 'batch': ['job_success_rate', 'job_duration', 'data_freshness', 'resource_utilization'], + 'ml': ['model_accuracy', 'prediction_latency', 'training_success_rate', 'feature_freshness'] + } + + def __init__(self): + """Initialize the SLO Designer.""" + self.service_config = {} + self.slo_framework = {} + + def load_service_definition(self, file_path: str) -> Dict[str, Any]: + """Load service definition from JSON file.""" + try: + with open(file_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + raise ValueError(f"Service definition file not found: {file_path}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in service definition: {e}") + + def create_service_definition(self, service_type: str, criticality: str, + user_facing: bool, name: str = None) -> Dict[str, Any]: + """Create a service definition from parameters.""" + return { + 'name': name or f'{service_type}_service', + 'type': service_type, + 'criticality': criticality, + 'user_facing': user_facing, + 'description': f'A {criticality} criticality {service_type} service', + 'dependencies': [], + 'team': 'platform', + 'environment': 'production' + } + + def generate_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Generate Service Level Indicators based on service characteristics.""" + service_type = service_def.get('type', 'api') + base_slis = self.SERVICE_TYPE_SLIS.get(service_type, ['availability', 'latency', 'error_rate']) + + slis = [] + + for sli_name in base_slis: + sli = self._create_sli_definition(sli_name, service_def) + if sli: + slis.append(sli) + + # Add user-facing specific SLIs + if service_def.get('user_facing', False): + user_slis = self._generate_user_facing_slis(service_def) + slis.extend(user_slis) + + return slis + + def _create_sli_definition(self, sli_name: str, service_def: Dict[str, Any]) -> Dict[str, Any]: + """Create detailed SLI definition.""" + service_name = service_def.get('name', 'service') + + sli_definitions = { + 'availability': { + 'name': 'Availability', + 'description': 'Percentage of successful requests', + 'type': 'ratio', + 'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))', + 'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))', + 'unit': 'percentage' + }, + 'latency': { + 'name': 'Request Latency P95', + 'description': '95th percentile of request latency', + 'type': 'threshold', + 'query': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m]))', + 'unit': 'seconds' + }, + 'error_rate': { + 'name': 'Error Rate', + 'description': 'Rate of 5xx errors', + 'type': 'ratio', + 'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))', + 'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))', + 'unit': 'percentage' + }, + 'throughput': { + 'name': 'Request Throughput', + 'description': 'Requests per second', + 'type': 'gauge', + 'query': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))', + 'unit': 'requests/sec' + }, + 'page_load_time': { + 'name': 'Page Load Time P95', + 'description': '95th percentile of page load time', + 'type': 'threshold', + 'query': f'histogram_quantile(0.95, rate(page_load_duration_seconds_bucket{{service="{service_name}"}}[5m]))', + 'unit': 'seconds' + }, + 'query_latency': { + 'name': 'Database Query Latency P95', + 'description': '95th percentile of database query latency', + 'type': 'threshold', + 'query': f'histogram_quantile(0.95, rate(db_query_duration_seconds_bucket{{service="{service_name}"}}[5m]))', + 'unit': 'seconds' + }, + 'connection_success_rate': { + 'name': 'Database Connection Success Rate', + 'description': 'Percentage of successful database connections', + 'type': 'ratio', + 'good_events': f'sum(rate(db_connections_total{{service="{service_name}",status="success"}}[5m]))', + 'total_events': f'sum(rate(db_connections_total{{service="{service_name}"}}[5m]))', + 'unit': 'percentage' + } + } + + return sli_definitions.get(sli_name) + + def _generate_user_facing_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]: + """Generate additional SLIs for user-facing services.""" + service_name = service_def.get('name', 'service') + + return [ + { + 'name': 'User Journey Success Rate', + 'description': 'Percentage of successful complete user journeys', + 'type': 'ratio', + 'good_events': f'sum(rate(user_journey_total{{service="{service_name}",status="success"}}[5m]))', + 'total_events': f'sum(rate(user_journey_total{{service="{service_name}"}}[5m]))', + 'unit': 'percentage' + }, + { + 'name': 'Feature Availability', + 'description': 'Percentage of time key features are available', + 'type': 'ratio', + 'good_events': f'sum(rate(feature_checks_total{{service="{service_name}",status="available"}}[5m]))', + 'total_events': f'sum(rate(feature_checks_total{{service="{service_name}"}}[5m]))', + 'unit': 'percentage' + } + ] + + def generate_slos(self, service_def: Dict[str, Any], slis: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Generate Service Level Objectives based on service criticality.""" + criticality = service_def.get('criticality', 'medium') + targets = self.SLO_TARGETS.get(criticality, self.SLO_TARGETS['medium']) + + slos = [] + + for sli in slis: + slo = self._create_slo_from_sli(sli, targets, service_def) + if slo: + slos.append(slo) + + return slos + + def _create_slo_from_sli(self, sli: Dict[str, Any], targets: Dict[str, float], + service_def: Dict[str, Any]) -> Dict[str, Any]: + """Create SLO definition from SLI.""" + sli_name = sli['name'].lower().replace(' ', '_') + + # Map SLI names to target keys + target_mapping = { + 'availability': 'availability', + 'request_latency_p95': 'latency_p95', + 'error_rate': 'error_rate', + 'user_journey_success_rate': 'availability', + 'feature_availability': 'availability', + 'page_load_time_p95': 'latency_p95', + 'database_query_latency_p95': 'latency_p95', + 'database_connection_success_rate': 'availability' + } + + target_key = target_mapping.get(sli_name) + if not target_key: + return None + + target_value = targets.get(target_key) + if target_value is None: + return None + + # Determine comparison operator and format target + if 'latency' in sli_name or 'duration' in sli_name: + operator = '<=' + target_display = f"{target_value}ms" if target_value < 10 else f"{target_value/1000}s" + elif 'rate' in sli_name and 'error' in sli_name: + operator = '<=' + target_display = f"{target_value * 100}%" + target_value = target_value # Keep as decimal + else: + operator = '>=' + target_display = f"{target_value * 100}%" + + # Calculate time windows + time_windows = ['1h', '1d', '7d', '30d'] + + slo = { + 'name': f"{sli['name']} SLO", + 'description': f"Service level objective for {sli['description'].lower()}", + 'sli_name': sli['name'], + 'target_value': target_value, + 'target_display': target_display, + 'operator': operator, + 'time_windows': time_windows, + 'measurement_window': '30d', + 'service': service_def.get('name', 'service'), + 'criticality': service_def.get('criticality', 'medium') + } + + return slo + + def calculate_error_budgets(self, slos: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Calculate error budgets for SLOs.""" + error_budgets = [] + + for slo in slos: + if slo['operator'] == '>=': # Availability-type SLOs + target = slo['target_value'] + error_budget_rate = 1 - target + + # Calculate budget for different time windows + time_windows = { + '1h': 3600, + '1d': 86400, + '7d': 604800, + '30d': 2592000 + } + + budgets = {} + for window, seconds in time_windows.items(): + budget_seconds = seconds * error_budget_rate + if budget_seconds < 60: + budgets[window] = f"{budget_seconds:.1f} seconds" + elif budget_seconds < 3600: + budgets[window] = f"{budget_seconds/60:.1f} minutes" + else: + budgets[window] = f"{budget_seconds/3600:.1f} hours" + + error_budget = { + 'slo_name': slo['name'], + 'error_budget_rate': error_budget_rate, + 'error_budget_percentage': f"{error_budget_rate * 100:.3f}%", + 'budgets_by_window': budgets, + 'burn_rate_alerts': self._generate_burn_rate_alerts(slo, error_budget_rate) + } + + error_budgets.append(error_budget) + + return error_budgets + + def _generate_burn_rate_alerts(self, slo: Dict[str, Any], error_budget_rate: float) -> List[Dict[str, Any]]: + """Generate multi-window burn rate alerts.""" + alerts = [] + service_name = slo['service'] + sli_query = self._get_sli_query_for_burn_rate(slo) + + for window_config in self.BURN_RATE_WINDOWS: + alert = { + 'name': f"{slo['sli_name']} Burn Rate {window_config['budget_consumed']} Alert", + 'description': f"Alert when {slo['sli_name']} is consuming error budget at {window_config['burn_rate']}x rate", + 'severity': self._determine_alert_severity(float(window_config['budget_consumed'].rstrip('%'))), + 'short_window': window_config['short'], + 'long_window': window_config['long'], + 'burn_rate_threshold': window_config['burn_rate'], + 'budget_consumed': window_config['budget_consumed'], + 'condition': f"({sli_query}_short > {window_config['burn_rate']}) and ({sli_query}_long > {window_config['burn_rate']})", + 'annotations': { + 'summary': f"High burn rate detected for {slo['sli_name']}", + 'description': f"Error budget consumption rate is {window_config['burn_rate']}x normal, will exhaust {window_config['budget_consumed']} of monthly budget" + } + } + alerts.append(alert) + + return alerts + + def _get_sli_query_for_burn_rate(self, slo: Dict[str, Any]) -> str: + """Generate SLI query fragment for burn rate calculation.""" + service_name = slo['service'] + sli_name = slo['sli_name'].lower().replace(' ', '_') + + if 'availability' in sli_name or 'success' in sli_name: + return f"(1 - (sum(rate(http_requests_total{{service='{service_name}',code!~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}}))))" + elif 'error' in sli_name: + return f"(sum(rate(http_requests_total{{service='{service_name}',code=~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}})))" + else: + return f"sli_burn_rate_{sli_name}" + + def _determine_alert_severity(self, budget_consumed_percent: float) -> str: + """Determine alert severity based on budget consumption rate.""" + if budget_consumed_percent <= 2: + return 'critical' + elif budget_consumed_percent <= 5: + return 'warning' + else: + return 'info' + + def generate_sla_recommendations(self, service_def: Dict[str, Any], + slos: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate SLA recommendations for customer-facing services.""" + if not service_def.get('user_facing', False): + return { + 'applicable': False, + 'reason': 'SLA not recommended for non-user-facing services' + } + + criticality = service_def.get('criticality', 'medium') + + # SLA targets should be more conservative than SLO targets + sla_buffer = 0.001 # 0.1% buffer below SLO + + sla_recommendations = { + 'applicable': True, + 'service': service_def.get('name'), + 'commitments': [], + 'penalties': self._generate_penalty_structure(criticality), + 'measurement_methodology': 'External synthetic monitoring from multiple geographic locations', + 'exclusions': [ + 'Planned maintenance windows (with 72h advance notice)', + 'Customer-side network or infrastructure issues', + 'Force majeure events', + 'Third-party service dependencies beyond our control' + ] + } + + for slo in slos: + if slo['operator'] == '>=' and 'availability' in slo['sli_name'].lower(): + sla_target = max(0.9, slo['target_value'] - sla_buffer) + commitment = { + 'metric': slo['sli_name'], + 'target': sla_target, + 'target_display': f"{sla_target * 100:.2f}%", + 'measurement_window': 'monthly', + 'measurement_method': 'Uptime monitoring with 1-minute granularity' + } + sla_recommendations['commitments'].append(commitment) + + return sla_recommendations + + def _generate_penalty_structure(self, criticality: str) -> List[Dict[str, Any]]: + """Generate penalty structure based on service criticality.""" + penalty_structures = { + 'critical': [ + {'breach_threshold': '< 99.99%', 'credit_percentage': 10}, + {'breach_threshold': '< 99.9%', 'credit_percentage': 25}, + {'breach_threshold': '< 99%', 'credit_percentage': 50} + ], + 'high': [ + {'breach_threshold': '< 99.9%', 'credit_percentage': 10}, + {'breach_threshold': '< 99.5%', 'credit_percentage': 25} + ], + 'medium': [ + {'breach_threshold': '< 99.5%', 'credit_percentage': 10} + ], + 'low': [] + } + + return penalty_structures.get(criticality, []) + + def generate_framework(self, service_def: Dict[str, Any]) -> Dict[str, Any]: + """Generate complete SLO framework.""" + # Generate SLIs + slis = self.generate_slis(service_def) + + # Generate SLOs + slos = self.generate_slos(service_def, slis) + + # Calculate error budgets + error_budgets = self.calculate_error_budgets(slos) + + # Generate SLA recommendations + sla_recommendations = self.generate_sla_recommendations(service_def, slos) + + # Create comprehensive framework + framework = { + 'metadata': { + 'service': service_def, + 'generated_at': datetime.utcnow().isoformat() + 'Z', + 'framework_version': '1.0' + }, + 'slis': slis, + 'slos': slos, + 'error_budgets': error_budgets, + 'sla_recommendations': sla_recommendations, + 'monitoring_recommendations': self._generate_monitoring_recommendations(service_def), + 'implementation_guide': self._generate_implementation_guide(service_def, slis, slos) + } + + return framework + + def _generate_monitoring_recommendations(self, service_def: Dict[str, Any]) -> Dict[str, Any]: + """Generate monitoring tool recommendations.""" + service_type = service_def.get('type', 'api') + + recommendations = { + 'metrics': { + 'collection': 'Prometheus with service discovery', + 'retention': '90 days for raw metrics, 1 year for aggregated', + 'alerting': 'Prometheus Alertmanager with multi-window burn rate alerts' + }, + 'logging': { + 'format': 'Structured JSON logs with correlation IDs', + 'aggregation': 'ELK stack or equivalent with proper indexing', + 'retention': '30 days for debug logs, 90 days for error logs' + }, + 'tracing': { + 'sampling': 'Adaptive sampling with 1% base rate', + 'storage': 'Jaeger or Zipkin with 7-day retention', + 'integration': 'OpenTelemetry instrumentation' + } + } + + if service_type == 'web': + recommendations['synthetic_monitoring'] = { + 'frequency': 'Every 1 minute from 3+ geographic locations', + 'checks': 'Full user journey simulation', + 'tools': 'Pingdom, DataDog Synthetics, or equivalent' + } + + return recommendations + + def _generate_implementation_guide(self, service_def: Dict[str, Any], + slis: List[Dict[str, Any]], + slos: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate implementation guide for the SLO framework.""" + return { + 'prerequisites': [ + 'Service instrumented with metrics collection (Prometheus format)', + 'Structured logging with correlation IDs', + 'Monitoring infrastructure (Prometheus, Grafana, Alertmanager)', + 'Incident response processes and escalation policies' + ], + 'implementation_steps': [ + { + 'step': 1, + 'title': 'Instrument Service', + 'description': 'Add metrics collection for all defined SLIs', + 'estimated_effort': '1-2 days' + }, + { + 'step': 2, + 'title': 'Configure Recording Rules', + 'description': 'Set up Prometheus recording rules for SLI calculations', + 'estimated_effort': '4-8 hours' + }, + { + 'step': 3, + 'title': 'Implement Burn Rate Alerts', + 'description': 'Configure multi-window burn rate alerting rules', + 'estimated_effort': '1 day' + }, + { + 'step': 4, + 'title': 'Create SLO Dashboard', + 'description': 'Build Grafana dashboard for SLO tracking and error budget monitoring', + 'estimated_effort': '4-6 hours' + }, + { + 'step': 5, + 'title': 'Test and Validate', + 'description': 'Test alerting and validate SLI measurements against expectations', + 'estimated_effort': '1-2 days' + }, + { + 'step': 6, + 'title': 'Documentation and Training', + 'description': 'Document runbooks and train team on SLO monitoring', + 'estimated_effort': '1 day' + } + ], + 'validation_checklist': [ + 'All SLIs produce expected metric values', + 'Burn rate alerts fire correctly during simulated outages', + 'Error budget calculations match manual verification', + 'Dashboard displays accurate SLO achievement rates', + 'Alert routing reaches correct escalation paths', + 'Runbooks are complete and tested' + ] + } + + def export_json(self, framework: Dict[str, Any], output_file: str): + """Export framework as JSON.""" + with open(output_file, 'w') as f: + json.dump(framework, f, indent=2) + + def print_summary(self, framework: Dict[str, Any]): + """Print human-readable summary of the SLO framework.""" + service = framework['metadata']['service'] + slis = framework['slis'] + slos = framework['slos'] + error_budgets = framework['error_budgets'] + + print(f"\n{'='*60}") + print(f"SLO FRAMEWORK SUMMARY FOR {service['name'].upper()}") + print(f"{'='*60}") + + print(f"\nService Details:") + print(f" Type: {service['type']}") + print(f" Criticality: {service['criticality']}") + print(f" User Facing: {'Yes' if service.get('user_facing') else 'No'}") + print(f" Team: {service.get('team', 'Unknown')}") + + print(f"\nService Level Indicators ({len(slis)}):") + for i, sli in enumerate(slis, 1): + print(f" {i}. {sli['name']}") + print(f" Description: {sli['description']}") + print(f" Type: {sli['type']}") + print() + + print(f"Service Level Objectives ({len(slos)}):") + for i, slo in enumerate(slos, 1): + print(f" {i}. {slo['name']}") + print(f" Target: {slo['target_display']}") + print(f" Measurement Window: {slo['measurement_window']}") + print() + + print(f"Error Budget Summary:") + for budget in error_budgets: + print(f" {budget['slo_name']}:") + print(f" Monthly Budget: {budget['error_budget_percentage']}") + print(f" Burn Rate Alerts: {len(budget['burn_rate_alerts'])}") + print() + + sla = framework['sla_recommendations'] + if sla['applicable']: + print(f"SLA Recommendations:") + print(f" Commitments: {len(sla['commitments'])}") + print(f" Penalty Tiers: {len(sla['penalties'])}") + else: + print(f"SLA Recommendations: {sla['reason']}") + + print(f"\nImplementation Timeline: 1-2 weeks") + print(f"Framework generated at: {framework['metadata']['generated_at']}") + print(f"{'='*60}\n") + + +def main(): + """Main function for CLI usage.""" + parser = argparse.ArgumentParser( + description='Generate comprehensive SLO frameworks for services', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate from service definition file + python slo_designer.py --input service.json --output framework.json + + # Generate from command line parameters + python slo_designer.py --service-type api --criticality high --user-facing true --output framework.json + + # Generate and display summary only + python slo_designer.py --service-type web --criticality critical --user-facing true --summary-only + """ + ) + + parser.add_argument('--input', '-i', + help='Input service definition JSON file') + parser.add_argument('--output', '-o', + help='Output framework JSON file') + parser.add_argument('--service-type', + choices=['api', 'web', 'database', 'queue', 'batch', 'ml'], + help='Service type') + parser.add_argument('--criticality', + choices=['critical', 'high', 'medium', 'low'], + help='Service criticality level') + parser.add_argument('--user-facing', + choices=['true', 'false'], + help='Whether service is user-facing') + parser.add_argument('--service-name', + help='Service name') + parser.add_argument('--summary-only', action='store_true', + help='Only display summary, do not save JSON') + + args = parser.parse_args() + + if not args.input and not (args.service_type and args.criticality and args.user_facing): + parser.error("Must provide either --input file or --service-type, --criticality, and --user-facing") + + designer = SLODesigner() + + try: + # Load or create service definition + if args.input: + service_def = designer.load_service_definition(args.input) + else: + user_facing = args.user_facing.lower() == 'true' + service_def = designer.create_service_definition( + args.service_type, args.criticality, user_facing, args.service_name + ) + + # Generate framework + framework = designer.generate_framework(service_def) + + # Output results + if not args.summary_only: + output_file = args.output or f"{service_def['name']}_slo_framework.json" + designer.export_json(framework, output_file) + print(f"SLO framework saved to: {output_file}") + + # Always show summary + designer.print_summary(framework) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() \ No newline at end of file