From 52732f7e2bbd58b2b98d8c418f42697a20c92190 Mon Sep 17 00:00:00 2001
From: Leo <leo@openclaw.ai>
Date: Mon, 16 Feb 2026 14:03:12 +0000
Subject: [PATCH] feat: add observability-designer POWERFUL-tier skill

- SLO Designer: generates comprehensive SLI/SLO frameworks with error budgets and burn rate alerts
- Alert Optimizer: analyzes and optimizes alert configurations to reduce noise and improve effectiveness
- Dashboard Generator: creates role-based dashboard specifications with golden signals coverage

Includes comprehensive documentation, sample data, and expected outputs for testing.
---
 engineering/observability-designer/README.md  |  384 ++++++
 engineering/observability-designer/SKILL.md   |  264 ++++
 .../assets/sample_alerts.json                 |  276 ++++
 .../assets/sample_service_api.json            |   83 ++
 .../assets/sample_service_web.json            |  113 ++
 .../expected_outputs/sample_dashboard.json    |  811 +++++++++++
 .../sample_slo_framework.json                 |  545 ++++++++
 .../references/alert_design_patterns.md       |  469 +++++++
 .../references/dashboard_best_practices.md    |  571 ++++++++
 .../references/slo_cookbook.md                |  329 +++++
 .../scripts/alert_optimizer.py                | 1059 ++++++++++++++
 .../scripts/dashboard_generator.py            | 1219 +++++++++++++++++
 .../scripts/slo_designer.py                   |  670 +++++++++
 13 files changed, 6793 insertions(+)
 create mode 100644 engineering/observability-designer/README.md
 create mode 100644 engineering/observability-designer/SKILL.md
 create mode 100644 engineering/observability-designer/assets/sample_alerts.json
 create mode 100644 engineering/observability-designer/assets/sample_service_api.json
 create mode 100644 engineering/observability-designer/assets/sample_service_web.json
 create mode 100644 engineering/observability-designer/expected_outputs/sample_dashboard.json
 create mode 100644 engineering/observability-designer/expected_outputs/sample_slo_framework.json
 create mode 100644 engineering/observability-designer/references/alert_design_patterns.md
 create mode 100644 engineering/observability-designer/references/dashboard_best_practices.md
 create mode 100644 engineering/observability-designer/references/slo_cookbook.md
 create mode 100644 engineering/observability-designer/scripts/alert_optimizer.py
 create mode 100644 engineering/observability-designer/scripts/dashboard_generator.py
 create mode 100644 engineering/observability-designer/scripts/slo_designer.py

diff --git a/engineering/observability-designer/README.md b/engineering/observability-designer/README.md
new file mode 100644
index 0000000..d6a1753
--- /dev/null
+++ b/engineering/observability-designer/README.md
@@ -0,0 +1,384 @@
+# Observability Designer
+
+A comprehensive toolkit for designing production-ready observability strategies including SLI/SLO frameworks, alert optimization, and dashboard generation.
+
+## Overview
+
+The Observability Designer skill provides three powerful Python scripts that help you create, optimize, and maintain observability systems:
+
+- **SLO Designer**: Generate complete SLI/SLO frameworks with error budgets and burn rate alerts
+- **Alert Optimizer**: Analyze and optimize existing alert configurations to reduce noise and improve effectiveness
+- **Dashboard Generator**: Create comprehensive dashboard specifications with role-based layouts and drill-down paths
+
+## Quick Start
+
+### Prerequisites
+
+- Python 3.7+
+- No external dependencies required (uses Python standard library only)
+
+### Basic Usage
+
+```bash
+# Generate SLO framework for a service
+python3 scripts/slo_designer.py --service-type api --criticality critical --user-facing true --service-name payment-service
+
+# Optimize existing alerts
+python3 scripts/alert_optimizer.py --input assets/sample_alerts.json --analyze-only
+
+# Generate a dashboard specification
+python3 scripts/dashboard_generator.py --service-type web --name "Customer Portal" --role sre
+```
+
+## Scripts Documentation
+
+### SLO Designer (`slo_designer.py`)
+
+Generates comprehensive SLO frameworks based on service characteristics.
+
+#### Features
+- **Automatic SLI Selection**: Recommends appropriate SLIs based on service type
+- **Target Setting**: Suggests SLO targets based on service criticality
+- **Error Budget Calculation**: Computes error budgets and burn rate thresholds
+- **Multi-Window Burn Rate Alerts**: Generates 4-window burn rate alerting rules
+- **SLA Recommendations**: Provides customer-facing SLA guidance
+
+#### Usage Examples
+
+```bash
+# From service definition file
+python3 scripts/slo_designer.py --input assets/sample_service_api.json --output slo_framework.json
+
+# From command line parameters
+python3 scripts/slo_designer.py \
+    --service-type api \
+    --criticality critical \
+    --user-facing true \
+    --service-name payment-service \
+    --output payment_slos.json
+
+# Generate and display summary only
+python3 scripts/slo_designer.py --input assets/sample_service_web.json --summary-only
+```
+
+#### Service Definition Format
+
+```json
+{
+  "name": "payment-service",
+  "type": "api",
+  "criticality": "critical",
+  "user_facing": true,
+  "description": "Handles payment processing",
+  "team": "payments",
+  "environment": "production",
+  "dependencies": [
+    {
+      "name": "user-service",
+      "type": "api",
+      "criticality": "high"
+    }
+  ]
+}
+```
+
+#### Supported Service Types
+- **api**: REST APIs, GraphQL services
+- **web**: Web applications, SPAs
+- **database**: Database services, data stores
+- **queue**: Message queues, event streams
+- **batch**: Batch processing jobs
+- **ml**: Machine learning services
+
+#### Criticality Levels
+- **critical**: 99.99% availability, <100ms P95 latency, <0.1% error rate
+- **high**: 99.9% availability, <200ms P95 latency, <0.5% error rate
+- **medium**: 99.5% availability, <500ms P95 latency, <1% error rate
+- **low**: 99% availability, <1s P95 latency, <2% error rate
+
+### Alert Optimizer (`alert_optimizer.py`)
+
+Analyzes existing alert configurations and provides optimization recommendations.
+
+#### Features
+- **Noise Detection**: Identifies alerts with high false positive rates
+- **Coverage Analysis**: Finds gaps in monitoring coverage
+- **Duplicate Detection**: Locates redundant or overlapping alerts  
+- **Threshold Analysis**: Reviews alert thresholds for appropriateness
+- **Fatigue Assessment**: Evaluates alert volume and routing
+
+#### Usage Examples
+
+```bash
+# Analyze existing alerts
+python3 scripts/alert_optimizer.py --input assets/sample_alerts.json --analyze-only
+
+# Generate optimized configuration
+python3 scripts/alert_optimizer.py \
+    --input assets/sample_alerts.json \
+    --output optimized_alerts.json
+
+# Generate HTML report
+python3 scripts/alert_optimizer.py \
+    --input assets/sample_alerts.json \
+    --report alert_analysis.html \
+    --format html
+```
+
+#### Alert Configuration Format
+
+```json
+{
+  "alerts": [
+    {
+      "alert": "HighLatency",
+      "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5",
+      "for": "5m",
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service"
+      },
+      "annotations": {
+        "summary": "High request latency detected",
+        "runbook_url": "https://runbooks.company.com/high-latency"
+      },
+      "historical_data": {
+        "fires_per_day": 2.5,
+        "false_positive_rate": 0.15
+      }
+    }
+  ],
+  "services": [
+    {
+      "name": "payment-service",
+      "criticality": "critical"
+    }
+  ]
+}
+```
+
+#### Analysis Categories
+- **Golden Signals**: Latency, traffic, errors, saturation
+- **Resource Utilization**: CPU, memory, disk, network
+- **Business Metrics**: Revenue, conversion, user engagement
+- **Security**: Auth failures, suspicious activity
+- **Availability**: Uptime, health checks
+
+### Dashboard Generator (`dashboard_generator.py`)
+
+Creates comprehensive dashboard specifications with role-based optimization.
+
+#### Features
+- **Role-Based Layouts**: Optimized for SRE, Developer, Executive, and Ops personas
+- **Golden Signals Coverage**: Automatic inclusion of key monitoring metrics
+- **Service-Type Specific Panels**: Tailored panels based on service characteristics
+- **Interactive Elements**: Template variables, drill-down paths, time range controls
+- **Grafana Compatibility**: Generates Grafana-compatible JSON
+
+#### Usage Examples
+
+```bash
+# From service definition
+python3 scripts/dashboard_generator.py \
+    --input assets/sample_service_web.json \
+    --output dashboard.json
+
+# With specific role optimization
+python3 scripts/dashboard_generator.py \
+    --service-type api \
+    --name "Payment Service" \
+    --role developer \
+    --output payment_dev_dashboard.json
+
+# Generate Grafana-compatible JSON
+python3 scripts/dashboard_generator.py \
+    --input assets/sample_service_api.json \
+    --output dashboard.json \
+    --format grafana
+
+# With documentation
+python3 scripts/dashboard_generator.py \
+    --service-type web \
+    --name "Customer Portal" \
+    --output portal_dashboard.json \
+    --doc-output portal_docs.md
+```
+
+#### Target Roles
+
+- **sre**: Focus on availability, latency, errors, resource utilization
+- **developer**: Emphasize latency, errors, throughput, business metrics  
+- **executive**: Highlight availability, business metrics, user experience
+- **ops**: Priority on resource utilization, capacity, alerts, deployments
+
+#### Panel Types
+- **Stat**: Single value displays with thresholds
+- **Gauge**: Resource utilization and capacity metrics
+- **Timeseries**: Trend analysis and historical data
+- **Table**: Top N lists and detailed breakdowns
+- **Heatmap**: Distribution and correlation analysis
+
+## Sample Data
+
+The `assets/` directory contains sample configurations for testing:
+
+- `sample_service_api.json`: Critical API service definition
+- `sample_service_web.json`: High-priority web application definition  
+- `sample_alerts.json`: Alert configuration with optimization opportunities
+
+The `expected_outputs/` directory shows example outputs from each script:
+
+- `sample_slo_framework.json`: Complete SLO framework for API service
+- `optimized_alerts.json`: Optimized alert configuration
+- `sample_dashboard.json`: SRE dashboard specification
+
+## Best Practices
+
+### SLO Design
+- Start with 1-2 SLOs per service and iterate
+- Choose SLIs that directly impact user experience
+- Set targets based on user needs, not technical capabilities
+- Use error budgets to balance reliability and velocity
+
+### Alert Optimization
+- Every alert must be actionable
+- Alert on symptoms, not causes
+- Use multi-window burn rate alerts for SLO protection
+- Implement proper escalation and routing policies
+
+### Dashboard Design  
+- Follow the F-pattern for visual hierarchy
+- Use consistent color semantics across dashboards
+- Include drill-down paths for effective troubleshooting
+- Optimize for the target role's specific needs
+
+## Integration Patterns
+
+### CI/CD Integration
+```bash
+# Generate SLOs during service onboarding
+python3 scripts/slo_designer.py --input service-config.json --output slos.json
+
+# Validate alert configurations in pipeline
+python3 scripts/alert_optimizer.py --input alerts.json --analyze-only --report validation.html
+
+# Auto-generate dashboards for new services
+python3 scripts/dashboard_generator.py --input service-config.json --format grafana --output dashboard.json
+```
+
+### Monitoring Stack Integration
+- **Prometheus**: Generated alert rules and recording rules
+- **Grafana**: Dashboard JSON for direct import
+- **Alertmanager**: Routing and escalation policies
+- **PagerDuty**: Escalation configuration
+
+### GitOps Workflow
+1. Store service definitions in version control
+2. Generate observability configurations in CI/CD
+3. Deploy configurations via GitOps
+4. Monitor effectiveness and iterate
+
+## Advanced Usage
+
+### Custom SLO Targets
+Override default targets by including them in service definitions:
+
+```json
+{
+  "name": "special-service",
+  "type": "api",
+  "criticality": "high",
+  "custom_slos": {
+    "availability_target": 0.9995,
+    "latency_p95_target_ms": 150,
+    "error_rate_target": 0.002
+  }
+}
+```
+
+### Alert Rule Templates
+Use template variables for reusable alert rules:
+
+```yaml
+# Generated Prometheus alert rule
+- alert: {{ service_name }}_HighLatency
+  expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service="{{ service_name }}"}[5m])) > {{ latency_threshold }}
+  for: 5m
+  labels:
+    severity: warning
+    service: "{{ service_name }}"
+```
+
+### Dashboard Variants
+Generate multiple dashboard variants for different use cases:
+
+```bash
+# SRE operational dashboard
+python3 scripts/dashboard_generator.py --input service.json --role sre --output sre-dashboard.json
+
+# Developer debugging dashboard  
+python3 scripts/dashboard_generator.py --input service.json --role developer --output dev-dashboard.json
+
+# Executive business dashboard
+python3 scripts/dashboard_generator.py --input service.json --role executive --output exec-dashboard.json
+```
+
+## Troubleshooting
+
+### Common Issues
+
+#### Script Execution Errors
+- Ensure Python 3.7+ is installed
+- Check file paths and permissions
+- Validate JSON syntax in input files
+
+#### Invalid Service Definitions
+- Required fields: `name`, `type`, `criticality`
+- Valid service types: `api`, `web`, `database`, `queue`, `batch`, `ml`
+- Valid criticality levels: `critical`, `high`, `medium`, `low`
+
+#### Missing Historical Data
+- Alert historical data is optional but improves analysis
+- Include `fires_per_day` and `false_positive_rate` when available
+- Use monitoring system APIs to populate historical metrics
+
+### Debug Mode
+Enable verbose logging by setting environment variable:
+
+```bash
+export DEBUG=1
+python3 scripts/slo_designer.py --input service.json
+```
+
+## Contributing
+
+### Development Setup
+```bash
+# Clone the repository
+git clone <repository-url>
+cd engineering/observability-designer
+
+# Run tests
+python3 -m pytest tests/
+
+# Lint code
+python3 -m flake8 scripts/
+```
+
+### Adding New Features
+1. Follow existing code patterns and error handling
+2. Include comprehensive docstrings and type hints  
+3. Add test cases for new functionality
+4. Update documentation and examples
+
+## Support
+
+For questions, issues, or feature requests:
+- Check existing documentation and examples
+- Review the reference materials in `references/`
+- Open an issue with detailed reproduction steps
+- Include sample configurations when reporting bugs
+
+---
+
+*This skill is part of the Claude Skills marketplace. For more information about observability best practices, see the reference documentation in the `references/` directory.*
\ No newline at end of file
diff --git a/engineering/observability-designer/SKILL.md b/engineering/observability-designer/SKILL.md
new file mode 100644
index 0000000..1cb45d5
--- /dev/null
+++ b/engineering/observability-designer/SKILL.md
@@ -0,0 +1,264 @@
+# Observability Designer (POWERFUL)
+
+**Category:** Engineering  
+**Tier:** POWERFUL  
+**Description:** Design comprehensive observability strategies for production systems including SLI/SLO frameworks, alerting optimization, and dashboard generation.
+
+## Overview
+
+Observability Designer enables you to create production-ready observability strategies that provide deep insights into system behavior, performance, and reliability. This skill combines the three pillars of observability (metrics, logs, traces) with proven frameworks like SLI/SLO design, golden signals monitoring, and alert optimization to create comprehensive observability solutions.
+
+## Core Competencies
+
+### SLI/SLO/SLA Framework Design
+- **Service Level Indicators (SLI):** Define measurable signals that indicate service health
+- **Service Level Objectives (SLO):** Set reliability targets based on user experience
+- **Service Level Agreements (SLA):** Establish customer-facing commitments with consequences
+- **Error Budget Management:** Calculate and track error budget consumption
+- **Burn Rate Alerting:** Multi-window burn rate alerts for proactive SLO protection
+
+### Three Pillars of Observability
+
+#### Metrics
+- **Golden Signals:** Latency, traffic, errors, and saturation monitoring
+- **RED Method:** Rate, Errors, and Duration for request-driven services
+- **USE Method:** Utilization, Saturation, and Errors for resource monitoring
+- **Business Metrics:** Revenue, user engagement, and feature adoption tracking
+- **Infrastructure Metrics:** CPU, memory, disk, network, and custom resource metrics
+
+#### Logs
+- **Structured Logging:** JSON-based log formats with consistent fields
+- **Log Aggregation:** Centralized log collection and indexing strategies
+- **Log Levels:** Appropriate use of DEBUG, INFO, WARN, ERROR, FATAL levels
+- **Correlation IDs:** Request tracing through distributed systems
+- **Log Sampling:** Volume management for high-throughput systems
+
+#### Traces
+- **Distributed Tracing:** End-to-end request flow visualization
+- **Span Design:** Meaningful span boundaries and metadata
+- **Trace Sampling:** Intelligent sampling strategies for performance and cost
+- **Service Maps:** Automatic dependency discovery through traces
+- **Root Cause Analysis:** Trace-driven debugging workflows
+
+### Dashboard Design Principles
+
+#### Information Architecture
+- **Hierarchy:** Overview → Service → Component → Instance drill-down paths
+- **Golden Ratio:** 80% operational metrics, 20% exploratory metrics
+- **Cognitive Load:** Maximum 7±2 panels per dashboard screen
+- **User Journey:** Role-based dashboard personas (SRE, Developer, Executive)
+
+#### Visualization Best Practices
+- **Chart Selection:** Time series for trends, heatmaps for distributions, gauges for status
+- **Color Theory:** Red for critical, amber for warning, green for healthy states
+- **Reference Lines:** SLO targets, capacity thresholds, and historical baselines
+- **Time Ranges:** Default to meaningful windows (4h for incidents, 7d for trends)
+
+#### Panel Design
+- **Metric Queries:** Efficient Prometheus/InfluxDB queries with proper aggregation
+- **Alerting Integration:** Visual alert state indicators on relevant panels
+- **Interactive Elements:** Template variables, drill-down links, and annotation overlays
+- **Performance:** Sub-second render times through query optimization
+
+### Alert Design and Optimization
+
+#### Alert Classification
+- **Severity Levels:** 
+  - **Critical:** Service down, SLO burn rate high
+  - **Warning:** Approaching thresholds, non-user-facing issues
+  - **Info:** Deployment notifications, capacity planning alerts
+- **Actionability:** Every alert must have a clear response action
+- **Alert Routing:** Escalation policies based on severity and team ownership
+
+#### Alert Fatigue Prevention
+- **Signal vs Noise:** High precision (few false positives) over high recall
+- **Hysteresis:** Different thresholds for firing and resolving alerts
+- **Suppression:** Dependent alert suppression during known outages
+- **Grouping:** Related alerts grouped into single notifications
+
+#### Alert Rule Design
+- **Threshold Selection:** Statistical methods for threshold determination
+- **Window Functions:** Appropriate averaging windows and percentile calculations
+- **Alert Lifecycle:** Clear firing conditions and automatic resolution criteria
+- **Testing:** Alert rule validation against historical data
+
+### Runbook Generation and Incident Response
+
+#### Runbook Structure
+- **Alert Context:** What the alert means and why it fired
+- **Impact Assessment:** User-facing vs internal impact evaluation
+- **Investigation Steps:** Ordered troubleshooting procedures with time estimates
+- **Resolution Actions:** Common fixes and escalation procedures
+- **Post-Incident:** Follow-up tasks and prevention measures
+
+#### Incident Detection Patterns
+- **Anomaly Detection:** Statistical methods for detecting unusual patterns
+- **Composite Alerts:** Multi-signal alerts for complex failure modes
+- **Predictive Alerts:** Capacity and trend-based forward-looking alerts
+- **Canary Monitoring:** Early detection through progressive deployment monitoring
+
+### Golden Signals Framework
+
+#### Latency Monitoring
+- **Request Latency:** P50, P95, P99 response time tracking
+- **Queue Latency:** Time spent waiting in processing queues
+- **Network Latency:** Inter-service communication delays
+- **Database Latency:** Query execution and connection pool metrics
+
+#### Traffic Monitoring
+- **Request Rate:** Requests per second with burst detection
+- **Bandwidth Usage:** Network throughput and capacity utilization
+- **User Sessions:** Active user tracking and session duration
+- **Feature Usage:** API endpoint and feature adoption metrics
+
+#### Error Monitoring
+- **Error Rate:** 4xx and 5xx HTTP response code tracking
+- **Error Budget:** SLO-based error rate targets and consumption
+- **Error Distribution:** Error type classification and trending
+- **Silent Failures:** Detection of processing failures without HTTP errors
+
+#### Saturation Monitoring
+- **Resource Utilization:** CPU, memory, disk, and network usage
+- **Queue Depth:** Processing queue length and wait times
+- **Connection Pools:** Database and service connection saturation
+- **Rate Limiting:** API throttling and quota exhaustion tracking
+
+### Distributed Tracing Strategies
+
+#### Trace Architecture
+- **Sampling Strategy:** Head-based, tail-based, and adaptive sampling
+- **Trace Propagation:** Context propagation across service boundaries
+- **Span Correlation:** Parent-child relationship modeling
+- **Trace Storage:** Retention policies and storage optimization
+
+#### Service Instrumentation
+- **Auto-Instrumentation:** Framework-based automatic trace generation
+- **Manual Instrumentation:** Custom span creation for business logic
+- **Baggage Handling:** Cross-cutting concern propagation
+- **Performance Impact:** Instrumentation overhead measurement and optimization
+
+### Log Aggregation Patterns
+
+#### Collection Architecture
+- **Agent Deployment:** Log shipping agent strategies (push vs pull)
+- **Log Routing:** Topic-based routing and filtering
+- **Parsing Strategies:** Structured vs unstructured log handling
+- **Schema Evolution:** Log format versioning and migration
+
+#### Storage and Indexing
+- **Index Design:** Optimized field indexing for common query patterns
+- **Retention Policies:** Time and volume-based log retention
+- **Compression:** Log data compression and archival strategies
+- **Search Performance:** Query optimization and result caching
+
+### Cost Optimization for Observability
+
+#### Data Management
+- **Metric Retention:** Tiered retention based on metric importance
+- **Log Sampling:** Intelligent sampling to reduce ingestion costs
+- **Trace Sampling:** Cost-effective trace collection strategies
+- **Data Archival:** Cold storage for historical observability data
+
+#### Resource Optimization
+- **Query Efficiency:** Optimized metric and log queries
+- **Storage Costs:** Appropriate storage tiers for different data types
+- **Ingestion Rate Limiting:** Controlled data ingestion to manage costs
+- **Cardinality Management:** High-cardinality metric detection and mitigation
+
+## Scripts Overview
+
+This skill includes three powerful Python scripts for comprehensive observability design:
+
+### 1. SLO Designer (`slo_designer.py`)
+Generates complete SLI/SLO frameworks based on service characteristics:
+- **Input:** Service description JSON (type, criticality, dependencies)
+- **Output:** SLI definitions, SLO targets, error budgets, burn rate alerts, SLA recommendations
+- **Features:** Multi-window burn rate calculations, error budget policies, alert rule generation
+
+### 2. Alert Optimizer (`alert_optimizer.py`)
+Analyzes and optimizes existing alert configurations:
+- **Input:** Alert configuration JSON with rules, thresholds, and routing
+- **Output:** Optimization report and improved alert configuration
+- **Features:** Noise detection, coverage gaps, duplicate identification, threshold optimization
+
+### 3. Dashboard Generator (`dashboard_generator.py`)
+Creates comprehensive dashboard specifications:
+- **Input:** Service/system description JSON
+- **Output:** Grafana-compatible dashboard JSON and documentation
+- **Features:** Golden signals coverage, RED/USE methods, drill-down paths, role-based views
+
+## Integration Patterns
+
+### Monitoring Stack Integration
+- **Prometheus:** Metric collection and alerting rule generation
+- **Grafana:** Dashboard creation and visualization configuration
+- **Elasticsearch/Kibana:** Log analysis and dashboard integration
+- **Jaeger/Zipkin:** Distributed tracing configuration and analysis
+
+### CI/CD Integration
+- **Pipeline Monitoring:** Build, test, and deployment observability
+- **Deployment Correlation:** Release impact tracking and rollback triggers
+- **Feature Flag Monitoring:** A/B test and feature rollout observability
+- **Performance Regression:** Automated performance monitoring in pipelines
+
+### Incident Management Integration
+- **PagerDuty/VictorOps:** Alert routing and escalation policies
+- **Slack/Teams:** Notification and collaboration integration
+- **JIRA/ServiceNow:** Incident tracking and resolution workflows
+- **Post-Mortem:** Automated incident analysis and improvement tracking
+
+## Advanced Patterns
+
+### Multi-Cloud Observability
+- **Cross-Cloud Metrics:** Unified metrics across AWS, GCP, Azure
+- **Network Observability:** Inter-cloud connectivity monitoring
+- **Cost Attribution:** Cloud resource cost tracking and optimization
+- **Compliance Monitoring:** Security and compliance posture tracking
+
+### Microservices Observability
+- **Service Mesh Integration:** Istio/Linkerd observability configuration
+- **API Gateway Monitoring:** Request routing and rate limiting observability
+- **Container Orchestration:** Kubernetes cluster and workload monitoring
+- **Service Discovery:** Dynamic service monitoring and health checks
+
+### Machine Learning Observability
+- **Model Performance:** Accuracy, drift, and bias monitoring
+- **Feature Store Monitoring:** Feature quality and freshness tracking
+- **Pipeline Observability:** ML pipeline execution and performance monitoring
+- **A/B Test Analysis:** Statistical significance and business impact measurement
+
+## Best Practices
+
+### Organizational Alignment
+- **SLO Setting:** Collaborative target setting between product and engineering
+- **Alert Ownership:** Clear escalation paths and team responsibilities
+- **Dashboard Governance:** Centralized dashboard management and standards
+- **Training Programs:** Team education on observability tools and practices
+
+### Technical Excellence
+- **Infrastructure as Code:** Observability configuration version control
+- **Testing Strategy:** Alert rule testing and dashboard validation
+- **Performance Monitoring:** Observability system performance tracking
+- **Security Considerations:** Access control and data privacy in observability
+
+### Continuous Improvement
+- **Metrics Review:** Regular SLI/SLO effectiveness assessment
+- **Alert Tuning:** Ongoing alert threshold and routing optimization
+- **Dashboard Evolution:** User feedback-driven dashboard improvements
+- **Tool Evaluation:** Regular assessment of observability tool effectiveness
+
+## Success Metrics
+
+### Operational Metrics
+- **Mean Time to Detection (MTTD):** How quickly issues are identified
+- **Mean Time to Resolution (MTTR):** Time from detection to resolution
+- **Alert Precision:** Percentage of actionable alerts
+- **SLO Achievement:** Percentage of SLO targets met consistently
+
+### Business Metrics
+- **System Reliability:** Overall uptime and user experience quality
+- **Engineering Velocity:** Development team productivity and deployment frequency
+- **Cost Efficiency:** Observability cost as percentage of infrastructure spend
+- **Customer Satisfaction:** User-reported reliability and performance satisfaction
+
+This comprehensive observability design skill enables organizations to build robust, scalable monitoring and alerting systems that provide actionable insights while maintaining cost efficiency and operational excellence.
\ No newline at end of file
diff --git a/engineering/observability-designer/assets/sample_alerts.json b/engineering/observability-designer/assets/sample_alerts.json
new file mode 100644
index 0000000..14e1cc1
--- /dev/null
+++ b/engineering/observability-designer/assets/sample_alerts.json
@@ -0,0 +1,276 @@
+{
+  "alerts": [
+    {
+      "alert": "HighLatency",
+      "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5",
+      "for": "5m",
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "High request latency detected",
+        "description": "95th percentile latency is {{ $value }}s for payment-service",
+        "runbook_url": "https://runbooks.company.com/high-latency"
+      },
+      "historical_data": {
+        "fires_per_day": 2.5,
+        "false_positive_rate": 0.15,
+        "average_duration_minutes": 12
+      }
+    },
+    {
+      "alert": "ServiceDown",
+      "expr": "up{service=\"payment-service\"} == 0",
+      "labels": {
+        "severity": "critical",
+        "service": "payment-service", 
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "Payment service is down",
+        "description": "Payment service has been down for more than 1 minute",
+        "runbook_url": "https://runbooks.company.com/service-down"
+      },
+      "historical_data": {
+        "fires_per_day": 0.1,
+        "false_positive_rate": 0.05,
+        "average_duration_minutes": 3
+      }
+    },
+    {
+      "alert": "HighErrorRate",
+      "expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.01",
+      "for": "2m",
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "High error rate detected",
+        "description": "Error rate is {{ $value | humanizePercentage }} for payment-service",
+        "runbook_url": "https://runbooks.company.com/high-error-rate"
+      },
+      "historical_data": {
+        "fires_per_day": 1.8,
+        "false_positive_rate": 0.25,
+        "average_duration_minutes": 8
+      }
+    },
+    {
+      "alert": "HighCPUUsage",
+      "expr": "rate(process_cpu_seconds_total{service=\"payment-service\"}[5m]) * 100 > 80",
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "High CPU usage",
+        "description": "CPU usage is {{ $value }}% for payment-service"
+      },
+      "historical_data": {
+        "fires_per_day": 15.2,
+        "false_positive_rate": 0.8,
+        "average_duration_minutes": 45
+      }
+    },
+    {
+      "alert": "HighMemoryUsage", 
+      "expr": "process_resident_memory_bytes{service=\"payment-service\"} / process_virtual_memory_max_bytes{service=\"payment-service\"} * 100 > 85",
+      "labels": {
+        "severity": "info",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "High memory usage",
+        "description": "Memory usage is {{ $value }}% for payment-service"
+      },
+      "historical_data": {
+        "fires_per_day": 8.5,
+        "false_positive_rate": 0.6,
+        "average_duration_minutes": 30
+      }
+    },
+    {
+      "alert": "DatabaseConnectionPoolExhaustion",
+      "expr": "db_connections_active{service=\"payment-service\"} / db_connections_max{service=\"payment-service\"} > 0.9",
+      "for": "1m",
+      "labels": {
+        "severity": "critical",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "Database connection pool near exhaustion",
+        "description": "Connection pool utilization is {{ $value | humanizePercentage }}",
+        "runbook_url": "https://runbooks.company.com/db-connections"
+      },
+      "historical_data": {
+        "fires_per_day": 0.3,
+        "false_positive_rate": 0.1,
+        "average_duration_minutes": 5
+      }
+    },
+    {
+      "alert": "LowTraffic",
+      "expr": "sum(rate(http_requests_total{service=\"payment-service\"}[5m])) < 10",
+      "for": "10m",
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "Unusually low traffic",
+        "description": "Request rate is {{ $value }} RPS, which is unusually low"
+      },
+      "historical_data": {
+        "fires_per_day": 12.0,
+        "false_positive_rate": 0.9,
+        "average_duration_minutes": 120
+      }
+    },
+    {
+      "alert": "HighLatencyDuplicate",
+      "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m])) > 0.5",
+      "for": "5m", 
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "High request latency detected (duplicate)",
+        "description": "95th percentile latency is {{ $value }}s for payment-service"
+      },
+      "historical_data": {
+        "fires_per_day": 2.5,
+        "false_positive_rate": 0.15,
+        "average_duration_minutes": 12
+      }
+    },
+    {
+      "alert": "VeryLowErrorRate",
+      "expr": "sum(rate(http_requests_total{service=\"payment-service\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"payment-service\"}[5m])) > 0.001",
+      "labels": {
+        "severity": "info",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "Error rate above 0.1%",
+        "description": "Error rate is {{ $value | humanizePercentage }}"
+      },
+      "historical_data": {
+        "fires_per_day": 25.0,
+        "false_positive_rate": 0.95,
+        "average_duration_minutes": 5
+      }
+    },
+    {
+      "alert": "DiskUsageHigh",
+      "expr": "disk_usage_percent{service=\"payment-service\"} > 85",
+      "labels": {
+        "severity": "warning",
+        "service": "payment-service",
+        "team": "payments"
+      },
+      "annotations": {
+        "summary": "Disk usage high",
+        "description": "Disk usage is {{ $value }}%"
+      },
+      "historical_data": {
+        "fires_per_day": 3.2,
+        "false_positive_rate": 0.4,
+        "average_duration_minutes": 240
+      }
+    }
+  ],
+  "services": [
+    {
+      "name": "payment-service",
+      "type": "api",
+      "criticality": "critical",
+      "team": "payments"
+    },
+    {
+      "name": "user-service", 
+      "type": "api",
+      "criticality": "high",
+      "team": "identity"
+    },
+    {
+      "name": "notification-service",
+      "type": "api", 
+      "criticality": "medium",
+      "team": "communications"
+    }
+  ],
+  "alert_routing": {
+    "routes": [
+      {
+        "match": {
+          "severity": "critical"
+        },
+        "receiver": "pager-critical",
+        "group_wait": "10s",
+        "group_interval": "1m",
+        "repeat_interval": "5m"
+      },
+      {
+        "match": {
+          "severity": "warning"
+        },
+        "receiver": "slack-warnings",
+        "group_wait": "30s",
+        "group_interval": "5m", 
+        "repeat_interval": "1h"
+      },
+      {
+        "match": {
+          "severity": "info"
+        },
+        "receiver": "email-info",
+        "group_wait": "2m",
+        "group_interval": "10m",
+        "repeat_interval": "24h"
+      }
+    ]
+  },
+  "receivers": [
+    {
+      "name": "pager-critical",
+      "pagerduty_configs": [
+        {
+          "routing_key": "pager-key-critical",
+          "description": "Critical alert: {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}"
+        }
+      ]
+    },
+    {
+      "name": "slack-warnings",
+      "slack_configs": [
+        {
+          "api_url": "https://hooks.slack.com/services/warnings",
+          "channel": "#alerts-warnings",
+          "title": "Warning Alert",
+          "text": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
+        }
+      ]
+    },
+    {
+      "name": "email-info",
+      "email_configs": [
+        {
+          "to": "team-notifications@company.com",
+          "subject": "Info Alert: {{ .GroupLabels.alertname }}",
+          "body": "{{ range .Alerts }}{{ .Annotations.description }}{{ end }}"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/engineering/observability-designer/assets/sample_service_api.json b/engineering/observability-designer/assets/sample_service_api.json
new file mode 100644
index 0000000..0eb7c6b
--- /dev/null
+++ b/engineering/observability-designer/assets/sample_service_api.json
@@ -0,0 +1,83 @@
+{
+  "name": "payment-service",
+  "type": "api",
+  "criticality": "critical",
+  "user_facing": true,
+  "description": "Handles payment processing and transaction management",
+  "team": "payments",
+  "environment": "production",
+  "dependencies": [
+    {
+      "name": "user-service",
+      "type": "api",
+      "criticality": "high"
+    },
+    {
+      "name": "payment-gateway",
+      "type": "external",
+      "criticality": "critical"
+    },
+    {
+      "name": "fraud-detection",
+      "type": "ml",
+      "criticality": "high"
+    }
+  ],
+  "endpoints": [
+    {
+      "path": "/api/v1/payments",
+      "method": "POST",
+      "sla_latency_ms": 500,
+      "expected_tps": 100
+    },
+    {
+      "path": "/api/v1/payments/{id}",
+      "method": "GET", 
+      "sla_latency_ms": 200,
+      "expected_tps": 500
+    },
+    {
+      "path": "/api/v1/payments/{id}/refund",
+      "method": "POST",
+      "sla_latency_ms": 1000,
+      "expected_tps": 10
+    }
+  ],
+  "business_metrics": {
+    "revenue_per_hour": {
+      "metric": "sum(payment_amount * rate(payments_successful_total[1h]))",
+      "target": 50000,
+      "unit": "USD"
+    },
+    "conversion_rate": {
+      "metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))",
+      "target": 0.95,
+      "unit": "percentage"
+    }
+  },
+  "infrastructure": {
+    "container_orchestrator": "kubernetes",
+    "replicas": 6,
+    "cpu_limit": "2000m",
+    "memory_limit": "4Gi",
+    "database": {
+      "type": "postgresql",
+      "connection_pool_size": 20
+    },
+    "cache": {
+      "type": "redis",
+      "cluster_size": 3
+    }
+  },
+  "compliance_requirements": [
+    "PCI-DSS",
+    "SOX",
+    "GDPR"
+  ],
+  "tags": [
+    "payment",
+    "transaction", 
+    "critical-path",
+    "revenue-generating"
+  ]
+}
\ No newline at end of file
diff --git a/engineering/observability-designer/assets/sample_service_web.json b/engineering/observability-designer/assets/sample_service_web.json
new file mode 100644
index 0000000..affc31e
--- /dev/null
+++ b/engineering/observability-designer/assets/sample_service_web.json
@@ -0,0 +1,113 @@
+{
+  "name": "customer-portal",
+  "type": "web",
+  "criticality": "high",
+  "user_facing": true,
+  "description": "Customer-facing web application for account management and billing",
+  "team": "frontend",
+  "environment": "production",
+  "dependencies": [
+    {
+      "name": "user-service",
+      "type": "api",
+      "criticality": "high"
+    },
+    {
+      "name": "billing-service",
+      "type": "api", 
+      "criticality": "high"
+    },
+    {
+      "name": "notification-service",
+      "type": "api",
+      "criticality": "medium"
+    },
+    {
+      "name": "cdn",
+      "type": "external",
+      "criticality": "medium"
+    }
+  ],
+  "pages": [
+    {
+      "path": "/dashboard",
+      "sla_load_time_ms": 2000,
+      "expected_concurrent_users": 1000
+    },
+    {
+      "path": "/billing",
+      "sla_load_time_ms": 3000,
+      "expected_concurrent_users": 200
+    },
+    {
+      "path": "/settings",
+      "sla_load_time_ms": 1500,
+      "expected_concurrent_users": 100
+    }
+  ],
+  "business_metrics": {
+    "daily_active_users": {
+      "metric": "count(user_sessions_started_total[1d])",
+      "target": 10000,
+      "unit": "users"
+    },
+    "session_duration": {
+      "metric": "avg(user_session_duration_seconds)",
+      "target": 300,
+      "unit": "seconds"
+    },
+    "bounce_rate": {
+      "metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))",
+      "target": 0.3,
+      "unit": "percentage"
+    }
+  },
+  "infrastructure": {
+    "container_orchestrator": "kubernetes",
+    "replicas": 4,
+    "cpu_limit": "1000m",
+    "memory_limit": "2Gi",
+    "storage": {
+      "type": "nfs",
+      "size": "50Gi"
+    },
+    "ingress": {
+      "type": "nginx",
+      "ssl_termination": true,
+      "rate_limiting": {
+        "requests_per_second": 100,
+        "burst": 200
+      }
+    }
+  },
+  "monitoring": {
+    "synthetic_checks": [
+      {
+        "name": "login_flow",
+        "url": "/auth/login",
+        "frequency": "1m",
+        "locations": ["us-east", "eu-west", "ap-south"]
+      },
+      {
+        "name": "checkout_flow", 
+        "url": "/billing/checkout",
+        "frequency": "5m",
+        "locations": ["us-east", "eu-west"]
+      }
+    ],
+    "rum": {
+      "enabled": true,
+      "sampling_rate": 0.1
+    }
+  },
+  "compliance_requirements": [
+    "GDPR",
+    "CCPA"
+  ],
+  "tags": [
+    "frontend",
+    "customer-facing",
+    "billing",
+    "high-traffic"
+  ]
+}
\ No newline at end of file
diff --git a/engineering/observability-designer/expected_outputs/sample_dashboard.json b/engineering/observability-designer/expected_outputs/sample_dashboard.json
new file mode 100644
index 0000000..4069c71
--- /dev/null
+++ b/engineering/observability-designer/expected_outputs/sample_dashboard.json
@@ -0,0 +1,811 @@
+{
+  "metadata": {
+    "title": "customer-portal - SRE Dashboard",
+    "service": {
+      "name": "customer-portal",
+      "type": "web",
+      "criticality": "high",
+      "user_facing": true,
+      "description": "Customer-facing web application for account management and billing",
+      "team": "frontend",
+      "environment": "production",
+      "dependencies": [
+        {
+          "name": "user-service",
+          "type": "api",
+          "criticality": "high"
+        },
+        {
+          "name": "billing-service",
+          "type": "api",
+          "criticality": "high"
+        },
+        {
+          "name": "notification-service",
+          "type": "api",
+          "criticality": "medium"
+        },
+        {
+          "name": "cdn",
+          "type": "external",
+          "criticality": "medium"
+        }
+      ],
+      "pages": [
+        {
+          "path": "/dashboard",
+          "sla_load_time_ms": 2000,
+          "expected_concurrent_users": 1000
+        },
+        {
+          "path": "/billing",
+          "sla_load_time_ms": 3000,
+          "expected_concurrent_users": 200
+        },
+        {
+          "path": "/settings",
+          "sla_load_time_ms": 1500,
+          "expected_concurrent_users": 100
+        }
+      ],
+      "business_metrics": {
+        "daily_active_users": {
+          "metric": "count(user_sessions_started_total[1d])",
+          "target": 10000,
+          "unit": "users"
+        },
+        "session_duration": {
+          "metric": "avg(user_session_duration_seconds)",
+          "target": 300,
+          "unit": "seconds"
+        },
+        "bounce_rate": {
+          "metric": "sum(rate(page_views_bounced_total[1h])) / sum(rate(page_views_total[1h]))",
+          "target": 0.3,
+          "unit": "percentage"
+        }
+      },
+      "infrastructure": {
+        "container_orchestrator": "kubernetes",
+        "replicas": 4,
+        "cpu_limit": "1000m",
+        "memory_limit": "2Gi",
+        "storage": {
+          "type": "nfs",
+          "size": "50Gi"
+        },
+        "ingress": {
+          "type": "nginx",
+          "ssl_termination": true,
+          "rate_limiting": {
+            "requests_per_second": 100,
+            "burst": 200
+          }
+        }
+      },
+      "monitoring": {
+        "synthetic_checks": [
+          {
+            "name": "login_flow",
+            "url": "/auth/login",
+            "frequency": "1m",
+            "locations": [
+              "us-east",
+              "eu-west",
+              "ap-south"
+            ]
+          },
+          {
+            "name": "checkout_flow",
+            "url": "/billing/checkout",
+            "frequency": "5m",
+            "locations": [
+              "us-east",
+              "eu-west"
+            ]
+          }
+        ],
+        "rum": {
+          "enabled": true,
+          "sampling_rate": 0.1
+        }
+      },
+      "compliance_requirements": [
+        "GDPR",
+        "CCPA"
+      ],
+      "tags": [
+        "frontend",
+        "customer-facing",
+        "billing",
+        "high-traffic"
+      ]
+    },
+    "target_role": "sre",
+    "generated_at": "2026-02-16T14:02:03.421248Z",
+    "version": "1.0"
+  },
+  "configuration": {
+    "time_ranges": [
+      "1h",
+      "6h",
+      "1d",
+      "7d"
+    ],
+    "default_time_range": "6h",
+    "refresh_interval": "30s",
+    "timezone": "UTC",
+    "theme": "dark"
+  },
+  "layout": {
+    "grid_settings": {
+      "width": 24,
+      "height_unit": "px",
+      "cell_height": 30
+    },
+    "sections": [
+      {
+        "title": "Service Overview",
+        "collapsed": false,
+        "y_position": 0,
+        "panels": [
+          "service_status",
+          "slo_summary",
+          "error_budget"
+        ]
+      },
+      {
+        "title": "Golden Signals",
+        "collapsed": false,
+        "y_position": 8,
+        "panels": [
+          "latency",
+          "traffic",
+          "errors",
+          "saturation"
+        ]
+      },
+      {
+        "title": "Resource Utilization",
+        "collapsed": false,
+        "y_position": 16,
+        "panels": [
+          "cpu_usage",
+          "memory_usage",
+          "network_io",
+          "disk_io"
+        ]
+      },
+      {
+        "title": "Dependencies & Downstream",
+        "collapsed": true,
+        "y_position": 24,
+        "panels": [
+          "dependency_status",
+          "downstream_latency",
+          "circuit_breakers"
+        ]
+      }
+    ]
+  },
+  "panels": [
+    {
+      "id": "service_status",
+      "title": "Service Status",
+      "type": "stat",
+      "grid_pos": {
+        "x": 0,
+        "y": 0,
+        "w": 6,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "up{service=\"customer-portal\"}",
+          "legendFormat": "Status"
+        }
+      ],
+      "field_config": {
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "Status"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "mode": "thresholds"
+                }
+              },
+              {
+                "id": "thresholds",
+                "value": {
+                  "steps": [
+                    {
+                      "color": "red",
+                      "value": 0
+                    },
+                    {
+                      "color": "green",
+                      "value": 1
+                    }
+                  ]
+                }
+              },
+              {
+                "id": "mappings",
+                "value": [
+                  {
+                    "options": {
+                      "0": {
+                        "text": "DOWN"
+                      }
+                    },
+                    "type": "value"
+                  },
+                  {
+                    "options": {
+                      "1": {
+                        "text": "UP"
+                      }
+                    },
+                    "type": "value"
+                  }
+                ]
+              }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "orientation": "horizontal",
+        "textMode": "value_and_name"
+      }
+    },
+    {
+      "id": "slo_summary",
+      "title": "SLO Achievement (30d)",
+      "type": "stat",
+      "grid_pos": {
+        "x": 6,
+        "y": 0,
+        "w": 9,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d]))) * 100",
+          "legendFormat": "Availability"
+        },
+        {
+          "expr": "histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{service=\"customer-portal\"}[30d])) * 1000",
+          "legendFormat": "P95 Latency (ms)"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "steps": [
+              {
+                "color": "red",
+                "value": 0
+              },
+              {
+                "color": "yellow",
+                "value": 99.0
+              },
+              {
+                "color": "green",
+                "value": 99.9
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "orientation": "horizontal",
+        "textMode": "value_and_name"
+      }
+    },
+    {
+      "id": "error_budget",
+      "title": "Error Budget Remaining",
+      "type": "gauge",
+      "grid_pos": {
+        "x": 15,
+        "y": 0,
+        "w": 9,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "(1 - (increase(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[30d]) / increase(http_requests_total{service=\"customer-portal\"}[30d])) - 0.999) / 0.001 * 100",
+          "legendFormat": "Error Budget %"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "steps": [
+              {
+                "color": "red",
+                "value": 0
+              },
+              {
+                "color": "yellow",
+                "value": 25
+              },
+              {
+                "color": "green",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "percent"
+        }
+      },
+      "options": {
+        "showThresholdLabels": true,
+        "showThresholdMarkers": true
+      }
+    },
+    {
+      "id": "latency",
+      "title": "Request Latency",
+      "type": "timeseries",
+      "grid_pos": {
+        "x": 0,
+        "y": 8,
+        "w": 12,
+        "h": 6
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
+          "legendFormat": "P50 Latency"
+        },
+        {
+          "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
+          "legendFormat": "P95 Latency"
+        },
+        {
+          "expr": "histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{service=\"customer-portal\"}[5m])) * 1000",
+          "legendFormat": "P99 Latency"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "ms",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "fillOpacity": 10
+          }
+        }
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        }
+      }
+    },
+    {
+      "id": "traffic",
+      "title": "Request Rate",
+      "type": "timeseries",
+      "grid_pos": {
+        "x": 12,
+        "y": 8,
+        "w": 12,
+        "h": 6
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total{service=\"customer-portal\"}[5m]))",
+          "legendFormat": "Total RPS"
+        },
+        {
+          "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"2..\"}[5m]))",
+          "legendFormat": "2xx RPS"
+        },
+        {
+          "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m]))",
+          "legendFormat": "4xx RPS"
+        },
+        {
+          "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m]))",
+          "legendFormat": "5xx RPS"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "reqps",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "fillOpacity": 0
+          }
+        }
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        }
+      }
+    },
+    {
+      "id": "errors",
+      "title": "Error Rate",
+      "type": "timeseries",
+      "grid_pos": {
+        "x": 0,
+        "y": 14,
+        "w": 12,
+        "h": 6
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"5..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100",
+          "legendFormat": "5xx Error Rate"
+        },
+        {
+          "expr": "sum(rate(http_requests_total{service=\"customer-portal\",code=~\"4..\"}[5m])) / sum(rate(http_requests_total{service=\"customer-portal\"}[5m])) * 100",
+          "legendFormat": "4xx Error Rate"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "percent",
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "lineWidth": 2,
+            "fillOpacity": 20
+          }
+        },
+        "overrides": [
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "5xx Error Rate"
+            },
+            "properties": [
+              {
+                "id": "color",
+                "value": {
+                  "fixedColor": "red"
+                }
+              }
+            ]
+          }
+        ]
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        }
+      }
+    },
+    {
+      "id": "saturation",
+      "title": "Saturation Metrics",
+      "type": "timeseries",
+      "grid_pos": {
+        "x": 12,
+        "y": 14,
+        "w": 12,
+        "h": 6
+      },
+      "targets": [
+        {
+          "expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100",
+          "legendFormat": "CPU Usage %"
+        },
+        {
+          "expr": "process_resident_memory_bytes{service=\"customer-portal\"} / process_virtual_memory_max_bytes{service=\"customer-portal\"} * 100",
+          "legendFormat": "Memory Usage %"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "percent",
+          "max": 100,
+          "custom": {
+            "drawStyle": "line",
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "fillOpacity": 10
+          }
+        }
+      },
+      "options": {
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        },
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom"
+        }
+      }
+    },
+    {
+      "id": "cpu_usage",
+      "title": "CPU Usage",
+      "type": "gauge",
+      "grid_pos": {
+        "x": 0,
+        "y": 20,
+        "w": 6,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "rate(process_cpu_seconds_total{service=\"customer-portal\"}[5m]) * 100",
+          "legendFormat": "CPU %"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 90
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "showThresholdLabels": true,
+        "showThresholdMarkers": true
+      }
+    },
+    {
+      "id": "memory_usage",
+      "title": "Memory Usage",
+      "type": "gauge",
+      "grid_pos": {
+        "x": 6,
+        "y": 20,
+        "w": 6,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "process_resident_memory_bytes{service=\"customer-portal\"} / 1024 / 1024",
+          "legendFormat": "Memory MB"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "unit": "decbytes",
+          "thresholds": {
+            "steps": [
+              {
+                "color": "green",
+                "value": 0
+              },
+              {
+                "color": "yellow",
+                "value": 512000000
+              },
+              {
+                "color": "red",
+                "value": 1024000000
+              }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "id": "network_io",
+      "title": "Network I/O",
+      "type": "timeseries",
+      "grid_pos": {
+        "x": 12,
+        "y": 20,
+        "w": 6,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "rate(process_network_receive_bytes_total{service=\"customer-portal\"}[5m])",
+          "legendFormat": "RX Bytes/s"
+        },
+        {
+          "expr": "rate(process_network_transmit_bytes_total{service=\"customer-portal\"}[5m])",
+          "legendFormat": "TX Bytes/s"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "binBps"
+        }
+      }
+    },
+    {
+      "id": "disk_io",
+      "title": "Disk I/O",
+      "type": "timeseries",
+      "grid_pos": {
+        "x": 18,
+        "y": 20,
+        "w": 6,
+        "h": 4
+      },
+      "targets": [
+        {
+          "expr": "rate(process_disk_read_bytes_total{service=\"customer-portal\"}[5m])",
+          "legendFormat": "Read Bytes/s"
+        },
+        {
+          "expr": "rate(process_disk_write_bytes_total{service=\"customer-portal\"}[5m])",
+          "legendFormat": "Write Bytes/s"
+        }
+      ],
+      "field_config": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "binBps"
+        }
+      }
+    }
+  ],
+  "variables": [
+    {
+      "name": "environment",
+      "type": "query",
+      "query": "label_values(environment)",
+      "current": {
+        "text": "production",
+        "value": "production"
+      },
+      "includeAll": false,
+      "multi": false,
+      "refresh": "on_dashboard_load"
+    },
+    {
+      "name": "instance",
+      "type": "query",
+      "query": "label_values(up{service=\"customer-portal\"}, instance)",
+      "current": {
+        "text": "All",
+        "value": "$__all"
+      },
+      "includeAll": true,
+      "multi": true,
+      "refresh": "on_time_range_change"
+    },
+    {
+      "name": "handler",
+      "type": "query",
+      "query": "label_values(http_requests_total{service=\"customer-portal\"}, handler)",
+      "current": {
+        "text": "All",
+        "value": "$__all"
+      },
+      "includeAll": true,
+      "multi": true,
+      "refresh": "on_time_range_change"
+    }
+  ],
+  "alerts_integration": {
+    "alert_annotations": true,
+    "alert_rules_query": "ALERTS{service=\"customer-portal\"}",
+    "alert_panels": [
+      {
+        "title": "Active Alerts",
+        "type": "table",
+        "query": "ALERTS{service=\"customer-portal\",alertstate=\"firing\"}",
+        "columns": [
+          "alertname",
+          "severity",
+          "instance",
+          "description"
+        ]
+      }
+    ]
+  },
+  "drill_down_paths": {
+    "service_overview": {
+      "from": "service_status",
+      "to": "detailed_health_dashboard",
+      "url": "/d/service-health/customer-portal-health",
+      "params": [
+        "var-service",
+        "var-environment"
+      ]
+    },
+    "error_investigation": {
+      "from": "errors",
+      "to": "error_details_dashboard",
+      "url": "/d/errors/customer-portal-errors",
+      "params": [
+        "var-service",
+        "var-time_range"
+      ]
+    },
+    "latency_analysis": {
+      "from": "latency",
+      "to": "trace_analysis_dashboard",
+      "url": "/d/traces/customer-portal-traces",
+      "params": [
+        "var-service",
+        "var-handler"
+      ]
+    },
+    "capacity_planning": {
+      "from": "saturation",
+      "to": "capacity_dashboard",
+      "url": "/d/capacity/customer-portal-capacity",
+      "params": [
+        "var-service",
+        "var-time_range"
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/engineering/observability-designer/expected_outputs/sample_slo_framework.json b/engineering/observability-designer/expected_outputs/sample_slo_framework.json
new file mode 100644
index 0000000..07c9e1f
--- /dev/null
+++ b/engineering/observability-designer/expected_outputs/sample_slo_framework.json
@@ -0,0 +1,545 @@
+{
+  "metadata": {
+    "service": {
+      "name": "payment-service",
+      "type": "api",
+      "criticality": "critical",
+      "user_facing": true,
+      "description": "Handles payment processing and transaction management",
+      "team": "payments",
+      "environment": "production",
+      "dependencies": [
+        {
+          "name": "user-service",
+          "type": "api",
+          "criticality": "high"
+        },
+        {
+          "name": "payment-gateway",
+          "type": "external",
+          "criticality": "critical"
+        },
+        {
+          "name": "fraud-detection",
+          "type": "ml",
+          "criticality": "high"
+        }
+      ],
+      "endpoints": [
+        {
+          "path": "/api/v1/payments",
+          "method": "POST",
+          "sla_latency_ms": 500,
+          "expected_tps": 100
+        },
+        {
+          "path": "/api/v1/payments/{id}",
+          "method": "GET",
+          "sla_latency_ms": 200,
+          "expected_tps": 500
+        },
+        {
+          "path": "/api/v1/payments/{id}/refund",
+          "method": "POST",
+          "sla_latency_ms": 1000,
+          "expected_tps": 10
+        }
+      ],
+      "business_metrics": {
+        "revenue_per_hour": {
+          "metric": "sum(payment_amount * rate(payments_successful_total[1h]))",
+          "target": 50000,
+          "unit": "USD"
+        },
+        "conversion_rate": {
+          "metric": "sum(rate(payments_successful_total[5m])) / sum(rate(payment_attempts_total[5m]))",
+          "target": 0.95,
+          "unit": "percentage"
+        }
+      },
+      "infrastructure": {
+        "container_orchestrator": "kubernetes",
+        "replicas": 6,
+        "cpu_limit": "2000m",
+        "memory_limit": "4Gi",
+        "database": {
+          "type": "postgresql",
+          "connection_pool_size": 20
+        },
+        "cache": {
+          "type": "redis",
+          "cluster_size": 3
+        }
+      },
+      "compliance_requirements": [
+        "PCI-DSS",
+        "SOX",
+        "GDPR"
+      ],
+      "tags": [
+        "payment",
+        "transaction",
+        "critical-path",
+        "revenue-generating"
+      ]
+    },
+    "generated_at": "2026-02-16T14:01:57.572080Z",
+    "framework_version": "1.0"
+  },
+  "slis": [
+    {
+      "name": "Availability",
+      "description": "Percentage of successful requests",
+      "type": "ratio",
+      "good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))",
+      "total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))",
+      "unit": "percentage"
+    },
+    {
+      "name": "Request Latency P95",
+      "description": "95th percentile of request latency",
+      "type": "threshold",
+      "query": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{service=\"payment-service\"}[5m]))",
+      "unit": "seconds"
+    },
+    {
+      "name": "Error Rate",
+      "description": "Rate of 5xx errors",
+      "type": "ratio",
+      "good_events": "sum(rate(http_requests_total{service=\"payment-service\",code!~\"5..\"}))",
+      "total_events": "sum(rate(http_requests_total{service=\"payment-service\"}))",
+      "unit": "percentage"
+    },
+    {
+      "name": "Request Throughput",
+      "description": "Requests per second",
+      "type": "gauge",
+      "query": "sum(rate(http_requests_total{service=\"payment-service\"}[5m]))",
+      "unit": "requests/sec"
+    },
+    {
+      "name": "User Journey Success Rate",
+      "description": "Percentage of successful complete user journeys",
+      "type": "ratio",
+      "good_events": "sum(rate(user_journey_total{service=\"payment-service\",status=\"success\"}[5m]))",
+      "total_events": "sum(rate(user_journey_total{service=\"payment-service\"}[5m]))",
+      "unit": "percentage"
+    },
+    {
+      "name": "Feature Availability",
+      "description": "Percentage of time key features are available",
+      "type": "ratio",
+      "good_events": "sum(rate(feature_checks_total{service=\"payment-service\",status=\"available\"}[5m]))",
+      "total_events": "sum(rate(feature_checks_total{service=\"payment-service\"}[5m]))",
+      "unit": "percentage"
+    }
+  ],
+  "slos": [
+    {
+      "name": "Availability SLO",
+      "description": "Service level objective for percentage of successful requests",
+      "sli_name": "Availability",
+      "target_value": 0.9999,
+      "target_display": "99.99%",
+      "operator": ">=",
+      "time_windows": [
+        "1h",
+        "1d",
+        "7d",
+        "30d"
+      ],
+      "measurement_window": "30d",
+      "service": "payment-service",
+      "criticality": "critical"
+    },
+    {
+      "name": "Request Latency P95 SLO",
+      "description": "Service level objective for 95th percentile of request latency",
+      "sli_name": "Request Latency P95",
+      "target_value": 100,
+      "target_display": "0.1s",
+      "operator": "<=",
+      "time_windows": [
+        "1h",
+        "1d",
+        "7d",
+        "30d"
+      ],
+      "measurement_window": "30d",
+      "service": "payment-service",
+      "criticality": "critical"
+    },
+    {
+      "name": "Error Rate SLO",
+      "description": "Service level objective for rate of 5xx errors",
+      "sli_name": "Error Rate",
+      "target_value": 0.001,
+      "target_display": "0.1%",
+      "operator": "<=",
+      "time_windows": [
+        "1h",
+        "1d",
+        "7d",
+        "30d"
+      ],
+      "measurement_window": "30d",
+      "service": "payment-service",
+      "criticality": "critical"
+    },
+    {
+      "name": "User Journey Success Rate SLO",
+      "description": "Service level objective for percentage of successful complete user journeys",
+      "sli_name": "User Journey Success Rate",
+      "target_value": 0.9999,
+      "target_display": "99.99%",
+      "operator": ">=",
+      "time_windows": [
+        "1h",
+        "1d",
+        "7d",
+        "30d"
+      ],
+      "measurement_window": "30d",
+      "service": "payment-service",
+      "criticality": "critical"
+    },
+    {
+      "name": "Feature Availability SLO",
+      "description": "Service level objective for percentage of time key features are available",
+      "sli_name": "Feature Availability",
+      "target_value": 0.9999,
+      "target_display": "99.99%",
+      "operator": ">=",
+      "time_windows": [
+        "1h",
+        "1d",
+        "7d",
+        "30d"
+      ],
+      "measurement_window": "30d",
+      "service": "payment-service",
+      "criticality": "critical"
+    }
+  ],
+  "error_budgets": [
+    {
+      "slo_name": "Availability SLO",
+      "error_budget_rate": 9.999999999998899e-05,
+      "error_budget_percentage": "0.010%",
+      "budgets_by_window": {
+        "1h": "0.4 seconds",
+        "1d": "8.6 seconds",
+        "7d": "1.0 minutes",
+        "30d": "4.3 minutes"
+      },
+      "burn_rate_alerts": [
+        {
+          "name": "Availability Burn Rate 2% Alert",
+          "description": "Alert when Availability is consuming error budget at 14.4x rate",
+          "severity": "critical",
+          "short_window": "5m",
+          "long_window": "1h",
+          "burn_rate_threshold": 14.4,
+          "budget_consumed": "2%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
+          "annotations": {
+            "summary": "High burn rate detected for Availability",
+            "description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
+          }
+        },
+        {
+          "name": "Availability Burn Rate 5% Alert",
+          "description": "Alert when Availability is consuming error budget at 6x rate",
+          "severity": "warning",
+          "short_window": "30m",
+          "long_window": "6h",
+          "burn_rate_threshold": 6,
+          "budget_consumed": "5%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
+          "annotations": {
+            "summary": "High burn rate detected for Availability",
+            "description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
+          }
+        },
+        {
+          "name": "Availability Burn Rate 10% Alert",
+          "description": "Alert when Availability is consuming error budget at 3x rate",
+          "severity": "info",
+          "short_window": "2h",
+          "long_window": "1d",
+          "burn_rate_threshold": 3,
+          "budget_consumed": "10%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
+          "annotations": {
+            "summary": "High burn rate detected for Availability",
+            "description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
+          }
+        },
+        {
+          "name": "Availability Burn Rate 10% Alert",
+          "description": "Alert when Availability is consuming error budget at 1x rate",
+          "severity": "info",
+          "short_window": "6h",
+          "long_window": "3d",
+          "burn_rate_threshold": 1,
+          "budget_consumed": "10%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
+          "annotations": {
+            "summary": "High burn rate detected for Availability",
+            "description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
+          }
+        }
+      ]
+    },
+    {
+      "slo_name": "User Journey Success Rate SLO",
+      "error_budget_rate": 9.999999999998899e-05,
+      "error_budget_percentage": "0.010%",
+      "budgets_by_window": {
+        "1h": "0.4 seconds",
+        "1d": "8.6 seconds",
+        "7d": "1.0 minutes",
+        "30d": "4.3 minutes"
+      },
+      "burn_rate_alerts": [
+        {
+          "name": "User Journey Success Rate Burn Rate 2% Alert",
+          "description": "Alert when User Journey Success Rate is consuming error budget at 14.4x rate",
+          "severity": "critical",
+          "short_window": "5m",
+          "long_window": "1h",
+          "burn_rate_threshold": 14.4,
+          "budget_consumed": "2%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
+          "annotations": {
+            "summary": "High burn rate detected for User Journey Success Rate",
+            "description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
+          }
+        },
+        {
+          "name": "User Journey Success Rate Burn Rate 5% Alert",
+          "description": "Alert when User Journey Success Rate is consuming error budget at 6x rate",
+          "severity": "warning",
+          "short_window": "30m",
+          "long_window": "6h",
+          "burn_rate_threshold": 6,
+          "budget_consumed": "5%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
+          "annotations": {
+            "summary": "High burn rate detected for User Journey Success Rate",
+            "description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
+          }
+        },
+        {
+          "name": "User Journey Success Rate Burn Rate 10% Alert",
+          "description": "Alert when User Journey Success Rate is consuming error budget at 3x rate",
+          "severity": "info",
+          "short_window": "2h",
+          "long_window": "1d",
+          "burn_rate_threshold": 3,
+          "budget_consumed": "10%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
+          "annotations": {
+            "summary": "High burn rate detected for User Journey Success Rate",
+            "description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
+          }
+        },
+        {
+          "name": "User Journey Success Rate Burn Rate 10% Alert",
+          "description": "Alert when User Journey Success Rate is consuming error budget at 1x rate",
+          "severity": "info",
+          "short_window": "6h",
+          "long_window": "3d",
+          "burn_rate_threshold": 1,
+          "budget_consumed": "10%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
+          "annotations": {
+            "summary": "High burn rate detected for User Journey Success Rate",
+            "description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
+          }
+        }
+      ]
+    },
+    {
+      "slo_name": "Feature Availability SLO",
+      "error_budget_rate": 9.999999999998899e-05,
+      "error_budget_percentage": "0.010%",
+      "budgets_by_window": {
+        "1h": "0.4 seconds",
+        "1d": "8.6 seconds",
+        "7d": "1.0 minutes",
+        "30d": "4.3 minutes"
+      },
+      "burn_rate_alerts": [
+        {
+          "name": "Feature Availability Burn Rate 2% Alert",
+          "description": "Alert when Feature Availability is consuming error budget at 14.4x rate",
+          "severity": "critical",
+          "short_window": "5m",
+          "long_window": "1h",
+          "burn_rate_threshold": 14.4,
+          "budget_consumed": "2%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 14.4) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 14.4)",
+          "annotations": {
+            "summary": "High burn rate detected for Feature Availability",
+            "description": "Error budget consumption rate is 14.4x normal, will exhaust 2% of monthly budget"
+          }
+        },
+        {
+          "name": "Feature Availability Burn Rate 5% Alert",
+          "description": "Alert when Feature Availability is consuming error budget at 6x rate",
+          "severity": "warning",
+          "short_window": "30m",
+          "long_window": "6h",
+          "burn_rate_threshold": 6,
+          "budget_consumed": "5%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 6) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 6)",
+          "annotations": {
+            "summary": "High burn rate detected for Feature Availability",
+            "description": "Error budget consumption rate is 6x normal, will exhaust 5% of monthly budget"
+          }
+        },
+        {
+          "name": "Feature Availability Burn Rate 10% Alert",
+          "description": "Alert when Feature Availability is consuming error budget at 3x rate",
+          "severity": "info",
+          "short_window": "2h",
+          "long_window": "1d",
+          "burn_rate_threshold": 3,
+          "budget_consumed": "10%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 3) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 3)",
+          "annotations": {
+            "summary": "High burn rate detected for Feature Availability",
+            "description": "Error budget consumption rate is 3x normal, will exhaust 10% of monthly budget"
+          }
+        },
+        {
+          "name": "Feature Availability Burn Rate 10% Alert",
+          "description": "Alert when Feature Availability is consuming error budget at 1x rate",
+          "severity": "info",
+          "short_window": "6h",
+          "long_window": "3d",
+          "burn_rate_threshold": 1,
+          "budget_consumed": "10%",
+          "condition": "((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_short > 1) and ((1 - (sum(rate(http_requests_total{service='payment-service',code!~'5..'})) / sum(rate(http_requests_total{service='payment-service'}))))_long > 1)",
+          "annotations": {
+            "summary": "High burn rate detected for Feature Availability",
+            "description": "Error budget consumption rate is 1x normal, will exhaust 10% of monthly budget"
+          }
+        }
+      ]
+    }
+  ],
+  "sla_recommendations": {
+    "applicable": true,
+    "service": "payment-service",
+    "commitments": [
+      {
+        "metric": "Availability",
+        "target": 0.9989,
+        "target_display": "99.89%",
+        "measurement_window": "monthly",
+        "measurement_method": "Uptime monitoring with 1-minute granularity"
+      },
+      {
+        "metric": "Feature Availability",
+        "target": 0.9989,
+        "target_display": "99.89%",
+        "measurement_window": "monthly",
+        "measurement_method": "Uptime monitoring with 1-minute granularity"
+      }
+    ],
+    "penalties": [
+      {
+        "breach_threshold": "< 99.99%",
+        "credit_percentage": 10
+      },
+      {
+        "breach_threshold": "< 99.9%",
+        "credit_percentage": 25
+      },
+      {
+        "breach_threshold": "< 99%",
+        "credit_percentage": 50
+      }
+    ],
+    "measurement_methodology": "External synthetic monitoring from multiple geographic locations",
+    "exclusions": [
+      "Planned maintenance windows (with 72h advance notice)",
+      "Customer-side network or infrastructure issues",
+      "Force majeure events",
+      "Third-party service dependencies beyond our control"
+    ]
+  },
+  "monitoring_recommendations": {
+    "metrics": {
+      "collection": "Prometheus with service discovery",
+      "retention": "90 days for raw metrics, 1 year for aggregated",
+      "alerting": "Prometheus Alertmanager with multi-window burn rate alerts"
+    },
+    "logging": {
+      "format": "Structured JSON logs with correlation IDs",
+      "aggregation": "ELK stack or equivalent with proper indexing",
+      "retention": "30 days for debug logs, 90 days for error logs"
+    },
+    "tracing": {
+      "sampling": "Adaptive sampling with 1% base rate",
+      "storage": "Jaeger or Zipkin with 7-day retention",
+      "integration": "OpenTelemetry instrumentation"
+    }
+  },
+  "implementation_guide": {
+    "prerequisites": [
+      "Service instrumented with metrics collection (Prometheus format)",
+      "Structured logging with correlation IDs",
+      "Monitoring infrastructure (Prometheus, Grafana, Alertmanager)",
+      "Incident response processes and escalation policies"
+    ],
+    "implementation_steps": [
+      {
+        "step": 1,
+        "title": "Instrument Service",
+        "description": "Add metrics collection for all defined SLIs",
+        "estimated_effort": "1-2 days"
+      },
+      {
+        "step": 2,
+        "title": "Configure Recording Rules",
+        "description": "Set up Prometheus recording rules for SLI calculations",
+        "estimated_effort": "4-8 hours"
+      },
+      {
+        "step": 3,
+        "title": "Implement Burn Rate Alerts",
+        "description": "Configure multi-window burn rate alerting rules",
+        "estimated_effort": "1 day"
+      },
+      {
+        "step": 4,
+        "title": "Create SLO Dashboard",
+        "description": "Build Grafana dashboard for SLO tracking and error budget monitoring",
+        "estimated_effort": "4-6 hours"
+      },
+      {
+        "step": 5,
+        "title": "Test and Validate",
+        "description": "Test alerting and validate SLI measurements against expectations",
+        "estimated_effort": "1-2 days"
+      },
+      {
+        "step": 6,
+        "title": "Documentation and Training",
+        "description": "Document runbooks and train team on SLO monitoring",
+        "estimated_effort": "1 day"
+      }
+    ],
+    "validation_checklist": [
+      "All SLIs produce expected metric values",
+      "Burn rate alerts fire correctly during simulated outages",
+      "Error budget calculations match manual verification",
+      "Dashboard displays accurate SLO achievement rates",
+      "Alert routing reaches correct escalation paths",
+      "Runbooks are complete and tested"
+    ]
+  }
+}
\ No newline at end of file
diff --git a/engineering/observability-designer/references/alert_design_patterns.md b/engineering/observability-designer/references/alert_design_patterns.md
new file mode 100644
index 0000000..8529996
--- /dev/null
+++ b/engineering/observability-designer/references/alert_design_patterns.md
@@ -0,0 +1,469 @@
+# Alert Design Patterns: A Guide to Effective Alerting
+
+## Introduction
+
+Well-designed alerts are the difference between a reliable system and 3 AM pages about non-issues. This guide provides patterns and anti-patterns for creating alerts that provide value without causing fatigue.
+
+## Fundamental Principles
+
+### The Golden Rules of Alerting
+
+1. **Every alert should be actionable** - If you can't do something about it, don't alert
+2. **Every alert should require human intelligence** - If a script can handle it, automate the response
+3. **Every alert should be novel** - Don't alert on known, ongoing issues
+4. **Every alert should represent a user-visible impact** - Internal metrics matter only if users are affected
+
+### Alert Classification
+
+#### Critical Alerts
+- Service is completely down
+- Data loss is occurring
+- Security breach detected
+- SLO burn rate indicates imminent SLO violation
+
+#### Warning Alerts  
+- Service degradation affecting some users
+- Approaching resource limits
+- Dependent service issues
+- Elevated error rates within SLO
+
+#### Info Alerts
+- Deployment notifications
+- Capacity planning triggers
+- Configuration changes
+- Maintenance windows
+
+## Alert Design Patterns
+
+### Pattern 1: Symptoms, Not Causes
+
+**Good**: Alert on user-visible symptoms
+```yaml
+- alert: HighLatency
+  expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5
+  for: 5m
+  annotations:
+    summary: "API latency is high"
+    description: "95th percentile latency is {{ $value }}s, above 500ms threshold"
+```
+
+**Bad**: Alert on internal metrics that may not affect users
+```yaml
+- alert: HighCPU
+  expr: cpu_usage > 80
+  # This might not affect users at all!
+```
+
+### Pattern 2: Multi-Window Alerting
+
+Reduce false positives by requiring sustained problems:
+
+```yaml
+- alert: ServiceDown
+  expr: (
+    avg_over_time(up[2m]) == 0  # Short window: immediate detection
+    and
+    avg_over_time(up[10m]) < 0.8  # Long window: avoid flapping
+  )
+  for: 1m
+```
+
+### Pattern 3: Burn Rate Alerting
+
+Alert based on error budget consumption rate:
+
+```yaml
+# Fast burn: 2% of monthly budget in 1 hour
+- alert: ErrorBudgetFastBurn  
+  expr: (
+    error_rate_5m > (14.4 * error_budget_slo)
+    and
+    error_rate_1h > (14.4 * error_budget_slo)
+  )
+  for: 2m
+  labels:
+    severity: critical
+    
+# Slow burn: 10% of monthly budget in 3 days
+- alert: ErrorBudgetSlowBurn
+  expr: (
+    error_rate_6h > (1.0 * error_budget_slo)
+    and  
+    error_rate_3d > (1.0 * error_budget_slo)
+  )
+  for: 15m
+  labels:
+    severity: warning
+```
+
+### Pattern 4: Hysteresis
+
+Use different thresholds for firing and resolving to prevent flapping:
+
+```yaml
+- alert: HighErrorRate
+  expr: error_rate > 0.05  # Fire at 5%
+  for: 5m
+  
+# Resolution happens automatically when error_rate < 0.03 (3%)
+# This prevents flapping around the 5% threshold
+```
+
+### Pattern 5: Composite Alerts
+
+Alert when multiple conditions indicate a problem:
+
+```yaml
+- alert: ServiceDegraded
+  expr: (
+    (latency_p95 > latency_threshold)
+    or
+    (error_rate > error_threshold)
+    or 
+    (availability < availability_threshold)
+  ) and (
+    request_rate > min_request_rate  # Only alert if we have traffic
+  )
+```
+
+### Pattern 6: Contextual Alerting
+
+Include relevant context in alerts:
+
+```yaml
+- alert: DatabaseConnections
+  expr: db_connections_active / db_connections_max > 0.8
+  for: 5m
+  annotations:
+    summary: "Database connection pool nearly exhausted"
+    description: "{{ $labels.database }} has {{ $value | humanizePercentage }} connection utilization"
+    runbook_url: "https://runbooks.company.com/database-connections"
+    impact: "New requests may be rejected, causing 500 errors"
+    suggested_action: "Check for connection leaks or increase pool size"
+```
+
+## Alert Routing and Escalation
+
+### Routing by Impact and Urgency
+
+#### Critical Path Services
+```yaml
+route:
+  group_by: ['service']
+  routes:
+  - match:
+      service: 'payment-api'
+      severity: 'critical'
+    receiver: 'payment-team-pager'
+    continue: true
+  - match:
+      service: 'payment-api' 
+      severity: 'warning'
+    receiver: 'payment-team-slack'
+```
+
+#### Time-Based Routing
+```yaml
+route:
+  routes:
+  - match:
+      severity: 'critical'
+    receiver: 'oncall-pager'
+  - match:
+      severity: 'warning'
+      time: 'business_hours'  # 9 AM - 5 PM
+    receiver: 'team-slack'
+  - match:
+      severity: 'warning'
+      time: 'after_hours'
+    receiver: 'team-email'  # Lower urgency outside business hours
+```
+
+### Escalation Patterns
+
+#### Linear Escalation
+```yaml
+receivers:
+- name: 'primary-oncall'
+  pagerduty_configs:
+  - escalation_policy: 'P1-Escalation'
+    # 0 min: Primary on-call
+    # 5 min: Secondary on-call  
+    # 15 min: Engineering manager
+    # 30 min: Director of engineering
+```
+
+#### Severity-Based Escalation
+```yaml
+# Critical: Immediate escalation
+- match:
+    severity: 'critical'
+  receiver: 'critical-escalation'
+  
+# Warning: Team-first escalation
+- match:
+    severity: 'warning'
+  receiver: 'team-escalation'
+```
+
+## Alert Fatigue Prevention
+
+### Grouping and Suppression
+
+#### Time-Based Grouping
+```yaml
+route:
+  group_wait: 30s        # Wait 30s to group similar alerts
+  group_interval: 2m     # Send grouped alerts every 2 minutes
+  repeat_interval: 1h    # Re-send unresolved alerts every hour
+```
+
+#### Dependent Service Suppression
+```yaml
+- alert: ServiceDown
+  expr: up == 0
+  
+- alert: HighLatency
+  expr: latency_p95 > 1
+  # This alert is suppressed when ServiceDown is firing
+  inhibit_rules:
+  - source_match:
+      alertname: 'ServiceDown'
+    target_match:
+      alertname: 'HighLatency'
+    equal: ['service']
+```
+
+### Alert Throttling
+
+```yaml
+# Limit to 1 alert per 10 minutes for noisy conditions
+- alert: HighMemoryUsage
+  expr: memory_usage_percent > 85
+  for: 10m  # Longer 'for' duration reduces noise
+  annotations:
+    summary: "Memory usage has been high for 10+ minutes"
+```
+
+### Smart Defaults
+
+```yaml
+# Use business logic to set intelligent thresholds
+- alert: LowTraffic
+  expr: request_rate < (
+    avg_over_time(request_rate[7d]) * 0.1  # 10% of weekly average
+  )
+  # Only alert during business hours when low traffic is unusual
+  for: 30m
+```
+
+## Runbook Integration
+
+### Runbook Structure Template
+
+```markdown
+# Alert: {{ $labels.alertname }}
+
+## Immediate Actions
+1. Check service status dashboard
+2. Verify if users are affected
+3. Look at recent deployments/changes
+
+## Investigation Steps
+1. Check logs for errors in the last 30 minutes
+2. Verify dependent services are healthy  
+3. Check resource utilization (CPU, memory, disk)
+4. Review recent alerts for patterns
+
+## Resolution Actions
+- If deployment-related: Consider rollback
+- If resource-related: Scale up or optimize queries
+- If dependency-related: Engage appropriate team
+
+## Escalation
+- Primary: @team-oncall
+- Secondary: @engineering-manager  
+- Emergency: @site-reliability-team
+```
+
+### Runbook Integration in Alerts
+
+```yaml
+annotations:
+  runbook_url: "https://runbooks.company.com/alerts/{{ $labels.alertname }}"
+  quick_debug: |
+    1. curl -s https://{{ $labels.instance }}/health
+    2. kubectl logs {{ $labels.pod }} --tail=50
+    3. Check dashboard: https://grafana.company.com/d/service-{{ $labels.service }}
+```
+
+## Testing and Validation
+
+### Alert Testing Strategies
+
+#### Chaos Engineering Integration
+```python
+# Test that alerts fire during controlled failures
+def test_alert_during_cpu_spike():
+    with chaos.cpu_spike(target='payment-api', duration='2m'):
+        assert wait_for_alert('HighCPU', timeout=180)
+        
+def test_alert_during_network_partition():
+    with chaos.network_partition(target='database'):
+        assert wait_for_alert('DatabaseUnreachable', timeout=60)
+```
+
+#### Historical Alert Analysis
+```prometheus
+# Query to find alerts that fired without incidents
+count by (alertname) (
+  ALERTS{alertstate="firing"}[30d]
+) unless on (alertname) (
+  count by (alertname) (
+    incident_created{source="alert"}[30d]
+  )
+)
+```
+
+### Alert Quality Metrics
+
+#### Alert Precision
+```
+Precision = True Positives / (True Positives + False Positives)
+```
+
+Track alerts that resulted in actual incidents vs false alarms.
+
+#### Time to Resolution
+```prometheus
+# Average time from alert firing to resolution
+avg_over_time(
+  (alert_resolved_timestamp - alert_fired_timestamp)[30d]
+) by (alertname)
+```
+
+#### Alert Fatigue Indicators
+```prometheus
+# Alerts per day by team
+sum by (team) (
+  increase(alerts_fired_total[1d])
+)
+
+# Percentage of alerts acknowledged within 15 minutes
+sum(alerts_acked_within_15m) / sum(alerts_fired) * 100
+```
+
+## Advanced Patterns
+
+### Machine Learning-Enhanced Alerting
+
+#### Anomaly Detection
+```yaml
+- alert: AnomalousTraffic
+  expr: |
+    abs(request_rate - predict_linear(request_rate[1h], 300)) / 
+    stddev_over_time(request_rate[1h]) > 3
+  for: 10m
+  annotations:
+    summary: "Traffic pattern is anomalous"
+    description: "Current traffic deviates from predicted pattern by >3 standard deviations"
+```
+
+#### Dynamic Thresholds
+```yaml
+- alert: DynamicHighLatency
+  expr: |
+    latency_p95 > (
+      quantile_over_time(0.95, latency_p95[7d]) +  # Historical 95th percentile
+      2 * stddev_over_time(latency_p95[7d])        # Plus 2 standard deviations
+    )
+```
+
+### Business Hours Awareness
+
+```yaml
+# Different thresholds for business vs off hours
+- alert: HighLatencyBusinessHours  
+  expr: latency_p95 > 0.2  # Stricter during business hours
+  for: 2m
+  # Active 9 AM - 5 PM weekdays
+  
+- alert: HighLatencyOffHours
+  expr: latency_p95 > 0.5  # More lenient after hours  
+  for: 5m
+  # Active nights and weekends
+```
+
+### Progressive Alerting
+
+```yaml
+# Escalating alert severity based on duration
+- alert: ServiceLatencyElevated
+  expr: latency_p95 > 0.5
+  for: 5m
+  labels:
+    severity: info
+    
+- alert: ServiceLatencyHigh
+  expr: latency_p95 > 0.5
+  for: 15m  # Same condition, longer duration
+  labels:
+    severity: warning
+    
+- alert: ServiceLatencyCritical  
+  expr: latency_p95 > 0.5
+  for: 30m  # Same condition, even longer duration
+  labels:
+    severity: critical
+```
+
+## Anti-Patterns to Avoid
+
+### Anti-Pattern 1: Alerting on Everything
+**Problem**: Too many alerts create noise and fatigue
+**Solution**: Be selective; only alert on user-impacting issues
+
+### Anti-Pattern 2: Vague Alert Messages
+**Problem**: "Service X is down" - which instance? what's the impact?
+**Solution**: Include specific details and context
+
+### Anti-Pattern 3: Alerts Without Runbooks
+**Problem**: Alerts that don't explain what to do
+**Solution**: Every alert must have an associated runbook
+
+### Anti-Pattern 4: Static Thresholds
+**Problem**: 80% CPU might be normal during peak hours
+**Solution**: Use contextual, adaptive thresholds
+
+### Anti-Pattern 5: Ignoring Alert Quality
+**Problem**: Accepting high false positive rates
+**Solution**: Regularly review and tune alert precision
+
+## Implementation Checklist
+
+### Pre-Implementation
+- [ ] Define alert severity levels and escalation policies
+- [ ] Create runbook templates
+- [ ] Set up alert routing configuration
+- [ ] Define SLOs that alerts will protect
+
+### Alert Development
+- [ ] Each alert has clear success criteria
+- [ ] Alert conditions tested against historical data
+- [ ] Runbook created and accessible
+- [ ] Severity and routing configured
+- [ ] Context and suggested actions included
+
+### Post-Implementation  
+- [ ] Monitor alert precision and recall
+- [ ] Regular review of alert fatigue metrics
+- [ ] Quarterly alert effectiveness review
+- [ ] Team training on alert response procedures
+
+### Quality Assurance
+- [ ] Test alerts fire during controlled failures
+- [ ] Verify alerts resolve when conditions improve
+- [ ] Confirm runbooks are accurate and helpful
+- [ ] Validate escalation paths work correctly
+
+Remember: Great alerts are invisible when things work and invaluable when things break. Focus on quality over quantity, and always optimize for the human who will respond to the alert at 3 AM.
\ No newline at end of file
diff --git a/engineering/observability-designer/references/dashboard_best_practices.md b/engineering/observability-designer/references/dashboard_best_practices.md
new file mode 100644
index 0000000..7d9af4f
--- /dev/null
+++ b/engineering/observability-designer/references/dashboard_best_practices.md
@@ -0,0 +1,571 @@
+# Dashboard Best Practices: Design for Insight and Action
+
+## Introduction
+
+A well-designed dashboard is like a good story - it guides you through the data with purpose and clarity. This guide provides practical patterns for creating dashboards that inform decisions and enable quick troubleshooting.
+
+## Design Principles
+
+### The Hierarchy of Information
+
+#### Primary Information (Top Third)
+- Service health status
+- SLO achievement
+- Critical alerts
+- Business KPIs
+
+#### Secondary Information (Middle Third)  
+- Golden signals (latency, traffic, errors, saturation)
+- Resource utilization
+- Throughput and performance metrics
+
+#### Tertiary Information (Bottom Third)
+- Detailed breakdowns
+- Historical trends
+- Dependency status
+- Debug information
+
+### Visual Design Principles
+
+#### Rule of 7±2
+- Maximum 7±2 panels per screen
+- Group related information together
+- Use sections to organize complexity
+
+#### Color Psychology
+- **Red**: Critical issues, danger, immediate attention needed
+- **Yellow/Orange**: Warnings, caution, degraded state
+- **Green**: Healthy, normal operation, success
+- **Blue**: Information, neutral metrics, capacity
+- **Gray**: Disabled, unknown, or baseline states
+
+#### Chart Selection Guide
+- **Line charts**: Time series, trends, comparisons over time
+- **Bar charts**: Categorical comparisons, top N lists
+- **Gauges**: Single value with defined good/bad ranges
+- **Stat panels**: Key metrics, percentages, counts
+- **Heatmaps**: Distribution data, correlation analysis
+- **Tables**: Detailed breakdowns, multi-dimensional data
+
+## Dashboard Archetypes
+
+### The Overview Dashboard
+
+**Purpose**: High-level health check and business metrics
+**Audience**: Executives, managers, cross-team stakeholders
+**Update Frequency**: 5-15 minutes
+
+```yaml
+sections:
+  - title: "Business Health"
+    panels:
+      - service_availability_summary
+      - revenue_per_hour  
+      - active_users
+      - conversion_rate
+      
+  - title: "System Health"  
+    panels:
+      - critical_alerts_count
+      - slo_achievement_summary
+      - error_budget_remaining
+      - deployment_status
+```
+
+### The SRE Operational Dashboard
+
+**Purpose**: Real-time monitoring and incident response
+**Audience**: SRE, on-call engineers
+**Update Frequency**: 15-30 seconds
+
+```yaml
+sections:
+  - title: "Service Status"
+    panels:
+      - service_up_status
+      - active_incidents
+      - recent_deployments
+      
+  - title: "Golden Signals"
+    panels:
+      - latency_percentiles
+      - request_rate
+      - error_rate  
+      - resource_saturation
+      
+  - title: "Infrastructure"
+    panels:
+      - cpu_memory_utilization
+      - network_io
+      - disk_space
+```
+
+### The Developer Debug Dashboard
+
+**Purpose**: Deep-dive troubleshooting and performance analysis
+**Audience**: Development teams
+**Update Frequency**: 30 seconds - 2 minutes
+
+```yaml
+sections:
+  - title: "Application Performance"
+    panels:
+      - endpoint_latency_breakdown
+      - database_query_performance
+      - cache_hit_rates
+      - queue_depths
+      
+  - title: "Errors and Logs"
+    panels:
+      - error_rate_by_endpoint
+      - log_volume_by_level
+      - exception_types
+      - slow_queries
+```
+
+## Layout Patterns
+
+### The F-Pattern Layout
+
+Based on eye-tracking studies, users scan in an F-pattern:
+
+```
+[Critical Status] [SLO Summary  ] [Error Budget ]
+[Latency       ] [Traffic      ] [Errors       ]
+[Saturation    ] [Resource Use ] [Detailed View]
+[Historical    ] [Dependencies ] [Debug Info   ]
+```
+
+### The Z-Pattern Layout  
+
+For executive dashboards, follow the Z-pattern:
+
+```
+[Business KPIs          ] → [System Status]
+      ↓                          ↓
+[Trend Analysis         ] ← [Key Metrics ]
+```
+
+### Responsive Design
+
+#### Desktop (1920x1080)
+- 24-column grid
+- Panels can be 6, 8, 12, or 24 units wide
+- 4-6 rows visible without scrolling
+
+#### Laptop (1366x768)
+- Stack wider panels vertically
+- Reduce panel heights
+- Prioritize most critical information
+
+#### Mobile (768px width)
+- Single column layout
+- Simplified panels
+- Touch-friendly controls
+
+## Effective Panel Design
+
+### Stat Panels
+
+```yaml
+# Good: Clear value with context
+- title: "API Availability"
+  type: stat
+  targets:
+    - expr: avg(up{service="api"}) * 100
+  field_config:
+    unit: percent
+    thresholds:
+      steps:
+        - color: red
+          value: 0
+        - color: yellow  
+          value: 99
+        - color: green
+          value: 99.9
+  options:
+    color_mode: background
+    text_mode: value_and_name
+```
+
+### Time Series Panels
+
+```yaml  
+# Good: Multiple related metrics with clear legend
+- title: "Request Latency"
+  type: timeseries
+  targets:
+    - expr: histogram_quantile(0.50, rate(http_duration_bucket[5m]))
+      legend: "P50"
+    - expr: histogram_quantile(0.95, rate(http_duration_bucket[5m]))
+      legend: "P95"  
+    - expr: histogram_quantile(0.99, rate(http_duration_bucket[5m]))
+      legend: "P99"
+  field_config:
+    unit: ms
+    custom:
+      draw_style: line
+      fill_opacity: 10
+  options:
+    legend:
+      display_mode: table
+      placement: bottom
+      values: [min, max, mean, last]
+```
+
+### Table Panels
+
+```yaml
+# Good: Top N with relevant columns
+- title: "Slowest Endpoints"
+  type: table
+  targets:
+    - expr: topk(10, histogram_quantile(0.95, sum by (handler)(rate(http_duration_bucket[5m]))))
+      format: table
+      instant: true
+  transformations:
+    - id: organize
+      options:
+        exclude_by_name: 
+          Time: true
+        rename_by_name:
+          Value: "P95 Latency (ms)"
+          handler: "Endpoint"
+```
+
+## Color and Visualization Best Practices
+
+### Threshold Configuration
+
+```yaml
+# Traffic light system with meaningful boundaries
+thresholds:
+  steps:
+    - color: green     # Good performance
+      value: null      # Default
+    - color: yellow    # Degraded performance  
+      value: 95        # 95th percentile of historical normal
+    - color: orange    # Poor performance
+      value: 99        # 99th percentile of historical normal
+    - color: red       # Critical performance
+      value: 99.9      # Worst case scenario
+```
+
+### Color Blind Friendly Palettes
+
+```yaml
+# Use patterns and shapes in addition to color
+field_config:
+  overrides:
+    - matcher:
+        id: byName
+        options: "Critical"
+      properties:
+        - id: color
+          value:
+            mode: fixed
+            fixed_color: "#d73027"  # Red-orange for protanopia
+        - id: custom.draw_style
+          value: "points"           # Different shape
+```
+
+### Consistent Color Semantics
+
+- **Success/Health**: Green (#28a745)
+- **Warning/Degraded**: Yellow (#ffc107)  
+- **Error/Critical**: Red (#dc3545)
+- **Information**: Blue (#007bff)
+- **Neutral**: Gray (#6c757d)
+
+## Time Range Strategy
+
+### Default Time Ranges by Dashboard Type
+
+#### Real-time Operational
+- **Default**: Last 15 minutes
+- **Quick options**: 5m, 15m, 1h, 4h
+- **Auto-refresh**: 15-30 seconds
+
+#### Troubleshooting  
+- **Default**: Last 1 hour
+- **Quick options**: 15m, 1h, 4h, 12h, 1d
+- **Auto-refresh**: 1 minute
+
+#### Business Review
+- **Default**: Last 24 hours
+- **Quick options**: 1d, 7d, 30d, 90d
+- **Auto-refresh**: 5 minutes
+
+#### Capacity Planning
+- **Default**: Last 7 days  
+- **Quick options**: 7d, 30d, 90d, 1y
+- **Auto-refresh**: 15 minutes
+
+### Time Range Annotations
+
+```yaml
+# Add context for time-based events
+annotations:
+  - name: "Deployments"
+    datasource: "Prometheus"
+    expr: "deployment_timestamp"
+    title_format: "Deploy {{ version }}"
+    text_format: "Deployed version {{ version }} to {{ environment }}"
+    
+  - name: "Incidents"  
+    datasource: "Incident API"
+    query: "incidents.json?service={{ service }}"
+    color: "red"
+```
+
+## Interactive Features
+
+### Template Variables
+
+```yaml
+# Service selector
+- name: service
+  type: query
+  query: label_values(up, service)
+  current:
+    text: All
+    value: $__all
+  include_all: true
+  multi: true
+  
+# Environment selector  
+- name: environment
+  type: query
+  query: label_values(up{service="$service"}, environment)
+  current:
+    text: production
+    value: production
+```
+
+### Drill-Down Links
+
+```yaml
+# Panel-level drill-downs
+- title: "Error Rate"
+  type: timeseries
+  # ... other config ...
+  options:
+    data_links:
+      - title: "View Error Logs"
+        url: "/d/logs-dashboard?var-service=${__field.labels.service}&from=${__from}&to=${__to}"
+      - title: "Error Traces"  
+        url: "/d/traces-dashboard?var-service=${__field.labels.service}"
+```
+
+### Dynamic Panel Titles
+
+```yaml
+- title: "${service} - Request Rate"  # Uses template variable
+  type: timeseries
+  # Title updates automatically when service variable changes
+```
+
+## Performance Optimization
+
+### Query Optimization
+
+#### Use Recording Rules
+```yaml
+# Instead of complex queries in dashboards
+groups:
+  - name: http_requests
+    rules:
+      - record: http_request_rate_5m
+        expr: sum(rate(http_requests_total[5m])) by (service, method, handler)
+        
+      - record: http_request_latency_p95_5m
+        expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le))
+```
+
+#### Limit Data Points
+```yaml
+# Good: Reasonable resolution for dashboard
+- expr: http_request_rate_5m[1h]
+  interval: 15s  # One point every 15 seconds
+
+# Bad: Too many points for visualization  
+- expr: http_request_rate_1s[1h]  # 3600 points!
+```
+
+### Dashboard Performance
+
+#### Panel Limits
+- **Maximum panels per dashboard**: 20-30
+- **Maximum queries per panel**: 10
+- **Maximum time series per panel**: 50
+
+#### Caching Strategy
+```yaml
+# Use appropriate cache headers
+cache_timeout: 30  # Cache for 30 seconds on fast-changing panels
+cache_timeout: 300 # Cache for 5 minutes on slow-changing panels
+```
+
+## Accessibility
+
+### Screen Reader Support
+
+```yaml
+# Provide text alternatives for visual elements
+- title: "Service Health Status"
+  type: stat
+  options:
+    text_mode: value_and_name  # Includes both value and description
+  field_config:
+    mappings:
+      - options:
+          "1": 
+            text: "Healthy"
+            color: "green"
+          "0":
+            text: "Unhealthy"  
+            color: "red"
+```
+
+### Keyboard Navigation
+
+- Ensure all interactive elements are keyboard accessible
+- Provide logical tab order
+- Include skip links for complex dashboards
+
+### High Contrast Mode
+
+```yaml
+# Test dashboards work in high contrast mode
+theme: high_contrast
+colors:
+  - "#000000"  # Pure black
+  - "#ffffff"  # Pure white  
+  - "#ffff00"  # Pure yellow
+  - "#ff0000"  # Pure red
+```
+
+## Testing and Validation
+
+### Dashboard Testing Checklist
+
+#### Functional Testing
+- [ ] All panels load without errors
+- [ ] Template variables filter correctly
+- [ ] Time range changes update all panels
+- [ ] Drill-down links work as expected
+- [ ] Auto-refresh functions properly
+
+#### Visual Testing
+- [ ] Dashboard renders correctly on different screen sizes
+- [ ] Colors are distinguishable and meaningful
+- [ ] Text is readable at normal zoom levels
+- [ ] Legends and labels are clear
+
+#### Performance Testing  
+- [ ] Dashboard loads in < 5 seconds
+- [ ] No queries timeout under normal load
+- [ ] Auto-refresh doesn't cause browser lag
+- [ ] Memory usage remains reasonable
+
+#### Usability Testing
+- [ ] New team members can understand the dashboard
+- [ ] Action items are clear during incidents
+- [ ] Key information is quickly discoverable
+- [ ] Dashboard supports common troubleshooting workflows
+
+## Maintenance and Governance
+
+### Dashboard Lifecycle
+
+#### Creation
+1. Define dashboard purpose and audience
+2. Identify key metrics and success criteria
+3. Design layout following established patterns
+4. Implement with consistent styling
+5. Test with real data and user scenarios
+
+#### Maintenance
+- **Weekly**: Check for broken panels or queries
+- **Monthly**: Review dashboard usage analytics  
+- **Quarterly**: Gather user feedback and iterate
+- **Annually**: Major review and potential redesign
+
+#### Retirement
+- Archive dashboards that are no longer used
+- Migrate users to replacement dashboards
+- Document lessons learned
+
+### Dashboard Standards
+
+```yaml
+# Organization dashboard standards
+standards:
+  naming_convention: "[Team] [Service] - [Purpose]"
+  tags: [team, service_type, environment, purpose]
+  refresh_intervals: [15s, 30s, 1m, 5m, 15m]
+  time_ranges: [5m, 15m, 1h, 4h, 1d, 7d, 30d]
+  color_scheme: "company_standard"
+  max_panels_per_dashboard: 25
+```
+
+## Advanced Patterns
+
+### Composite Dashboards
+
+```yaml
+# Dashboard that includes panels from other dashboards
+- title: "Service Overview"
+  type: dashlist
+  targets:
+    - "service-health"
+    - "service-performance" 
+    - "service-business-metrics"
+  options:
+    show_headings: true
+    max_items: 10
+```
+
+### Dynamic Dashboard Generation
+
+```python
+# Generate dashboards from service definitions
+def generate_service_dashboard(service_config):
+    panels = []
+    
+    # Always include golden signals
+    panels.extend(generate_golden_signals_panels(service_config))
+    
+    # Add service-specific panels
+    if service_config.type == 'database':
+        panels.extend(generate_database_panels(service_config))
+    elif service_config.type == 'queue':
+        panels.extend(generate_queue_panels(service_config))
+        
+    return {
+        'title': f"{service_config.name} - Operational Dashboard",
+        'panels': panels,
+        'variables': generate_variables(service_config)
+    }
+```
+
+### A/B Testing for Dashboards
+
+```yaml
+# Test different dashboard designs with different teams
+experiment:
+  name: "dashboard_layout_test"
+  variants:
+    - name: "traditional_layout"
+      weight: 50
+      config: "dashboard_v1.json"
+    - name: "f_pattern_layout"  
+      weight: 50
+      config: "dashboard_v2.json"
+  success_metrics:
+    - "time_to_insight"
+    - "user_satisfaction"
+    - "troubleshooting_efficiency"
+```
+
+Remember: A dashboard should tell a story about your system's health and guide users toward the right actions. Focus on clarity over complexity, and always optimize for the person who will use it during a stressful incident.
\ No newline at end of file
diff --git a/engineering/observability-designer/references/slo_cookbook.md b/engineering/observability-designer/references/slo_cookbook.md
new file mode 100644
index 0000000..3734a2b
--- /dev/null
+++ b/engineering/observability-designer/references/slo_cookbook.md
@@ -0,0 +1,329 @@
+# SLO Cookbook: A Practical Guide to Service Level Objectives
+
+## Introduction
+
+Service Level Objectives (SLOs) are a key tool for managing service reliability. This cookbook provides practical guidance for implementing SLOs that actually improve system reliability rather than just creating meaningless metrics.
+
+## Fundamentals
+
+### The SLI/SLO/SLA Hierarchy
+
+- **SLI (Service Level Indicator)**: A quantifiable measure of service quality
+- **SLO (Service Level Objective)**: A target range of values for an SLI
+- **SLA (Service Level Agreement)**: A business agreement with consequences for missing SLO targets
+
+### Golden Rule of SLOs
+
+**Start simple, iterate based on learning.** Your first SLOs won't be perfect, and that's okay.
+
+## Choosing Good SLIs
+
+### The Four Golden Signals
+
+1. **Latency**: How long requests take to complete
+2. **Traffic**: How many requests are coming in
+3. **Errors**: How many requests are failing
+4. **Saturation**: How "full" your service is
+
+### SLI Selection Criteria
+
+A good SLI should be:
+- **Measurable**: You can collect data for it
+- **Meaningful**: It reflects user experience
+- **Controllable**: You can take action to improve it
+- **Proportional**: Changes in the SLI reflect changes in user happiness
+
+### Service Type Specific SLIs
+
+#### HTTP APIs
+- **Request latency**: P95 or P99 response time
+- **Availability**: Proportion of successful requests (non-5xx)
+- **Throughput**: Requests per second capacity
+
+```prometheus
+# Availability SLI
+sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m]))
+
+# Latency SLI  
+histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
+```
+
+#### Batch Jobs
+- **Freshness**: Age of the last successful run
+- **Correctness**: Proportion of jobs completing successfully
+- **Throughput**: Items processed per unit time
+
+#### Data Pipelines
+- **Data freshness**: Time since last successful update
+- **Data quality**: Proportion of records passing validation
+- **Processing latency**: Time from ingestion to availability
+
+### Anti-Patterns in SLI Selection
+
+❌ **Don't use**: CPU usage, memory usage, disk space as primary SLIs
+- These are symptoms, not user-facing impacts
+
+❌ **Don't use**: Counts instead of rates or proportions
+- "Number of errors" vs "Error rate"
+
+❌ **Don't use**: Internal metrics that users don't care about
+- Queue depth, cache hit rate (unless they directly impact user experience)
+
+## Setting SLO Targets
+
+### The Art of Target Setting
+
+Setting SLO targets is balancing act between:
+- **User happiness**: Targets should reflect acceptable user experience
+- **Business value**: Tighter SLOs cost more to maintain
+- **Current performance**: Targets should be achievable but aspirational
+
+### Target Setting Strategies
+
+#### Historical Performance Method
+1. Collect 4-6 weeks of historical data
+2. Calculate the worst user-visible performance in that period
+3. Set your SLO slightly better than the worst acceptable performance
+
+#### User Journey Mapping
+1. Map critical user journeys
+2. Identify acceptable performance for each step
+3. Work backwards to component SLOs
+
+#### Error Budget Approach
+1. Decide how much unreliability you can afford
+2. Set SLO targets based on acceptable error budget consumption
+3. Example: 99.9% availability = 43.8 minutes downtime per month
+
+### SLO Target Examples by Service Criticality
+
+#### Critical Services (Revenue Impact)
+- **Availability**: 99.95% - 99.99%
+- **Latency (P95)**: 100-200ms
+- **Error Rate**: < 0.1%
+
+#### High Priority Services  
+- **Availability**: 99.9% - 99.95%
+- **Latency (P95)**: 200-500ms
+- **Error Rate**: < 0.5%
+
+#### Standard Services
+- **Availability**: 99.5% - 99.9%
+- **Latency (P95)**: 500ms - 1s
+- **Error Rate**: < 1%
+
+## Error Budget Management
+
+### What is an Error Budget?
+
+Your error budget is the maximum amount of unreliability you can accumulate while still meeting your SLO. It's calculated as:
+
+```
+Error Budget = (1 - SLO) × Time Window
+```
+
+For a 99.9% availability SLO over 30 days:
+```
+Error Budget = (1 - 0.999) × 30 days = 0.001 × 30 days = 43.8 minutes
+```
+
+### Error Budget Policies
+
+Define what happens when you consume your error budget:
+
+#### Conservative Policy (High-Risk Services)
+- **> 50% consumed**: Freeze non-critical feature releases
+- **> 75% consumed**: Focus entirely on reliability improvements  
+- **> 90% consumed**: Consider emergency measures (traffic shaping, etc.)
+
+#### Balanced Policy (Standard Services)
+- **> 75% consumed**: Increase focus on reliability work
+- **> 90% consumed**: Pause feature work, focus on reliability
+
+#### Aggressive Policy (Early Stage Services)
+- **> 90% consumed**: Review but continue normal operations
+- **100% consumed**: Evaluate SLO appropriateness
+
+### Burn Rate Alerting
+
+Multi-window burn rate alerts help you catch SLO violations before they become critical:
+
+```yaml
+# Fast burn: 2% budget consumed in 1 hour
+- alert: FastBurnSLOViolation
+  expr: (
+    (1 - (sum(rate(http_requests_total{code!~"5.."}[5m])) / sum(rate(http_requests_total[5m])))) > (14.4 * 0.001)
+    and
+    (1 - (sum(rate(http_requests_total{code!~"5.."}[1h])) / sum(rate(http_requests_total[1h])))) > (14.4 * 0.001)
+  )
+  for: 2m
+
+# Slow burn: 10% budget consumed in 3 days  
+- alert: SlowBurnSLOViolation
+  expr: (
+    (1 - (sum(rate(http_requests_total{code!~"5.."}[6h])) / sum(rate(http_requests_total[6h])))) > (1.0 * 0.001)
+    and
+    (1 - (sum(rate(http_requests_total{code!~"5.."}[3d])) / sum(rate(http_requests_total[3d])))) > (1.0 * 0.001)
+  )
+  for: 15m
+```
+
+## Implementation Patterns
+
+### The SLO Implementation Ladder
+
+#### Level 1: Basic SLOs
+- Choose 1-2 SLIs that matter most to users
+- Set aspirational but achievable targets
+- Implement basic alerting when SLOs are missed
+
+#### Level 2: Operational SLOs
+- Add burn rate alerting
+- Create error budget dashboards
+- Establish error budget policies
+- Regular SLO review meetings
+
+#### Level 3: Advanced SLOs
+- Multi-window burn rate alerts
+- Automated error budget policy enforcement
+- SLO-driven incident prioritization
+- Integration with CI/CD for deployment decisions
+
+### SLO Measurement Architecture
+
+#### Push vs Pull Metrics
+- **Pull** (Prometheus): Good for infrastructure metrics, real-time alerting
+- **Push** (StatsD): Good for application metrics, business events
+
+#### Measurement Points
+- **Server-side**: More reliable, easier to implement
+- **Client-side**: Better reflects user experience
+- **Synthetic**: Consistent, predictable, may not reflect real user experience
+
+### SLO Dashboard Design
+
+Essential elements for SLO dashboards:
+
+1. **Current SLO Achievement**: Large, prominent display
+2. **Error Budget Remaining**: Visual indicator (gauge, progress bar)
+3. **Burn Rate**: Time series showing error budget consumption rate
+4. **Historical Trends**: 4-week view of SLO achievement
+5. **Alerts**: Current and recent SLO-related alerts
+
+## Advanced Topics
+
+### Dependency SLOs
+
+For services with dependencies:
+
+```
+SLO_service ≤ min(SLO_inherent, ∏SLO_dependencies)
+```
+
+If your service depends on 3 other services each with 99.9% SLO:
+```
+Maximum_SLO = 0.999³ = 0.997 = 99.7%
+```
+
+### User Journey SLOs
+
+Track end-to-end user experiences:
+
+```prometheus
+# Registration success rate
+sum(rate(user_registration_success_total[5m])) / sum(rate(user_registration_attempts_total[5m]))
+
+# Purchase completion latency
+histogram_quantile(0.95, rate(purchase_completion_duration_seconds_bucket[5m]))
+```
+
+### SLOs for Batch Systems
+
+Special considerations for non-request/response systems:
+
+#### Freshness SLO
+```prometheus
+# Data should be no more than 4 hours old
+(time() - last_successful_update_timestamp) < (4 * 3600)
+```
+
+#### Throughput SLO
+```prometheus
+# Should process at least 1000 items per hour
+rate(items_processed_total[1h]) >= 1000
+```
+
+#### Quality SLO
+```prometheus
+# At least 99.5% of records should pass validation
+sum(rate(records_valid_total[5m])) / sum(rate(records_processed_total[5m])) >= 0.995
+```
+
+## Common Mistakes and How to Avoid Them
+
+### Mistake 1: Too Many SLOs
+**Problem**: Drowning in metrics, losing focus
+**Solution**: Start with 1-2 SLOs per service, add more only when needed
+
+### Mistake 2: Internal Metrics as SLIs
+**Problem**: Optimizing for metrics that don't impact users
+**Solution**: Always ask "If this metric changes, do users notice?"
+
+### Mistake 3: Perfectionist SLOs
+**Problem**: 99.99% SLO when 99.9% would be fine
+**Solution**: Higher SLOs cost exponentially more; pick the minimum acceptable level
+
+### Mistake 4: Ignoring Error Budgets
+**Problem**: Treating any SLO miss as an emergency
+**Solution**: Error budgets exist to be spent; use them to balance feature velocity and reliability
+
+### Mistake 5: Static SLOs
+**Problem**: Setting SLOs once and never updating them
+**Solution**: Review SLOs quarterly; adjust based on user feedback and business changes
+
+## SLO Review Process
+
+### Monthly SLO Review Agenda
+
+1. **SLO Achievement Review**: Did we meet our SLOs?
+2. **Error Budget Analysis**: How did we spend our error budget?
+3. **Incident Correlation**: Which incidents impacted our SLOs?
+4. **SLI Quality Assessment**: Are our SLIs still meaningful?
+5. **Target Adjustment**: Should we change any targets?
+
+### Quarterly SLO Health Check
+
+1. **User Impact Validation**: Survey users about acceptable performance
+2. **Business Alignment**: Do SLOs still reflect business priorities?
+3. **Measurement Quality**: Are we measuring the right things?
+4. **Cost/Benefit Analysis**: Are tighter SLOs worth the investment?
+
+## Tooling and Automation
+
+### Essential Tools
+
+1. **Metrics Collection**: Prometheus, InfluxDB, CloudWatch
+2. **Alerting**: Alertmanager, PagerDuty, OpsGenie  
+3. **Dashboards**: Grafana, DataDog, New Relic
+4. **SLO Platforms**: Sloth, Pyrra, Service Level Blue
+
+### Automation Opportunities
+
+- **Burn rate alert generation** from SLO definitions
+- **Dashboard creation** from SLO specifications
+- **Error budget calculation** and tracking
+- **Release blocking** based on error budget consumption
+
+## Getting Started Checklist
+
+- [ ] Identify your service's critical user journeys
+- [ ] Choose 1-2 SLIs that best reflect user experience
+- [ ] Collect 4-6 weeks of baseline data
+- [ ] Set initial SLO targets based on historical performance
+- [ ] Implement basic SLO monitoring and alerting
+- [ ] Create an SLO dashboard
+- [ ] Define error budget policies
+- [ ] Schedule monthly SLO reviews
+- [ ] Plan for quarterly SLO health checks
+
+Remember: SLOs are a journey, not a destination. Start simple, learn from experience, and iterate toward better reliability management.
\ No newline at end of file
diff --git a/engineering/observability-designer/scripts/alert_optimizer.py b/engineering/observability-designer/scripts/alert_optimizer.py
new file mode 100644
index 0000000..cbb0ef4
--- /dev/null
+++ b/engineering/observability-designer/scripts/alert_optimizer.py
@@ -0,0 +1,1059 @@
+#!/usr/bin/env python3
+"""
+Alert Optimizer - Analyze and optimize alert configurations
+
+This script analyzes existing alert configurations and identifies optimization opportunities:
+- Noisy alerts with high false positive rates
+- Missing coverage gaps in monitoring
+- Duplicate or redundant alerts
+- Poor threshold settings and alert fatigue risks
+- Missing runbooks and documentation
+- Routing and escalation policy improvements
+
+Usage:
+    python alert_optimizer.py --input alert_config.json --output optimized_config.json
+    python alert_optimizer.py --input alerts.json --analyze-only --report report.html
+"""
+
+import json
+import argparse
+import sys
+import re
+import math
+from typing import Dict, List, Any, Tuple, Set
+from datetime import datetime, timedelta
+from collections import defaultdict, Counter
+
+
+class AlertOptimizer:
+    """Analyze and optimize alert configurations."""
+    
+    # Alert severity priority mapping
+    SEVERITY_PRIORITY = {
+        'critical': 1,
+        'high': 2,
+        'warning': 3,
+        'info': 4
+    }
+    
+    # Common noisy alert patterns
+    NOISY_PATTERNS = [
+        r'disk.*usage.*>.*[89]\d%',  # Disk usage > 80% often noisy
+        r'memory.*>.*[89]\d%',       # Memory > 80% often noisy
+        r'cpu.*>.*[789]\d%',         # CPU > 70% can be noisy
+        r'response.*time.*>.*\d+ms', # Low latency thresholds
+        r'error.*rate.*>.*0\.[01]%'  # Very low error rate thresholds
+    ]
+    
+    # Essential monitoring categories
+    COVERAGE_CATEGORIES = [
+        'availability',
+        'latency', 
+        'error_rate',
+        'resource_utilization',
+        'security',
+        'business_metrics'
+    ]
+    
+    # Golden signals that should always be monitored
+    GOLDEN_SIGNALS = [
+        'latency',
+        'traffic',
+        'errors', 
+        'saturation'
+    ]
+
+    def __init__(self):
+        """Initialize the Alert Optimizer."""
+        self.alert_config = {}
+        self.optimization_results = {}
+        self.alert_analysis = {}
+
+    def load_alert_config(self, file_path: str) -> Dict[str, Any]:
+        """Load alert configuration from JSON file."""
+        try:
+            with open(file_path, 'r') as f:
+                return json.load(f)
+        except FileNotFoundError:
+            raise ValueError(f"Alert configuration file not found: {file_path}")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in alert configuration: {e}")
+
+    def analyze_alert_noise(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Identify potentially noisy alerts."""
+        noisy_alerts = []
+        
+        for alert in alerts:
+            noise_score = 0
+            noise_reasons = []
+            
+            alert_rule = alert.get('expr', alert.get('condition', ''))
+            alert_name = alert.get('alert', alert.get('name', 'Unknown'))
+            
+            # Check for common noisy patterns
+            for pattern in self.NOISY_PATTERNS:
+                if re.search(pattern, alert_rule, re.IGNORECASE):
+                    noise_score += 3
+                    noise_reasons.append(f"Matches noisy pattern: {pattern}")
+            
+            # Check for very frequent evaluation intervals
+            evaluation_interval = alert.get('for', '0s')
+            if self._parse_duration(evaluation_interval) < 60:  # Less than 1 minute
+                noise_score += 2
+                noise_reasons.append("Very short evaluation interval")
+            
+            # Check for lack of 'for' clause
+            if not alert.get('for') or alert.get('for') == '0s':
+                noise_score += 2
+                noise_reasons.append("No 'for' clause - may cause alert flapping")
+            
+            # Check for overly sensitive thresholds
+            if self._has_sensitive_threshold(alert_rule):
+                noise_score += 2
+                noise_reasons.append("Potentially sensitive threshold")
+            
+            # Check historical firing rate if available
+            historical_data = alert.get('historical_data', {})
+            if historical_data:
+                firing_rate = historical_data.get('fires_per_day', 0)
+                if firing_rate > 10:  # More than 10 fires per day
+                    noise_score += 3
+                    noise_reasons.append(f"High firing rate: {firing_rate} times/day")
+                
+                false_positive_rate = historical_data.get('false_positive_rate', 0)
+                if false_positive_rate > 0.3:  # > 30% false positives
+                    noise_score += 4
+                    noise_reasons.append(f"High false positive rate: {false_positive_rate*100:.1f}%")
+            
+            if noise_score >= 3:  # Threshold for considering an alert noisy
+                noisy_alert = {
+                    'alert_name': alert_name,
+                    'noise_score': noise_score,
+                    'reasons': noise_reasons,
+                    'current_rule': alert_rule,
+                    'recommendations': self._generate_noise_reduction_recommendations(alert, noise_reasons)
+                }
+                noisy_alerts.append(noisy_alert)
+        
+        return sorted(noisy_alerts, key=lambda x: x['noise_score'], reverse=True)
+
+    def _parse_duration(self, duration_str: str) -> int:
+        """Parse duration string to seconds."""
+        if not duration_str or duration_str == '0s':
+            return 0
+        
+        duration_map = {'s': 1, 'm': 60, 'h': 3600, 'd': 86400}
+        match = re.match(r'(\d+)([smhd])', duration_str)
+        if match:
+            value, unit = match.groups()
+            return int(value) * duration_map.get(unit, 1)
+        return 0
+
+    def _has_sensitive_threshold(self, rule: str) -> bool:
+        """Check if alert rule has potentially sensitive thresholds."""
+        # Look for very low error rates or very tight latency thresholds
+        sensitive_patterns = [
+            r'error.*rate.*>.*0\.0[01]',     # Error rate > 0.01% or 0.001%
+            r'latency.*>.*[12]\d\d?ms',      # Latency > 100-299ms
+            r'response.*time.*>.*0\.[12]',   # Response time > 0.1-0.2s
+            r'cpu.*>.*[456]\d%'              # CPU > 40-69% (too sensitive for most cases)
+        ]
+        
+        for pattern in sensitive_patterns:
+            if re.search(pattern, rule, re.IGNORECASE):
+                return True
+        return False
+
+    def _generate_noise_reduction_recommendations(self, alert: Dict[str, Any], 
+                                                 reasons: List[str]) -> List[str]:
+        """Generate recommendations to reduce alert noise."""
+        recommendations = []
+        
+        if "No 'for' clause" in str(reasons):
+            recommendations.append("Add 'for: 5m' clause to prevent flapping")
+        
+        if "Very short evaluation interval" in str(reasons):
+            recommendations.append("Increase evaluation interval to at least 1 minute")
+        
+        if "sensitive threshold" in str(reasons):
+            recommendations.append("Review and increase threshold based on historical data")
+        
+        if "High firing rate" in str(reasons):
+            recommendations.append("Analyze historical firing patterns and adjust thresholds")
+        
+        if "High false positive rate" in str(reasons):
+            recommendations.append("Implement more specific conditions to reduce false positives")
+        
+        if "noisy pattern" in str(reasons):
+            recommendations.append("Consider using percentile-based thresholds instead of absolute values")
+        
+        return recommendations
+
+    def identify_coverage_gaps(self, alerts: List[Dict[str, Any]], 
+                             services: List[Dict[str, Any]] = None) -> Dict[str, Any]:
+        """Identify gaps in monitoring coverage."""
+        coverage_analysis = {
+            'missing_categories': [],
+            'missing_golden_signals': [],
+            'service_coverage_gaps': [],
+            'critical_gaps': [],
+            'recommendations': []
+        }
+        
+        # Analyze coverage by category
+        covered_categories = set()
+        alert_categories = []
+        
+        for alert in alerts:
+            alert_rule = alert.get('expr', alert.get('condition', ''))
+            alert_name = alert.get('alert', alert.get('name', ''))
+            
+            category = self._classify_alert_category(alert_rule, alert_name)
+            if category:
+                covered_categories.add(category)
+                alert_categories.append(category)
+        
+        # Check for missing essential categories
+        missing_categories = set(self.COVERAGE_CATEGORIES) - covered_categories
+        coverage_analysis['missing_categories'] = list(missing_categories)
+        
+        # Check for missing golden signals
+        covered_signals = set()
+        for alert in alerts:
+            alert_rule = alert.get('expr', alert.get('condition', ''))
+            signal = self._identify_golden_signal(alert_rule)
+            if signal:
+                covered_signals.add(signal)
+        
+        missing_signals = set(self.GOLDEN_SIGNALS) - covered_signals
+        coverage_analysis['missing_golden_signals'] = list(missing_signals)
+        
+        # Analyze service-specific coverage if service list provided
+        if services:
+            service_coverage = self._analyze_service_coverage(alerts, services)
+            coverage_analysis['service_coverage_gaps'] = service_coverage
+        
+        # Identify critical gaps
+        critical_gaps = []
+        if 'availability' in missing_categories:
+            critical_gaps.append("Missing availability monitoring")
+        if 'error_rate' in missing_categories:
+            critical_gaps.append("Missing error rate monitoring")
+        if 'errors' in missing_signals:
+            critical_gaps.append("Missing error signal monitoring")
+        
+        coverage_analysis['critical_gaps'] = critical_gaps
+        
+        # Generate recommendations
+        recommendations = self._generate_coverage_recommendations(coverage_analysis)
+        coverage_analysis['recommendations'] = recommendations
+        
+        return coverage_analysis
+
+    def _classify_alert_category(self, rule: str, alert_name: str) -> str:
+        """Classify alert into monitoring category."""
+        rule_lower = rule.lower()
+        name_lower = alert_name.lower()
+        
+        if any(keyword in rule_lower or keyword in name_lower 
+               for keyword in ['up', 'down', 'available', 'reachable']):
+            return 'availability'
+        
+        if any(keyword in rule_lower or keyword in name_lower 
+               for keyword in ['latency', 'response_time', 'duration']):
+            return 'latency'
+        
+        if any(keyword in rule_lower or keyword in name_lower 
+               for keyword in ['error', 'fail', '5xx', '4xx']):
+            return 'error_rate'
+        
+        if any(keyword in rule_lower or keyword in name_lower 
+               for keyword in ['cpu', 'memory', 'disk', 'network', 'utilization']):
+            return 'resource_utilization'
+        
+        if any(keyword in rule_lower or keyword in name_lower 
+               for keyword in ['security', 'auth', 'login', 'breach']):
+            return 'security'
+        
+        if any(keyword in rule_lower or keyword in name_lower 
+               for keyword in ['revenue', 'conversion', 'user', 'business']):
+            return 'business_metrics'
+        
+        return 'other'
+
+    def _identify_golden_signal(self, rule: str) -> str:
+        """Identify which golden signal an alert covers."""
+        rule_lower = rule.lower()
+        
+        if any(keyword in rule_lower for keyword in ['latency', 'response_time', 'duration']):
+            return 'latency'
+        
+        if any(keyword in rule_lower for keyword in ['rate', 'rps', 'qps', 'throughput']):
+            return 'traffic'
+        
+        if any(keyword in rule_lower for keyword in ['error', 'fail', '5xx']):
+            return 'errors'
+        
+        if any(keyword in rule_lower for keyword in ['cpu', 'memory', 'disk', 'utilization']):
+            return 'saturation'
+        
+        return None
+
+    def _analyze_service_coverage(self, alerts: List[Dict[str, Any]], 
+                                services: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Analyze monitoring coverage per service."""
+        service_coverage = []
+        
+        for service in services:
+            service_name = service.get('name', '')
+            service_alerts = [alert for alert in alerts 
+                            if service_name in alert.get('expr', '') or 
+                               service_name in alert.get('labels', {}).get('service', '')]
+            
+            covered_signals = set()
+            for alert in service_alerts:
+                signal = self._identify_golden_signal(alert.get('expr', ''))
+                if signal:
+                    covered_signals.add(signal)
+            
+            missing_signals = set(self.GOLDEN_SIGNALS) - covered_signals
+            
+            if missing_signals or len(service_alerts) < 3:  # Less than 3 alerts per service
+                coverage_gap = {
+                    'service': service_name,
+                    'alert_count': len(service_alerts),
+                    'covered_signals': list(covered_signals),
+                    'missing_signals': list(missing_signals),
+                    'criticality': service.get('criticality', 'medium'),
+                    'recommendations': []
+                }
+                
+                if len(service_alerts) == 0:
+                    coverage_gap['recommendations'].append("Add basic availability monitoring")
+                if 'errors' in missing_signals:
+                    coverage_gap['recommendations'].append("Add error rate monitoring")
+                if 'latency' in missing_signals:
+                    coverage_gap['recommendations'].append("Add latency monitoring")
+                    
+                service_coverage.append(coverage_gap)
+        
+        return service_coverage
+
+    def _generate_coverage_recommendations(self, coverage_analysis: Dict[str, Any]) -> List[str]:
+        """Generate recommendations to improve monitoring coverage."""
+        recommendations = []
+        
+        for missing_category in coverage_analysis['missing_categories']:
+            if missing_category == 'availability':
+                recommendations.append("Add service availability/uptime monitoring")
+            elif missing_category == 'latency':
+                recommendations.append("Add response time and latency monitoring")
+            elif missing_category == 'error_rate':
+                recommendations.append("Add error rate and HTTP status code monitoring")
+            elif missing_category == 'resource_utilization':
+                recommendations.append("Add CPU, memory, and disk utilization monitoring")
+            elif missing_category == 'security':
+                recommendations.append("Add security monitoring (auth failures, suspicious activity)")
+            elif missing_category == 'business_metrics':
+                recommendations.append("Add business KPI monitoring")
+        
+        for missing_signal in coverage_analysis['missing_golden_signals']:
+            recommendations.append(f"Implement {missing_signal} monitoring (Golden Signal)")
+        
+        if coverage_analysis['critical_gaps']:
+            recommendations.append("Address critical monitoring gaps as highest priority")
+        
+        return recommendations
+
+    def find_duplicate_alerts(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Identify duplicate or redundant alerts."""
+        duplicates = []
+        alert_signatures = defaultdict(list)
+        
+        # Group alerts by signature
+        for i, alert in enumerate(alerts):
+            signature = self._generate_alert_signature(alert)
+            alert_signatures[signature].append((i, alert))
+        
+        # Find exact duplicates
+        for signature, alert_group in alert_signatures.items():
+            if len(alert_group) > 1:
+                duplicate_group = {
+                    'type': 'exact_duplicate',
+                    'signature': signature,
+                    'alerts': [{'index': i, 'name': alert.get('alert', alert.get('name', f'Alert_{i}'))} 
+                             for i, alert in alert_group],
+                    'recommendation': 'Remove duplicate alerts, keep the most comprehensive one'
+                }
+                duplicates.append(duplicate_group)
+        
+        # Find semantic duplicates (similar but not identical)
+        semantic_duplicates = self._find_semantic_duplicates(alerts)
+        duplicates.extend(semantic_duplicates)
+        
+        return duplicates
+
+    def _generate_alert_signature(self, alert: Dict[str, Any]) -> str:
+        """Generate a signature for alert comparison."""
+        expr = alert.get('expr', alert.get('condition', ''))
+        labels = alert.get('labels', {})
+        
+        # Normalize the expression by removing whitespace and standardizing
+        normalized_expr = re.sub(r'\s+', ' ', expr).strip()
+        
+        # Create signature from expression and key labels
+        key_labels = {k: v for k, v in labels.items() 
+                     if k in ['service', 'severity', 'team']}
+        
+        return f"{normalized_expr}::{json.dumps(key_labels, sort_keys=True)}"
+
+    def _find_semantic_duplicates(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Find semantically similar alerts."""
+        semantic_duplicates = []
+        
+        # Group alerts by service and metric type
+        service_groups = defaultdict(list)
+        
+        for i, alert in enumerate(alerts):
+            service = self._extract_service_from_alert(alert)
+            metric_type = self._extract_metric_type_from_alert(alert)
+            key = f"{service}::{metric_type}"
+            service_groups[key].append((i, alert))
+        
+        # Look for similar alerts within each group
+        for key, alert_group in service_groups.items():
+            if len(alert_group) > 1:
+                similar_alerts = self._identify_similar_alerts(alert_group)
+                if similar_alerts:
+                    semantic_duplicates.extend(similar_alerts)
+        
+        return semantic_duplicates
+
+    def _extract_service_from_alert(self, alert: Dict[str, Any]) -> str:
+        """Extract service name from alert."""
+        labels = alert.get('labels', {})
+        if 'service' in labels:
+            return labels['service']
+        
+        expr = alert.get('expr', alert.get('condition', ''))
+        # Try to extract service from metric labels
+        service_match = re.search(r'service="([^"]+)"', expr)
+        if service_match:
+            return service_match.group(1)
+        
+        return 'unknown'
+
+    def _extract_metric_type_from_alert(self, alert: Dict[str, Any]) -> str:
+        """Extract metric type from alert."""
+        expr = alert.get('expr', alert.get('condition', ''))
+        
+        # Common metric patterns
+        if 'up' in expr.lower():
+            return 'availability'
+        elif any(keyword in expr.lower() for keyword in ['latency', 'duration', 'response_time']):
+            return 'latency'
+        elif any(keyword in expr.lower() for keyword in ['error', 'fail', '5xx']):
+            return 'error_rate'
+        elif any(keyword in expr.lower() for keyword in ['cpu', 'memory', 'disk']):
+            return 'resource'
+        
+        return 'other'
+
+    def _identify_similar_alerts(self, alert_group: List[Tuple[int, Dict[str, Any]]]) -> List[Dict[str, Any]]:
+        """Identify similar alerts within a group."""
+        similar_groups = []
+        
+        # Simple similarity check based on threshold values and conditions
+        threshold_groups = defaultdict(list)
+        
+        for index, alert in alert_group:
+            expr = alert.get('expr', alert.get('condition', ''))
+            threshold = self._extract_threshold_from_expression(expr)
+            severity = alert.get('labels', {}).get('severity', 'unknown')
+            
+            similarity_key = f"{threshold}::{severity}"
+            threshold_groups[similarity_key].append((index, alert))
+        
+        # If multiple alerts have very similar thresholds, they might be redundant
+        for similarity_key, similar_alerts in threshold_groups.items():
+            if len(similar_alerts) > 1:
+                similar_group = {
+                    'type': 'semantic_duplicate',
+                    'similarity_key': similarity_key,
+                    'alerts': [{'index': i, 'name': alert.get('alert', alert.get('name', f'Alert_{i}'))} 
+                             for i, alert in similar_alerts],
+                    'recommendation': 'Review for potential consolidation - similar thresholds and conditions'
+                }
+                similar_groups.append(similar_group)
+        
+        return similar_groups
+
+    def _extract_threshold_from_expression(self, expr: str) -> str:
+        """Extract threshold value from alert expression."""
+        # Look for common threshold patterns
+        threshold_patterns = [
+            r'>[\s]*([0-9.]+)',
+            r'<[\s]*([0-9.]+)',
+            r'>=[\s]*([0-9.]+)',
+            r'<=[\s]*([0-9.]+)',
+            r'==[\s]*([0-9.]+)'
+        ]
+        
+        for pattern in threshold_patterns:
+            match = re.search(pattern, expr)
+            if match:
+                return match.group(1)
+        
+        return 'unknown'
+
+    def analyze_thresholds(self, alerts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Analyze alert thresholds for optimization opportunities."""
+        threshold_analysis = []
+        
+        for alert in alerts:
+            alert_name = alert.get('alert', alert.get('name', 'Unknown'))
+            expr = alert.get('expr', alert.get('condition', ''))
+            
+            analysis = {
+                'alert_name': alert_name,
+                'current_expression': expr,
+                'threshold_issues': [],
+                'recommendations': []
+            }
+            
+            # Check for hard-coded thresholds
+            if re.search(r'[><=]\s*[0-9.]+', expr):
+                analysis['threshold_issues'].append('Hard-coded threshold value')
+                analysis['recommendations'].append('Consider parameterizing thresholds')
+            
+            # Check for percentage-based thresholds that might be too strict
+            percentage_match = re.search(r'([><=])\s*0?\.\d+', expr)
+            if percentage_match:
+                operator = percentage_match.group(1)
+                if operator in ['>', '>='] and 'error' in expr.lower():
+                    analysis['threshold_issues'].append('Very low error rate threshold')
+                    analysis['recommendations'].append('Consider increasing error rate threshold based on SLO')
+            
+            # Check for missing hysteresis
+            if '>' in expr and 'for:' not in str(alert):
+                analysis['threshold_issues'].append('No hysteresis (for clause)')
+                analysis['recommendations'].append('Add "for" clause to prevent alert flapping')
+            
+            # Check for resource utilization thresholds
+            if any(resource in expr.lower() for resource in ['cpu', 'memory', 'disk']):
+                threshold_value = self._extract_threshold_from_expression(expr)
+                if threshold_value and threshold_value.replace('.', '').isdigit():
+                    threshold_num = float(threshold_value)
+                    if threshold_num < 0.7:  # Less than 70%
+                        analysis['threshold_issues'].append('Low resource utilization threshold')
+                        analysis['recommendations'].append('Consider increasing threshold to reduce noise')
+            
+            # Add historical data analysis if available
+            historical_data = alert.get('historical_data', {})
+            if historical_data:
+                false_positive_rate = historical_data.get('false_positive_rate', 0)
+                if false_positive_rate > 0.2:
+                    analysis['threshold_issues'].append(f'High false positive rate: {false_positive_rate*100:.1f}%')
+                    analysis['recommendations'].append('Analyze historical data and adjust threshold')
+            
+            if analysis['threshold_issues']:
+                threshold_analysis.append(analysis)
+        
+        return threshold_analysis
+
+    def assess_alert_fatigue_risk(self, alerts: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Assess risk of alert fatigue."""
+        fatigue_assessment = {
+            'total_alerts': len(alerts),
+            'risk_level': 'low',
+            'risk_factors': [],
+            'metrics': {},
+            'recommendations': []
+        }
+        
+        # Count alerts by severity
+        severity_counts = Counter()
+        for alert in alerts:
+            severity = alert.get('labels', {}).get('severity', 'unknown')
+            severity_counts[severity] += 1
+        
+        fatigue_assessment['metrics']['severity_distribution'] = dict(severity_counts)
+        
+        # Calculate risk factors
+        critical_count = severity_counts.get('critical', 0)
+        warning_count = severity_counts.get('warning', 0) + severity_counts.get('high', 0)
+        total_high_priority = critical_count + warning_count
+        
+        # Too many high-priority alerts
+        if total_high_priority > 50:
+            fatigue_assessment['risk_factors'].append('High number of critical/warning alerts')
+            fatigue_assessment['recommendations'].append('Review and reduce number of high-priority alerts')
+        
+        # Poor critical to warning ratio
+        if critical_count > 0 and warning_count > 0:
+            critical_ratio = critical_count / (critical_count + warning_count)
+            if critical_ratio > 0.3:  # More than 30% critical
+                fatigue_assessment['risk_factors'].append('High ratio of critical alerts')
+                fatigue_assessment['recommendations'].append('Review critical alert criteria - not everything should be critical')
+        
+        # Estimate daily alert volume
+        daily_estimate = self._estimate_daily_alert_volume(alerts)
+        fatigue_assessment['metrics']['estimated_daily_alerts'] = daily_estimate
+        
+        if daily_estimate > 100:
+            fatigue_assessment['risk_factors'].append('High estimated daily alert volume')
+            fatigue_assessment['recommendations'].append('Implement alert grouping and suppression rules')
+        
+        # Check for missing runbooks
+        alerts_without_runbooks = [alert for alert in alerts 
+                                 if not alert.get('annotations', {}).get('runbook_url')]
+        runbook_ratio = len(alerts_without_runbooks) / len(alerts) if alerts else 0
+        
+        if runbook_ratio > 0.5:
+            fatigue_assessment['risk_factors'].append('Many alerts lack runbooks')
+            fatigue_assessment['recommendations'].append('Create runbooks for alerts to improve response efficiency')
+        
+        # Determine overall risk level
+        risk_score = len(fatigue_assessment['risk_factors'])
+        if risk_score >= 3:
+            fatigue_assessment['risk_level'] = 'high'
+        elif risk_score >= 1:
+            fatigue_assessment['risk_level'] = 'medium'
+        
+        return fatigue_assessment
+
+    def _estimate_daily_alert_volume(self, alerts: List[Dict[str, Any]]) -> int:
+        """Estimate daily alert volume."""
+        total_estimated = 0
+        
+        for alert in alerts:
+            # Use historical data if available
+            historical_data = alert.get('historical_data', {})
+            if historical_data and 'fires_per_day' in historical_data:
+                total_estimated += historical_data['fires_per_day']
+                continue
+            
+            # Otherwise estimate based on alert characteristics
+            expr = alert.get('expr', alert.get('condition', ''))
+            severity = alert.get('labels', {}).get('severity', 'warning')
+            
+            # Base estimate by severity
+            base_estimates = {
+                'critical': 0.1,  # Critical should rarely fire
+                'high': 0.5,
+                'warning': 2,
+                'info': 5
+            }
+            
+            estimate = base_estimates.get(severity, 1)
+            
+            # Adjust based on alert type
+            if 'error_rate' in expr.lower():
+                estimate *= 1.5  # Error rate alerts tend to be more frequent
+            elif 'availability' in expr.lower() or 'up' in expr.lower():
+                estimate *= 0.5  # Availability alerts should be rare
+            
+            total_estimated += estimate
+        
+        return int(total_estimated)
+
+    def generate_optimized_config(self, alerts: List[Dict[str, Any]], 
+                                analysis_results: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate optimized alert configuration."""
+        optimized_alerts = []
+        
+        for i, alert in enumerate(alerts):
+            optimized_alert = alert.copy()
+            alert_name = alert.get('alert', alert.get('name', f'Alert_{i}'))
+            
+            # Apply noise reduction optimizations
+            noisy_alerts = analysis_results.get('noisy_alerts', [])
+            for noisy_alert in noisy_alerts:
+                if noisy_alert['alert_name'] == alert_name:
+                    optimized_alert = self._apply_noise_reduction(optimized_alert, noisy_alert)
+                    break
+            
+            # Apply threshold optimizations
+            threshold_issues = analysis_results.get('threshold_analysis', [])
+            for threshold_issue in threshold_issues:
+                if threshold_issue['alert_name'] == alert_name:
+                    optimized_alert = self._apply_threshold_optimization(optimized_alert, threshold_issue)
+                    break
+            
+            # Ensure proper alert metadata
+            optimized_alert = self._ensure_alert_metadata(optimized_alert)
+            
+            optimized_alerts.append(optimized_alert)
+        
+        # Remove duplicates based on analysis
+        if 'duplicate_alerts' in analysis_results:
+            optimized_alerts = self._remove_duplicate_alerts(optimized_alerts, 
+                                                           analysis_results['duplicate_alerts'])
+        
+        # Add missing alerts for coverage gaps
+        if 'coverage_gaps' in analysis_results:
+            new_alerts = self._generate_missing_alerts(analysis_results['coverage_gaps'])
+            optimized_alerts.extend(new_alerts)
+        
+        optimized_config = {
+            'alerts': optimized_alerts,
+            'optimization_metadata': {
+                'optimized_at': datetime.utcnow().isoformat() + 'Z',
+                'original_count': len(alerts),
+                'optimized_count': len(optimized_alerts),
+                'changes_applied': analysis_results.get('optimizations_applied', [])
+            }
+        }
+        
+        return optimized_config
+
+    def _apply_noise_reduction(self, alert: Dict[str, Any], 
+                             noise_analysis: Dict[str, Any]) -> Dict[str, Any]:
+        """Apply noise reduction optimizations to an alert."""
+        optimized_alert = alert.copy()
+        
+        for recommendation in noise_analysis['recommendations']:
+            if 'for:' in recommendation and not alert.get('for'):
+                optimized_alert['for'] = '5m'
+            elif 'threshold' in recommendation.lower():
+                # This would require more sophisticated threshold adjustment
+                # For now, add annotation for manual review
+                if 'annotations' not in optimized_alert:
+                    optimized_alert['annotations'] = {}
+                optimized_alert['annotations']['optimization_note'] = 'Review threshold - potentially too sensitive'
+        
+        return optimized_alert
+
+    def _apply_threshold_optimization(self, alert: Dict[str, Any], 
+                                    threshold_analysis: Dict[str, Any]) -> Dict[str, Any]:
+        """Apply threshold optimizations to an alert."""
+        optimized_alert = alert.copy()
+        
+        # Add 'for' clause if missing
+        if 'No hysteresis' in str(threshold_analysis['threshold_issues']):
+            if not alert.get('for'):
+                optimized_alert['for'] = '5m'
+        
+        # Add optimization annotations
+        if threshold_analysis['recommendations']:
+            if 'annotations' not in optimized_alert:
+                optimized_alert['annotations'] = {}
+            optimized_alert['annotations']['threshold_recommendations'] = '; '.join(threshold_analysis['recommendations'])
+        
+        return optimized_alert
+
+    def _ensure_alert_metadata(self, alert: Dict[str, Any]) -> Dict[str, Any]:
+        """Ensure alert has proper metadata."""
+        optimized_alert = alert.copy()
+        
+        # Ensure annotations exist
+        if 'annotations' not in optimized_alert:
+            optimized_alert['annotations'] = {}
+        
+        # Add summary if missing
+        if 'summary' not in optimized_alert['annotations']:
+            alert_name = alert.get('alert', alert.get('name', 'Alert'))
+            optimized_alert['annotations']['summary'] = f"Alert: {alert_name}"
+        
+        # Add description if missing
+        if 'description' not in optimized_alert['annotations']:
+            optimized_alert['annotations']['description'] = 'This alert requires a description. Please update with specific details about the condition and impact.'
+        
+        # Ensure proper labels
+        if 'labels' not in optimized_alert:
+            optimized_alert['labels'] = {}
+        
+        if 'severity' not in optimized_alert['labels']:
+            optimized_alert['labels']['severity'] = 'warning'
+        
+        return optimized_alert
+
+    def _remove_duplicate_alerts(self, alerts: List[Dict[str, Any]], 
+                               duplicates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Remove duplicate alerts from the list."""
+        indices_to_remove = set()
+        
+        for duplicate_group in duplicates:
+            if duplicate_group['type'] == 'exact_duplicate':
+                # Keep the first alert, remove the rest
+                alert_indices = [alert_info['index'] for alert_info in duplicate_group['alerts']]
+                indices_to_remove.update(alert_indices[1:])  # Remove all but first
+        
+        return [alert for i, alert in enumerate(alerts) if i not in indices_to_remove]
+
+    def _generate_missing_alerts(self, coverage_gaps: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate alerts for missing coverage."""
+        new_alerts = []
+        
+        for missing_signal in coverage_gaps.get('missing_golden_signals', []):
+            if missing_signal == 'latency':
+                new_alert = {
+                    'alert': 'HighLatency',
+                    'expr': 'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5',
+                    'for': '5m',
+                    'labels': {
+                        'severity': 'warning'
+                    },
+                    'annotations': {
+                        'summary': 'High request latency detected',
+                        'description': 'The 95th percentile latency is above 500ms for 5 minutes.',
+                        'generated': 'true'
+                    }
+                }
+                new_alerts.append(new_alert)
+            
+            elif missing_signal == 'errors':
+                new_alert = {
+                    'alert': 'HighErrorRate',
+                    'expr': 'sum(rate(http_requests_total{code=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.01',
+                    'for': '5m',
+                    'labels': {
+                        'severity': 'warning'
+                    },
+                    'annotations': {
+                        'summary': 'High error rate detected',
+                        'description': 'Error rate is above 1% for 5 minutes.',
+                        'generated': 'true'
+                    }
+                }
+                new_alerts.append(new_alert)
+        
+        return new_alerts
+
+    def analyze_configuration(self, alert_config: Dict[str, Any]) -> Dict[str, Any]:
+        """Perform comprehensive analysis of alert configuration."""
+        alerts = alert_config.get('alerts', alert_config.get('rules', []))
+        services = alert_config.get('services', [])
+        
+        analysis_results = {
+            'summary': {
+                'total_alerts': len(alerts),
+                'analysis_timestamp': datetime.utcnow().isoformat() + 'Z'
+            },
+            'noisy_alerts': self.analyze_alert_noise(alerts),
+            'coverage_gaps': self.identify_coverage_gaps(alerts, services),
+            'duplicate_alerts': self.find_duplicate_alerts(alerts),
+            'threshold_analysis': self.analyze_thresholds(alerts),
+            'alert_fatigue_assessment': self.assess_alert_fatigue_risk(alerts)
+        }
+        
+        # Generate overall recommendations
+        analysis_results['overall_recommendations'] = self._generate_overall_recommendations(analysis_results)
+        
+        return analysis_results
+
+    def _generate_overall_recommendations(self, analysis_results: Dict[str, Any]) -> List[str]:
+        """Generate overall recommendations based on complete analysis."""
+        recommendations = []
+        
+        # High-priority recommendations
+        if analysis_results['alert_fatigue_assessment']['risk_level'] == 'high':
+            recommendations.append("HIGH PRIORITY: Address alert fatigue risk by reducing alert volume")
+        
+        if len(analysis_results['coverage_gaps']['critical_gaps']) > 0:
+            recommendations.append("HIGH PRIORITY: Address critical monitoring gaps")
+        
+        # Medium-priority recommendations
+        if len(analysis_results['noisy_alerts']) > 0:
+            recommendations.append(f"Optimize {len(analysis_results['noisy_alerts'])} noisy alerts to reduce false positives")
+        
+        if len(analysis_results['duplicate_alerts']) > 0:
+            recommendations.append(f"Remove or consolidate {len(analysis_results['duplicate_alerts'])} duplicate alert groups")
+        
+        # General recommendations
+        recommendations.append("Implement proper alert routing and escalation policies")
+        recommendations.append("Create runbooks for all production alerts")
+        recommendations.append("Set up alert effectiveness monitoring and regular reviews")
+        
+        return recommendations
+
+    def export_analysis(self, analysis_results: Dict[str, Any], output_file: str, 
+                       format_type: str = 'json'):
+        """Export analysis results."""
+        if format_type.lower() == 'json':
+            with open(output_file, 'w') as f:
+                json.dump(analysis_results, f, indent=2)
+        elif format_type.lower() == 'html':
+            self._export_html_report(analysis_results, output_file)
+        else:
+            raise ValueError(f"Unsupported format: {format_type}")
+
+    def _export_html_report(self, analysis_results: Dict[str, Any], output_file: str):
+        """Export analysis as HTML report."""
+        html_content = self._generate_html_report(analysis_results)
+        with open(output_file, 'w') as f:
+            f.write(html_content)
+
+    def _generate_html_report(self, analysis_results: Dict[str, Any]) -> str:
+        """Generate HTML report of analysis results."""
+        html = f"""
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Alert Configuration Analysis Report</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; }}
+        .header {{ background: #f4f4f4; padding: 20px; border-radius: 5px; }}
+        .section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
+        .critical {{ border-left: 5px solid #ff0000; }}
+        .warning {{ border-left: 5px solid #ff9900; }}
+        .info {{ border-left: 5px solid #0066cc; }}
+        .success {{ border-left: 5px solid #00aa00; }}
+        ul {{ margin: 10px 0; }}
+        li {{ margin: 5px 0; }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>Alert Configuration Analysis Report</h1>
+        <p>Generated: {analysis_results['summary']['analysis_timestamp']}</p>
+        <p>Total Alerts Analyzed: {analysis_results['summary']['total_alerts']}</p>
+    </div>
+    
+    <div class="section critical">
+        <h2>Overall Recommendations</h2>
+        <ul>
+        {''.join(f'<li>{rec}</li>' for rec in analysis_results['overall_recommendations'])}
+        </ul>
+    </div>
+    
+    <div class="section warning">
+        <h2>Alert Fatigue Assessment</h2>
+        <p><strong>Risk Level:</strong> {analysis_results['alert_fatigue_assessment']['risk_level'].upper()}</p>
+        <p><strong>Risk Factors:</strong></p>
+        <ul>
+        {''.join(f'<li>{factor}</li>' for factor in analysis_results['alert_fatigue_assessment']['risk_factors'])}
+        </ul>
+    </div>
+    
+    <div class="section info">
+        <h2>Noisy Alerts ({len(analysis_results['noisy_alerts'])})</h2>
+        {''.join(f'<div><strong>{alert["alert_name"]}</strong> (Score: {alert["noise_score"]})<ul>{"".join(f"<li>{reason}</li>" for reason in alert["reasons"])}</ul></div>' 
+                for alert in analysis_results['noisy_alerts'][:5])}
+    </div>
+    
+    <div class="section info">
+        <h2>Coverage Gaps</h2>
+        <p><strong>Missing Categories:</strong> {', '.join(analysis_results['coverage_gaps']['missing_categories']) or 'None'}</p>
+        <p><strong>Missing Golden Signals:</strong> {', '.join(analysis_results['coverage_gaps']['missing_golden_signals']) or 'None'}</p>
+        <p><strong>Critical Gaps:</strong> {len(analysis_results['coverage_gaps']['critical_gaps'])}</p>
+    </div>
+    
+</body>
+</html>
+        """
+        return html
+
+    def print_summary(self, analysis_results: Dict[str, Any]):
+        """Print human-readable summary of analysis."""
+        print(f"\n{'='*60}")
+        print(f"ALERT CONFIGURATION ANALYSIS SUMMARY")
+        print(f"{'='*60}")
+        
+        summary = analysis_results['summary']
+        print(f"\nOverall Statistics:")
+        print(f"  Total Alerts: {summary['total_alerts']}")
+        print(f"  Analysis Date: {summary['analysis_timestamp']}")
+        
+        # Alert fatigue assessment
+        fatigue = analysis_results['alert_fatigue_assessment']
+        print(f"\nAlert Fatigue Risk: {fatigue['risk_level'].upper()}")
+        if fatigue['risk_factors']:
+            print(f"  Risk Factors:")
+            for factor in fatigue['risk_factors']:
+                print(f"    • {factor}")
+        
+        # Noisy alerts
+        noisy = analysis_results['noisy_alerts']
+        print(f"\nNoisy Alerts: {len(noisy)}")
+        if noisy:
+            print(f"  Top 3 Noisiest:")
+            for alert in noisy[:3]:
+                print(f"    • {alert['alert_name']} (Score: {alert['noise_score']})")
+        
+        # Coverage gaps
+        gaps = analysis_results['coverage_gaps']
+        print(f"\nMonitoring Coverage:")
+        print(f"  Missing Categories: {len(gaps['missing_categories'])}")
+        print(f"  Missing Golden Signals: {len(gaps['missing_golden_signals'])}")
+        print(f"  Critical Gaps: {len(gaps['critical_gaps'])}")
+        
+        # Duplicates
+        duplicates = analysis_results['duplicate_alerts']
+        print(f"\nDuplicate Alerts: {len(duplicates)} groups")
+        
+        # Overall recommendations
+        recommendations = analysis_results['overall_recommendations']
+        print(f"\nTop Recommendations:")
+        for i, rec in enumerate(recommendations[:5], 1):
+            print(f"  {i}. {rec}")
+        
+        print(f"\n{'='*60}\n")
+
+
+def main():
+    """Main function for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description='Analyze and optimize alert configurations',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Analyze alert configuration
+    python alert_optimizer.py --input alerts.json --analyze-only
+    
+    # Generate optimized configuration
+    python alert_optimizer.py --input alerts.json --output optimized_alerts.json
+    
+    # Generate HTML report
+    python alert_optimizer.py --input alerts.json --report report.html --format html
+        """
+    )
+    
+    parser.add_argument('--input', '-i', required=True,
+                       help='Input alert configuration JSON file')
+    parser.add_argument('--output', '-o',
+                       help='Output optimized configuration JSON file')
+    parser.add_argument('--report', '-r',
+                       help='Generate analysis report file')
+    parser.add_argument('--format', choices=['json', 'html'], default='json',
+                       help='Report format (json or html)')
+    parser.add_argument('--analyze-only', action='store_true',
+                       help='Only perform analysis, do not generate optimized config')
+    
+    args = parser.parse_args()
+    
+    optimizer = AlertOptimizer()
+    
+    try:
+        # Load alert configuration
+        alert_config = optimizer.load_alert_config(args.input)
+        
+        # Perform analysis
+        analysis_results = optimizer.analyze_configuration(alert_config)
+        
+        # Generate optimized configuration if requested
+        if not args.analyze_only:
+            optimized_config = optimizer.generate_optimized_config(
+                alert_config.get('alerts', alert_config.get('rules', [])),
+                analysis_results
+            )
+            
+            output_file = args.output or 'optimized_alerts.json'
+            optimizer.export_analysis(optimized_config, output_file, 'json')
+            print(f"Optimized configuration saved to: {output_file}")
+        
+        # Generate report if requested
+        if args.report:
+            optimizer.export_analysis(analysis_results, args.report, args.format)
+            print(f"Analysis report saved to: {args.report}")
+        
+        # Always show summary
+        optimizer.print_summary(analysis_results)
+        
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/engineering/observability-designer/scripts/dashboard_generator.py b/engineering/observability-designer/scripts/dashboard_generator.py
new file mode 100644
index 0000000..a07b077
--- /dev/null
+++ b/engineering/observability-designer/scripts/dashboard_generator.py
@@ -0,0 +1,1219 @@
+#!/usr/bin/env python3
+"""
+Dashboard Generator - Generate comprehensive dashboard specifications
+
+This script generates dashboard specifications based on service/system descriptions:
+- Panel layout optimized for different screen sizes and roles
+- Metric queries (Prometheus-style) for comprehensive monitoring
+- Visualization types appropriate for different metric types
+- Drill-down paths for effective troubleshooting workflows
+- Golden signals coverage (latency, traffic, errors, saturation)
+- RED/USE method implementation
+- Business metrics integration
+
+Usage:
+    python dashboard_generator.py --input service_definition.json --output dashboard_spec.json
+    python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json
+"""
+
+import json
+import argparse
+import sys
+import math
+from typing import Dict, List, Any, Tuple
+from datetime import datetime, timedelta
+
+
+class DashboardGenerator:
+    """Generate comprehensive dashboard specifications."""
+    
+    # Dashboard layout templates by role
+    ROLE_LAYOUTS = {
+        'sre': {
+            'primary_focus': ['availability', 'latency', 'errors', 'resource_utilization'],
+            'secondary_focus': ['throughput', 'capacity', 'dependencies'],
+            'time_ranges': ['1h', '6h', '1d', '7d'],
+            'default_refresh': '30s'
+        },
+        'developer': {
+            'primary_focus': ['latency', 'errors', 'throughput', 'business_metrics'],
+            'secondary_focus': ['resource_utilization', 'dependencies'],
+            'time_ranges': ['15m', '1h', '6h', '1d'],
+            'default_refresh': '1m'
+        },
+        'executive': {
+            'primary_focus': ['availability', 'business_metrics', 'user_experience'],
+            'secondary_focus': ['cost', 'capacity_trends'],
+            'time_ranges': ['1d', '7d', '30d'],
+            'default_refresh': '5m'
+        },
+        'ops': {
+            'primary_focus': ['resource_utilization', 'capacity', 'alerts', 'deployments'],
+            'secondary_focus': ['throughput', 'latency'],
+            'time_ranges': ['5m', '30m', '2h', '1d'],
+            'default_refresh': '15s'
+        }
+    }
+    
+    # Service type specific metric configurations
+    SERVICE_METRICS = {
+        'api': {
+            'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
+            'key_metrics': [
+                'http_requests_total',
+                'http_request_duration_seconds',
+                'http_request_size_bytes',
+                'http_response_size_bytes'
+            ],
+            'resource_metrics': ['cpu_usage', 'memory_usage', 'goroutines']
+        },
+        'web': {
+            'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
+            'key_metrics': [
+                'http_requests_total',
+                'http_request_duration_seconds',
+                'page_load_time',
+                'user_sessions'
+            ],
+            'resource_metrics': ['cpu_usage', 'memory_usage', 'connections']
+        },
+        'database': {
+            'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
+            'key_metrics': [
+                'db_connections_active',
+                'db_query_duration_seconds',
+                'db_queries_total',
+                'db_slow_queries_total'
+            ],
+            'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_io', 'connections']
+        },
+        'queue': {
+            'golden_signals': ['latency', 'traffic', 'errors', 'saturation'],
+            'key_metrics': [
+                'queue_depth',
+                'message_processing_duration',
+                'messages_published_total',
+                'messages_consumed_total'
+            ],
+            'resource_metrics': ['cpu_usage', 'memory_usage', 'disk_usage']
+        }
+    }
+    
+    # Visualization type recommendations
+    VISUALIZATION_TYPES = {
+        'latency': 'line_chart',
+        'throughput': 'line_chart',
+        'error_rate': 'line_chart',
+        'success_rate': 'stat',
+        'resource_utilization': 'gauge',
+        'queue_depth': 'bar_chart',
+        'status': 'stat',
+        'distribution': 'heatmap',
+        'alerts': 'table',
+        'logs': 'logs_panel'
+    }
+
+    def __init__(self):
+        """Initialize the Dashboard Generator."""
+        self.service_config = {}
+        self.dashboard_spec = {}
+
+    def load_service_definition(self, file_path: str) -> Dict[str, Any]:
+        """Load service definition from JSON file."""
+        try:
+            with open(file_path, 'r') as f:
+                return json.load(f)
+        except FileNotFoundError:
+            raise ValueError(f"Service definition file not found: {file_path}")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in service definition: {e}")
+
+    def create_service_definition(self, service_type: str, name: str, 
+                                criticality: str = 'medium') -> Dict[str, Any]:
+        """Create a service definition from parameters."""
+        return {
+            'name': name,
+            'type': service_type,
+            'criticality': criticality,
+            'description': f'{name} - A {criticality} criticality {service_type} service',
+            'team': 'platform',
+            'environment': 'production',
+            'dependencies': [],
+            'tags': []
+        }
+
+    def generate_dashboard_specification(self, service_def: Dict[str, Any], 
+                                       target_role: str = 'sre') -> Dict[str, Any]:
+        """Generate comprehensive dashboard specification."""
+        service_name = service_def.get('name', 'Service')
+        service_type = service_def.get('type', 'api')
+        
+        # Get role-specific configuration
+        role_config = self.ROLE_LAYOUTS.get(target_role, self.ROLE_LAYOUTS['sre'])
+        
+        dashboard_spec = {
+            'metadata': {
+                'title': f"{service_name} - {target_role.upper()} Dashboard",
+                'service': service_def,
+                'target_role': target_role,
+                'generated_at': datetime.utcnow().isoformat() + 'Z',
+                'version': '1.0'
+            },
+            'configuration': {
+                'time_ranges': role_config['time_ranges'],
+                'default_time_range': role_config['time_ranges'][1],  # Second option as default
+                'refresh_interval': role_config['default_refresh'],
+                'timezone': 'UTC',
+                'theme': 'dark'
+            },
+            'layout': self._generate_dashboard_layout(service_def, role_config),
+            'panels': self._generate_panels(service_def, role_config),
+            'variables': self._generate_template_variables(service_def),
+            'alerts_integration': self._generate_alerts_integration(service_def),
+            'drill_down_paths': self._generate_drill_down_paths(service_def)
+        }
+        
+        return dashboard_spec
+
+    def _generate_dashboard_layout(self, service_def: Dict[str, Any], 
+                                 role_config: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate dashboard layout configuration."""
+        return {
+            'grid_settings': {
+                'width': 24,  # Grafana-style 24-column grid
+                'height_unit': 'px',
+                'cell_height': 30
+            },
+            'sections': [
+                {
+                    'title': 'Service Overview',
+                    'collapsed': False,
+                    'y_position': 0,
+                    'panels': ['service_status', 'slo_summary', 'error_budget']
+                },
+                {
+                    'title': 'Golden Signals',
+                    'collapsed': False,
+                    'y_position': 8,
+                    'panels': ['latency', 'traffic', 'errors', 'saturation']
+                },
+                {
+                    'title': 'Resource Utilization',
+                    'collapsed': False,
+                    'y_position': 16,
+                    'panels': ['cpu_usage', 'memory_usage', 'network_io', 'disk_io']
+                },
+                {
+                    'title': 'Dependencies & Downstream',
+                    'collapsed': True,
+                    'y_position': 24,
+                    'panels': ['dependency_status', 'downstream_latency', 'circuit_breakers']
+                }
+            ]
+        }
+
+    def _generate_panels(self, service_def: Dict[str, Any], 
+                        role_config: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate dashboard panels based on service and role."""
+        service_name = service_def.get('name', 'service')
+        service_type = service_def.get('type', 'api')
+        panels = []
+        
+        # Service Overview Panels
+        panels.extend(self._create_overview_panels(service_def))
+        
+        # Golden Signals Panels
+        panels.extend(self._create_golden_signals_panels(service_def))
+        
+        # Resource Utilization Panels
+        panels.extend(self._create_resource_panels(service_def))
+        
+        # Service-specific panels
+        if service_type == 'api':
+            panels.extend(self._create_api_specific_panels(service_def))
+        elif service_type == 'database':
+            panels.extend(self._create_database_specific_panels(service_def))
+        elif service_type == 'queue':
+            panels.extend(self._create_queue_specific_panels(service_def))
+        
+        # Role-specific additional panels
+        if 'business_metrics' in role_config['primary_focus']:
+            panels.extend(self._create_business_metrics_panels(service_def))
+        
+        if 'capacity' in role_config['primary_focus']:
+            panels.extend(self._create_capacity_panels(service_def))
+        
+        return panels
+
+    def _create_overview_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create service overview panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'service_status',
+                'title': 'Service Status',
+                'type': 'stat',
+                'grid_pos': {'x': 0, 'y': 0, 'w': 6, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'up{{service="{service_name}"}}',
+                        'legendFormat': 'Status'
+                    }
+                ],
+                'field_config': {
+                    'overrides': [
+                        {
+                            'matcher': {'id': 'byName', 'options': 'Status'},
+                            'properties': [
+                                {'id': 'color', 'value': {'mode': 'thresholds'}},
+                                {'id': 'thresholds', 'value': {
+                                    'steps': [
+                                        {'color': 'red', 'value': 0},
+                                        {'color': 'green', 'value': 1}
+                                    ]
+                                }},
+                                {'id': 'mappings', 'value': [
+                                    {'options': {'0': {'text': 'DOWN'}}, 'type': 'value'},
+                                    {'options': {'1': {'text': 'UP'}}, 'type': 'value'}
+                                ]}
+                            ]
+                        }
+                    ]
+                },
+                'options': {
+                    'orientation': 'horizontal',
+                    'textMode': 'value_and_name'
+                }
+            },
+            {
+                'id': 'slo_summary',
+                'title': 'SLO Achievement (30d)',
+                'type': 'stat',
+                'grid_pos': {'x': 6, 'y': 0, 'w': 9, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d]))) * 100',
+                        'legendFormat': 'Availability'
+                    },
+                    {
+                        'expr': f'histogram_quantile(0.95, increase(http_request_duration_seconds_bucket{{service="{service_name}"}}[30d])) * 1000',
+                        'legendFormat': 'P95 Latency (ms)'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'thresholds'},
+                        'thresholds': {
+                            'steps': [
+                                {'color': 'red', 'value': 0},
+                                {'color': 'yellow', 'value': 99.0},
+                                {'color': 'green', 'value': 99.9}
+                            ]
+                        }
+                    }
+                },
+                'options': {
+                    'orientation': 'horizontal',
+                    'textMode': 'value_and_name'
+                }
+            },
+            {
+                'id': 'error_budget',
+                'title': 'Error Budget Remaining',
+                'type': 'gauge',
+                'grid_pos': {'x': 15, 'y': 0, 'w': 9, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'(1 - (increase(http_requests_total{{service="{service_name}",code=~"5.."}}[30d]) / increase(http_requests_total{{service="{service_name}"}}[30d])) - 0.999) / 0.001 * 100',
+                        'legendFormat': 'Error Budget %'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'thresholds'},
+                        'min': 0,
+                        'max': 100,
+                        'thresholds': {
+                            'steps': [
+                                {'color': 'red', 'value': 0},
+                                {'color': 'yellow', 'value': 25},
+                                {'color': 'green', 'value': 50}
+                            ]
+                        },
+                        'unit': 'percent'
+                    }
+                },
+                'options': {
+                    'showThresholdLabels': True,
+                    'showThresholdMarkers': True
+                }
+            }
+        ]
+
+    def _create_golden_signals_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create golden signals monitoring panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'latency',
+                'title': 'Request Latency',
+                'type': 'timeseries',
+                'grid_pos': {'x': 0, 'y': 8, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
+                        'legendFormat': 'P50 Latency'
+                    },
+                    {
+                        'expr': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
+                        'legendFormat': 'P95 Latency'
+                    },
+                    {
+                        'expr': f'histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])) * 1000',
+                        'legendFormat': 'P99 Latency'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'unit': 'ms',
+                        'custom': {
+                            'drawStyle': 'line',
+                            'lineInterpolation': 'linear',
+                            'lineWidth': 1,
+                            'fillOpacity': 10
+                        }
+                    }
+                },
+                'options': {
+                    'tooltip': {'mode': 'multi', 'sort': 'desc'},
+                    'legend': {'displayMode': 'table', 'placement': 'bottom'}
+                }
+            },
+            {
+                'id': 'traffic',
+                'title': 'Request Rate',
+                'type': 'timeseries',
+                'grid_pos': {'x': 12, 'y': 8, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
+                        'legendFormat': 'Total RPS'
+                    },
+                    {
+                        'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"2.."}}[5m]))',
+                        'legendFormat': '2xx RPS'
+                    },
+                    {
+                        'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m]))',
+                        'legendFormat': '4xx RPS'
+                    },
+                    {
+                        'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m]))',
+                        'legendFormat': '5xx RPS'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'unit': 'reqps',
+                        'custom': {
+                            'drawStyle': 'line',
+                            'lineInterpolation': 'linear',
+                            'lineWidth': 1,
+                            'fillOpacity': 0
+                        }
+                    }
+                },
+                'options': {
+                    'tooltip': {'mode': 'multi', 'sort': 'desc'},
+                    'legend': {'displayMode': 'table', 'placement': 'bottom'}
+                }
+            },
+            {
+                'id': 'errors',
+                'title': 'Error Rate',
+                'type': 'timeseries',
+                'grid_pos': {'x': 0, 'y': 14, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"5.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100',
+                        'legendFormat': '5xx Error Rate'
+                    },
+                    {
+                        'expr': f'sum(rate(http_requests_total{{service="{service_name}",code=~"4.."}}[5m])) / sum(rate(http_requests_total{{service="{service_name}"}}[5m])) * 100',
+                        'legendFormat': '4xx Error Rate'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'unit': 'percent',
+                        'custom': {
+                            'drawStyle': 'line',
+                            'lineInterpolation': 'linear',
+                            'lineWidth': 2,
+                            'fillOpacity': 20
+                        }
+                    },
+                    'overrides': [
+                        {
+                            'matcher': {'id': 'byName', 'options': '5xx Error Rate'},
+                            'properties': [{'id': 'color', 'value': {'fixedColor': 'red'}}]
+                        }
+                    ]
+                },
+                'options': {
+                    'tooltip': {'mode': 'multi', 'sort': 'desc'},
+                    'legend': {'displayMode': 'table', 'placement': 'bottom'}
+                }
+            },
+            {
+                'id': 'saturation',
+                'title': 'Saturation Metrics',
+                'type': 'timeseries',
+                'grid_pos': {'x': 12, 'y': 14, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100',
+                        'legendFormat': 'CPU Usage %'
+                    },
+                    {
+                        'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / process_virtual_memory_max_bytes{{service="{service_name}"}} * 100',
+                        'legendFormat': 'Memory Usage %'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'unit': 'percent',
+                        'max': 100,
+                        'custom': {
+                            'drawStyle': 'line',
+                            'lineInterpolation': 'linear',
+                            'lineWidth': 1,
+                            'fillOpacity': 10
+                        }
+                    }
+                },
+                'options': {
+                    'tooltip': {'mode': 'multi', 'sort': 'desc'},
+                    'legend': {'displayMode': 'table', 'placement': 'bottom'}
+                }
+            }
+        ]
+
+    def _create_resource_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create resource utilization panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'cpu_usage',
+                'title': 'CPU Usage',
+                'type': 'gauge',
+                'grid_pos': {'x': 0, 'y': 20, 'w': 6, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'rate(process_cpu_seconds_total{{service="{service_name}"}}[5m]) * 100',
+                        'legendFormat': 'CPU %'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'thresholds'},
+                        'unit': 'percent',
+                        'min': 0,
+                        'max': 100,
+                        'thresholds': {
+                            'steps': [
+                                {'color': 'green', 'value': 0},
+                                {'color': 'yellow', 'value': 70},
+                                {'color': 'red', 'value': 90}
+                            ]
+                        }
+                    }
+                },
+                'options': {
+                    'showThresholdLabels': True,
+                    'showThresholdMarkers': True
+                }
+            },
+            {
+                'id': 'memory_usage',
+                'title': 'Memory Usage',
+                'type': 'gauge',
+                'grid_pos': {'x': 6, 'y': 20, 'w': 6, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'process_resident_memory_bytes{{service="{service_name}"}} / 1024 / 1024',
+                        'legendFormat': 'Memory MB'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'thresholds'},
+                        'unit': 'decbytes',
+                        'thresholds': {
+                            'steps': [
+                                {'color': 'green', 'value': 0},
+                                {'color': 'yellow', 'value': 512000000},  # 512MB
+                                {'color': 'red', 'value': 1024000000}     # 1GB
+                            ]
+                        }
+                    }
+                }
+            },
+            {
+                'id': 'network_io',
+                'title': 'Network I/O',
+                'type': 'timeseries',
+                'grid_pos': {'x': 12, 'y': 20, 'w': 6, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'rate(process_network_receive_bytes_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'RX Bytes/s'
+                    },
+                    {
+                        'expr': f'rate(process_network_transmit_bytes_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'TX Bytes/s'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'unit': 'binBps'
+                    }
+                }
+            },
+            {
+                'id': 'disk_io',
+                'title': 'Disk I/O',
+                'type': 'timeseries',
+                'grid_pos': {'x': 18, 'y': 20, 'w': 6, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'rate(process_disk_read_bytes_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'Read Bytes/s'
+                    },
+                    {
+                        'expr': f'rate(process_disk_write_bytes_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'Write Bytes/s'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'unit': 'binBps'
+                    }
+                }
+            }
+        ]
+
+    def _create_api_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create API-specific panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'endpoint_latency',
+                'title': 'Top Slowest Endpoints',
+                'type': 'table',
+                'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'topk(10, histogram_quantile(0.95, sum by (handler) (rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m])))) * 1000',
+                        'legendFormat': '{{handler}}',
+                        'format': 'table',
+                        'instant': True
+                    }
+                ],
+                'transformations': [
+                    {
+                        'id': 'organize',
+                        'options': {
+                            'excludeByName': {'Time': True},
+                            'renameByName': {'Value': 'P95 Latency (ms)'}
+                        }
+                    }
+                ],
+                'field_config': {
+                    'overrides': [
+                        {
+                            'matcher': {'id': 'byName', 'options': 'P95 Latency (ms)'},
+                            'properties': [
+                                {'id': 'color', 'value': {'mode': 'thresholds'}},
+                                {'id': 'thresholds', 'value': {
+                                    'steps': [
+                                        {'color': 'green', 'value': 0},
+                                        {'color': 'yellow', 'value': 100},
+                                        {'color': 'red', 'value': 500}
+                                    ]
+                                }}
+                            ]
+                        }
+                    ]
+                }
+            },
+            {
+                'id': 'request_size_distribution',
+                'title': 'Request Size Distribution',
+                'type': 'heatmap',
+                'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'sum by (le) (rate(http_request_size_bytes_bucket{{service="{service_name}"}}[5m]))',
+                        'legendFormat': '{{le}}'
+                    }
+                ],
+                'options': {
+                    'calculate': True,
+                    'yAxis': {'unit': 'bytes'},
+                    'color': {'scheme': 'Spectral'}
+                }
+            }
+        ]
+
+    def _create_database_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create database-specific panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'db_connections',
+                'title': 'Database Connections',
+                'type': 'timeseries',
+                'grid_pos': {'x': 0, 'y': 24, 'w': 8, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'db_connections_active{{service="{service_name}"}}',
+                        'legendFormat': 'Active Connections'
+                    },
+                    {
+                        'expr': f'db_connections_idle{{service="{service_name}"}}',
+                        'legendFormat': 'Idle Connections'
+                    },
+                    {
+                        'expr': f'db_connections_max{{service="{service_name}"}}',
+                        'legendFormat': 'Max Connections'
+                    }
+                ]
+            },
+            {
+                'id': 'query_performance',
+                'title': 'Query Performance',
+                'type': 'timeseries',
+                'grid_pos': {'x': 8, 'y': 24, 'w': 8, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'rate(db_queries_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'Queries/sec'
+                    },
+                    {
+                        'expr': f'rate(db_slow_queries_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'Slow Queries/sec'
+                    }
+                ]
+            },
+            {
+                'id': 'db_locks',
+                'title': 'Database Locks',
+                'type': 'stat',
+                'grid_pos': {'x': 16, 'y': 24, 'w': 8, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'db_locks_waiting{{service="{service_name}"}}',
+                        'legendFormat': 'Waiting Locks'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'thresholds'},
+                        'thresholds': {
+                            'steps': [
+                                {'color': 'green', 'value': 0},
+                                {'color': 'yellow', 'value': 1},
+                                {'color': 'red', 'value': 5}
+                            ]
+                        }
+                    }
+                }
+            }
+        ]
+
+    def _create_queue_specific_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create queue-specific panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'queue_depth',
+                'title': 'Queue Depth',
+                'type': 'timeseries',
+                'grid_pos': {'x': 0, 'y': 24, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'queue_depth{{service="{service_name}"}}',
+                        'legendFormat': 'Messages in Queue'
+                    }
+                ]
+            },
+            {
+                'id': 'message_throughput',
+                'title': 'Message Throughput',
+                'type': 'timeseries',
+                'grid_pos': {'x': 12, 'y': 24, 'w': 12, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'rate(messages_published_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'Published/sec'
+                    },
+                    {
+                        'expr': f'rate(messages_consumed_total{{service="{service_name}"}}[5m])',
+                        'legendFormat': 'Consumed/sec'
+                    }
+                ]
+            }
+        ]
+
+    def _create_business_metrics_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create business metrics panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'business_kpis',
+                'title': 'Business KPIs',
+                'type': 'stat',
+                'grid_pos': {'x': 0, 'y': 30, 'w': 24, 'h': 4},
+                'targets': [
+                    {
+                        'expr': f'rate(business_transactions_total{{service="{service_name}"}}[1h])',
+                        'legendFormat': 'Transactions/hour'
+                    },
+                    {
+                        'expr': f'avg(business_transaction_value{{service="{service_name}"}}) * rate(business_transactions_total{{service="{service_name}"}}[1h])',
+                        'legendFormat': 'Revenue/hour'
+                    },
+                    {
+                        'expr': f'rate(user_registrations_total{{service="{service_name}"}}[1h])',
+                        'legendFormat': 'New Users/hour'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'custom': {
+                            'displayMode': 'basic'
+                        }
+                    }
+                },
+                'options': {
+                    'orientation': 'horizontal',
+                    'textMode': 'value_and_name'
+                }
+            }
+        ]
+
+    def _create_capacity_panels(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Create capacity planning panels."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'id': 'capacity_trends',
+                'title': 'Capacity Trends (7d)',
+                'type': 'timeseries',
+                'grid_pos': {'x': 0, 'y': 34, 'w': 24, 'h': 6},
+                'targets': [
+                    {
+                        'expr': f'predict_linear(avg_over_time(rate(http_requests_total{{service="{service_name}"}}[5m])[7d:1h]), 7*24*3600)',
+                        'legendFormat': 'Predicted Traffic (7d)'
+                    },
+                    {
+                        'expr': f'predict_linear(avg_over_time(process_resident_memory_bytes{{service="{service_name}"}}[7d:1h]), 7*24*3600)',
+                        'legendFormat': 'Predicted Memory Usage (7d)'
+                    }
+                ],
+                'field_config': {
+                    'defaults': {
+                        'color': {'mode': 'palette-classic'},
+                        'custom': {
+                            'drawStyle': 'line',
+                            'lineStyle': {'dash': [10, 10]}
+                        }
+                    }
+                }
+            }
+        ]
+
+    def _generate_template_variables(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate template variables for dynamic dashboard filtering."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'name': 'environment',
+                'type': 'query',
+                'query': 'label_values(environment)',
+                'current': {'text': 'production', 'value': 'production'},
+                'includeAll': False,
+                'multi': False,
+                'refresh': 'on_dashboard_load'
+            },
+            {
+                'name': 'instance',
+                'type': 'query',
+                'query': f'label_values(up{{service="{service_name}"}}, instance)',
+                'current': {'text': 'All', 'value': '$__all'},
+                'includeAll': True,
+                'multi': True,
+                'refresh': 'on_time_range_change'
+            },
+            {
+                'name': 'handler',
+                'type': 'query',
+                'query': f'label_values(http_requests_total{{service="{service_name}"}}, handler)',
+                'current': {'text': 'All', 'value': '$__all'},
+                'includeAll': True,
+                'multi': True,
+                'refresh': 'on_time_range_change'
+            }
+        ]
+
+    def _generate_alerts_integration(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate alerts integration configuration."""
+        service_name = service_def.get('name', 'service')
+        
+        return {
+            'alert_annotations': True,
+            'alert_rules_query': f'ALERTS{{service="{service_name}"}}',
+            'alert_panels': [
+                {
+                    'title': 'Active Alerts',
+                    'type': 'table',
+                    'query': f'ALERTS{{service="{service_name}",alertstate="firing"}}',
+                    'columns': ['alertname', 'severity', 'instance', 'description']
+                }
+            ]
+        }
+
+    def _generate_drill_down_paths(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate drill-down navigation paths."""
+        service_name = service_def.get('name', 'service')
+        
+        return {
+            'service_overview': {
+                'from': 'service_status',
+                'to': 'detailed_health_dashboard',
+                'url': f'/d/service-health/{service_name}-health',
+                'params': ['var-service', 'var-environment']
+            },
+            'error_investigation': {
+                'from': 'errors',
+                'to': 'error_details_dashboard',
+                'url': f'/d/errors/{service_name}-errors',
+                'params': ['var-service', 'var-time_range']
+            },
+            'latency_analysis': {
+                'from': 'latency',
+                'to': 'trace_analysis_dashboard',
+                'url': f'/d/traces/{service_name}-traces',
+                'params': ['var-service', 'var-handler']
+            },
+            'capacity_planning': {
+                'from': 'saturation',
+                'to': 'capacity_dashboard',
+                'url': f'/d/capacity/{service_name}-capacity',
+                'params': ['var-service', 'var-time_range']
+            }
+        }
+
+    def generate_grafana_json(self, dashboard_spec: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert dashboard specification to Grafana JSON format."""
+        metadata = dashboard_spec['metadata']
+        config = dashboard_spec['configuration']
+        
+        grafana_json = {
+            'dashboard': {
+                'id': None,
+                'title': metadata['title'],
+                'tags': [metadata['service']['type'], metadata['target_role'], 'generated'],
+                'timezone': config['timezone'],
+                'refresh': config['refresh_interval'],
+                'time': {
+                    'from': 'now-1h',
+                    'to': 'now'
+                },
+                'templating': {
+                    'list': dashboard_spec['variables']
+                },
+                'panels': self._convert_panels_to_grafana_format(dashboard_spec['panels']),
+                'version': 1,
+                'schemaVersion': 30
+            },
+            'overwrite': True
+        }
+        
+        return grafana_json
+
+    def _convert_panels_to_grafana_format(self, panels: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Convert panel specifications to Grafana format."""
+        grafana_panels = []
+        
+        for panel in panels:
+            grafana_panel = {
+                'id': hash(panel['id']) % 1000,  # Generate numeric ID
+                'title': panel['title'],
+                'type': panel['type'],
+                'gridPos': panel['grid_pos'],
+                'targets': panel['targets'],
+                'fieldConfig': panel.get('field_config', {}),
+                'options': panel.get('options', {}),
+                'transformations': panel.get('transformations', [])
+            }
+            grafana_panels.append(grafana_panel)
+        
+        return grafana_panels
+
+    def generate_documentation(self, dashboard_spec: Dict[str, Any]) -> str:
+        """Generate documentation for the dashboard."""
+        metadata = dashboard_spec['metadata']
+        service = metadata['service']
+        
+        doc_content = f"""# {metadata['title']} Documentation
+
+## Overview
+This dashboard provides comprehensive monitoring for {service['name']}, a {service['type']} service with {service['criticality']} criticality.
+
+**Target Audience:** {metadata['target_role'].upper()} teams
+**Generated:** {metadata['generated_at']}
+
+## Dashboard Sections
+
+### Service Overview
+- **Service Status**: Real-time availability status
+- **SLO Achievement**: 30-day SLO compliance metrics
+- **Error Budget**: Remaining error budget visualization
+
+### Golden Signals Monitoring
+- **Latency**: P50, P95, P99 response times
+- **Traffic**: Request rate by status code
+- **Errors**: Error rates for 4xx and 5xx responses
+- **Saturation**: CPU and memory utilization
+
+### Resource Utilization
+- **CPU Usage**: Process CPU consumption
+- **Memory Usage**: Memory utilization tracking
+- **Network I/O**: Network throughput metrics
+- **Disk I/O**: Disk read/write operations
+
+## Key Metrics
+
+### SLIs Tracked
+"""
+        
+        # Add service-type specific metrics
+        service_type = service.get('type', 'api')
+        if service_type in self.SERVICE_METRICS:
+            metrics = self.SERVICE_METRICS[service_type]['key_metrics']
+            for metric in metrics:
+                doc_content += f"- `{metric}`: Core service metric\n"
+        
+        doc_content += f"""
+## Alert Integration
+- Active alerts are displayed in context with relevant panels
+- Alert annotations show on time series charts
+- Click-through to alert management system available
+
+## Drill-Down Paths
+"""
+        
+        drill_downs = dashboard_spec.get('drill_down_paths', {})
+        for path_name, path_config in drill_downs.items():
+            doc_content += f"- **{path_name}**: From {path_config['from']} → {path_config['to']}\n"
+        
+        doc_content += f"""
+## Usage Guidelines
+
+### Time Ranges
+Use appropriate time ranges for different investigation types:
+- **Real-time monitoring**: 15m - 1h
+- **Recent incident investigation**: 1h - 6h  
+- **Trend analysis**: 1d - 7d
+- **Capacity planning**: 7d - 30d
+
+### Variables
+- **environment**: Filter by deployment environment
+- **instance**: Focus on specific service instances
+- **handler**: Filter by API endpoint or handler
+
+### Performance Optimization
+- Use longer time ranges for capacity planning
+- Refresh intervals are optimized per role:
+  - SRE: 30s for operational awareness
+  - Developer: 1m for troubleshooting
+  - Executive: 5m for high-level monitoring
+
+## Maintenance
+- Dashboard panels automatically adapt to service changes
+- Template variables refresh based on actual metric labels
+- Review and update business metrics quarterly
+"""
+        
+        return doc_content
+
+    def export_specification(self, dashboard_spec: Dict[str, Any], output_file: str, 
+                           format_type: str = 'json'):
+        """Export dashboard specification."""
+        if format_type.lower() == 'json':
+            with open(output_file, 'w') as f:
+                json.dump(dashboard_spec, f, indent=2)
+        elif format_type.lower() == 'grafana':
+            grafana_json = self.generate_grafana_json(dashboard_spec)
+            with open(output_file, 'w') as f:
+                json.dump(grafana_json, f, indent=2)
+        else:
+            raise ValueError(f"Unsupported format: {format_type}")
+
+    def print_summary(self, dashboard_spec: Dict[str, Any]):
+        """Print human-readable summary of dashboard specification."""
+        metadata = dashboard_spec['metadata']
+        service = metadata['service']
+        config = dashboard_spec['configuration']
+        panels = dashboard_spec['panels']
+        
+        print(f"\n{'='*60}")
+        print(f"DASHBOARD SPECIFICATION SUMMARY")
+        print(f"{'='*60}")
+        
+        print(f"\nDashboard Details:")
+        print(f"  Title: {metadata['title']}")
+        print(f"  Target Role: {metadata['target_role'].upper()}")
+        print(f"  Service: {service['name']} ({service['type']})")
+        print(f"  Criticality: {service['criticality']}")
+        print(f"  Generated: {metadata['generated_at']}")
+        
+        print(f"\nConfiguration:")
+        print(f"  Default Time Range: {config['default_time_range']}")
+        print(f"  Refresh Interval: {config['refresh_interval']}")
+        print(f"  Available Time Ranges: {', '.join(config['time_ranges'])}")
+        
+        print(f"\nPanels ({len(panels)}):")
+        panel_types = {}
+        for panel in panels:
+            panel_type = panel['type']
+            panel_types[panel_type] = panel_types.get(panel_type, 0) + 1
+        
+        for panel_type, count in panel_types.items():
+            print(f"  {panel_type}: {count}")
+        
+        variables = dashboard_spec.get('variables', [])
+        print(f"\nTemplate Variables ({len(variables)}):")
+        for var in variables:
+            print(f"  {var['name']} ({var['type']})")
+        
+        drill_downs = dashboard_spec.get('drill_down_paths', {})
+        print(f"\nDrill-down Paths: {len(drill_downs)}")
+        
+        print(f"\nKey Features:")
+        print(f"  • Golden Signals monitoring")
+        print(f"  • Resource utilization tracking")
+        print(f"  • Alert integration")
+        print(f"  • Role-optimized layout")
+        print(f"  • Service-type specific panels")
+        
+        print(f"\n{'='*60}\n")
+
+
+def main():
+    """Main function for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description='Generate comprehensive dashboard specifications',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Generate from service definition file
+    python dashboard_generator.py --input service.json --output dashboard.json
+    
+    # Generate from command line parameters
+    python dashboard_generator.py --service-type api --name "Payment Service" --output payment_dashboard.json
+    
+    # Generate Grafana-compatible JSON
+    python dashboard_generator.py --input service.json --output dashboard.json --format grafana
+    
+    # Generate with specific role focus
+    python dashboard_generator.py --service-type web --name "Frontend" --role developer --output frontend_dev.json
+        """
+    )
+    
+    parser.add_argument('--input', '-i',
+                       help='Input service definition JSON file')
+    parser.add_argument('--output', '-o', 
+                       help='Output dashboard specification file')
+    parser.add_argument('--service-type',
+                       choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
+                       help='Service type')
+    parser.add_argument('--name',
+                       help='Service name')
+    parser.add_argument('--criticality',
+                       choices=['critical', 'high', 'medium', 'low'],
+                       default='medium',
+                       help='Service criticality level')
+    parser.add_argument('--role',
+                       choices=['sre', 'developer', 'executive', 'ops'],
+                       default='sre',
+                       help='Target role for dashboard optimization')
+    parser.add_argument('--format',
+                       choices=['json', 'grafana'],
+                       default='json',
+                       help='Output format (json specification or grafana compatible)')
+    parser.add_argument('--doc-output',
+                       help='Generate documentation file')
+    parser.add_argument('--summary-only', action='store_true',
+                       help='Only display summary, do not save files')
+    
+    args = parser.parse_args()
+    
+    if not args.input and not (args.service_type and args.name):
+        parser.error("Must provide either --input file or --service-type and --name")
+    
+    generator = DashboardGenerator()
+    
+    try:
+        # Load or create service definition
+        if args.input:
+            service_def = generator.load_service_definition(args.input)
+        else:
+            service_def = generator.create_service_definition(
+                args.service_type, args.name, args.criticality
+            )
+        
+        # Generate dashboard specification
+        dashboard_spec = generator.generate_dashboard_specification(service_def, args.role)
+        
+        # Output results
+        if not args.summary_only:
+            output_file = args.output or f"{service_def['name'].replace(' ', '_').lower()}_dashboard.json"
+            generator.export_specification(dashboard_spec, output_file, args.format)
+            print(f"Dashboard specification saved to: {output_file}")
+            
+            # Generate documentation if requested
+            if args.doc_output:
+                documentation = generator.generate_documentation(dashboard_spec)
+                with open(args.doc_output, 'w') as f:
+                    f.write(documentation)
+                print(f"Documentation saved to: {args.doc_output}")
+        
+        # Always show summary
+        generator.print_summary(dashboard_spec)
+        
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/engineering/observability-designer/scripts/slo_designer.py b/engineering/observability-designer/scripts/slo_designer.py
new file mode 100644
index 0000000..69459a5
--- /dev/null
+++ b/engineering/observability-designer/scripts/slo_designer.py
@@ -0,0 +1,670 @@
+#!/usr/bin/env python3
+"""
+SLO Designer - Generate comprehensive SLI/SLO frameworks for services
+
+This script analyzes service descriptions and generates complete SLO frameworks including:
+- SLI definitions based on service characteristics
+- SLO targets based on criticality and user impact
+- Error budget calculations and policies
+- Multi-window burn rate alerts
+- SLA recommendations for customer-facing services
+
+Usage:
+    python slo_designer.py --input service_definition.json --output slo_framework.json
+    python slo_designer.py --service-type api --criticality high --user-facing true
+"""
+
+import json
+import argparse
+import sys
+import math
+from typing import Dict, List, Any, Tuple
+from datetime import datetime, timedelta
+
+
+class SLODesigner:
+    """Design and generate SLO frameworks for services."""
+    
+    # SLO target recommendations based on service criticality
+    SLO_TARGETS = {
+        'critical': {
+            'availability': 0.9999,  # 99.99% - 4.38 minutes downtime/month
+            'latency_p95': 100,      # 95th percentile latency in ms
+            'latency_p99': 500,      # 99th percentile latency in ms
+            'error_rate': 0.001      # 0.1% error rate
+        },
+        'high': {
+            'availability': 0.999,   # 99.9% - 43.8 minutes downtime/month
+            'latency_p95': 200,      # 95th percentile latency in ms
+            'latency_p99': 1000,     # 99th percentile latency in ms
+            'error_rate': 0.005      # 0.5% error rate
+        },
+        'medium': {
+            'availability': 0.995,   # 99.5% - 3.65 hours downtime/month
+            'latency_p95': 500,      # 95th percentile latency in ms
+            'latency_p99': 2000,     # 99th percentile latency in ms
+            'error_rate': 0.01       # 1% error rate
+        },
+        'low': {
+            'availability': 0.99,    # 99% - 7.3 hours downtime/month
+            'latency_p95': 1000,     # 95th percentile latency in ms
+            'latency_p99': 5000,     # 99th percentile latency in ms
+            'error_rate': 0.02       # 2% error rate
+        }
+    }
+    
+    # Burn rate windows for multi-window alerting
+    BURN_RATE_WINDOWS = [
+        {'short': '5m', 'long': '1h', 'burn_rate': 14.4, 'budget_consumed': '2%'},
+        {'short': '30m', 'long': '6h', 'burn_rate': 6, 'budget_consumed': '5%'},
+        {'short': '2h', 'long': '1d', 'burn_rate': 3, 'budget_consumed': '10%'},
+        {'short': '6h', 'long': '3d', 'burn_rate': 1, 'budget_consumed': '10%'}
+    ]
+    
+    # Service type specific SLI recommendations
+    SERVICE_TYPE_SLIS = {
+        'api': ['availability', 'latency', 'error_rate', 'throughput'],
+        'web': ['availability', 'latency', 'error_rate', 'page_load_time'],
+        'database': ['availability', 'query_latency', 'connection_success_rate', 'replication_lag'],
+        'queue': ['availability', 'message_processing_time', 'queue_depth', 'message_loss_rate'],
+        'batch': ['job_success_rate', 'job_duration', 'data_freshness', 'resource_utilization'],
+        'ml': ['model_accuracy', 'prediction_latency', 'training_success_rate', 'feature_freshness']
+    }
+
+    def __init__(self):
+        """Initialize the SLO Designer."""
+        self.service_config = {}
+        self.slo_framework = {}
+
+    def load_service_definition(self, file_path: str) -> Dict[str, Any]:
+        """Load service definition from JSON file."""
+        try:
+            with open(file_path, 'r') as f:
+                return json.load(f)
+        except FileNotFoundError:
+            raise ValueError(f"Service definition file not found: {file_path}")
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in service definition: {e}")
+
+    def create_service_definition(self, service_type: str, criticality: str, 
+                                user_facing: bool, name: str = None) -> Dict[str, Any]:
+        """Create a service definition from parameters."""
+        return {
+            'name': name or f'{service_type}_service',
+            'type': service_type,
+            'criticality': criticality,
+            'user_facing': user_facing,
+            'description': f'A {criticality} criticality {service_type} service',
+            'dependencies': [],
+            'team': 'platform',
+            'environment': 'production'
+        }
+
+    def generate_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate Service Level Indicators based on service characteristics."""
+        service_type = service_def.get('type', 'api')
+        base_slis = self.SERVICE_TYPE_SLIS.get(service_type, ['availability', 'latency', 'error_rate'])
+        
+        slis = []
+        
+        for sli_name in base_slis:
+            sli = self._create_sli_definition(sli_name, service_def)
+            if sli:
+                slis.append(sli)
+        
+        # Add user-facing specific SLIs
+        if service_def.get('user_facing', False):
+            user_slis = self._generate_user_facing_slis(service_def)
+            slis.extend(user_slis)
+            
+        return slis
+
+    def _create_sli_definition(self, sli_name: str, service_def: Dict[str, Any]) -> Dict[str, Any]:
+        """Create detailed SLI definition."""
+        service_name = service_def.get('name', 'service')
+        
+        sli_definitions = {
+            'availability': {
+                'name': 'Availability',
+                'description': 'Percentage of successful requests',
+                'type': 'ratio',
+                'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
+                'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
+                'unit': 'percentage'
+            },
+            'latency': {
+                'name': 'Request Latency P95',
+                'description': '95th percentile of request latency',
+                'type': 'threshold',
+                'query': f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
+                'unit': 'seconds'
+            },
+            'error_rate': {
+                'name': 'Error Rate',
+                'description': 'Rate of 5xx errors',
+                'type': 'ratio',
+                'good_events': f'sum(rate(http_requests_total{{service="{service_name}",code!~"5.."}}))',
+                'total_events': f'sum(rate(http_requests_total{{service="{service_name}"}}))',
+                'unit': 'percentage'
+            },
+            'throughput': {
+                'name': 'Request Throughput',
+                'description': 'Requests per second',
+                'type': 'gauge',
+                'query': f'sum(rate(http_requests_total{{service="{service_name}"}}[5m]))',
+                'unit': 'requests/sec'
+            },
+            'page_load_time': {
+                'name': 'Page Load Time P95',
+                'description': '95th percentile of page load time',
+                'type': 'threshold',
+                'query': f'histogram_quantile(0.95, rate(page_load_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
+                'unit': 'seconds'
+            },
+            'query_latency': {
+                'name': 'Database Query Latency P95',
+                'description': '95th percentile of database query latency',
+                'type': 'threshold',
+                'query': f'histogram_quantile(0.95, rate(db_query_duration_seconds_bucket{{service="{service_name}"}}[5m]))',
+                'unit': 'seconds'
+            },
+            'connection_success_rate': {
+                'name': 'Database Connection Success Rate',
+                'description': 'Percentage of successful database connections',
+                'type': 'ratio',
+                'good_events': f'sum(rate(db_connections_total{{service="{service_name}",status="success"}}[5m]))',
+                'total_events': f'sum(rate(db_connections_total{{service="{service_name}"}}[5m]))',
+                'unit': 'percentage'
+            }
+        }
+        
+        return sli_definitions.get(sli_name)
+
+    def _generate_user_facing_slis(self, service_def: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Generate additional SLIs for user-facing services."""
+        service_name = service_def.get('name', 'service')
+        
+        return [
+            {
+                'name': 'User Journey Success Rate',
+                'description': 'Percentage of successful complete user journeys',
+                'type': 'ratio',
+                'good_events': f'sum(rate(user_journey_total{{service="{service_name}",status="success"}}[5m]))',
+                'total_events': f'sum(rate(user_journey_total{{service="{service_name}"}}[5m]))',
+                'unit': 'percentage'
+            },
+            {
+                'name': 'Feature Availability',
+                'description': 'Percentage of time key features are available',
+                'type': 'ratio',
+                'good_events': f'sum(rate(feature_checks_total{{service="{service_name}",status="available"}}[5m]))',
+                'total_events': f'sum(rate(feature_checks_total{{service="{service_name}"}}[5m]))',
+                'unit': 'percentage'
+            }
+        ]
+
+    def generate_slos(self, service_def: Dict[str, Any], slis: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Generate Service Level Objectives based on service criticality."""
+        criticality = service_def.get('criticality', 'medium')
+        targets = self.SLO_TARGETS.get(criticality, self.SLO_TARGETS['medium'])
+        
+        slos = []
+        
+        for sli in slis:
+            slo = self._create_slo_from_sli(sli, targets, service_def)
+            if slo:
+                slos.append(slo)
+                
+        return slos
+
+    def _create_slo_from_sli(self, sli: Dict[str, Any], targets: Dict[str, float], 
+                           service_def: Dict[str, Any]) -> Dict[str, Any]:
+        """Create SLO definition from SLI."""
+        sli_name = sli['name'].lower().replace(' ', '_')
+        
+        # Map SLI names to target keys
+        target_mapping = {
+            'availability': 'availability',
+            'request_latency_p95': 'latency_p95',
+            'error_rate': 'error_rate',
+            'user_journey_success_rate': 'availability',
+            'feature_availability': 'availability',
+            'page_load_time_p95': 'latency_p95',
+            'database_query_latency_p95': 'latency_p95',
+            'database_connection_success_rate': 'availability'
+        }
+        
+        target_key = target_mapping.get(sli_name)
+        if not target_key:
+            return None
+            
+        target_value = targets.get(target_key)
+        if target_value is None:
+            return None
+            
+        # Determine comparison operator and format target
+        if 'latency' in sli_name or 'duration' in sli_name:
+            operator = '<='
+            target_display = f"{target_value}ms" if target_value < 10 else f"{target_value/1000}s"
+        elif 'rate' in sli_name and 'error' in sli_name:
+            operator = '<='
+            target_display = f"{target_value * 100}%"
+            target_value = target_value  # Keep as decimal
+        else:
+            operator = '>='
+            target_display = f"{target_value * 100}%"
+        
+        # Calculate time windows
+        time_windows = ['1h', '1d', '7d', '30d']
+        
+        slo = {
+            'name': f"{sli['name']} SLO",
+            'description': f"Service level objective for {sli['description'].lower()}",
+            'sli_name': sli['name'],
+            'target_value': target_value,
+            'target_display': target_display,
+            'operator': operator,
+            'time_windows': time_windows,
+            'measurement_window': '30d',
+            'service': service_def.get('name', 'service'),
+            'criticality': service_def.get('criticality', 'medium')
+        }
+        
+        return slo
+
+    def calculate_error_budgets(self, slos: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Calculate error budgets for SLOs."""
+        error_budgets = []
+        
+        for slo in slos:
+            if slo['operator'] == '>=':  # Availability-type SLOs
+                target = slo['target_value']
+                error_budget_rate = 1 - target
+                
+                # Calculate budget for different time windows
+                time_windows = {
+                    '1h': 3600,
+                    '1d': 86400,
+                    '7d': 604800,
+                    '30d': 2592000
+                }
+                
+                budgets = {}
+                for window, seconds in time_windows.items():
+                    budget_seconds = seconds * error_budget_rate
+                    if budget_seconds < 60:
+                        budgets[window] = f"{budget_seconds:.1f} seconds"
+                    elif budget_seconds < 3600:
+                        budgets[window] = f"{budget_seconds/60:.1f} minutes"
+                    else:
+                        budgets[window] = f"{budget_seconds/3600:.1f} hours"
+                
+                error_budget = {
+                    'slo_name': slo['name'],
+                    'error_budget_rate': error_budget_rate,
+                    'error_budget_percentage': f"{error_budget_rate * 100:.3f}%",
+                    'budgets_by_window': budgets,
+                    'burn_rate_alerts': self._generate_burn_rate_alerts(slo, error_budget_rate)
+                }
+                
+                error_budgets.append(error_budget)
+                
+        return error_budgets
+
+    def _generate_burn_rate_alerts(self, slo: Dict[str, Any], error_budget_rate: float) -> List[Dict[str, Any]]:
+        """Generate multi-window burn rate alerts."""
+        alerts = []
+        service_name = slo['service']
+        sli_query = self._get_sli_query_for_burn_rate(slo)
+        
+        for window_config in self.BURN_RATE_WINDOWS:
+            alert = {
+                'name': f"{slo['sli_name']} Burn Rate {window_config['budget_consumed']} Alert",
+                'description': f"Alert when {slo['sli_name']} is consuming error budget at {window_config['burn_rate']}x rate",
+                'severity': self._determine_alert_severity(float(window_config['budget_consumed'].rstrip('%'))),
+                'short_window': window_config['short'],
+                'long_window': window_config['long'],
+                'burn_rate_threshold': window_config['burn_rate'],
+                'budget_consumed': window_config['budget_consumed'],
+                'condition': f"({sli_query}_short > {window_config['burn_rate']}) and ({sli_query}_long > {window_config['burn_rate']})",
+                'annotations': {
+                    'summary': f"High burn rate detected for {slo['sli_name']}",
+                    'description': f"Error budget consumption rate is {window_config['burn_rate']}x normal, will exhaust {window_config['budget_consumed']} of monthly budget"
+                }
+            }
+            alerts.append(alert)
+            
+        return alerts
+
+    def _get_sli_query_for_burn_rate(self, slo: Dict[str, Any]) -> str:
+        """Generate SLI query fragment for burn rate calculation."""
+        service_name = slo['service']
+        sli_name = slo['sli_name'].lower().replace(' ', '_')
+        
+        if 'availability' in sli_name or 'success' in sli_name:
+            return f"(1 - (sum(rate(http_requests_total{{service='{service_name}',code!~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}}))))"
+        elif 'error' in sli_name:
+            return f"(sum(rate(http_requests_total{{service='{service_name}',code=~'5..'}})) / sum(rate(http_requests_total{{service='{service_name}'}})))"
+        else:
+            return f"sli_burn_rate_{sli_name}"
+
+    def _determine_alert_severity(self, budget_consumed_percent: float) -> str:
+        """Determine alert severity based on budget consumption rate."""
+        if budget_consumed_percent <= 2:
+            return 'critical'
+        elif budget_consumed_percent <= 5:
+            return 'warning'
+        else:
+            return 'info'
+
+    def generate_sla_recommendations(self, service_def: Dict[str, Any], 
+                                   slos: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Generate SLA recommendations for customer-facing services."""
+        if not service_def.get('user_facing', False):
+            return {
+                'applicable': False,
+                'reason': 'SLA not recommended for non-user-facing services'
+            }
+        
+        criticality = service_def.get('criticality', 'medium')
+        
+        # SLA targets should be more conservative than SLO targets
+        sla_buffer = 0.001  # 0.1% buffer below SLO
+        
+        sla_recommendations = {
+            'applicable': True,
+            'service': service_def.get('name'),
+            'commitments': [],
+            'penalties': self._generate_penalty_structure(criticality),
+            'measurement_methodology': 'External synthetic monitoring from multiple geographic locations',
+            'exclusions': [
+                'Planned maintenance windows (with 72h advance notice)',
+                'Customer-side network or infrastructure issues',
+                'Force majeure events',
+                'Third-party service dependencies beyond our control'
+            ]
+        }
+        
+        for slo in slos:
+            if slo['operator'] == '>=' and 'availability' in slo['sli_name'].lower():
+                sla_target = max(0.9, slo['target_value'] - sla_buffer)
+                commitment = {
+                    'metric': slo['sli_name'],
+                    'target': sla_target,
+                    'target_display': f"{sla_target * 100:.2f}%",
+                    'measurement_window': 'monthly',
+                    'measurement_method': 'Uptime monitoring with 1-minute granularity'
+                }
+                sla_recommendations['commitments'].append(commitment)
+        
+        return sla_recommendations
+
+    def _generate_penalty_structure(self, criticality: str) -> List[Dict[str, Any]]:
+        """Generate penalty structure based on service criticality."""
+        penalty_structures = {
+            'critical': [
+                {'breach_threshold': '< 99.99%', 'credit_percentage': 10},
+                {'breach_threshold': '< 99.9%', 'credit_percentage': 25},
+                {'breach_threshold': '< 99%', 'credit_percentage': 50}
+            ],
+            'high': [
+                {'breach_threshold': '< 99.9%', 'credit_percentage': 10},
+                {'breach_threshold': '< 99.5%', 'credit_percentage': 25}
+            ],
+            'medium': [
+                {'breach_threshold': '< 99.5%', 'credit_percentage': 10}
+            ],
+            'low': []
+        }
+        
+        return penalty_structures.get(criticality, [])
+
+    def generate_framework(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate complete SLO framework."""
+        # Generate SLIs
+        slis = self.generate_slis(service_def)
+        
+        # Generate SLOs
+        slos = self.generate_slos(service_def, slis)
+        
+        # Calculate error budgets
+        error_budgets = self.calculate_error_budgets(slos)
+        
+        # Generate SLA recommendations
+        sla_recommendations = self.generate_sla_recommendations(service_def, slos)
+        
+        # Create comprehensive framework
+        framework = {
+            'metadata': {
+                'service': service_def,
+                'generated_at': datetime.utcnow().isoformat() + 'Z',
+                'framework_version': '1.0'
+            },
+            'slis': slis,
+            'slos': slos,
+            'error_budgets': error_budgets,
+            'sla_recommendations': sla_recommendations,
+            'monitoring_recommendations': self._generate_monitoring_recommendations(service_def),
+            'implementation_guide': self._generate_implementation_guide(service_def, slis, slos)
+        }
+        
+        return framework
+
+    def _generate_monitoring_recommendations(self, service_def: Dict[str, Any]) -> Dict[str, Any]:
+        """Generate monitoring tool recommendations."""
+        service_type = service_def.get('type', 'api')
+        
+        recommendations = {
+            'metrics': {
+                'collection': 'Prometheus with service discovery',
+                'retention': '90 days for raw metrics, 1 year for aggregated',
+                'alerting': 'Prometheus Alertmanager with multi-window burn rate alerts'
+            },
+            'logging': {
+                'format': 'Structured JSON logs with correlation IDs',
+                'aggregation': 'ELK stack or equivalent with proper indexing',
+                'retention': '30 days for debug logs, 90 days for error logs'
+            },
+            'tracing': {
+                'sampling': 'Adaptive sampling with 1% base rate',
+                'storage': 'Jaeger or Zipkin with 7-day retention',
+                'integration': 'OpenTelemetry instrumentation'
+            }
+        }
+        
+        if service_type == 'web':
+            recommendations['synthetic_monitoring'] = {
+                'frequency': 'Every 1 minute from 3+ geographic locations',
+                'checks': 'Full user journey simulation',
+                'tools': 'Pingdom, DataDog Synthetics, or equivalent'
+            }
+        
+        return recommendations
+
+    def _generate_implementation_guide(self, service_def: Dict[str, Any], 
+                                     slis: List[Dict[str, Any]], 
+                                     slos: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Generate implementation guide for the SLO framework."""
+        return {
+            'prerequisites': [
+                'Service instrumented with metrics collection (Prometheus format)',
+                'Structured logging with correlation IDs',
+                'Monitoring infrastructure (Prometheus, Grafana, Alertmanager)',
+                'Incident response processes and escalation policies'
+            ],
+            'implementation_steps': [
+                {
+                    'step': 1,
+                    'title': 'Instrument Service',
+                    'description': 'Add metrics collection for all defined SLIs',
+                    'estimated_effort': '1-2 days'
+                },
+                {
+                    'step': 2,
+                    'title': 'Configure Recording Rules',
+                    'description': 'Set up Prometheus recording rules for SLI calculations',
+                    'estimated_effort': '4-8 hours'
+                },
+                {
+                    'step': 3,
+                    'title': 'Implement Burn Rate Alerts',
+                    'description': 'Configure multi-window burn rate alerting rules',
+                    'estimated_effort': '1 day'
+                },
+                {
+                    'step': 4,
+                    'title': 'Create SLO Dashboard',
+                    'description': 'Build Grafana dashboard for SLO tracking and error budget monitoring',
+                    'estimated_effort': '4-6 hours'
+                },
+                {
+                    'step': 5,
+                    'title': 'Test and Validate',
+                    'description': 'Test alerting and validate SLI measurements against expectations',
+                    'estimated_effort': '1-2 days'
+                },
+                {
+                    'step': 6,
+                    'title': 'Documentation and Training',
+                    'description': 'Document runbooks and train team on SLO monitoring',
+                    'estimated_effort': '1 day'
+                }
+            ],
+            'validation_checklist': [
+                'All SLIs produce expected metric values',
+                'Burn rate alerts fire correctly during simulated outages',
+                'Error budget calculations match manual verification',
+                'Dashboard displays accurate SLO achievement rates',
+                'Alert routing reaches correct escalation paths',
+                'Runbooks are complete and tested'
+            ]
+        }
+
+    def export_json(self, framework: Dict[str, Any], output_file: str):
+        """Export framework as JSON."""
+        with open(output_file, 'w') as f:
+            json.dump(framework, f, indent=2)
+
+    def print_summary(self, framework: Dict[str, Any]):
+        """Print human-readable summary of the SLO framework."""
+        service = framework['metadata']['service']
+        slis = framework['slis']
+        slos = framework['slos']
+        error_budgets = framework['error_budgets']
+        
+        print(f"\n{'='*60}")
+        print(f"SLO FRAMEWORK SUMMARY FOR {service['name'].upper()}")
+        print(f"{'='*60}")
+        
+        print(f"\nService Details:")
+        print(f"  Type: {service['type']}")
+        print(f"  Criticality: {service['criticality']}")
+        print(f"  User Facing: {'Yes' if service.get('user_facing') else 'No'}")
+        print(f"  Team: {service.get('team', 'Unknown')}")
+        
+        print(f"\nService Level Indicators ({len(slis)}):")
+        for i, sli in enumerate(slis, 1):
+            print(f"  {i}. {sli['name']}")
+            print(f"     Description: {sli['description']}")
+            print(f"     Type: {sli['type']}")
+            print()
+        
+        print(f"Service Level Objectives ({len(slos)}):")
+        for i, slo in enumerate(slos, 1):
+            print(f"  {i}. {slo['name']}")
+            print(f"     Target: {slo['target_display']}")
+            print(f"     Measurement Window: {slo['measurement_window']}")
+            print()
+        
+        print(f"Error Budget Summary:")
+        for budget in error_budgets:
+            print(f"  {budget['slo_name']}:")
+            print(f"    Monthly Budget: {budget['error_budget_percentage']}")
+            print(f"    Burn Rate Alerts: {len(budget['burn_rate_alerts'])}")
+            print()
+        
+        sla = framework['sla_recommendations']
+        if sla['applicable']:
+            print(f"SLA Recommendations:")
+            print(f"  Commitments: {len(sla['commitments'])}")
+            print(f"  Penalty Tiers: {len(sla['penalties'])}")
+        else:
+            print(f"SLA Recommendations: {sla['reason']}")
+        
+        print(f"\nImplementation Timeline: 1-2 weeks")
+        print(f"Framework generated at: {framework['metadata']['generated_at']}")
+        print(f"{'='*60}\n")
+
+
+def main():
+    """Main function for CLI usage."""
+    parser = argparse.ArgumentParser(
+        description='Generate comprehensive SLO frameworks for services',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Generate from service definition file
+    python slo_designer.py --input service.json --output framework.json
+    
+    # Generate from command line parameters
+    python slo_designer.py --service-type api --criticality high --user-facing true --output framework.json
+    
+    # Generate and display summary only
+    python slo_designer.py --service-type web --criticality critical --user-facing true --summary-only
+        """
+    )
+    
+    parser.add_argument('--input', '-i', 
+                       help='Input service definition JSON file')
+    parser.add_argument('--output', '-o', 
+                       help='Output framework JSON file')
+    parser.add_argument('--service-type', 
+                       choices=['api', 'web', 'database', 'queue', 'batch', 'ml'],
+                       help='Service type')
+    parser.add_argument('--criticality',
+                       choices=['critical', 'high', 'medium', 'low'],
+                       help='Service criticality level')
+    parser.add_argument('--user-facing',
+                       choices=['true', 'false'],
+                       help='Whether service is user-facing')
+    parser.add_argument('--service-name',
+                       help='Service name')
+    parser.add_argument('--summary-only', action='store_true',
+                       help='Only display summary, do not save JSON')
+    
+    args = parser.parse_args()
+    
+    if not args.input and not (args.service_type and args.criticality and args.user_facing):
+        parser.error("Must provide either --input file or --service-type, --criticality, and --user-facing")
+    
+    designer = SLODesigner()
+    
+    try:
+        # Load or create service definition
+        if args.input:
+            service_def = designer.load_service_definition(args.input)
+        else:
+            user_facing = args.user_facing.lower() == 'true'
+            service_def = designer.create_service_definition(
+                args.service_type, args.criticality, user_facing, args.service_name
+            )
+        
+        # Generate framework
+        framework = designer.generate_framework(service_def)
+        
+        # Output results
+        if not args.summary_only:
+            output_file = args.output or f"{service_def['name']}_slo_framework.json"
+            designer.export_json(framework, output_file)
+            print(f"SLO framework saved to: {output_file}")
+        
+        # Always show summary
+        designer.print_summary(framework)
+        
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file