diff --git a/engineering-team/incident-commander/README.md b/engineering-team/incident-commander/README.md new file mode 100644 index 0000000..27fde44 --- /dev/null +++ b/engineering-team/incident-commander/README.md @@ -0,0 +1,252 @@ +# Incident Commander Skill + +A comprehensive incident response framework providing structured tools for managing technology incidents from detection through resolution and post-incident review. + +## Overview + +This skill implements battle-tested practices from SRE and DevOps teams at scale, providing: + +- **Automated Severity Classification** - Intelligent incident triage +- **Timeline Reconstruction** - Transform scattered events into coherent narratives +- **Post-Incident Review Generation** - Structured PIRs with RCA frameworks +- **Communication Templates** - Pre-built stakeholder communication +- **Comprehensive Documentation** - Reference guides for incident response + +## Quick Start + +### Classify an Incident + +```bash +# From JSON file +python scripts/incident_classifier.py --input incident.json --format text + +# From stdin text +echo "Database is down affecting all users" | python scripts/incident_classifier.py --format text + +# Interactive mode +python scripts/incident_classifier.py --interactive +``` + +### Reconstruct Timeline + +```bash +# Analyze event timeline +python scripts/timeline_reconstructor.py --input events.json --format text + +# With gap analysis +python scripts/timeline_reconstructor.py --input events.json --gap-analysis --format markdown +``` + +### Generate PIR Document + +```bash +# Basic PIR +python scripts/pir_generator.py --incident incident.json --format markdown + +# Comprehensive PIR with timeline +python scripts/pir_generator.py --incident incident.json --timeline timeline.json --rca-method fishbone +``` + +## Scripts + +### incident_classifier.py + +**Purpose:** Analyzes incident descriptions and provides severity classification, team recommendations, and response templates. + +**Input:** JSON object with incident details or plain text description +**Output:** JSON + human-readable classification report + +**Example Input:** +```json +{ + "description": "Database connection timeouts causing 500 errors", + "service": "payment-api", + "affected_users": "80%", + "business_impact": "high" +} +``` + +**Key Features:** +- SEV1-4 severity classification +- Recommended response teams +- Initial action prioritization +- Communication templates +- Response timelines + +### timeline_reconstructor.py + +**Purpose:** Reconstructs incident timelines from timestamped events, identifies phases, and performs gap analysis. + +**Input:** JSON array of timestamped events +**Output:** Formatted timeline with phase analysis and metrics + +**Example Input:** +```json +[ + { + "timestamp": "2024-01-01T12:00:00Z", + "source": "monitoring", + "message": "High error rate detected", + "severity": "critical", + "actor": "system" + } +] +``` + +**Key Features:** +- Phase detection (detection → triage → mitigation → resolution) +- Duration analysis +- Gap identification +- Communication effectiveness analysis +- Response metrics + +### pir_generator.py + +**Purpose:** Generates comprehensive Post-Incident Review documents with multiple RCA frameworks. + +**Input:** Incident data JSON, optional timeline data +**Output:** Structured PIR document with RCA analysis + +**Key Features:** +- Multiple RCA methods (5 Whys, Fishbone, Timeline, Bow Tie) +- Automated action item generation +- Lessons learned categorization +- Follow-up planning +- Completeness assessment + +## Sample Data + +The `assets/` directory contains sample data files for testing: + +- `sample_incident_classification.json` - Database connection pool exhaustion incident +- `sample_timeline_events.json` - Complete timeline with 21 events across phases +- `sample_incident_pir_data.json` - Comprehensive incident data for PIR generation +- `simple_incident.json` - Minimal incident for basic testing +- `simple_timeline_events.json` - Simple 4-event timeline + +## Expected Outputs + +The `expected_outputs/` directory contains reference outputs showing what each script produces: + +- `incident_classification_text_output.txt` - Detailed classification report +- `timeline_reconstruction_text_output.txt` - Complete timeline analysis +- `pir_markdown_output.md` - Full PIR document +- `simple_incident_classification.txt` - Basic classification example + +## Reference Documentation + +### references/incident_severity_matrix.md +Complete severity classification system with: +- SEV1-4 definitions and criteria +- Response requirements and timelines +- Escalation paths +- Communication requirements +- Decision trees and examples + +### references/rca_frameworks_guide.md +Detailed guide for root cause analysis: +- 5 Whys methodology +- Fishbone (Ishikawa) diagram analysis +- Timeline analysis techniques +- Bow Tie analysis for high-risk incidents +- Framework selection guidelines + +### references/communication_templates.md +Standardized communication templates: +- Severity-specific notification templates +- Stakeholder-specific messaging +- Escalation communications +- Resolution notifications +- Customer communication guidelines + +## Usage Patterns + +### End-to-End Incident Workflow + +1. **Initial Classification** +```bash +echo "Payment API returning 500 errors for 70% of requests" | \ + python scripts/incident_classifier.py --format text +``` + +2. **Timeline Reconstruction** (after collecting events) +```bash +python scripts/timeline_reconstructor.py \ + --input events.json \ + --gap-analysis \ + --format markdown \ + --output timeline.md +``` + +3. **PIR Generation** (after incident resolution) +```bash +python scripts/pir_generator.py \ + --incident incident.json \ + --timeline timeline.md \ + --rca-method fishbone \ + --output pir.md +``` + +### Integration Examples + +**CI/CD Pipeline Integration:** +```bash +# Classify deployment issues +cat deployment_error.log | python scripts/incident_classifier.py --format json +``` + +**Monitoring Integration:** +```bash +# Process alert events +curl -s "monitoring-api/events" | python scripts/timeline_reconstructor.py --format text +``` + +**Runbook Generation:** +Use classification output to automatically select appropriate runbooks and escalation procedures. + +## Quality Standards + +- **Zero External Dependencies** - All scripts use only Python standard library +- **Dual Output Format** - Both JSON (machine-readable) and text (human-readable) +- **Robust Input Handling** - Graceful handling of missing or malformed data +- **Professional Defaults** - Opinionated, battle-tested configurations +- **Comprehensive Testing** - Sample data and expected outputs included + +## Technical Requirements + +- Python 3.6+ +- No external dependencies required +- Works with standard Unix tools (pipes, redirection) +- Cross-platform compatible + +## Severity Classification Reference + +| Severity | Description | Response Time | Update Frequency | +|----------|-------------|---------------|------------------| +| **SEV1** | Complete outage | 5 minutes | Every 15 minutes | +| **SEV2** | Major degradation | 15 minutes | Every 30 minutes | +| **SEV3** | Minor impact | 2 hours | At milestones | +| **SEV4** | Low impact | 1-2 days | Weekly | + +## Getting Help + +Each script includes comprehensive help: +```bash +python scripts/incident_classifier.py --help +python scripts/timeline_reconstructor.py --help +python scripts/pir_generator.py --help +``` + +For methodology questions, refer to the reference documentation in the `references/` directory. + +## Contributing + +When adding new features: +1. Maintain zero external dependencies +2. Add comprehensive examples to `assets/` +3. Update expected outputs in `expected_outputs/` +4. Follow the established patterns for argument parsing and output formatting + +## License + +This skill is part of the claude-skills repository. See the main repository LICENSE for details. \ No newline at end of file diff --git a/engineering-team/incident-commander/SKILL.md b/engineering-team/incident-commander/SKILL.md index 58b1a49..0895eb9 100644 --- a/engineering-team/incident-commander/SKILL.md +++ b/engineering-team/incident-commander/SKILL.md @@ -1,693 +1,668 @@ ---- -name: incident-commander -description: Production incident management with structured timeline analysis, severity classification (SEV1-4), automated postmortem generation, and SLA tracking. Features communication templates, escalation routing, 5-Whys root cause analysis, and MTTR/MTTD metrics for high-reliability engineering teams. -license: MIT -metadata: - version: 1.0.0 - author: Alireza Rezvani - category: engineering - domain: site-reliability - updated: 2026-02-16 - python-tools: incident_timeline_builder.py, severity_classifier.py, postmortem_generator.py - tech-stack: incident-management, sre, on-call, postmortem-analysis ---- +# Incident Commander Skill -# Incident Commander Expert +**Category:** Engineering Team +**Tier:** POWERFUL +**Author:** Claude Skills Team +**Version:** 1.0.0 +**Last Updated:** February 2026 -Advanced incident management specializing in structured response coordination, severity-driven escalation, postmortem excellence, and SLA compliance. Combines PagerDuty/Google SRE/Atlassian incident management frameworks with quantitative reliability metrics for high-performance engineering organizations. +## Overview + +The Incident Commander skill provides a comprehensive incident response framework for managing technology incidents from detection through resolution and post-incident review. This skill implements battle-tested practices from SRE and DevOps teams at scale, providing structured tools for severity classification, timeline reconstruction, and thorough post-incident analysis. + +## Key Features + +- **Automated Severity Classification** - Intelligent incident triage based on impact and urgency metrics +- **Timeline Reconstruction** - Transform scattered logs and events into coherent incident narratives +- **Post-Incident Review Generation** - Structured PIRs with multiple RCA frameworks +- **Communication Templates** - Pre-built templates for stakeholder updates and escalations +- **Runbook Integration** - Generate actionable runbooks from incident patterns + +## Skills Included + +### Core Tools + +1. **Incident Classifier** (`incident_classifier.py`) + - Analyzes incident descriptions and outputs severity levels + - Recommends response teams and initial actions + - Generates communication templates based on severity + +2. **Timeline Reconstructor** (`timeline_reconstructor.py`) + - Processes timestamped events from multiple sources + - Reconstructs chronological incident timeline + - Identifies gaps and provides duration analysis + +3. **PIR Generator** (`pir_generator.py`) + - Creates comprehensive Post-Incident Review documents + - Applies multiple RCA frameworks (5 Whys, Fishbone, Timeline) + - Generates actionable follow-up items + +## Incident Response Framework + +### Severity Classification System + +#### SEV1 - Critical Outage +**Definition:** Complete service failure affecting all users or critical business functions + +**Characteristics:** +- Customer-facing services completely unavailable +- Data loss or corruption affecting users +- Security breaches with customer data exposure +- Revenue-generating systems down +- SLA violations with financial penalties + +**Response Requirements:** +- Immediate escalation to on-call engineer +- Incident Commander assigned within 5 minutes +- Executive notification within 15 minutes +- Public status page update within 15 minutes +- War room established +- All hands on deck if needed + +**Communication Frequency:** Every 15 minutes until resolution + +#### SEV2 - Major Impact +**Definition:** Significant degradation affecting subset of users or non-critical functions + +**Characteristics:** +- Partial service degradation (>25% of users affected) +- Performance issues causing user frustration +- Non-critical features unavailable +- Internal tools impacting productivity +- Data inconsistencies not affecting user experience + +**Response Requirements:** +- On-call engineer response within 15 minutes +- Incident Commander assigned within 30 minutes +- Status page update within 30 minutes +- Stakeholder notification within 1 hour +- Regular team updates + +**Communication Frequency:** Every 30 minutes during active response + +#### SEV3 - Minor Impact +**Definition:** Limited impact with workarounds available + +**Characteristics:** +- Single feature or component affected +- <25% of users impacted +- Workarounds available +- Performance degradation not significantly impacting UX +- Non-urgent monitoring alerts + +**Response Requirements:** +- Response within 2 hours during business hours +- Next business day response acceptable outside hours +- Internal team notification +- Optional status page update + +**Communication Frequency:** At key milestones only + +#### SEV4 - Low Impact +**Definition:** Minimal impact, cosmetic issues, or planned maintenance + +**Characteristics:** +- Cosmetic bugs +- Documentation issues +- Logging or monitoring gaps +- Performance issues with no user impact +- Development/test environment issues + +**Response Requirements:** +- Response within 1-2 business days +- Standard ticket/issue tracking +- No special escalation required + +**Communication Frequency:** Standard development cycle updates + +### Incident Commander Role + +#### Primary Responsibilities + +1. **Command and Control** + - Own the incident response process + - Make critical decisions about resource allocation + - Coordinate between technical teams and stakeholders + - Maintain situational awareness across all response streams + +2. **Communication Hub** + - Provide regular updates to stakeholders + - Manage external communications (status pages, customer notifications) + - Facilitate effective communication between response teams + - Shield responders from external distractions + +3. **Process Management** + - Ensure proper incident tracking and documentation + - Drive toward resolution while maintaining quality + - Coordinate handoffs between team members + - Plan and execute rollback strategies if needed + +4. **Post-Incident Leadership** + - Ensure thorough post-incident reviews are conducted + - Drive implementation of preventive measures + - Share learnings with broader organization + +#### Decision-Making Framework + +**Emergency Decisions (SEV1/2):** +- Incident Commander has full authority +- Bias toward action over analysis +- Document decisions for later review +- Consult subject matter experts but don't get blocked + +**Resource Allocation:** +- Can pull in any necessary team members +- Authority to escalate to senior leadership +- Can approve emergency spend for external resources +- Make call on communication channels and timing + +**Technical Decisions:** +- Lean on technical leads for implementation details +- Make final calls on trade-offs between speed and risk +- Approve rollback vs. fix-forward strategies +- Coordinate testing and validation approaches + +### Communication Templates + +#### Initial Incident Notification (SEV1/2) + +``` +Subject: [SEV{severity}] {Service Name} - {Brief Description} + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV{level} +- Impact: {user impact description} +- Current Status: {investigating/mitigating/resolved} + +Technical Details: +- Affected Services: {service list} +- Symptoms: {what users are experiencing} +- Initial Assessment: {suspected root cause if known} + +Response Team: +- Incident Commander: {name} +- Technical Lead: {name} +- SMEs Engaged: {list} + +Next Update: {timestamp} +Status Page: {link} +War Room: {bridge/chat link} --- - -## Table of Contents - -- [Capabilities](#capabilities) -- [Input Requirements](#input-requirements) -- [Analysis Tools](#analysis-tools) -- [Methodology](#methodology) -- [Templates & Assets](#templates--assets) -- [Reference Frameworks](#reference-frameworks) -- [Implementation Workflows](#implementation-workflows) -- [Assessment & Measurement](#assessment--measurement) -- [Best Practices](#best-practices) -- [Advanced Techniques](#advanced-techniques) -- [Limitations & Considerations](#limitations--considerations) -- [Success Metrics & Outcomes](#success-metrics--outcomes) - ---- - -## Capabilities - -### Incident Timeline Intelligence -- **Structured Timeline Construction**: Chronological event assembly from detection through resolution with gap identification via `incident_timeline_builder.py` -- **Phase Duration Analysis**: Automated calculation of time-in-phase for Detection, Triage, Mitigation, and Resolution with bottleneck identification -- **Communication Log Correlation**: Maps status updates, escalation events, and stakeholder notifications against incident progression -- **Gap Detection**: Identifies periods of inactivity or missing log entries that indicate process failures or documentation gaps -- **Multi-Source Aggregation**: Consolidates events from monitoring alerts, Slack messages, PagerDuty pages, and manual entries into a unified timeline - -### Severity Classification & Escalation -- **Impact-First Classification**: Four-tier severity model (SEV1-SEV4) driven by customer impact, revenue exposure, and data integrity risk via `severity_classifier.py` -- **Dynamic Re-Classification**: Continuous severity reassessment as incident scope changes, with automatic escalation triggers -- **Escalation Routing Matrix**: Role-based escalation paths with time-boxed response requirements per severity level -- **Blast Radius Estimation**: Quantitative assessment of affected users, services, and revenue based on incident metadata -- **SLA Threshold Mapping**: Automatic SLA timer activation and breach prediction based on classified severity - -### Postmortem Excellence -- **Automated Report Generation**: Structured postmortem documents from incident data with timeline, impact summary, and root cause sections via `postmortem_generator.py` -- **5-Whys Root Cause Analysis**: Guided causal chain construction with depth validation and contributing factor identification -- **Action Item Extraction**: Automated identification of remediation tasks with priority scoring and ownership assignment -- **Pattern Recognition**: Cross-incident analysis to surface recurring failure modes and systemic weaknesses -- **Blameless Framing**: Language analysis to ensure postmortem narratives focus on systems and processes, not individuals - -### SLA & Reliability Metrics -- **MTTR Tracking**: Mean Time to Resolve computed per severity level with trend analysis and target comparison -- **MTTD Monitoring**: Mean Time to Detect measuring observability effectiveness from incident onset to first alert -- **MTBF Calculation**: Mean Time Between Failures per service, providing reliability baselines for capacity planning -- **SLA Compliance Scoring**: Real-time compliance percentages against defined availability targets (99.9%, 99.95%, 99.99%) -- **Incident Frequency Analysis**: Trend detection in incident volume by severity, service, and time window - ---- - -## Input Requirements - -### Incident Data Structure -All analysis tools accept JSON input following this schema: - -```json -{ - "incident": { - "id": "INC-2026-0142", - "title": "Payment processing service degradation", - "severity": "SEV2", - "status": "resolved", - "commander": "Jane Chen", - "declared_at": "2026-02-15T14:23:00Z", - "resolved_at": "2026-02-15T16:47:00Z", - "services_affected": ["payment-api", "checkout-frontend", "order-service"], - "customer_impact": { - "affected_users": 12400, - "revenue_impact_usd": 84000, - "data_integrity": false - } - }, - "timeline": [ - { - "timestamp": "2026-02-15T14:18:00Z", - "type": "alert", - "source": "datadog", - "description": "P95 latency > 2000ms on payment-api", - "actor": "monitoring" - }, - { - "timestamp": "2026-02-15T14:23:00Z", - "type": "declaration", - "source": "slack", - "description": "SEV2 declared by on-call engineer", - "actor": "jane.chen" - } - ], - "root_cause": { - "summary": "Connection pool exhaustion due to upstream database failover", - "category": "infrastructure", - "five_whys": [ - "Payment API returned 503 errors", - "Connection pool was exhausted (0/50 available)", - "Database primary failed over to replica", - "Replica promotion took 47 seconds, exceeding 10s pool timeout", - "Failover health check interval was set to 30s instead of 5s" - ] - }, - "action_items": [ - { - "id": "AI-001", - "description": "Reduce database health check interval to 5 seconds", - "priority": "P1", - "owner": "platform-team", - "due_date": "2026-02-22", - "status": "open" - } - ], - "sla": { - "target_availability": 99.95, - "downtime_minutes": 144, - "monthly_budget_minutes": 21.6, - "remaining_budget_minutes": -122.4 - } -} +{Incident Commander Name} +{Contact Information} ``` -### Minimum Data Requirements -- **Timeline Builder**: Incident ID, declared_at timestamp, and 2+ timeline events with timestamps -- **Severity Classifier**: Services affected, customer impact metrics (affected users OR revenue impact), and incident description -- **Postmortem Generator**: Complete incident record with timeline (5+ events recommended), root cause summary, and at least 1 action item -- **SLA Analysis**: Target availability percentage and incident duration; historical incident data for trend analysis (6+ incidents recommended) +#### Executive Summary (SEV1) + +``` +Subject: URGENT - Customer-Impacting Outage - {Service Name} + +Executive Summary: +{2-3 sentence description of customer impact and business implications} + +Key Metrics: +- Time to Detection: {X minutes} +- Time to Engagement: {X minutes} +- Estimated Customer Impact: {number/percentage} +- Current Status: {status} +- ETA to Resolution: {time or "investigating"} + +Leadership Actions Required: +- [ ] Customer communication approval +- [ ] PR/Communications coordination +- [ ] Resource allocation decisions +- [ ] External vendor engagement + +Incident Commander: {name} ({contact}) +Next Update: {time} --- +This is an automated alert from our incident response system. +``` -## Analysis Tools +#### Customer Communication Template -### Incident Timeline Builder (`scripts/incident_timeline_builder.py`) -Constructs structured, chronological incident timelines from raw event data with phase analysis and gap detection. +``` +We are currently experiencing {brief description of issue} affecting {scope of impact}. -**Features**: -- Chronological event ordering with deduplication across sources -- Automatic phase classification (Detection, Triage, Mitigation, Resolution, Postmortem) -- Phase duration calculation with bottleneck identification -- Communication cadence analysis (flags gaps > 15 minutes during active incidents) -- Timeline gap detection for periods with no recorded activity -- Multi-format output (text table, JSON, markdown) +Our engineering team was alerted at {time} and is actively working to resolve the issue. We will provide updates every {frequency} until resolved. + +What we know: +- {factual statement of impact} +- {factual statement of scope} +- {brief status of response} + +What we're doing: +- {primary response action} +- {secondary response action} + +Workaround (if available): +{workaround steps or "No workaround currently available"} + +We apologize for the inconvenience and will share more information as it becomes available. + +Next update: {time} +Status page: {link} +``` + +### Stakeholder Management + +#### Stakeholder Classification + +**Internal Stakeholders:** +- **Engineering Leadership** - Technical decisions and resource allocation +- **Product Management** - Customer impact assessment and feature implications +- **Customer Support** - User communication and support ticket management +- **Sales/Account Management** - Customer relationship management for enterprise clients +- **Executive Team** - Business impact decisions and external communication approval +- **Legal/Compliance** - Regulatory reporting and liability assessment + +**External Stakeholders:** +- **Customers** - Service availability and impact communication +- **Partners** - API availability and integration impacts +- **Vendors** - Third-party service dependencies and support escalation +- **Regulators** - Compliance reporting for regulated industries +- **Public/Media** - Transparency for public-facing outages + +#### Communication Cadence by Stakeholder + +| Stakeholder | SEV1 | SEV2 | SEV3 | SEV4 | +|-------------|------|------|------|------| +| Engineering Leadership | Real-time | 30min | 4hrs | Daily | +| Executive Team | 15min | 1hr | EOD | Weekly | +| Customer Support | Real-time | 30min | 2hrs | As needed | +| Customers | 15min | 1hr | Optional | None | +| Partners | 30min | 2hrs | Optional | None | + +### Runbook Generation Framework + +#### Dynamic Runbook Components + +1. **Detection Playbooks** + - Monitoring alert definitions + - Triage decision trees + - Escalation trigger points + - Initial response actions + +2. **Response Playbooks** + - Step-by-step mitigation procedures + - Rollback instructions + - Validation checkpoints + - Communication checkpoints + +3. **Recovery Playbooks** + - Service restoration procedures + - Data consistency checks + - Performance validation + - User notification processes + +#### Runbook Template Structure + +```markdown +# {Service/Component} Incident Response Runbook + +## Quick Reference +- **Severity Indicators:** {list of conditions for each severity level} +- **Key Contacts:** {on-call rotations and escalation paths} +- **Critical Commands:** {list of emergency commands with descriptions} + +## Detection +### Monitoring Alerts +- {Alert name}: {description and thresholds} +- {Alert name}: {description and thresholds} + +### Manual Detection Signs +- {Symptom}: {what to look for and where} +- {Symptom}: {what to look for and where} + +## Initial Response (0-15 minutes) +1. **Assess Severity** + - [ ] Check {primary metric} + - [ ] Verify {secondary indicator} + - [ ] Classify as SEV{level} based on {criteria} + +2. **Establish Command** + - [ ] Page Incident Commander if SEV1/2 + - [ ] Create incident tracking ticket + - [ ] Join war room: {link/bridge info} + +3. **Initial Investigation** + - [ ] Check recent deployments: {deployment log location} + - [ ] Review error logs: {log location and queries} + - [ ] Verify dependencies: {dependency check commands} + +## Mitigation Strategies +### Strategy 1: {Name} +**Use when:** {conditions} +**Steps:** +1. {detailed step with commands} +2. {detailed step with expected outcomes} +3. {validation step} + +**Rollback Plan:** +1. {rollback step} +2. {verification step} + +### Strategy 2: {Name} +{similar structure} + +## Recovery and Validation +1. **Service Restoration** + - [ ] {restoration step} + - [ ] Wait for {metric} to return to normal + - [ ] Validate end-to-end functionality + +2. **Communication** + - [ ] Update status page + - [ ] Notify stakeholders + - [ ] Schedule PIR + +## Common Pitfalls +- **{Pitfall}:** {description and how to avoid} +- **{Pitfall}:** {description and how to avoid} + +## Reference Information +- **Architecture Diagram:** {link} +- **Monitoring Dashboard:** {link} +- **Related Runbooks:** {links to dependent service runbooks} +``` + +### Post-Incident Review (PIR) Framework + +#### PIR Timeline and Ownership + +**Timeline:** +- **24 hours:** Initial PIR draft completed by Incident Commander +- **3 business days:** Final PIR published with all stakeholder input +- **1 week:** Action items assigned with owners and due dates +- **4 weeks:** Follow-up review on action item progress + +**Roles:** +- **PIR Owner:** Incident Commander (can delegate writing but owns completion) +- **Technical Contributors:** All engineers involved in response +- **Review Committee:** Engineering leadership, affected product teams +- **Action Item Owners:** Assigned based on expertise and capacity + +#### Root Cause Analysis Frameworks + +#### 1. Five Whys Method + +The Five Whys technique involves asking "why" repeatedly to drill down to root causes: + +**Example Application:** +- **Problem:** Database became unresponsive during peak traffic +- **Why 1:** Why did the database become unresponsive? → Connection pool was exhausted +- **Why 2:** Why was the connection pool exhausted? → Application was creating more connections than usual +- **Why 3:** Why was the application creating more connections? → New feature wasn't properly connection pooling +- **Why 4:** Why wasn't the feature properly connection pooling? → Code review missed this pattern +- **Why 5:** Why did code review miss this? → No automated checks for connection pooling patterns + +**Best Practices:** +- Ask "why" at least 3 times, often need 5+ iterations +- Focus on process failures, not individual blame +- Each "why" should point to a actionable system improvement +- Consider multiple root cause paths, not just one linear chain + +#### 2. Fishbone (Ishikawa) Diagram + +Systematic analysis across multiple categories of potential causes: + +**Categories:** +- **People:** Training, experience, communication, handoffs +- **Process:** Procedures, change management, review processes +- **Technology:** Architecture, tooling, monitoring, automation +- **Environment:** Infrastructure, dependencies, external factors + +**Application Method:** +1. State the problem clearly at the "head" of the fishbone +2. For each category, brainstorm potential contributing factors +3. For each factor, ask what caused that factor (sub-causes) +4. Identify the factors most likely to be root causes +5. Validate root causes with evidence from the incident + +#### 3. Timeline Analysis + +Reconstruct the incident chronologically to identify decision points and missed opportunities: + +**Timeline Elements:** +- **Detection:** When was the issue first observable? When was it first detected? +- **Notification:** How quickly were the right people informed? +- **Response:** What actions were taken and how effective were they? +- **Communication:** When were stakeholders updated? +- **Resolution:** What finally resolved the issue? + +**Analysis Questions:** +- Where were there delays and what caused them? +- What decisions would we make differently with perfect information? +- Where did communication break down? +- What automation could have detected/resolved faster? + +### Escalation Paths + +#### Technical Escalation + +**Level 1:** On-call engineer +- **Responsibility:** Initial response and common issue resolution +- **Escalation Trigger:** Issue not resolved within SLA timeframe +- **Timeframe:** 15 minutes (SEV1), 30 minutes (SEV2) + +**Level 2:** Senior engineer/Team lead +- **Responsibility:** Complex technical issues requiring deeper expertise +- **Escalation Trigger:** Level 1 requests help or timeout occurs +- **Timeframe:** 30 minutes (SEV1), 1 hour (SEV2) + +**Level 3:** Engineering Manager/Staff Engineer +- **Responsibility:** Cross-team coordination and architectural decisions +- **Escalation Trigger:** Issue spans multiple systems or teams +- **Timeframe:** 45 minutes (SEV1), 2 hours (SEV2) + +**Level 4:** Director of Engineering/CTO +- **Responsibility:** Resource allocation and business impact decisions +- **Escalation Trigger:** Extended outage or significant business impact +- **Timeframe:** 1 hour (SEV1), 4 hours (SEV2) + +#### Business Escalation + +**Customer Impact Assessment:** +- **High:** Revenue loss, SLA breaches, customer churn risk +- **Medium:** User experience degradation, support ticket volume +- **Low:** Internal tools, development impact only + +**Escalation Matrix:** + +| Severity | Duration | Business Escalation | +|----------|----------|-------------------| +| SEV1 | Immediate | VP Engineering | +| SEV1 | 30 minutes | CTO + Customer Success VP | +| SEV1 | 1 hour | CEO + Full Executive Team | +| SEV2 | 2 hours | VP Engineering | +| SEV2 | 4 hours | CTO | +| SEV3 | 1 business day | Engineering Manager | + +### Status Page Management + +#### Update Principles + +1. **Transparency:** Provide factual information without speculation +2. **Timeliness:** Update within committed timeframes +3. **Clarity:** Use customer-friendly language, avoid technical jargon +4. **Completeness:** Include impact scope, status, and next update time + +#### Status Categories + +- **Operational:** All systems functioning normally +- **Degraded Performance:** Some users may experience slowness +- **Partial Outage:** Subset of features unavailable +- **Major Outage:** Service unavailable for most/all users +- **Under Maintenance:** Planned maintenance window + +#### Update Template + +``` +{Timestamp} - {Status Category} + +{Brief description of current state} + +Impact: {who is affected and how} +Cause: {root cause if known, "under investigation" if not} +Resolution: {what's being done to fix it} + +Next update: {specific time} + +We apologize for any inconvenience this may cause. +``` + +### Action Item Framework + +#### Action Item Categories + +1. **Immediate Fixes** + - Critical bugs discovered during incident + - Security vulnerabilities exposed + - Data integrity issues + +2. **Process Improvements** + - Communication gaps + - Escalation procedure updates + - Runbook additions/updates + +3. **Technical Debt** + - Architecture improvements + - Monitoring enhancements + - Automation opportunities + +4. **Organizational Changes** + - Team structure adjustments + - Training requirements + - Tool/platform investments + +#### Action Item Template + +``` +**Title:** {Concise description of the action} +**Priority:** {Critical/High/Medium/Low} +**Category:** {Fix/Process/Technical/Organizational} +**Owner:** {Assigned person} +**Due Date:** {Specific date} +**Success Criteria:** {How will we know this is complete} +**Dependencies:** {What needs to happen first} +**Related PIRs:** {Links to other incidents this addresses} + +**Description:** +{Detailed description of what needs to be done and why} + +**Implementation Plan:** +1. {Step 1} +2. {Step 2} +3. {Validation step} + +**Progress Updates:** +- {Date}: {Progress update} +- {Date}: {Progress update} +``` + +## Usage Examples + +### Example 1: Database Connection Pool Exhaustion -**Usage**: ```bash -# File input with text output -python scripts/incident_timeline_builder.py incident.json --format text +# Classify the incident +echo '{"description": "Users reporting 500 errors, database connections timing out", "affected_users": "80%", "business_impact": "high"}' | python scripts/incident_classifier.py -# File input with JSON output for downstream processing -python scripts/incident_timeline_builder.py incident.json --format json +# Reconstruct timeline from logs +python scripts/timeline_reconstructor.py --input assets/db_incident_events.json --output timeline.md -# Stdin support for pipeline integration -cat incident.json | python scripts/incident_timeline_builder.py --format text - -# Markdown output for postmortem documents -python scripts/incident_timeline_builder.py incident.json --format markdown - -# Filter events by phase -python scripts/incident_timeline_builder.py incident.json --phase mitigation --format text +# Generate PIR after resolution +python scripts/pir_generator.py --incident assets/db_incident_data.json --timeline timeline.md --output pir.md ``` -**Options**: -| Flag | Description | Default | -|------|-------------|---------| -| `--format` | Output format: `text`, `json`, `markdown` | `text` | -| `--phase` | Filter to specific phase: `detection`, `triage`, `mitigation`, `resolution` | all | -| `--gap-threshold` | Minutes of silence before flagging a gap | `15` | -| `--include-comms` | Include communication events in timeline | `true` | -| `--verbose` | Show phase duration breakdown and statistics | `false` | +### Example 2: API Rate Limiting Incident -**Output Description**: -- Ordered event list with timestamps, actors, sources, and phase tags -- Phase duration summary (e.g., "Triage: 12 minutes, Mitigation: 47 minutes") -- Communication cadence score (updates per 15-minute window) -- Gap warnings with recommended actions -- Total incident duration from first alert to resolution confirmation - -### Severity Classifier (`scripts/severity_classifier.py`) -Impact-driven severity classification with escalation routing and SLA timer activation. - -**Features**: -- Four-tier severity classification (SEV1-SEV4) based on quantitative impact thresholds -- Blast radius estimation: affected users, services, and revenue exposure -- Escalation path generation with role assignments and response time requirements -- SLA breach prediction based on current severity and elapsed time -- Re-classification recommendations when incident scope changes -- Confidence scoring for classification decisions - -**Classification Thresholds**: -- **SEV1** (Critical): >50% users affected OR >$500K/hour revenue impact OR data breach OR complete service outage -- **SEV2** (Major): >10% users affected OR >$50K/hour revenue impact OR major feature unavailable -- **SEV3** (Minor): >1% users affected OR >$5K/hour revenue impact OR degraded performance -- **SEV4** (Low): <1% users affected AND <$5K/hour revenue impact AND workaround available - -**Usage**: ```bash -# Classify from incident file -python scripts/severity_classifier.py incident.json --format text +# Quick classification from stdin +echo "API rate limits causing customer API calls to fail" | python scripts/incident_classifier.py --format text -# Classify with JSON output for automation -python scripts/severity_classifier.py incident.json --format json +# Build timeline from multiple sources +python scripts/timeline_reconstructor.py --input assets/api_incident_logs.json --detect-phases --gap-analysis -# Stdin support -cat incident.json | python scripts/severity_classifier.py --format text - -# Re-classify with updated scope -python scripts/severity_classifier.py incident.json --reclassify --format text - -# Include escalation routing in output -python scripts/severity_classifier.py incident.json --with-escalation --format text +# Generate comprehensive PIR +python scripts/pir_generator.py --incident assets/api_incident_summary.json --rca-method fishbone --action-items ``` -**Options**: -| Flag | Description | Default | -|------|-------------|---------| -| `--format` | Output format: `text`, `json` | `text` | -| `--reclassify` | Compare current vs. recommended severity | `false` | -| `--with-escalation` | Include escalation path and response times | `false` | -| `--sla-predict` | Predict SLA breach probability | `false` | -| `--verbose` | Show classification reasoning and confidence | `false` | - -**Output Description**: -- Severity level with confidence percentage (e.g., "SEV2 - 94% confidence") -- Impact summary: affected users, services, estimated revenue loss -- Escalation path: who to page, response time requirements, communication channels -- SLA status: time remaining before breach, recommended actions -- Re-classification recommendation if scope has changed - -### Postmortem Generator (`scripts/postmortem_generator.py`) -Automated blameless postmortem document generation with root cause analysis and action item tracking. - -**Features**: -- Complete postmortem document generation from incident data -- 5-Whys root cause chain validation (checks for depth and logical consistency) -- Action item extraction with priority scoring (P1-P4) and ownership assignment -- Impact quantification: downtime minutes, affected users, revenue loss, SLA budget consumed -- Contributing factor identification beyond primary root cause -- Cross-incident pattern matching for recurring failure modes -- Blameless language validation (flags accusatory phrasing) - -**Usage**: -```bash -# Generate postmortem in markdown format -python scripts/postmortem_generator.py incident.json --format markdown - -# Generate in JSON for integration with tracking systems -python scripts/postmortem_generator.py incident.json --format json - -# Stdin support -cat incident.json | python scripts/postmortem_generator.py --format markdown - -# Include cross-incident pattern analysis (requires historical data) -python scripts/postmortem_generator.py incident.json --history incidents/ --format markdown - -# Validate blameless language in existing postmortem -python scripts/postmortem_generator.py incident.json --validate-language --format text -``` - -**Options**: -| Flag | Description | Default | -|------|-------------|---------| -| `--format` | Output format: `markdown`, `json`, `text` | `markdown` | -| `--history` | Directory of historical incident JSON files for pattern analysis | none | -| `--validate-language` | Check for blame-assigning language patterns | `false` | -| `--include-timeline` | Embed full timeline in postmortem document | `true` | -| `--action-items-only` | Output only extracted action items | `false` | -| `--verbose` | Include classification reasoning and pattern details | `false` | - -**Output Description**: -- Complete postmortem document with: title, severity, duration, impact summary -- Chronological timeline embedded from timeline builder -- Root cause analysis with 5-Whys chain and contributing factors -- Action items table with ID, description, priority, owner, due date -- Lessons learned section with systemic improvement recommendations -- SLA impact statement with remaining monthly error budget - ---- - -## Methodology - -### The Incident Commander's Decision Framework - -#### Incident Lifecycle Model - -Every incident follows five phases. The Incident Commander owns the transitions between them. - -**Phase 1 - Detection** (Target: <5 minutes from onset to alert) -- Monitoring systems fire alerts based on predefined thresholds -- On-call engineer acknowledges alert within defined SLA (2 minutes for SEV1, 5 minutes for SEV2) -- Initial triage determines whether to declare a formal incident -- If customer-reported: escalate classification by one severity level automatically - -**Phase 2 - Triage** (Target: <10 minutes) -- Incident Commander assigned or self-declared -- Severity classified using impact-first methodology (not cause-first) -- Communication channel established (dedicated Slack channel, bridge line) -- Stakeholder notification triggered per severity level -- Responder roles assigned: IC, Technical Lead, Communications Lead, Scribe - -**Phase 3 - Mitigation** (Target: varies by severity) -- Focus on restoring service, not finding root cause -- Time-boxed investigation windows (15-minute check-ins for SEV1, 30-minute for SEV2) -- Escalation triggers if mitigation stalls beyond defined thresholds -- Customer communication cadence: every 15 minutes for SEV1, every 30 minutes for SEV2 -- Decision framework: rollback vs. forward-fix vs. failover - -**Phase 4 - Resolution** (Target: confirmed stable for 15+ minutes) -- Service confirmed restored to baseline metrics -- Monitoring confirms stability for minimum observation window -- Customer-facing all-clear communication sent -- Incident record updated with resolution summary -- Postmortem scheduled within 48 hours (24 hours for SEV1) - -**Phase 5 - Postmortem** (Target: completed within 5 business days) -- Blameless postmortem meeting conducted with all responders -- Timeline reconstructed and validated by participants -- 5-Whys root cause analysis completed to systemic level -- Action items assigned with owners, priorities, and due dates -- Postmortem published to incident knowledge base - -#### Severity Classification Philosophy - -This framework uses **impact-first classification**, not cause-first. The severity of an incident is determined by its effect on customers and business, never by the technical cause. - -Rationale: A typo in a config file that takes down all of production is a SEV1. A complex distributed systems failure that affects 0.1% of users is a SEV3. Cause complexity is irrelevant to severity -- only impact matters. - -**Classification must happen within the first 5 minutes of declaration.** Reclassification is expected and encouraged as more information surfaces. Upgrading severity is always acceptable; downgrading requires IC approval and documented justification. - -#### Communication Cadence Protocol - -Silence during an incident is a failure mode. The Incident Commander enforces communication discipline: - -| Severity | Internal Update | Customer Update | Executive Update | -|----------|----------------|-----------------|------------------| -| SEV1 | Every 10 min | Every 15 min | Every 30 min | -| SEV2 | Every 15 min | Every 30 min | Every 60 min | -| SEV3 | Every 30 min | Every 60 min | On resolution | -| SEV4 | Every 60 min | On resolution | Not required | - -Updates must contain: current status, actions being taken, expected next update time, and any changes in severity or scope. - -#### Blameless Postmortem Culture - -Postmortems are the highest-leverage activity in incident management. They fail when they become blame sessions. - -**Non-Negotiable Principles:** -1. Humans do not cause incidents. Systems that allow humans to trigger failures cause incidents. -2. Every postmortem must produce at least one systemic action item (process, tooling, or architecture change). -3. The 5-Whys analysis must reach a systemic root cause. "Engineer made a mistake" is never a root cause -- the question is why the system allowed that mistake to cause an outage. -4. Postmortem attendance is mandatory for all incident responders. Optional for anyone else who wants to learn. -5. Action items without owners and due dates are not action items. They are wishes. - ---- - -## Templates & Assets - -### Incident Response Runbook (`assets/incident_response_runbook.md`) -Step-by-step response protocol for active incidents including: -- Incident Commander checklist (declaration through resolution) -- Role assignments and responsibilities (IC, Tech Lead, Comms Lead, Scribe) -- Severity-specific escalation procedures with contact routing -- Communication templates for each update cadence -- Handoff protocol for long-running incidents (>4 hours) - -### Postmortem Template (`assets/postmortem_template.md`) -Production-ready blameless postmortem document featuring: -- Structured header with incident metadata (ID, severity, duration, commander) -- Impact quantification section (users, revenue, SLA budget) -- Chronological timeline with phase annotations -- 5-Whys root cause analysis framework -- Contributing factors and systemic weaknesses -- Action items table with priority, owner, due date, and tracking status -- Lessons learned and process improvement recommendations - -### Stakeholder Communication Templates (`assets/stakeholder_comms_templates.md`) -Pre-written communication templates for consistent messaging: -- Initial incident declaration (internal and external) -- Periodic status updates per severity level -- Resolution and all-clear notifications -- Executive briefing format for SEV1/SEV2 incidents -- Customer-facing status page update language -- Post-resolution follow-up communication - -### Sample Incident Data (`assets/sample_incident_data.json`) -Comprehensive incident dataset demonstrating: -- Multi-service payment processing outage with realistic timeline -- 24 timeline events across all five lifecycle phases -- Complete 5-Whys root cause chain with contributing factors -- 6 action items with varying priorities and ownership -- SLA impact calculation with monthly error budget tracking -- Cross-referenced monitoring alerts, Slack messages, and PagerDuty events - ---- - -## Reference Frameworks - -### SRE Incident Management Guide (`references/sre-incident-management-guide.md`) -Comprehensive incident management methodology derived from Google SRE, PagerDuty, and Atlassian practices: -- Incident Commander role definition and authority boundaries -- On-call rotation best practices (follow-the-sun, escalation tiers) -- Severity classification decision trees with worked examples -- Communication protocols for internal, customer, and executive audiences -- Incident review cadence (weekly incident review, monthly trend analysis, quarterly reliability review) -- Tooling integration patterns (PagerDuty, OpsGenie, Slack, Datadog, Grafana) -- Regulatory incident reporting requirements (SOC2, HIPAA, PCI-DSS, GDPR) - -### Reliability Metrics Framework (`references/reliability-metrics-framework.md`) -Quantitative reliability measurement and target-setting guide: -- MTTR, MTTD, MTBF definitions with calculation formulas and edge cases -- SLA/SLO/SLI hierarchy with implementation guidance -- Error budget policy design and enforcement mechanisms -- Incident frequency analysis with statistical trend detection -- Service-level reliability tiering (Tier 1 critical, Tier 2 important, Tier 3 standard) -- Dashboard design for operational visibility (what to measure, what to alert on, what to ignore) -- Benchmarking data: industry-standard targets by company maturity and service tier - ---- - -## Implementation Workflows - -### Active Incident Response - -#### Step 1: Detection & Declaration (0-5 minutes) -1. **Alert fires** from monitoring system (Datadog, PagerDuty, CloudWatch, custom) -2. **On-call acknowledges** within response SLA (2 min SEV1, 5 min SEV2) -3. **Initial assessment**: Is this a real incident or a false positive? -4. **Declare incident**: Create incident channel, page Incident Commander - ``` - /incident declare --severity SEV2 --title "Payment API 503 errors" --channel #inc-20260215-payments - ``` -5. **Classify severity** using `severity_classifier.py`: - ```bash - python scripts/severity_classifier.py incident.json --with-escalation --format text - ``` -6. **Assign roles**: IC, Technical Lead, Communications Lead, Scribe - -#### Step 2: Triage & Mobilization (5-15 minutes) -1. **IC confirms severity** and activates escalation path -2. **Page additional responders** based on affected services -3. **Establish communication rhythm**: Set timer for first status update -4. **Scribe begins timeline**: Record all events with timestamps -5. **Technical Lead begins investigation**: Check dashboards, recent deployments, dependency health -6. **Communications Lead sends initial notification** to stakeholders - -#### Step 3: Mitigation (15 minutes - varies) -1. **Focus on restoring service, not diagnosing root cause** -2. **Decision framework** at each check-in: - - Can we rollback the last deployment? (fastest) - - Can we failover to a healthy replica? (fast) - - Can we apply a targeted forward-fix? (moderate) - - Do we need to scale infrastructure? (slow) -3. **Time-boxed investigation**: If no progress in 15 minutes (SEV1) or 30 minutes (SEV2), escalate -4. **Customer communication**: Send status update per cadence protocol -5. **Re-classify severity** if scope changes: - ```bash - python scripts/severity_classifier.py incident_updated.json --reclassify --format text - ``` - -#### Step 4: Resolution & Verification (varies) -1. **Confirm fix deployed** and metrics returning to baseline -2. **Observation window**: 15 minutes stable for SEV1/SEV2, 30 minutes for SEV3/SEV4 -3. **Resolve incident**: Update status, send all-clear communication -4. **Schedule postmortem**: Within 24 hours for SEV1, 48 hours for SEV2, 5 business days for SEV3 -5. **On-call engineer writes initial incident summary** while context is fresh - -### Post-Incident Analysis - -#### Timeline Reconstruction (Day 1-2) -1. **Gather raw data** from all sources (monitoring, Slack, PagerDuty, git log) -2. **Build unified timeline**: - ```bash - python scripts/incident_timeline_builder.py incident.json --format markdown --verbose - ``` -3. **Identify gaps**: Missing events, unexplained delays, undocumented decisions -4. **Validate with responders**: Circulate timeline for corrections before postmortem meeting - -#### 5-Whys Root Cause Analysis (Postmortem Meeting) -1. **Start with the observable impact**: "Payment API returned 503 errors for 144 minutes" -2. **Ask "Why?" iteratively** -- each answer must be factual and verifiable -3. **Reach a systemic cause**: The final "why" must point to a process, tooling, or architecture gap -4. **Identify contributing factors**: What else made this incident worse or longer than necessary? -5. **Validate depth**: If the final cause is "human error," ask one more "why" - -#### Action Item Generation -1. **Categorize**: Prevention (stop recurrence), Detection (find faster), Mitigation (recover faster) -2. **Prioritize**: P1 items must be completed before next on-call rotation -3. **Assign ownership**: Every action item has exactly one owner (team, not individual) -4. **Set due dates**: P1 within 1 week, P2 within 2 weeks, P3 within 1 month -5. **Generate postmortem**: - ```bash - python scripts/postmortem_generator.py incident.json --format markdown --include-timeline - ``` - -### SLA Compliance Monitoring - -1. **Define SLOs per service tier**: - - Tier 1 (revenue-critical): 99.99% availability (52.6 min/year downtime budget) - - Tier 2 (customer-facing): 99.95% availability (4.38 hours/year) - - Tier 3 (internal tooling): 99.9% availability (8.77 hours/year) - -2. **Track error budget consumption**: Monthly rolling window with daily updates -3. **Trigger error budget policy** when >50% consumed: - - Freeze non-critical deployments - - Prioritize reliability work over feature work - - Require IC review for all production changes -4. **Monthly reliability review**: Present SLA compliance, incident trends, action item completion - -### On-Call Handoff Protocol - -1. **End-of-rotation summary**: Document active incidents, ongoing investigations, known risks -2. **Handoff meeting**: 15-minute synchronous handoff between outgoing and incoming on-call -3. **Runbook review**: Confirm incoming on-call has access to all runbooks and escalation paths -4. **Alert review**: Walk through any alerts that fired during the rotation and their resolutions -5. **Pending action items**: Transfer ownership of time-sensitive items to incoming on-call - ---- - -## Assessment & Measurement - -### Key Performance Indicators - -#### Response Effectiveness Metrics -- **MTTD (Mean Time to Detect)**: Time from incident onset to first alert. Target: <5 minutes for Tier 1 services, <15 minutes for Tier 2. Measures observability coverage and alert threshold quality. -- **MTTR (Mean Time to Resolve)**: Time from incident declaration to confirmed resolution. Target: <30 minutes for SEV1, <2 hours for SEV2, <8 hours for SEV3. The single most important operational metric. -- **MTBF (Mean Time Between Failures)**: Time between consecutive incidents per service. Target: increasing quarter-over-quarter. Measures systemic reliability improvement. -- **MTTA (Mean Time to Acknowledge)**: Time from alert to human acknowledgment. Target: <2 minutes for SEV1, <5 minutes for SEV2. Measures on-call responsiveness. - -#### Process Quality Metrics -- **Postmortem Completion Rate**: Percentage of SEV1-SEV3 incidents with completed postmortems. Target: 100% for SEV1-SEV2, >90% for SEV3. -- **Action Item Completion Rate**: Percentage of postmortem action items completed by due date. Target: >85% for P1, >70% for P2. Below 60% indicates systemic follow-through failure. -- **Postmortem Timeliness**: Days from resolution to published postmortem. Target: <3 business days for SEV1, <5 for SEV2. -- **Severity Accuracy**: Percentage of incidents where initial classification matched final assessment. Target: >80%. Low accuracy indicates classification training gaps. - -#### Reliability Metrics -- **SLA Compliance**: Percentage of time meeting availability targets per service tier. Target: 100% compliance with defined SLOs. -- **Error Budget Remaining**: Monthly remaining error budget as percentage. Target: >25% remaining at month-end. -- **Incident Frequency Trend**: Month-over-month incident count by severity. Target: decreasing or stable for SEV1-SEV2. -- **Repeat Incident Rate**: Percentage of incidents with same root cause as a previous incident. Target: <10%. Above 15% indicates postmortem action items are not effective. - -### Assessment Schedule -- **Per Incident**: MTTD, MTTR, severity accuracy, communication cadence adherence -- **Weekly**: Incident count review, open action item status, on-call load assessment -- **Monthly**: SLA compliance report, error budget status, MTTR trends, postmortem completion rates -- **Quarterly**: Reliability review with executive stakeholders, MTBF trends, incident pattern analysis, on-call health survey - -### Calibration & Validation -- Cross-reference MTTR calculations with customer-reported impact duration -- Validate severity classifications retrospectively during postmortem review -- Compare automated severity classifier output against IC decisions to improve model accuracy -- Audit action item effectiveness by tracking repeat incident rate per root cause category - ---- - ## Best Practices -### "Declare Early, Declare Often" -The single highest-leverage behavior in incident management is lowering the threshold for declaring incidents. Every organization that improves at incident response does so by declaring more incidents, not fewer. +### During Incident Response -**The cost of a false alarm is one wasted Slack channel. The cost of a missed incident is customer trust.** +1. **Maintain Calm Leadership** + - Stay composed under pressure + - Make decisive calls with incomplete information + - Communicate confidence while acknowledging uncertainty -Specific guidance: -- If two engineers are discussing whether something is an incident, it is an incident. Declare it. -- Any customer-reported issue that affects more than one user is an incident. Declare it. -- Any alert that requires more than 5 minutes of investigation is an incident. Declare it. -- Declaring an incident does not mean waking people up. It means creating a structured record. +2. **Document Everything** + - All actions taken and their outcomes + - Decision rationale, especially for controversial calls + - Timeline of events as they happen -### Anti-Patterns to Eliminate +3. **Effective Communication** + - Use clear, jargon-free language + - Provide regular updates even when there's no new information + - Manage stakeholder expectations proactively -**Hero Culture**: One engineer who "always fixes things" is a single point of failure, not an asset. If your incident response depends on a specific person being available, your process is broken. Fix the runbooks, not the rotation. +4. **Technical Excellence** + - Prefer rollbacks to risky fixes under pressure + - Validate fixes before declaring resolution + - Plan for secondary failures and cascading effects -**Blame Games**: The moment a postmortem asks "who did this?" instead of "why did our systems allow this?", the entire process loses value. Engineers who fear blame will hide information. Engineers who trust the process will share everything. +### Post-Incident -**Skipping Postmortems**: "We already know what happened" is the most dangerous sentence in incident management. The purpose of a postmortem is not to discover what happened -- it is to generate systemic improvements and share learnings across the organization. +1. **Blameless Culture** + - Focus on system failures, not individual mistakes + - Encourage honest reporting of what went wrong + - Celebrate learning and improvement opportunities -**Severity Inflation**: Classifying everything as SEV1 to get faster response trains the organization to ignore severity levels. Classify honestly. Respond proportionally. +2. **Action Item Discipline** + - Assign specific owners and due dates + - Track progress publicly + - Prioritize based on risk and effort -**Action Item Graveyards**: Postmortems that generate action items no one tracks are worse than no postmortem at all. They create a false sense of progress. If your action item completion rate is below 50%, stop generating new action items and complete the existing ones first. +3. **Knowledge Sharing** + - Share PIRs broadly within the organization + - Update runbooks based on lessons learned + - Conduct training sessions for common failure modes -### Communication During Incidents +4. **Continuous Improvement** + - Look for patterns across multiple incidents + - Invest in tooling and automation + - Regularly review and update processes -Template-driven communication eliminates cognitive load during high-stress situations: -- Never compose a customer update from scratch during an active incident -- Pre-written templates with fill-in-the-blank fields ensure consistent, professional communication -- The Communications Lead owns all external messaging; the IC approves content but does not write it -- Every update must answer three questions: What is happening? What are we doing about it? When is the next update? +## Integration with Existing Tools -### On-Call Health and Burnout Prevention +### Monitoring and Alerting +- PagerDuty/Opsgenie integration for escalation +- Datadog/Grafana for metrics and dashboards +- ELK/Splunk for log analysis and correlation -On-call is a tax on engineers' personal lives. Treating it as "just part of the job" without active management leads to burnout and attrition. +### Communication Platforms +- Slack/Teams for war room coordination +- Zoom/Meet for video bridges +- Status page providers (Statuspage.io, etc.) -**Non-Negotiable Standards:** -- Maximum on-call rotation: 1 week in 4 (25% on-call time). Below 1-in-3 requires immediate hiring. -- On-call engineers who are paged overnight get a late start or half-day the following day. No exceptions. -- Track pages-per-rotation. If any rotation consistently exceeds 5 pages, the alert thresholds need tuning. -- Quarterly on-call satisfaction surveys. Scores below 3/5 trigger mandatory process review. -- On-call compensation: either financial (on-call pay) or temporal (comp time). Uncompensated on-call is unacceptable. +### Documentation Systems +- Confluence/Notion for PIR storage +- GitHub/GitLab for runbook version control +- JIRA/Linear for action item tracking ---- +### Change Management +- CI/CD pipeline integration +- Deployment tracking systems +- Feature flag platforms for quick rollbacks -## Advanced Techniques +## Conclusion -### Chaos Engineering Integration -Proactive reliability testing through controlled failure injection: -- **Pre-Incident Drills**: Run tabletop exercises using `postmortem_generator.py` output from past incidents as scenarios -- **Game Days**: Scheduled chaos experiments (Chaos Monkey, Litmus, Gremlin) with full incident response activation -- **Runbook Validation**: Use chaos experiments to verify runbook accuracy and completeness before real incidents test them -- **Detection Validation**: Inject known failures to verify MTTD targets are achievable with current monitoring +The Incident Commander skill provides a comprehensive framework for managing incidents from detection through post-incident review. By implementing structured processes, clear communication templates, and thorough analysis tools, teams can improve their incident response capabilities and build more resilient systems. -### Automated Incident Detection -Reducing MTTD through intelligent alerting: -- **Anomaly Detection**: Statistical baselines (3-sigma) on key metrics with automatic incident creation above threshold -- **Composite Alerts**: Multi-signal correlation (latency + error rate + saturation) to reduce false positive rates below 5% -- **Customer Signal Integration**: Status page report volume, support ticket spike detection, social media monitoring -- **Deployment Correlation**: Automatic incident flagging when metric degradation occurs within 30 minutes of a deployment +The key to successful incident management is preparation, practice, and continuous learning. Use this framework as a starting point, but adapt it to your organization's specific needs, culture, and technical environment. -### Cross-Team Incident Coordination -Managing incidents that span organizational boundaries: -- **Unified Command Structure**: Single IC with authority across all affected teams, regardless of organizational reporting -- **Liaison Role**: Each affected team designates a liaison who communicates team-specific updates to the IC -- **Shared Timeline**: All teams contribute to a single timeline document, eliminating information silos -- **Joint Postmortems**: Cross-team postmortems with shared action items and joint ownership - -### Regulatory Incident Reporting -Meeting compliance obligations during incidents: -- **SOC2**: Document incident detection, response, and resolution within audit trail. Action items must be tracked to completion. -- **HIPAA**: Breach notification within 60 days for incidents involving PHI. Document risk assessment and mitigation steps. -- **PCI-DSS**: Immediate containment for cardholder data exposure. Forensic investigation required for confirmed breaches. -- **GDPR**: 72-hour notification to supervisory authority for personal data breaches. Document legal basis for processing decisions. -- **Automation**: `postmortem_generator.py --format json` output structured to feed directly into compliance reporting workflows - ---- - -## Limitations & Considerations - -### Data Quality Dependencies -- **Minimum Event Count**: Timeline analysis requires 5+ events for meaningful phase analysis; fewer events produce incomplete coverage -- **Timestamp Accuracy**: All analysis assumes synchronized timestamps (NTP); clock skew across systems degrades timeline accuracy -- **Source Coverage**: Timeline quality depends on capturing events from all relevant systems; missing sources create blind spots -- **Historical Data**: Cross-incident pattern analysis requires 10+ resolved incidents for statistically meaningful trends - -### Organizational Prerequisites -- **Blameless Culture**: Tools generate blameless framing, but cultural adoption requires sustained leadership commitment over 6+ months -- **On-Call Maturity**: Severity classification and escalation routing assume an established on-call rotation with defined response SLAs -- **Tooling Integration**: Full value requires integration with monitoring (Datadog/Grafana), communication (Slack), and paging (PagerDuty/OpsGenie) systems -- **Executive Buy-In**: Error budget policies and deployment freezes require executive sponsorship to enforce during business-critical periods - -### Scaling Considerations -- **Team Size**: Communication cadence protocols optimized for 3-8 responders; larger incidents require additional coordination roles (Operations Lead, Customer Liaison) -- **Incident Volume**: Organizations handling >20 incidents/week need automated triage to prevent IC fatigue and classification inconsistency -- **Geographic Distribution**: Follow-the-sun on-call requires adapted handoff protocols and timezone-aware SLA calculations -- **Multi-Product**: Shared infrastructure incidents affecting multiple products require product-specific impact assessment and communication tracks - -### Measurement Limitations -- **MTTR Variance**: Mean values obscure outliers; track P50, P90, and P99 MTTR for accurate performance assessment -- **Attribution Complexity**: Incidents with multiple contributing causes resist single-root-cause analysis; 5-Whys may oversimplify -- **Leading Indicators**: Most reliability metrics are lagging; invest in leading indicators (deployment frequency, change failure rate, alert noise ratio) -- **Comparison Pitfalls**: MTTR benchmarks vary dramatically by industry, company size, and service architecture; internal trends are more valuable than external comparisons - ---- - -## Success Metrics & Outcomes - -Organizations that implement this incident management framework consistently achieve: - -- **40-60% reduction in MTTR** within the first 6 months through structured response protocols and severity-driven escalation -- **70%+ reduction in MTTD** through improved monitoring coverage and composite alert configuration -- **90%+ postmortem completion rate** for SEV1-SEV2 incidents, up from the industry average of 40-50% -- **85%+ action item completion rate** within defined due dates, eliminating the "action item graveyard" anti-pattern -- **50% reduction in repeat incidents** (same root cause) within 12 months through systematic postmortem follow-through -- **30-40% improvement in on-call satisfaction scores** through rotation health management and burnout prevention -- **99.95%+ SLA compliance** for Tier 1 services through error budget policies and proactive reliability investment -- **Sub-5-minute severity classification** with >80% accuracy through impact-first methodology and trained Incident Commanders - -The framework transforms incident management from reactive firefighting into a structured, measurable engineering discipline. Teams stop treating incidents as exceptional events and start treating them as opportunities to systematically improve reliability, build organizational trust, and protect customer experience. - ---- - -*This skill combines Google SRE principles, PagerDuty operational best practices, and Atlassian incident management workflows into a unified, tool-supported framework. Success requires organizational commitment to blameless culture, consistent postmortem follow-through, and investment in observability. Adapt severity thresholds, communication cadences, and SLA targets to your specific organizational context and customer expectations.* +Remember: The goal isn't to prevent all incidents (which is impossible), but to detect them quickly, respond effectively, communicate clearly, and learn continuously. \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/sample_incident_classification.json b/engineering-team/incident-commander/assets/sample_incident_classification.json new file mode 100644 index 0000000..00a7677 --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_incident_classification.json @@ -0,0 +1,14 @@ +{ + "description": "Database connection timeouts causing 500 errors for payment processing API. Users unable to complete checkout. Error rate spiked from 0.1% to 45% starting at 14:30 UTC. Database monitoring shows connection pool exhaustion with 200/200 connections active.", + "service": "payment-api", + "affected_users": "80%", + "business_impact": "high", + "duration_minutes": 95, + "metadata": { + "error_rate": "45%", + "connection_pool_utilization": "100%", + "affected_regions": ["us-west", "us-east", "eu-west"], + "detection_method": "monitoring_alert", + "customer_escalations": 12 + } +} \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/sample_incident_pir_data.json b/engineering-team/incident-commander/assets/sample_incident_pir_data.json new file mode 100644 index 0000000..c04749d --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_incident_pir_data.json @@ -0,0 +1,74 @@ +{ + "incident_id": "INC-2024-0315-001", + "title": "Payment API Database Connection Pool Exhaustion", + "description": "Database connection pool exhaustion caused widespread 500 errors in payment processing API, preventing users from completing purchases. Root cause was an inefficient database query introduced in deployment v2.3.1.", + "severity": "sev2", + "start_time": "2024-03-15T14:30:00Z", + "end_time": "2024-03-15T15:35:00Z", + "duration": "1h 5m", + "affected_services": ["payment-api", "checkout-service", "subscription-billing"], + "customer_impact": "80% of users unable to complete payments or checkout. Approximately 2,400 failed payment attempts during the incident. Users experienced immediate 500 errors when attempting to pay.", + "business_impact": "Estimated revenue loss of $45,000 during outage period. No SLA breaches as resolution was within 2-hour window. 12 customer escalations through support channels.", + "incident_commander": "Mike Rodriguez", + "responders": [ + "Sarah Chen - On-call Engineer, Primary Responder", + "Tom Wilson - Database Team Lead", + "Lisa Park - Database Engineer", + "Mike Rodriguez - Incident Commander", + "David Kumar - DevOps Engineer" + ], + "status": "resolved", + "detection_details": { + "detection_method": "automated_monitoring", + "detection_time": "2024-03-15T14:30:00Z", + "alert_source": "Datadog error rate threshold", + "time_to_detection": "immediate" + }, + "response_details": { + "time_to_response": "5 minutes", + "time_to_escalation": "10 minutes", + "time_to_resolution": "65 minutes", + "war_room_established": "2024-03-15T14:45:00Z", + "executives_notified": false, + "status_page_updated": true + }, + "technical_details": { + "root_cause": "Inefficient database query introduced in deployment v2.3.1 caused each payment validation to take 15 seconds instead of normal 0.1 seconds, exhausting the 200-connection database pool", + "affected_regions": ["us-west", "us-east", "eu-west"], + "error_metrics": { + "peak_error_rate": "45%", + "normal_error_rate": "0.1%", + "connection_pool_max": 200, + "connections_exhausted_at": "100%" + }, + "resolution_method": "rollback", + "rollback_target": "v2.2.9", + "rollback_duration": "7 minutes" + }, + "communication_log": [ + { + "timestamp": "2024-03-15T14:50:00Z", + "type": "status_page", + "message": "Investigating payment processing issues", + "audience": "customers" + }, + { + "timestamp": "2024-03-15T15:35:00Z", + "type": "status_page", + "message": "Payment processing issues resolved", + "audience": "customers" + } + ], + "lessons_learned_preview": [ + "Deployment v2.3.1 code review missed performance implications of query change", + "Load testing didn't include realistic database query patterns", + "Connection pool monitoring could have provided earlier warning", + "Rollback procedure worked effectively - 7 minute rollback time" + ], + "preliminary_action_items": [ + "Fix inefficient query for v2.3.2 deployment", + "Add database query performance checks to CI pipeline", + "Improve load testing to include database performance scenarios", + "Add connection pool utilization alerts" + ] +} \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/sample_timeline_events.json b/engineering-team/incident-commander/assets/sample_timeline_events.json new file mode 100644 index 0000000..18438da --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_timeline_events.json @@ -0,0 +1,263 @@ +[ + { + "timestamp": "2024-03-15T14:30:00Z", + "source": "datadog", + "type": "alert", + "message": "High error rate detected on payment-api: 45% error rate (threshold: 5%)", + "severity": "critical", + "actor": "monitoring-system", + "metadata": { + "alert_id": "ALT-001", + "metric_value": "45%", + "threshold": "5%" + } + }, + { + "timestamp": "2024-03-15T14:32:00Z", + "source": "pagerduty", + "type": "escalation", + "message": "Paged on-call engineer Sarah Chen for payment-api alerts", + "severity": "high", + "actor": "pagerduty-system", + "metadata": { + "incident_id": "PD-12345", + "responder": "sarah.chen@company.com" + } + }, + { + "timestamp": "2024-03-15T14:35:00Z", + "source": "slack", + "type": "communication", + "message": "Sarah Chen acknowledged the alert and is investigating payment-api issues", + "severity": "medium", + "actor": "sarah.chen", + "metadata": { + "channel": "#incidents", + "message_id": "1234567890.123456" + } + }, + { + "timestamp": "2024-03-15T14:38:00Z", + "source": "application_logs", + "type": "log", + "message": "Database connection pool exhausted: 200/200 connections active, unable to acquire new connections", + "severity": "critical", + "actor": "payment-api", + "metadata": { + "log_level": "ERROR", + "component": "database_pool", + "connection_count": 200, + "max_connections": 200 + } + }, + { + "timestamp": "2024-03-15T14:40:00Z", + "source": "slack", + "type": "escalation", + "message": "Sarah Chen: Escalating to incident commander - database connection pool exhausted, need database team", + "severity": "high", + "actor": "sarah.chen", + "metadata": { + "channel": "#incidents", + "escalation_reason": "database_expertise_needed" + } + }, + { + "timestamp": "2024-03-15T14:42:00Z", + "source": "pagerduty", + "type": "escalation", + "message": "Incident commander Mike Rodriguez assigned to incident PD-12345", + "severity": "high", + "actor": "pagerduty-system", + "metadata": { + "incident_commander": "mike.rodriguez@company.com", + "role": "incident_commander" + } + }, + { + "timestamp": "2024-03-15T14:45:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: War room established in #war-room-payment-api. Engaging database team.", + "severity": "high", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#incidents", + "war_room": "#war-room-payment-api" + } + }, + { + "timestamp": "2024-03-15T14:47:00Z", + "source": "pagerduty", + "type": "escalation", + "message": "Database team engineers paged: Tom Wilson, Lisa Park", + "severity": "medium", + "actor": "pagerduty-system", + "metadata": { + "team": "database-team", + "responders": ["tom.wilson@company.com", "lisa.park@company.com"] + } + }, + { + "timestamp": "2024-03-15T14:50:00Z", + "source": "statuspage", + "type": "communication", + "message": "Status page updated: Investigating payment processing issues", + "severity": "medium", + "actor": "mike.rodriguez", + "metadata": { + "status": "investigating", + "affected_systems": ["payment-api"] + } + }, + { + "timestamp": "2024-03-15T14:52:00Z", + "source": "slack", + "type": "communication", + "message": "Tom Wilson: Joining war room. Looking at database metrics now. Seeing unusual query patterns from recent deployment.", + "severity": "medium", + "actor": "tom.wilson", + "metadata": { + "channel": "#war-room-payment-api", + "investigation_focus": "database_metrics" + } + }, + { + "timestamp": "2024-03-15T14:55:00Z", + "source": "database_monitoring", + "type": "log", + "message": "Identified slow query introduced in deployment v2.3.1: payment validation taking 15s per request", + "severity": "critical", + "actor": "database-monitor", + "metadata": { + "deployment_version": "v2.3.1", + "query_time": "15s", + "normal_query_time": "0.1s" + } + }, + { + "timestamp": "2024-03-15T15:00:00Z", + "source": "slack", + "type": "communication", + "message": "Tom Wilson: Root cause identified - inefficient query in v2.3.1 deployment. Recommending immediate rollback.", + "severity": "high", + "actor": "tom.wilson", + "metadata": { + "channel": "#war-room-payment-api", + "root_cause": "inefficient_query", + "recommendation": "rollback" + } + }, + { + "timestamp": "2024-03-15T15:02:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: Approved rollback to v2.2.9. Sarah initiating rollback procedure.", + "severity": "high", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#war-room-payment-api", + "decision": "rollback_approved", + "target_version": "v2.2.9" + } + }, + { + "timestamp": "2024-03-15T15:05:00Z", + "source": "deployment_system", + "type": "action", + "message": "Rollback initiated: payment-api v2.3.1 → v2.2.9", + "severity": "medium", + "actor": "sarah.chen", + "metadata": { + "from_version": "v2.3.1", + "to_version": "v2.2.9", + "deployment_type": "rollback" + } + }, + { + "timestamp": "2024-03-15T15:12:00Z", + "source": "deployment_system", + "type": "action", + "message": "Rollback completed successfully: payment-api now running v2.2.9 across all regions", + "severity": "medium", + "actor": "deployment-system", + "metadata": { + "deployment_status": "completed", + "regions": ["us-west", "us-east", "eu-west"] + } + }, + { + "timestamp": "2024-03-15T15:15:00Z", + "source": "datadog", + "type": "log", + "message": "Error rate decreasing: payment-api error rate dropped to 8% and continuing to decline", + "severity": "medium", + "actor": "monitoring-system", + "metadata": { + "error_rate": "8%", + "trend": "decreasing" + } + }, + { + "timestamp": "2024-03-15T15:18:00Z", + "source": "database_monitoring", + "type": "log", + "message": "Connection pool utilization normalizing: 45/200 connections active", + "severity": "low", + "actor": "database-monitor", + "metadata": { + "connection_count": 45, + "max_connections": 200, + "utilization": "22.5%" + } + }, + { + "timestamp": "2024-03-15T15:25:00Z", + "source": "datadog", + "type": "log", + "message": "Error rate returned to normal: payment-api error rate now 0.2% (within normal range)", + "severity": "low", + "actor": "monitoring-system", + "metadata": { + "error_rate": "0.2%", + "status": "normal" + } + }, + { + "timestamp": "2024-03-15T15:30:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: All metrics returned to normal. Declaring incident resolved. Thanks to all responders.", + "severity": "low", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#war-room-payment-api", + "status": "resolved" + } + }, + { + "timestamp": "2024-03-15T15:35:00Z", + "source": "statuspage", + "type": "communication", + "message": "Status page updated: Payment processing issues resolved. All systems operational.", + "severity": "low", + "actor": "mike.rodriguez", + "metadata": { + "status": "resolved", + "duration": "65 minutes" + } + }, + { + "timestamp": "2024-03-15T15:40:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: PIR scheduled for tomorrow 10am. Action item: fix the inefficient query in v2.3.2", + "severity": "low", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#incidents", + "pir_time": "2024-03-16T10:00:00Z", + "action_item": "fix_query_v2.3.2" + } + } +] \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/simple_incident.json b/engineering-team/incident-commander/assets/simple_incident.json new file mode 100644 index 0000000..b1af6a3 --- /dev/null +++ b/engineering-team/incident-commander/assets/simple_incident.json @@ -0,0 +1,6 @@ +{ + "description": "Users reporting slow page loads on the main website", + "service": "web-frontend", + "affected_users": "25%", + "business_impact": "medium" +} \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/simple_timeline_events.json b/engineering-team/incident-commander/assets/simple_timeline_events.json new file mode 100644 index 0000000..75b1126 --- /dev/null +++ b/engineering-team/incident-commander/assets/simple_timeline_events.json @@ -0,0 +1,30 @@ +[ + { + "timestamp": "2024-03-10T09:00:00Z", + "source": "monitoring", + "message": "High CPU utilization detected on web servers", + "severity": "medium", + "actor": "system" + }, + { + "timestamp": "2024-03-10T09:05:00Z", + "source": "slack", + "message": "Engineer investigating high CPU alerts", + "severity": "medium", + "actor": "john.doe" + }, + { + "timestamp": "2024-03-10T09:15:00Z", + "source": "deployment", + "message": "Deployed hotfix to reduce CPU usage", + "severity": "low", + "actor": "john.doe" + }, + { + "timestamp": "2024-03-10T09:25:00Z", + "source": "monitoring", + "message": "CPU utilization returned to normal levels", + "severity": "low", + "actor": "system" + } +] \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt b/engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt new file mode 100644 index 0000000..0182b8e --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt @@ -0,0 +1,44 @@ +============================================================ +INCIDENT CLASSIFICATION REPORT +============================================================ + +CLASSIFICATION: + Severity: SEV1 + Confidence: 100.0% + Reasoning: Classified as SEV1 based on: keywords: timeout, 500 error; user impact: 80% + Timestamp: 2026-02-16T12:41:46.644096+00:00 + +RECOMMENDED RESPONSE: + Primary Team: Analytics Team + Supporting Teams: SRE, API Team, Backend Engineering, Finance Engineering, Payments Team, DevOps, Compliance Team, Database Team, Platform Team, Data Engineering + Response Time: 5 minutes + +INITIAL ACTIONS: + 1. Establish incident command (Priority 1) + Timeout: 5 minutes + Page incident commander and establish war room + + 2. Create incident ticket (Priority 1) + Timeout: 2 minutes + Create tracking ticket with all known details + + 3. Update status page (Priority 2) + Timeout: 15 minutes + Post initial status page update acknowledging incident + + 4. Notify executives (Priority 2) + Timeout: 15 minutes + Alert executive team of customer-impacting outage + + 5. Engage subject matter experts (Priority 3) + Timeout: 10 minutes + Page relevant SMEs based on affected systems + +COMMUNICATION: + Subject: 🚨 [SEV1] payment-api - Database connection timeouts causing 500 errors fo... + Urgency: SEV1 + Recipients: on-call, engineering-leadership, executives, customer-success + Channels: pager, phone, slack, email, status-page + Update Frequency: Every 15 minutes + +============================================================ \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/pir_markdown_output.md b/engineering-team/incident-commander/expected_outputs/pir_markdown_output.md new file mode 100644 index 0000000..c9f46ac --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/pir_markdown_output.md @@ -0,0 +1,88 @@ +# Post-Incident Review: Payment API Database Connection Pool Exhaustion + +## Executive Summary +On March 15, 2024, we experienced a sev2 incident affecting ['payment-api', 'checkout-service', 'subscription-billing']. The incident lasted 1h 5m and had the following impact: 80% of users unable to complete payments or checkout. Approximately 2,400 failed payment attempts during the incident. Users experienced immediate 500 errors when attempting to pay. The incident has been resolved and we have identified specific actions to prevent recurrence. + +## Incident Overview +- **Incident ID:** INC-2024-0315-001 +- **Date & Time:** 2024-03-15 14:30:00 UTC +- **Duration:** 1h 5m +- **Severity:** SEV2 +- **Status:** Resolved +- **Incident Commander:** Mike Rodriguez +- **Responders:** Sarah Chen - On-call Engineer, Primary Responder, Tom Wilson - Database Team Lead, Lisa Park - Database Engineer, Mike Rodriguez - Incident Commander, David Kumar - DevOps Engineer + +### Customer Impact +80% of users unable to complete payments or checkout. Approximately 2,400 failed payment attempts during the incident. Users experienced immediate 500 errors when attempting to pay. + +### Business Impact +Estimated revenue loss of $45,000 during outage period. No SLA breaches as resolution was within 2-hour window. 12 customer escalations through support channels. + +## Timeline +No detailed timeline available. + +## Root Cause Analysis +### Analysis Method: 5 Whys Analysis + +#### Why Analysis + +**Why 1:** Why did Database connection pool exhaustion caused widespread 500 errors in payment processing API, preventing users from completing purchases. Root cause was an inefficient database query introduced in deployment v2.3.1.? +**Answer:** New deployment introduced a regression + +**Why 2:** Why wasn't this detected earlier? +**Answer:** Code review process missed the issue + +**Why 3:** Why didn't existing safeguards prevent this? +**Answer:** Testing environment didn't match production + +**Why 4:** Why wasn't there a backup mechanism? +**Answer:** Further investigation needed + +**Why 5:** Why wasn't this scenario anticipated? +**Answer:** Further investigation needed + + +## What Went Well +- The incident was successfully resolved +- Incident command was established +- Multiple team members collaborated on resolution + +## What Didn't Go Well +- Analysis in progress + +## Lessons Learned +Lessons learned to be documented following detailed analysis. + +## Action Items +Action items to be defined. + +## Follow-up and Prevention +### Prevention Measures + +Based on the root cause analysis, the following preventive measures have been identified: + +- Implement comprehensive testing for similar scenarios +- Improve monitoring and alerting coverage +- Enhance error handling and resilience patterns + +### Follow-up Schedule + +- 1 week: Review action item progress +- 1 month: Evaluate effectiveness of implemented changes +- 3 months: Conduct follow-up assessment and update preventive measures + +## Appendix +### Additional Information + +- Incident ID: INC-2024-0315-001 +- Severity Classification: sev2 +- Affected Services: payment-api, checkout-service, subscription-billing + +### References + +- Incident tracking ticket: [Link TBD] +- Monitoring dashboards: [Link TBD] +- Communication thread: [Link TBD] + +--- +*Generated on 2026-02-16 by PIR Generator* \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt b/engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt new file mode 100644 index 0000000..75d747d --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt @@ -0,0 +1,44 @@ +============================================================ +INCIDENT CLASSIFICATION REPORT +============================================================ + +CLASSIFICATION: + Severity: SEV2 + Confidence: 100.0% + Reasoning: Classified as SEV2 based on: keywords: slow; user impact: 25% + Timestamp: 2026-02-16T12:42:41.889774+00:00 + +RECOMMENDED RESPONSE: + Primary Team: UX Engineering + Supporting Teams: Product Engineering, Frontend Team + Response Time: 15 minutes + +INITIAL ACTIONS: + 1. Assign incident commander (Priority 1) + Timeout: 30 minutes + Assign IC and establish coordination channel + + 2. Create incident tracking (Priority 1) + Timeout: 5 minutes + Create incident ticket with details and timeline + + 3. Assess customer impact (Priority 2) + Timeout: 15 minutes + Determine scope and severity of user impact + + 4. Engage response team (Priority 2) + Timeout: 30 minutes + Page appropriate technical responders + + 5. Begin investigation (Priority 3) + Timeout: 15 minutes + Start technical analysis and debugging + +COMMUNICATION: + Subject: ⚠️ [SEV2] web-frontend - Users reporting slow page loads on the main websit... + Urgency: SEV2 + Recipients: on-call, engineering-leadership, product-team + Channels: pager, slack, email + Update Frequency: Every 30 minutes + +============================================================ \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt b/engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt new file mode 100644 index 0000000..f772409 --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt @@ -0,0 +1,110 @@ +================================================================================ +INCIDENT TIMELINE RECONSTRUCTION +================================================================================ + +OVERVIEW: + Time Range: 2024-03-15T14:30:00+00:00 to 2024-03-15T15:40:00+00:00 + Total Duration: 70 minutes + Total Events: 21 + Phases Detected: 12 + +PHASES: + DETECTION: + Start: 2024-03-15T14:30:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Initial detection of the incident through monitoring or observation + + ESCALATION: + Start: 2024-03-15T14:32:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Escalation to additional resources or higher severity response + + TRIAGE: + Start: 2024-03-15T14:35:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Assessment and initial investigation of the incident + + ESCALATION: + Start: 2024-03-15T14:38:00+00:00 + Duration: 9.0 minutes + Events: 5 + Description: Escalation to additional resources or higher severity response + + TRIAGE: + Start: 2024-03-15T14:50:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Assessment and initial investigation of the incident + + ESCALATION: + Start: 2024-03-15T14:52:00+00:00 + Duration: 10.0 minutes + Events: 4 + Description: Escalation to additional resources or higher severity response + + TRIAGE: + Start: 2024-03-15T15:05:00+00:00 + Duration: 7.0 minutes + Events: 2 + Description: Assessment and initial investigation of the incident + + DETECTION: + Start: 2024-03-15T15:15:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Initial detection of the incident through monitoring or observation + + RESOLUTION: + Start: 2024-03-15T15:18:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Confirmation that the incident has been resolved + + DETECTION: + Start: 2024-03-15T15:25:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Initial detection of the incident through monitoring or observation + + RESOLUTION: + Start: 2024-03-15T15:30:00+00:00 + Duration: 5.0 minutes + Events: 2 + Description: Confirmation that the incident has been resolved + + TRIAGE: + Start: 2024-03-15T15:40:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Assessment and initial investigation of the incident + +KEY METRICS: + Time to Mitigation: 0 minutes + Time to Resolution: 48.0 minutes + Events per Hour: 18.0 + Unique Sources: 7 + +INCIDENT NARRATIVE: +Incident Timeline Summary: +The incident began at 2024-03-15 14:30:00 UTC and concluded at 2024-03-15 15:40:00 UTC, lasting approximately 70 minutes. + +The incident progressed through 12 distinct phases: detection, escalation, triage, escalation, triage, escalation, triage, detection, resolution, detection, resolution, triage. + +Key milestones: +- Detection: 14:30 (0 min) +- Escalation: 14:32 (0 min) +- Triage: 14:35 (0 min) +- Escalation: 14:38 (9 min) +- Triage: 14:50 (0 min) +- Escalation: 14:52 (10 min) +- Triage: 15:05 (7 min) +- Detection: 15:15 (0 min) +- Resolution: 15:18 (0 min) +- Detection: 15:25 (0 min) +- Resolution: 15:30 (5 min) +- Triage: 15:40 (0 min) + +================================================================================ \ No newline at end of file diff --git a/engineering-team/incident-commander/references/communication_templates.md b/engineering-team/incident-commander/references/communication_templates.md new file mode 100644 index 0000000..c3b370f --- /dev/null +++ b/engineering-team/incident-commander/references/communication_templates.md @@ -0,0 +1,591 @@ +# Incident Communication Templates + +## Overview + +This document provides standardized communication templates for incident response. These templates ensure consistent, clear communication across different severity levels and stakeholder groups. + +## Template Usage Guidelines + +### General Principles +1. **Be Clear and Concise** - Use simple language, avoid jargon +2. **Be Factual** - Only state what is known, avoid speculation +3. **Be Timely** - Send updates at committed intervals +4. **Be Actionable** - Include next steps and expected timelines +5. **Be Accountable** - Include contact information for follow-up + +### Template Selection +- Choose templates based on incident severity and audience +- Customize templates with specific incident details +- Always include next update time and contact information +- Escalate template types as severity increases + +--- + +## SEV1 Templates + +### Initial Alert - Internal Teams + +**Subject:** 🚨 [SEV1] CRITICAL: {Service} Complete Outage - Immediate Response Required + +``` +CRITICAL INCIDENT ALERT - IMMEDIATE ATTENTION REQUIRED + +Incident Summary: +- Service: {Service Name} +- Status: Complete Outage +- Start Time: {Timestamp} +- Customer Impact: {Impact Description} +- Estimated Affected Users: {Number/Percentage} + +Immediate Actions Needed: +✓ Incident Commander: {Name} - ASSIGNED +✓ War Room: {Bridge/Chat Link} - JOIN NOW +✓ On-Call Response: {Team} - PAGED +⏳ Executive Notification: In progress +⏳ Status Page Update: Within 15 minutes + +Current Situation: +{Brief description of what we know} + +What We're Doing: +{Immediate response actions being taken} + +Next Update: {Timestamp - 15 minutes from now} + +Incident Commander: {Name} +Contact: {Phone/Slack} + +THIS IS A CUSTOMER-IMPACTING INCIDENT REQUIRING IMMEDIATE ATTENTION +``` + +### Executive Notification - SEV1 + +**Subject:** 🚨 URGENT: Customer-Impacting Outage - {Service} + +``` +EXECUTIVE ALERT: Critical customer-facing incident + +Service: {Service Name} +Impact: {Customer impact description} +Duration: {Current duration} (started {start time}) +Business Impact: {Revenue/SLA/compliance implications} + +Customer Impact Summary: +- Affected Users: {Number/percentage} +- Revenue Impact: {$ amount if known} +- SLA Status: {Breach status} +- Customer Escalations: {Number if any} + +Response Status: +- Incident Commander: {Name} ({contact}) +- Response Team Size: {Number of engineers} +- Root Cause: {If known, otherwise "Under investigation"} +- ETA to Resolution: {If known, otherwise "Investigating"} + +Executive Actions Required: +- [ ] Customer communication approval needed +- [ ] Legal/compliance notification: {If applicable} +- [ ] PR/Media response preparation: {If needed} +- [ ] Resource allocation decisions: {If escalation needed} + +War Room: {Link} +Next Update: {15 minutes from now} + +This incident meets SEV1 criteria and requires executive oversight. + +{Incident Commander contact information} +``` + +### Customer Communication - SEV1 + +**Subject:** Service Disruption - Immediate Action Being Taken + +``` +We are currently experiencing a service disruption affecting {service description}. + +What's Happening: +{Clear, customer-friendly description of the issue} + +Impact: +{What customers are experiencing - be specific} + +What We're Doing: +We detected this issue at {time} and immediately mobilized our engineering team. We are actively working to resolve this issue and will provide updates every 15 minutes. + +Current Actions: +• {Action 1 - customer-friendly description} +• {Action 2 - customer-friendly description} +• {Action 3 - customer-friendly description} + +Workaround: +{If available, provide clear steps} +{If not available: "We are working on alternative solutions and will share them as soon as available."} + +Next Update: {Timestamp} +Status Page: {Link} +Support: {Contact information if different from usual} + +We sincerely apologize for the inconvenience and are committed to resolving this as quickly as possible. + +{Company Name} Team +``` + +### Status Page Update - SEV1 + +**Status:** Major Outage + +``` +{Timestamp} - Investigating + +We are currently investigating reports of {service} being unavailable. Our team has been alerted and is actively investigating the cause. + +Affected Services: {List of affected services} +Impact: {Customer-facing impact description} + +We will provide an update within 15 minutes. +``` + +``` +{Timestamp} - Identified + +We have identified the cause of the {service} outage. Our engineering team is implementing a fix. + +Root Cause: {Brief, customer-friendly explanation} +Expected Resolution: {Timeline if known} + +Next update in 15 minutes. +``` + +``` +{Timestamp} - Monitoring + +The fix has been implemented and we are monitoring the service recovery. + +Current Status: {Recovery progress} +Next Steps: {What we're monitoring} + +We expect full service restoration within {timeframe}. +``` + +``` +{Timestamp} - Resolved + +{Service} is now fully operational. We have confirmed that all functionality is working as expected. + +Total Duration: {Duration} +Root Cause: {Brief summary} + +We apologize for the inconvenience. A full post-incident review will be conducted and shared within 24 hours. +``` + +--- + +## SEV2 Templates + +### Team Notification - SEV2 + +**Subject:** ⚠️ [SEV2] {Service} Performance Issues - Response Team Mobilizing + +``` +SEV2 INCIDENT: Performance degradation requiring active response + +Incident Details: +- Service: {Service Name} +- Issue: {Description of performance issue} +- Start Time: {Timestamp} +- Affected Users: {Percentage/description} +- Business Impact: {Impact on business operations} + +Current Status: +{What we know about the issue} + +Response Team: +- Incident Commander: {Name} ({contact}) +- Primary Responder: {Name} ({team}) +- Supporting Teams: {List of engaged teams} + +Immediate Actions: +✓ {Action 1 - completed} +⏳ {Action 2 - in progress} +⏳ {Action 3 - next step} + +Metrics: +- Error Rate: {Current vs normal} +- Response Time: {Current vs normal} +- Throughput: {Current vs normal} + +Communication Plan: +- Internal Updates: Every 30 minutes +- Stakeholder Notification: {If needed} +- Status Page Update: {Planned/not needed} + +Coordination Channel: {Slack channel} +Next Update: {30 minutes from now} + +Incident Commander: {Name} | {Contact} +``` + +### Stakeholder Update - SEV2 + +**Subject:** [SEV2] Service Performance Update - {Service} + +``` +Service Performance Incident Update + +Service: {Service Name} +Duration: {Current duration} +Impact: {Description of user impact} + +Current Status: +{Brief status of the incident and response efforts} + +What We Know: +• {Key finding 1} +• {Key finding 2} +• {Key finding 3} + +What We're Doing: +• {Response action 1} +• {Response action 2} +• {Monitoring/verification steps} + +Customer Impact: +{Realistic assessment of what users are experiencing} + +Workaround: +{If available, provide steps} + +Expected Resolution: +{Timeline if known, otherwise "Continuing investigation"} + +Next Update: {30 minutes} +Contact: {Incident Commander information} + +This incident is being actively managed and does not currently require escalation. +``` + +### Customer Communication - SEV2 (Optional) + +**Subject:** Temporary Service Performance Issues + +``` +We are currently experiencing performance issues with {service name} that may affect your experience. + +What You Might Notice: +{Specific symptoms users might experience} + +What We're Doing: +Our team identified this issue at {time} and is actively working on a resolution. We expect to have this resolved within {timeframe}. + +Workaround: +{If applicable, provide simple workaround steps} + +We will update our status page at {link} with progress information. + +Thank you for your patience as we work to resolve this issue quickly. + +{Company Name} Support Team +``` + +--- + +## SEV3 Templates + +### Team Assignment - SEV3 + +**Subject:** [SEV3] Issue Assignment - {Component} Issue + +``` +SEV3 Issue Assignment + +Service/Component: {Affected component} +Issue: {Description} +Reported: {Timestamp} +Reporter: {Person/system that reported} + +Issue Details: +{Detailed description of the problem} + +Impact Assessment: +- Affected Users: {Scope} +- Business Impact: {Assessment} +- Urgency: {Business hours response appropriate} + +Assignment: +- Primary: {Engineer name} +- Team: {Responsible team} +- Expected Response: {Within 2-4 hours} + +Investigation Plan: +1. {Investigation step 1} +2. {Investigation step 2} +3. {Communication checkpoint} + +Workaround: +{If known, otherwise "Investigating alternatives"} + +This issue will be tracked in {ticket system} as {ticket number}. + +Team Lead: {Name} | {Contact} +``` + +### Status Update - SEV3 + +**Subject:** [SEV3] Progress Update - {Component} + +``` +SEV3 Issue Progress Update + +Issue: {Brief description} +Assigned to: {Engineer/Team} +Investigation Status: {Current progress} + +Findings So Far: +{What has been discovered during investigation} + +Next Steps: +{Planned actions and timeline} + +Impact Update: +{Any changes to scope or urgency} + +Expected Resolution: +{Timeline if known} + +This issue continues to be tracked as SEV3 with no escalation required. + +Contact: {Assigned engineer} | {Team lead} +``` + +--- + +## SEV4 Templates + +### Issue Documentation - SEV4 + +**Subject:** [SEV4] Issue Documented - {Description} + +``` +SEV4 Issue Logged + +Description: {Clear description of the issue} +Reporter: {Name/system} +Date: {Date reported} + +Impact: +{Minimal impact description} + +Priority Assessment: +This issue has been classified as SEV4 and will be addressed in the normal development cycle. + +Assignment: +- Team: {Responsible team} +- Sprint: {Target sprint} +- Estimated Effort: {Story points/hours} + +This issue is tracked as {ticket number} in {system}. + +Product Owner: {Name} +``` + +--- + +## Escalation Templates + +### Severity Escalation + +**Subject:** ESCALATION: {Original Severity} → {New Severity} - {Service} + +``` +SEVERITY ESCALATION NOTIFICATION + +Original Classification: {Original severity} +New Classification: {New severity} +Escalation Time: {Timestamp} +Escalated By: {Name and role} + +Escalation Reasons: +• {Reason 1 - scope expansion/duration/impact} +• {Reason 2} +• {Reason 3} + +Updated Impact: +{New assessment of customer/business impact} + +Updated Response Requirements: +{New response team, communication frequency, etc.} + +Previous Response Actions: +{Summary of actions taken under previous severity} + +New Incident Commander: {If changed} +Updated Communication Plan: {New frequency/recipients} + +All stakeholders should adjust response according to {new severity} protocols. + +Incident Commander: {Name} | {Contact} +``` + +### Management Escalation + +**Subject:** MANAGEMENT ESCALATION: Extended {Severity} Incident - {Service} + +``` +Management Escalation Required + +Incident: {Service} {brief description} +Original Severity: {Severity} +Duration: {Current duration} +Escalation Trigger: {Duration threshold/scope change/customer escalation} + +Current Status: +{Brief status of incident response} + +Challenges Encountered: +• {Challenge 1} +• {Challenge 2} +• {Resource/expertise needs} + +Business Impact: +{Updated assessment of business implications} + +Management Decision Required: +• {Decision 1 - resource allocation/external expertise/communication} +• {Decision 2} + +Recommended Actions: +{Incident Commander's recommendations} + +This escalation follows standard procedures for {trigger type}. + +Incident Commander: {Name} +Contact: {Phone/Slack} +War Room: {Link} +``` + +--- + +## Resolution Templates + +### Resolution Confirmation - All Severities + +**Subject:** RESOLVED: [{Severity}] {Service} Incident - {Brief Description} + +``` +INCIDENT RESOLVED + +Service: {Service Name} +Issue: {Brief description} +Duration: {Total duration} +Resolution Time: {Timestamp} + +Resolution Summary: +{Brief description of how the issue was resolved} + +Root Cause: +{Brief explanation - detailed PIR to follow} + +Impact Summary: +- Users Affected: {Final count/percentage} +- Business Impact: {Final assessment} +- Services Affected: {List} + +Resolution Actions Taken: +• {Action 1} +• {Action 2} +• {Verification steps} + +Monitoring: +We will continue monitoring {service} for {duration} to ensure stability. + +Next Steps: +• Post-incident review scheduled for {date} +• Action items to be tracked in {system} +• Follow-up communication: {If needed} + +Thank you to everyone who participated in the incident response. + +Incident Commander: {Name} +``` + +### Customer Resolution Communication + +**Subject:** Service Restored - Thank You for Your Patience + +``` +Service Update: Issue Resolved + +We're pleased to report that the {service} issues have been fully resolved as of {timestamp}. + +What Was Fixed: +{Customer-friendly explanation of the resolution} + +Duration: +The issue lasted {duration} from {start time} to {end time}. + +What We Learned: +{Brief, high-level takeaway} + +Our Commitment: +We are conducting a thorough review of this incident and will implement improvements to prevent similar issues in the future. A summary of our findings and improvements will be shared {timeframe}. + +We sincerely apologize for any inconvenience this may have caused and appreciate your patience while we worked to resolve the issue. + +If you continue to experience any problems, please contact our support team at {contact information}. + +Thank you, +{Company Name} Team +``` + +--- + +## Template Customization Guidelines + +### Placeholders to Always Replace +- `{Service}` / `{Service Name}` - Specific service or component +- `{Timestamp}` - Specific date/time in consistent format +- `{Name}` / `{Contact}` - Actual names and contact information +- `{Duration}` - Actual time durations +- `{Link}` - Real URLs to war rooms, status pages, etc. + +### Language Guidelines +- Use active voice ("We are investigating" not "The issue is being investigated") +- Be specific about timelines ("within 30 minutes" not "soon") +- Avoid technical jargon in customer communications +- Include empathy in customer-facing messages +- Use consistent terminology throughout incident lifecycle + +### Timing Guidelines +| Severity | Initial Notification | Update Frequency | Resolution Notification | +|----------|---------------------|------------------|------------------------| +| SEV1 | Immediate (< 5 min) | Every 15 minutes | Immediate | +| SEV2 | Within 15 minutes | Every 30 minutes | Within 15 minutes | +| SEV3 | Within 2 hours | At milestones | Within 1 hour | +| SEV4 | Within 1 business day | Weekly | When resolved | + +### Audience-Specific Considerations + +#### Engineering Teams +- Include technical details +- Provide specific metrics and logs +- Include coordination channels +- List specific actions and owners + +#### Executive/Business +- Focus on business impact +- Include customer and revenue implications +- Provide clear timeline and resource needs +- Highlight any external factors (PR, legal, compliance) + +#### Customers +- Use plain language +- Focus on customer impact and workarounds +- Provide realistic timelines +- Include support contact information +- Show empathy and accountability + +--- + +**Last Updated:** February 2026 +**Next Review:** May 2026 +**Owner:** Incident Management Team \ No newline at end of file diff --git a/engineering-team/incident-commander/references/incident_severity_matrix.md b/engineering-team/incident-commander/references/incident_severity_matrix.md new file mode 100644 index 0000000..7ab1265 --- /dev/null +++ b/engineering-team/incident-commander/references/incident_severity_matrix.md @@ -0,0 +1,292 @@ +# Incident Severity Classification Matrix + +## Overview + +This document defines the severity classification system used for incident response. The classification determines response requirements, escalation paths, and communication frequency. + +## Severity Levels + +### SEV1 - Critical Outage + +**Definition:** Complete service failure affecting all users or critical business functions + +#### Impact Criteria +- Customer-facing services completely unavailable +- Data loss or corruption affecting users +- Security breaches with customer data exposure +- Revenue-generating systems down +- SLA violations with financial penalties +- > 75% of users affected + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | Immediate (0-5 minutes) | +| **Incident Commander** | Assigned within 5 minutes | +| **War Room** | Established within 10 minutes | +| **Executive Notification** | Within 15 minutes | +| **Public Status Page** | Updated within 15 minutes | +| **Customer Communication** | Within 30 minutes | + +#### Escalation Path +1. **Immediate**: On-call Engineer → Incident Commander +2. **15 minutes**: VP Engineering + Customer Success VP +3. **30 minutes**: CTO +4. **60 minutes**: CEO + Full Executive Team + +#### Communication Requirements +- **Frequency**: Every 15 minutes until resolution +- **Channels**: PagerDuty, Phone, Slack, Email, Status Page +- **Recipients**: All engineering, executives, customer success +- **Template**: SEV1 Executive Alert Template + +--- + +### SEV2 - Major Impact + +**Definition:** Significant degradation affecting subset of users or non-critical functions + +#### Impact Criteria +- Partial service degradation (25-75% of users affected) +- Performance issues causing user frustration +- Non-critical features unavailable +- Internal tools impacting productivity +- Data inconsistencies not affecting user experience +- API errors affecting integrations + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | 15 minutes | +| **Incident Commander** | Assigned within 30 minutes | +| **Status Page Update** | Within 30 minutes | +| **Stakeholder Notification** | Within 1 hour | +| **Team Assembly** | Within 30 minutes | + +#### Escalation Path +1. **Immediate**: On-call Engineer → Team Lead +2. **30 minutes**: Engineering Manager +3. **2 hours**: VP Engineering +4. **4 hours**: CTO (if unresolved) + +#### Communication Requirements +- **Frequency**: Every 30 minutes during active response +- **Channels**: PagerDuty, Slack, Email +- **Recipients**: Engineering team, product team, relevant stakeholders +- **Template**: SEV2 Major Impact Template + +--- + +### SEV3 - Minor Impact + +**Definition:** Limited impact with workarounds available + +#### Impact Criteria +- Single feature or component affected +- < 25% of users impacted +- Workarounds available +- Performance degradation not significantly impacting UX +- Non-urgent monitoring alerts +- Development/test environment issues + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | 2 hours (business hours) | +| **After Hours Response** | Next business day | +| **Team Assignment** | Within 4 hours | +| **Status Page Update** | Optional | +| **Internal Notification** | Within 2 hours | + +#### Escalation Path +1. **Immediate**: Assigned Engineer +2. **4 hours**: Team Lead +3. **1 business day**: Engineering Manager (if needed) + +#### Communication Requirements +- **Frequency**: At key milestones only +- **Channels**: Slack, Email +- **Recipients**: Assigned team, team lead +- **Template**: SEV3 Minor Impact Template + +--- + +### SEV4 - Low Impact + +**Definition:** Minimal impact, cosmetic issues, or planned maintenance + +#### Impact Criteria +- Cosmetic bugs +- Documentation issues +- Logging or monitoring gaps +- Performance issues with no user impact +- Development/test environment issues +- Feature requests or enhancements + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | 1-2 business days | +| **Assignment** | Next sprint planning | +| **Tracking** | Standard ticket system | +| **Escalation** | None required | + +#### Communication Requirements +- **Frequency**: Standard development cycle updates +- **Channels**: Ticket system +- **Recipients**: Product owner, assigned developer +- **Template**: Standard issue template + +## Classification Guidelines + +### User Impact Assessment + +| Impact Scope | Description | Typical Severity | +|--------------|-------------|------------------| +| **All Users** | 100% of users affected | SEV1 | +| **Major Subset** | 50-75% of users affected | SEV1/SEV2 | +| **Significant Subset** | 25-50% of users affected | SEV2 | +| **Limited Users** | 5-25% of users affected | SEV2/SEV3 | +| **Few Users** | < 5% of users affected | SEV3/SEV4 | +| **No User Impact** | Internal only | SEV4 | + +### Business Impact Assessment + +| Business Impact | Description | Severity Boost | +|-----------------|-------------|----------------| +| **Revenue Loss** | Direct revenue impact | +1 severity level | +| **SLA Breach** | Contract violations | +1 severity level | +| **Regulatory** | Compliance implications | +1 severity level | +| **Brand Damage** | Public-facing issues | +1 severity level | +| **Security** | Data or system security | +2 severity levels | + +### Duration Considerations + +| Duration | Impact on Classification | +|----------|--------------------------| +| **< 15 minutes** | May reduce severity by 1 level | +| **15-60 minutes** | Standard classification | +| **1-4 hours** | May increase severity by 1 level | +| **> 4 hours** | Significant severity increase | + +## Decision Tree + +``` +1. Is this a security incident with data exposure? + → YES: SEV1 (regardless of user count) + → NO: Continue to step 2 + +2. Are revenue-generating services completely down? + → YES: SEV1 + → NO: Continue to step 3 + +3. What percentage of users are affected? + → > 75%: SEV1 + → 25-75%: SEV2 + → 5-25%: SEV3 + → < 5%: SEV4 + +4. Apply business impact modifiers +5. Consider duration factors +6. When in doubt, err on higher severity +``` + +## Examples + +### SEV1 Examples +- Payment processing system completely down +- All user authentication failing +- Database corruption causing data loss +- Security breach with customer data exposed +- Website returning 500 errors for all users + +### SEV2 Examples +- Payment processing slow (30-second delays) +- Search functionality returning incomplete results +- API rate limits causing partner integration issues +- Dashboard displaying stale data (> 1 hour old) +- Mobile app crashing for 40% of users + +### SEV3 Examples +- Single feature in admin panel not working +- Email notifications delayed by 1 hour +- Non-critical API endpoint returning errors +- Cosmetic UI bug in settings page +- Development environment deployment failing + +### SEV4 Examples +- Typo in help documentation +- Log format change needed for analysis +- Non-critical performance optimization +- Internal tool enhancement request +- Test data cleanup needed + +## Escalation Triggers + +### Automatic Escalation +- SEV1 incidents automatically escalate every 30 minutes if unresolved +- SEV2 incidents escalate after 2 hours without significant progress +- Any incident with expanding scope increases severity +- Customer escalation to support triggers severity review + +### Manual Escalation +- Incident Commander can escalate at any time +- Technical leads can request escalation +- Business stakeholders can request severity review +- External factors (media attention, regulatory) trigger escalation + +## Communication Templates + +### SEV1 Executive Alert +``` +Subject: 🚨 CRITICAL INCIDENT - [Service] Complete Outage + +URGENT: Customer-facing service outage requiring immediate attention + +Service: [Service Name] +Start Time: [Timestamp] +Impact: [Description of customer impact] +Estimated Affected Users: [Number/Percentage] +Business Impact: [Revenue/SLA/Brand implications] + +Incident Commander: [Name] ([Contact]) +Response Team: [Team members engaged] + +Current Status: [Brief status update] +Next Update: [Timestamp - 15 minutes from now] +War Room: [Bridge/Chat link] + +This is a customer-impacting incident requiring executive awareness. +``` + +### SEV2 Major Impact +``` +Subject: ⚠️ [SEV2] [Service] - Major Performance Impact + +Major service degradation affecting user experience + +Service: [Service Name] +Start Time: [Timestamp] +Impact: [Description of user impact] +Scope: [Affected functionality/users] + +Response Team: [Team Lead] + [Team members] +Status: [Current mitigation efforts] +Workaround: [If available] + +Next Update: 30 minutes +Status Page: [Link if updated] +``` + +## Review and Updates + +This severity matrix should be reviewed quarterly and updated based on: +- Incident response learnings +- Business priority changes +- Service architecture evolution +- Regulatory requirement changes +- Customer feedback and SLA updates + +**Last Updated:** February 2026 +**Next Review:** May 2026 +**Owner:** Engineering Leadership \ No newline at end of file diff --git a/engineering-team/incident-commander/references/rca_frameworks_guide.md b/engineering-team/incident-commander/references/rca_frameworks_guide.md new file mode 100644 index 0000000..4c62fc8 --- /dev/null +++ b/engineering-team/incident-commander/references/rca_frameworks_guide.md @@ -0,0 +1,562 @@ +# Root Cause Analysis (RCA) Frameworks Guide + +## Overview + +This guide provides detailed instructions for applying various Root Cause Analysis frameworks during Post-Incident Reviews. Each framework offers a different perspective and approach to identifying underlying causes of incidents. + +## Framework Selection Guidelines + +| Incident Type | Recommended Framework | Why | +|---------------|----------------------|-----| +| **Process Failure** | 5 Whys | Simple, direct cause-effect chain | +| **Complex System Failure** | Fishbone + Timeline | Multiple contributing factors | +| **Human Error** | Fishbone | Systematic analysis of contributing factors | +| **Extended Incidents** | Timeline Analysis | Understanding decision points | +| **High-Risk Incidents** | Bow Tie | Comprehensive barrier analysis | +| **Recurring Issues** | 5 Whys + Fishbone | Deep dive into systemic issues | + +--- + +## 5 Whys Analysis Framework + +### Purpose +Iteratively drill down through cause-effect relationships to identify root causes. + +### When to Use +- Simple, linear cause-effect chains +- Time-pressured analysis +- Process-related failures +- Individual component failures + +### Process Steps + +#### Step 1: Problem Statement +Write a clear, specific problem statement. + +**Good Example:** +> "The payment API returned 500 errors for 2 hours on March 15, affecting 80% of checkout attempts." + +**Poor Example:** +> "The system was broken." + +#### Step 2: First Why +Ask why the problem occurred. Focus on immediate, observable causes. + +**Example:** +- **Why 1:** Why did the payment API return 500 errors? +- **Answer:** The database connection pool was exhausted. + +#### Step 3: Subsequent Whys +For each answer, ask "why" again. Continue until you reach a root cause. + +**Example Chain:** +- **Why 2:** Why was the database connection pool exhausted? +- **Answer:** The application was creating more connections than usual. + +- **Why 3:** Why was the application creating more connections? +- **Answer:** A new feature wasn't properly closing connections. + +- **Why 4:** Why wasn't the feature properly closing connections? +- **Answer:** Code review missed the connection leak pattern. + +- **Why 5:** Why did code review miss this pattern? +- **Answer:** We don't have automated checks for connection pooling best practices. + +#### Step 4: Validation +Verify that addressing the root cause would prevent the original problem. + +### Best Practices + +1. **Ask at least 3 "whys"** - Surface causes are rarely root causes +2. **Focus on process failures, not people** - Avoid blame, focus on system improvements +3. **Use evidence** - Support each answer with data or observations +4. **Consider multiple paths** - Some problems have multiple root causes +5. **Test the logic** - Work backwards from root cause to problem + +### Common Pitfalls + +- **Stopping too early** - First few whys often reveal symptoms, not causes +- **Single-cause assumption** - Complex systems often have multiple contributing factors +- **Blame focus** - Focusing on individual mistakes rather than system failures +- **Vague answers** - Use specific, actionable answers + +### 5 Whys Template + +```markdown +## 5 Whys Analysis + +**Problem Statement:** [Clear description of the incident] + +**Why 1:** [First why question] +**Answer:** [Specific, evidence-based answer] +**Evidence:** [Supporting data, logs, observations] + +**Why 2:** [Second why question] +**Answer:** [Specific answer based on Why 1] +**Evidence:** [Supporting evidence] + +[Continue for 3-7 iterations] + +**Root Cause(s) Identified:** +1. [Primary root cause] +2. [Secondary root cause if applicable] + +**Validation:** [Confirm that addressing root causes would prevent recurrence] +``` + +--- + +## Fishbone (Ishikawa) Diagram Framework + +### Purpose +Systematically analyze potential causes across multiple categories to identify contributing factors. + +### When to Use +- Complex incidents with multiple potential causes +- When human factors are suspected +- Systemic or organizational issues +- When 5 Whys doesn't reveal clear root causes + +### Categories + +#### People (Human Factors) +- **Training and Skills** + - Insufficient training on new systems + - Lack of domain expertise + - Skill gaps in team + - Knowledge not shared across team + +- **Communication** + - Poor communication between teams + - Unclear responsibilities + - Information not reaching right people + - Language/cultural barriers + +- **Decision Making** + - Decisions made under pressure + - Insufficient information for decisions + - Risk assessment inadequate + - Approval processes bypassed + +#### Process (Procedures and Workflows) +- **Documentation** + - Outdated procedures + - Missing runbooks + - Unclear instructions + - Process not documented + +- **Change Management** + - Inadequate change review + - Rushed deployments + - Insufficient testing + - Rollback procedures unclear + +- **Review and Approval** + - Code review gaps + - Architecture review skipped + - Security review insufficient + - Performance review missing + +#### Technology (Systems and Tools) +- **Architecture** + - Single points of failure + - Insufficient redundancy + - Scalability limitations + - Tight coupling between systems + +- **Monitoring and Alerting** + - Missing monitoring + - Alert fatigue + - Inadequate thresholds + - Poor alert routing + +- **Tools and Automation** + - Manual processes prone to error + - Tool limitations + - Automation gaps + - Integration issues + +#### Environment (External Factors) +- **Infrastructure** + - Hardware failures + - Network issues + - Capacity limitations + - Geographic dependencies + +- **Dependencies** + - Third-party service failures + - External API changes + - Vendor issues + - Supply chain problems + +- **External Pressure** + - Time pressure from business + - Resource constraints + - Regulatory changes + - Market conditions + +### Process Steps + +#### Step 1: Define the Problem +Place the incident at the "head" of the fishbone diagram. + +#### Step 2: Brainstorm Causes +For each category, brainstorm potential contributing factors. + +#### Step 3: Drill Down +For each factor, ask what caused that factor (sub-causes). + +#### Step 4: Identify Primary Causes +Mark the most likely contributing factors based on evidence. + +#### Step 5: Validate +Gather evidence to support or refute each suspected cause. + +### Fishbone Template + +```markdown +## Fishbone Analysis + +**Problem:** [Incident description] + +### People +**Training/Skills:** +- [Factor 1]: [Evidence/likelihood] +- [Factor 2]: [Evidence/likelihood] + +**Communication:** +- [Factor 1]: [Evidence/likelihood] + +**Decision Making:** +- [Factor 1]: [Evidence/likelihood] + +### Process +**Documentation:** +- [Factor 1]: [Evidence/likelihood] + +**Change Management:** +- [Factor 1]: [Evidence/likelihood] + +**Review/Approval:** +- [Factor 1]: [Evidence/likelihood] + +### Technology +**Architecture:** +- [Factor 1]: [Evidence/likelihood] + +**Monitoring:** +- [Factor 1]: [Evidence/likelihood] + +**Tools:** +- [Factor 1]: [Evidence/likelihood] + +### Environment +**Infrastructure:** +- [Factor 1]: [Evidence/likelihood] + +**Dependencies:** +- [Factor 1]: [Evidence/likelihood] + +**External Factors:** +- [Factor 1]: [Evidence/likelihood] + +### Primary Contributing Factors +1. [Factor with highest evidence/impact] +2. [Second most significant factor] +3. [Third most significant factor] + +### Root Cause Hypothesis +[Synthesized explanation of how factors combined to cause incident] +``` + +--- + +## Timeline Analysis Framework + +### Purpose +Analyze the chronological sequence of events to identify decision points, missed opportunities, and process gaps. + +### When to Use +- Extended incidents (> 1 hour) +- Complex multi-phase incidents +- When response effectiveness is questioned +- Communication or coordination failures + +### Analysis Dimensions + +#### Detection Analysis +- **Time to Detection:** How long from onset to first alert? +- **Detection Method:** How was the incident first identified? +- **Alert Effectiveness:** Were the right people notified quickly? +- **False Negatives:** What signals were missed? + +#### Response Analysis +- **Time to Response:** How long from detection to first response action? +- **Escalation Timing:** Were escalations timely and appropriate? +- **Resource Mobilization:** How quickly were the right people engaged? +- **Decision Points:** What key decisions were made and when? + +#### Communication Analysis +- **Internal Communication:** How effective was team coordination? +- **External Communication:** Were stakeholders informed appropriately? +- **Communication Gaps:** Where did information flow break down? +- **Update Frequency:** Were updates provided at appropriate intervals? + +#### Resolution Analysis +- **Mitigation Strategy:** Was the chosen approach optimal? +- **Alternative Paths:** What other options were considered? +- **Resource Allocation:** Were resources used effectively? +- **Verification:** How was resolution confirmed? + +### Process Steps + +#### Step 1: Event Reconstruction +Create comprehensive timeline with all available events. + +#### Step 2: Phase Identification +Identify distinct phases (detection, triage, escalation, mitigation, resolution). + +#### Step 3: Gap Analysis +Identify time gaps and analyze their causes. + +#### Step 4: Decision Point Analysis +Examine key decision points and alternative paths. + +#### Step 5: Effectiveness Assessment +Evaluate the overall effectiveness of the response. + +### Timeline Template + +```markdown +## Timeline Analysis + +### Incident Phases +1. **Detection** ([start] - [end], [duration]) +2. **Triage** ([start] - [end], [duration]) +3. **Escalation** ([start] - [end], [duration]) +4. **Mitigation** ([start] - [end], [duration]) +5. **Resolution** ([start] - [end], [duration]) + +### Key Decision Points +**[Timestamp]:** [Decision made] +- **Context:** [Situation at time of decision] +- **Alternatives:** [Other options considered] +- **Outcome:** [Result of decision] +- **Assessment:** [Was this optimal?] + +### Communication Timeline +**[Timestamp]:** [Communication event] +- **Channel:** [Slack/Email/Phone/etc.] +- **Audience:** [Who was informed] +- **Content:** [What was communicated] +- **Effectiveness:** [Assessment] + +### Gaps and Delays +**[Time Period]:** [Description of gap] +- **Duration:** [Length of gap] +- **Cause:** [Why did gap occur] +- **Impact:** [Effect on incident response] + +### Response Effectiveness +**Strengths:** +- [What went well] +- [Effective decisions/actions] + +**Weaknesses:** +- [What could be improved] +- [Missed opportunities] + +### Root Causes from Timeline +1. [Process-based root cause] +2. [Communication-based root cause] +3. [Decision-making root cause] +``` + +--- + +## Bow Tie Analysis Framework + +### Purpose +Analyze both preventive measures (left side) and protective measures (right side) around an incident. + +### When to Use +- High-severity incidents (SEV1) +- Security incidents +- Safety-critical systems +- When comprehensive barrier analysis is needed + +### Components + +#### Hazards +What conditions create the potential for incidents? + +**Examples:** +- High traffic loads +- Software deployments +- Human interactions with critical systems +- Third-party dependencies + +#### Top Event +What actually went wrong? This is the center of the bow tie. + +**Examples:** +- "Database became unresponsive" +- "Payment processing failed" +- "User authentication service crashed" + +#### Threats (Left Side) +What specific causes could lead to the top event? + +**Examples:** +- Code defects in new deployment +- Database connection pool exhaustion +- Network connectivity issues +- DDoS attack + +#### Consequences (Right Side) +What are the potential impacts of the top event? + +**Examples:** +- Revenue loss +- Customer churn +- Regulatory violations +- Brand damage +- Data loss + +#### Barriers +What controls exist (or could exist) to prevent threats or mitigate consequences? + +**Preventive Barriers (Left Side):** +- Code reviews +- Automated testing +- Load testing +- Input validation +- Rate limiting + +**Protective Barriers (Right Side):** +- Circuit breakers +- Failover systems +- Backup procedures +- Customer communication +- Rollback capabilities + +### Process Steps + +#### Step 1: Define the Top Event +Clearly state what went wrong. + +#### Step 2: Identify Threats +Brainstorm all possible causes that could lead to the top event. + +#### Step 3: Identify Consequences +List all potential impacts of the top event. + +#### Step 4: Map Existing Barriers +Identify current controls for each threat and consequence. + +#### Step 5: Assess Barrier Effectiveness +Evaluate how well each barrier worked (or failed). + +#### Step 6: Recommend Additional Barriers +Identify new controls needed to prevent recurrence. + +### Bow Tie Template + +```markdown +## Bow Tie Analysis + +**Top Event:** [What went wrong] + +### Threats (Potential Causes) +1. **[Threat 1]** + - Likelihood: [High/Medium/Low] + - Current Barriers: [Preventive controls] + - Barrier Effectiveness: [Assessment] + +2. **[Threat 2]** + - Likelihood: [High/Medium/Low] + - Current Barriers: [Preventive controls] + - Barrier Effectiveness: [Assessment] + +### Consequences (Potential Impacts) +1. **[Consequence 1]** + - Severity: [High/Medium/Low] + - Current Barriers: [Protective controls] + - Barrier Effectiveness: [Assessment] + +2. **[Consequence 2]** + - Severity: [High/Medium/Low] + - Current Barriers: [Protective controls] + - Barrier Effectiveness: [Assessment] + +### Barrier Analysis +**Effective Barriers:** +- [Barrier that worked well] +- [Why it was effective] + +**Failed Barriers:** +- [Barrier that failed] +- [Why it failed] +- [How to improve] + +**Missing Barriers:** +- [Needed preventive control] +- [Needed protective control] + +### Recommendations +**Preventive Measures:** +1. [New barrier to prevent threat] +2. [Improvement to existing barrier] + +**Protective Measures:** +1. [New barrier to mitigate consequence] +2. [Improvement to existing barrier] +``` + +--- + +## Framework Comparison + +| Framework | Time Required | Complexity | Best For | Output | +|-----------|---------------|------------|----------|---------| +| **5 Whys** | 30-60 minutes | Low | Simple, linear causes | Clear cause chain | +| **Fishbone** | 1-2 hours | Medium | Complex, multi-factor | Comprehensive factor map | +| **Timeline** | 2-3 hours | Medium | Extended incidents | Process improvements | +| **Bow Tie** | 2-4 hours | High | High-risk incidents | Barrier strategy | + +## Combining Frameworks + +### 5 Whys + Fishbone +Use 5 Whys for initial analysis, then Fishbone to explore contributing factors. + +### Timeline + 5 Whys +Use Timeline to identify key decision points, then 5 Whys on critical failures. + +### Fishbone + Bow Tie +Use Fishbone to identify causes, then Bow Tie to develop comprehensive prevention strategy. + +## Quality Checklist + +- [ ] Root causes address systemic issues, not symptoms +- [ ] Analysis is backed by evidence, not assumptions +- [ ] Multiple perspectives considered (technical, process, human) +- [ ] Recommendations are specific and actionable +- [ ] Analysis focuses on prevention, not blame +- [ ] Findings are validated against incident timeline +- [ ] Contributing factors are prioritized by impact +- [ ] Root causes link clearly to preventive actions + +## Common Anti-Patterns + +- **Human Error as Root Cause** - Dig deeper into why human error occurred +- **Single Root Cause** - Complex systems usually have multiple contributing factors +- **Technology-Only Focus** - Consider process and organizational factors +- **Blame Assignment** - Focus on system improvements, not individual fault +- **Generic Recommendations** - Provide specific, measurable actions +- **Surface-Level Analysis** - Ensure you've reached true root causes + +--- + +**Last Updated:** February 2026 +**Next Review:** August 2026 +**Owner:** SRE Team + Engineering Leadership \ No newline at end of file diff --git a/engineering-team/incident-commander/scripts/incident_classifier.py b/engineering-team/incident-commander/scripts/incident_classifier.py new file mode 100644 index 0000000..8814e99 --- /dev/null +++ b/engineering-team/incident-commander/scripts/incident_classifier.py @@ -0,0 +1,914 @@ +#!/usr/bin/env python3 +""" +Incident Classifier + +Analyzes incident descriptions and outputs severity levels, recommended response teams, +initial actions, and communication templates. + +This tool uses pattern matching and keyword analysis to classify incidents according to +SEV1-4 criteria and provide structured response guidance. + +Usage: + python incident_classifier.py --input incident.json + echo "Database is down" | python incident_classifier.py --format text + python incident_classifier.py --interactive +""" + +import argparse +import json +import sys +import re +from datetime import datetime, timezone +from typing import Dict, List, Tuple, Optional, Any + + +class IncidentClassifier: + """ + Classifies incidents based on description, impact metrics, and business context. + Provides severity assessment, team recommendations, and response templates. + """ + + def __init__(self): + """Initialize the classifier with rules and templates.""" + self.severity_rules = self._load_severity_rules() + self.team_mappings = self._load_team_mappings() + self.communication_templates = self._load_communication_templates() + self.action_templates = self._load_action_templates() + + def _load_severity_rules(self) -> Dict[str, Dict]: + """Load severity classification rules and keywords.""" + return { + "sev1": { + "keywords": [ + "down", "outage", "offline", "unavailable", "crashed", "failed", + "critical", "emergency", "dead", "broken", "timeout", "500 error", + "data loss", "corrupted", "breach", "security incident", + "revenue impact", "customer facing", "all users", "complete failure" + ], + "impact_indicators": [ + "100%", "all users", "entire service", "complete", + "revenue loss", "sla violation", "customer churn", + "security breach", "data corruption", "regulatory" + ], + "duration_threshold": 0, # Immediate classification + "response_time": 300, # 5 minutes + "description": "Complete service failure affecting all users or critical business functions" + }, + "sev2": { + "keywords": [ + "degraded", "slow", "performance", "errors", "partial", + "intermittent", "high latency", "timeouts", "some users", + "feature broken", "api errors", "database slow" + ], + "impact_indicators": [ + "50%", "25-75%", "many users", "significant", + "performance degradation", "feature unavailable", + "support tickets", "user complaints" + ], + "duration_threshold": 300, # 5 minutes + "response_time": 900, # 15 minutes + "description": "Significant degradation affecting subset of users or non-critical functions" + }, + "sev3": { + "keywords": [ + "minor", "cosmetic", "single feature", "workaround available", + "edge case", "rare issue", "non-critical", "internal tool", + "logging issue", "monitoring gap" + ], + "impact_indicators": [ + "<25%", "few users", "limited impact", + "workaround exists", "internal only", + "development environment" + ], + "duration_threshold": 3600, # 1 hour + "response_time": 7200, # 2 hours + "description": "Limited impact with workarounds available" + }, + "sev4": { + "keywords": [ + "cosmetic", "documentation", "typo", "minor bug", + "enhancement", "nice to have", "low priority", + "test environment", "dev tools" + ], + "impact_indicators": [ + "no impact", "cosmetic only", "documentation", + "development", "testing", "non-production" + ], + "duration_threshold": 86400, # 24 hours + "response_time": 172800, # 2 days + "description": "Minimal impact, cosmetic issues, or planned maintenance" + } + } + + def _load_team_mappings(self) -> Dict[str, List[str]]: + """Load team assignment rules based on service/component keywords.""" + return { + "database": ["Database Team", "SRE", "Backend Engineering"], + "frontend": ["Frontend Team", "UX Engineering", "Product Engineering"], + "api": ["API Team", "Backend Engineering", "Platform Team"], + "infrastructure": ["SRE", "DevOps", "Platform Team"], + "security": ["Security Team", "SRE", "Compliance Team"], + "network": ["Network Engineering", "SRE", "Infrastructure Team"], + "authentication": ["Identity Team", "Security Team", "Backend Engineering"], + "payment": ["Payments Team", "Finance Engineering", "Compliance Team"], + "mobile": ["Mobile Team", "API Team", "QA Engineering"], + "monitoring": ["SRE", "Platform Team", "DevOps"], + "deployment": ["DevOps", "Release Engineering", "SRE"], + "data": ["Data Engineering", "Analytics Team", "Backend Engineering"] + } + + def _load_communication_templates(self) -> Dict[str, Dict]: + """Load communication templates for each severity level.""" + return { + "sev1": { + "subject": "🚨 [SEV1] {service} - {brief_description}", + "body": """CRITICAL INCIDENT ALERT + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV1 - Critical Outage +- Service: {service} +- Impact: {impact_description} +- Current Status: Investigating + +Customer Impact: +{customer_impact} + +Response Team: +- Incident Commander: TBD (assigning now) +- Primary Responder: {primary_responder} +- SMEs Required: {subject_matter_experts} + +Immediate Actions Taken: +{initial_actions} + +War Room: {war_room_link} +Status Page: Will be updated within 15 minutes +Next Update: {next_update_time} + +This is a customer-impacting incident requiring immediate attention. + +{incident_commander_contact}""" + }, + "sev2": { + "subject": "⚠️ [SEV2] {service} - {brief_description}", + "body": """MAJOR INCIDENT NOTIFICATION + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV2 - Major Impact +- Service: {service} +- Impact: {impact_description} +- Current Status: Investigating + +User Impact: +{customer_impact} + +Response Team: +- Primary Responder: {primary_responder} +- Supporting Team: {supporting_teams} +- Incident Commander: {incident_commander} + +Initial Assessment: +{initial_assessment} + +Next Steps: +{next_steps} + +Updates will be provided every 30 minutes. +Status page: {status_page_link} + +{contact_information}""" + }, + "sev3": { + "subject": "ℹ️ [SEV3] {service} - {brief_description}", + "body": """MINOR INCIDENT NOTIFICATION + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV3 - Minor Impact +- Service: {service} +- Impact: {impact_description} +- Status: {current_status} + +Details: +{incident_details} + +Assigned Team: {assigned_team} +Estimated Resolution: {eta} + +Workaround: {workaround} + +This incident has limited customer impact and is being addressed during normal business hours. + +{team_contact}""" + }, + "sev4": { + "subject": "[SEV4] {service} - {brief_description}", + "body": """LOW PRIORITY ISSUE + +Issue Details: +- Reported: {timestamp} +- Severity: SEV4 - Low Impact +- Component: {service} +- Description: {description} + +This issue will be addressed in the normal development cycle. + +Assigned to: {assigned_team} +Target Resolution: {target_date} + +{standard_contact}""" + } + } + + def _load_action_templates(self) -> Dict[str, List[Dict]]: + """Load initial action templates for each severity level.""" + return { + "sev1": [ + { + "action": "Establish incident command", + "priority": 1, + "timeout_minutes": 5, + "description": "Page incident commander and establish war room" + }, + { + "action": "Create incident ticket", + "priority": 1, + "timeout_minutes": 2, + "description": "Create tracking ticket with all known details" + }, + { + "action": "Update status page", + "priority": 2, + "timeout_minutes": 15, + "description": "Post initial status page update acknowledging incident" + }, + { + "action": "Notify executives", + "priority": 2, + "timeout_minutes": 15, + "description": "Alert executive team of customer-impacting outage" + }, + { + "action": "Engage subject matter experts", + "priority": 3, + "timeout_minutes": 10, + "description": "Page relevant SMEs based on affected systems" + }, + { + "action": "Begin technical investigation", + "priority": 3, + "timeout_minutes": 5, + "description": "Start technical diagnosis and mitigation efforts" + } + ], + "sev2": [ + { + "action": "Assign incident commander", + "priority": 1, + "timeout_minutes": 30, + "description": "Assign IC and establish coordination channel" + }, + { + "action": "Create incident tracking", + "priority": 1, + "timeout_minutes": 5, + "description": "Create incident ticket with details and timeline" + }, + { + "action": "Assess customer impact", + "priority": 2, + "timeout_minutes": 15, + "description": "Determine scope and severity of user impact" + }, + { + "action": "Engage response team", + "priority": 2, + "timeout_minutes": 30, + "description": "Page appropriate technical responders" + }, + { + "action": "Begin investigation", + "priority": 3, + "timeout_minutes": 15, + "description": "Start technical analysis and debugging" + }, + { + "action": "Plan status communication", + "priority": 3, + "timeout_minutes": 30, + "description": "Determine if status page update is needed" + } + ], + "sev3": [ + { + "action": "Assign to appropriate team", + "priority": 1, + "timeout_minutes": 120, + "description": "Route to team with relevant expertise" + }, + { + "action": "Create tracking ticket", + "priority": 1, + "timeout_minutes": 30, + "description": "Document issue in standard ticketing system" + }, + { + "action": "Assess scope and impact", + "priority": 2, + "timeout_minutes": 60, + "description": "Understand full scope of the issue" + }, + { + "action": "Identify workarounds", + "priority": 2, + "timeout_minutes": 60, + "description": "Find temporary solutions if possible" + }, + { + "action": "Plan resolution approach", + "priority": 3, + "timeout_minutes": 120, + "description": "Develop plan for permanent fix" + } + ], + "sev4": [ + { + "action": "Create backlog item", + "priority": 1, + "timeout_minutes": 1440, # 24 hours + "description": "Add to team backlog for future sprint planning" + }, + { + "action": "Triage and prioritize", + "priority": 2, + "timeout_minutes": 2880, # 2 days + "description": "Review and prioritize against other work" + }, + { + "action": "Assign owner", + "priority": 3, + "timeout_minutes": 4320, # 3 days + "description": "Assign to appropriate developer when capacity allows" + } + ] + } + + def classify_incident(self, incident_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Main classification method that analyzes incident data and returns + comprehensive response recommendations. + + Args: + incident_data: Dictionary containing incident information + + Returns: + Dictionary with classification results and recommendations + """ + # Extract key information from incident data + description = incident_data.get('description', '').lower() + affected_users = incident_data.get('affected_users', '0%') + business_impact = incident_data.get('business_impact', 'unknown') + service = incident_data.get('service', 'unknown service') + duration = incident_data.get('duration_minutes', 0) + + # Classify severity + severity = self._classify_severity(description, affected_users, business_impact, duration) + + # Determine response teams + response_teams = self._determine_teams(description, service) + + # Generate initial actions + initial_actions = self._generate_initial_actions(severity, incident_data) + + # Create communication template + communication = self._generate_communication(severity, incident_data) + + # Calculate response timeline + timeline = self._generate_timeline(severity) + + # Determine escalation path + escalation = self._determine_escalation(severity, business_impact) + + return { + "classification": { + "severity": severity.upper(), + "confidence": self._calculate_confidence(description, affected_users, business_impact), + "reasoning": self._explain_classification(severity, description, affected_users), + "timestamp": datetime.now(timezone.utc).isoformat() + }, + "response": { + "primary_team": response_teams[0] if response_teams else "General Engineering", + "supporting_teams": response_teams[1:] if len(response_teams) > 1 else [], + "all_teams": response_teams, + "response_time_minutes": self.severity_rules[severity]["response_time"] // 60 + }, + "initial_actions": initial_actions, + "communication": communication, + "timeline": timeline, + "escalation": escalation, + "incident_data": { + "service": service, + "description": incident_data.get('description', ''), + "affected_users": affected_users, + "business_impact": business_impact, + "duration_minutes": duration + } + } + + def _classify_severity(self, description: str, affected_users: str, + business_impact: str, duration: int) -> str: + """Classify incident severity based on multiple factors.""" + scores = {"sev1": 0, "sev2": 0, "sev3": 0, "sev4": 0} + + # Keyword analysis + for severity, rules in self.severity_rules.items(): + for keyword in rules["keywords"]: + if keyword in description: + scores[severity] += 2 + + for indicator in rules["impact_indicators"]: + if indicator.lower() in description or indicator.lower() in affected_users.lower(): + scores[severity] += 3 + + # Business impact weighting + if business_impact.lower() in ['critical', 'high', 'severe']: + scores["sev1"] += 5 + scores["sev2"] += 3 + elif business_impact.lower() in ['medium', 'moderate']: + scores["sev2"] += 3 + scores["sev3"] += 2 + elif business_impact.lower() in ['low', 'minimal']: + scores["sev3"] += 2 + scores["sev4"] += 3 + + # User impact analysis + if '%' in affected_users: + try: + percentage = float(re.findall(r'\d+', affected_users)[0]) + if percentage >= 75: + scores["sev1"] += 4 + elif percentage >= 25: + scores["sev2"] += 4 + elif percentage >= 5: + scores["sev3"] += 3 + else: + scores["sev4"] += 2 + except (IndexError, ValueError): + pass + + # Duration consideration + if duration > 0: + if duration >= 3600: # 1 hour + scores["sev1"] += 2 + scores["sev2"] += 1 + elif duration >= 1800: # 30 minutes + scores["sev2"] += 2 + scores["sev3"] += 1 + + # Return highest scoring severity + return max(scores, key=scores.get) + + def _determine_teams(self, description: str, service: str) -> List[str]: + """Determine which teams should respond based on affected systems.""" + teams = set() + text_to_analyze = f"{description} {service}".lower() + + for component, team_list in self.team_mappings.items(): + if component in text_to_analyze: + teams.update(team_list) + + # Default teams if no specific match + if not teams: + teams = {"General Engineering", "SRE"} + + return list(teams) + + def _generate_initial_actions(self, severity: str, incident_data: Dict) -> List[Dict]: + """Generate prioritized initial actions based on severity.""" + base_actions = self.action_templates[severity].copy() + + # Customize actions based on incident details + for action in base_actions: + if severity in ["sev1", "sev2"]: + action["urgency"] = "immediate" if severity == "sev1" else "high" + else: + action["urgency"] = "normal" if severity == "sev3" else "low" + + return base_actions + + def _generate_communication(self, severity: str, incident_data: Dict) -> Dict: + """Generate communication template filled with incident data.""" + template = self.communication_templates[severity] + + # Fill template with incident data + now = datetime.now(timezone.utc) + service = incident_data.get('service', 'Unknown Service') + description = incident_data.get('description', 'Incident detected') + + communication = { + "subject": template["subject"].format( + service=service, + brief_description=description[:50] + "..." if len(description) > 50 else description + ), + "body": template["body"], + "urgency": severity, + "recipients": self._determine_recipients(severity), + "channels": self._determine_channels(severity), + "frequency_minutes": self._get_update_frequency(severity) + } + + return communication + + def _generate_timeline(self, severity: str) -> Dict: + """Generate expected response timeline.""" + rules = self.severity_rules[severity] + now = datetime.now(timezone.utc) + + milestones = [] + if severity == "sev1": + milestones = [ + {"milestone": "Incident Commander assigned", "minutes": 5}, + {"milestone": "War room established", "minutes": 10}, + {"milestone": "Initial status page update", "minutes": 15}, + {"milestone": "Executive notification", "minutes": 15}, + {"milestone": "First customer update", "minutes": 30} + ] + elif severity == "sev2": + milestones = [ + {"milestone": "Response team assembled", "minutes": 15}, + {"milestone": "Initial assessment complete", "minutes": 30}, + {"milestone": "Stakeholder notification", "minutes": 60}, + {"milestone": "Status page update (if needed)", "minutes": 60} + ] + elif severity == "sev3": + milestones = [ + {"milestone": "Team assignment", "minutes": 120}, + {"milestone": "Initial triage complete", "minutes": 240}, + {"milestone": "Resolution plan created", "minutes": 480} + ] + else: # sev4 + milestones = [ + {"milestone": "Backlog creation", "minutes": 1440}, + {"milestone": "Priority assessment", "minutes": 2880} + ] + + return { + "response_time_minutes": rules["response_time"] // 60, + "milestones": milestones, + "update_frequency_minutes": self._get_update_frequency(severity) + } + + def _determine_escalation(self, severity: str, business_impact: str) -> Dict: + """Determine escalation requirements and triggers.""" + escalation_rules = { + "sev1": { + "immediate": ["Incident Commander", "Engineering Manager"], + "15_minutes": ["VP Engineering", "Customer Success"], + "30_minutes": ["CTO"], + "60_minutes": ["CEO", "All C-Suite"], + "triggers": ["Extended outage", "Revenue impact", "Media attention"] + }, + "sev2": { + "immediate": ["Team Lead", "On-call Engineer"], + "30_minutes": ["Engineering Manager"], + "120_minutes": ["VP Engineering"], + "triggers": ["No progress", "Expanding scope", "Customer escalation"] + }, + "sev3": { + "immediate": ["Assigned Engineer"], + "240_minutes": ["Team Lead"], + "triggers": ["Issue complexity", "Multiple teams needed"] + }, + "sev4": { + "immediate": ["Product Owner"], + "triggers": ["Customer request", "Stakeholder priority"] + } + } + + return escalation_rules.get(severity, escalation_rules["sev4"]) + + def _determine_recipients(self, severity: str) -> List[str]: + """Determine who should receive notifications.""" + recipients = { + "sev1": ["on-call", "engineering-leadership", "executives", "customer-success"], + "sev2": ["on-call", "engineering-leadership", "product-team"], + "sev3": ["assigned-team", "team-lead"], + "sev4": ["assigned-engineer"] + } + return recipients.get(severity, recipients["sev4"]) + + def _determine_channels(self, severity: str) -> List[str]: + """Determine communication channels to use.""" + channels = { + "sev1": ["pager", "phone", "slack", "email", "status-page"], + "sev2": ["pager", "slack", "email"], + "sev3": ["slack", "email"], + "sev4": ["ticket-system"] + } + return channels.get(severity, channels["sev4"]) + + def _get_update_frequency(self, severity: str) -> int: + """Get recommended update frequency in minutes.""" + frequencies = {"sev1": 15, "sev2": 30, "sev3": 240, "sev4": 0} + return frequencies.get(severity, 0) + + def _calculate_confidence(self, description: str, affected_users: str, business_impact: str) -> float: + """Calculate confidence score for the classification.""" + confidence = 0.5 # Base confidence + + # Higher confidence with more specific information + if '%' in affected_users and any(char.isdigit() for char in affected_users): + confidence += 0.2 + + if business_impact.lower() in ['critical', 'high', 'medium', 'low']: + confidence += 0.15 + + if len(description.split()) > 5: # Detailed description + confidence += 0.15 + + return min(confidence, 1.0) + + def _explain_classification(self, severity: str, description: str, affected_users: str) -> str: + """Provide explanation for the classification decision.""" + rules = self.severity_rules[severity] + + matched_keywords = [] + for keyword in rules["keywords"]: + if keyword in description.lower(): + matched_keywords.append(keyword) + + explanation = f"Classified as {severity.upper()} based on: " + reasons = [] + + if matched_keywords: + reasons.append(f"keywords: {', '.join(matched_keywords[:3])}") + + if '%' in affected_users: + reasons.append(f"user impact: {affected_users}") + + if not reasons: + reasons.append("default classification based on available information") + + return explanation + "; ".join(reasons) + + +def format_json_output(result: Dict) -> str: + """Format result as pretty JSON.""" + return json.dumps(result, indent=2, ensure_ascii=False) + + +def format_text_output(result: Dict) -> str: + """Format result as human-readable text.""" + classification = result["classification"] + response = result["response"] + actions = result["initial_actions"] + communication = result["communication"] + + output = [] + output.append("=" * 60) + output.append("INCIDENT CLASSIFICATION REPORT") + output.append("=" * 60) + output.append("") + + # Classification section + output.append("CLASSIFICATION:") + output.append(f" Severity: {classification['severity']}") + output.append(f" Confidence: {classification['confidence']:.1%}") + output.append(f" Reasoning: {classification['reasoning']}") + output.append(f" Timestamp: {classification['timestamp']}") + output.append("") + + # Response section + output.append("RECOMMENDED RESPONSE:") + output.append(f" Primary Team: {response['primary_team']}") + if response['supporting_teams']: + output.append(f" Supporting Teams: {', '.join(response['supporting_teams'])}") + output.append(f" Response Time: {response['response_time_minutes']} minutes") + output.append("") + + # Actions section + output.append("INITIAL ACTIONS:") + for i, action in enumerate(actions[:5], 1): # Show first 5 actions + output.append(f" {i}. {action['action']} (Priority {action['priority']})") + output.append(f" Timeout: {action['timeout_minutes']} minutes") + output.append(f" {action['description']}") + output.append("") + + # Communication section + output.append("COMMUNICATION:") + output.append(f" Subject: {communication['subject']}") + output.append(f" Urgency: {communication['urgency'].upper()}") + output.append(f" Recipients: {', '.join(communication['recipients'])}") + output.append(f" Channels: {', '.join(communication['channels'])}") + if communication['frequency_minutes'] > 0: + output.append(f" Update Frequency: Every {communication['frequency_minutes']} minutes") + output.append("") + + output.append("=" * 60) + + return "\n".join(output) + + +def parse_input_text(text: str) -> Dict[str, Any]: + """Parse free-form text input into structured incident data.""" + # Basic parsing - in a real system, this would be more sophisticated + incident_data = { + "description": text.strip(), + "service": "unknown service", + "affected_users": "unknown", + "business_impact": "unknown" + } + + # Try to extract service name + service_patterns = [ + r'(?:service|api|database|server|application)\s+(\w+)', + r'(\w+)(?:\s+(?:is|has|service|api|database))', + r'(?:^|\s)(\w+)\s+(?:down|failed|broken)' + ] + + for pattern in service_patterns: + match = re.search(pattern, text.lower()) + if match: + incident_data["service"] = match.group(1) + break + + # Try to extract user impact + impact_patterns = [ + r'(\d+%)\s+(?:of\s+)?(?:users?|customers?)', + r'(?:all|every|100%)\s+(?:users?|customers?)', + r'(?:some|many|several)\s+(?:users?|customers?)' + ] + + for pattern in impact_patterns: + match = re.search(pattern, text.lower()) + if match: + incident_data["affected_users"] = match.group(1) if match.group(1) else match.group(0) + break + + # Try to infer business impact + if any(word in text.lower() for word in ['critical', 'urgent', 'emergency', 'down', 'outage']): + incident_data["business_impact"] = "high" + elif any(word in text.lower() for word in ['slow', 'degraded', 'performance']): + incident_data["business_impact"] = "medium" + elif any(word in text.lower() for word in ['minor', 'cosmetic', 'small']): + incident_data["business_impact"] = "low" + + return incident_data + + +def interactive_mode(): + """Run in interactive mode, prompting user for input.""" + classifier = IncidentClassifier() + + print("🚨 Incident Classifier - Interactive Mode") + print("=" * 50) + print("Enter incident details (or 'quit' to exit):") + print() + + while True: + try: + description = input("Incident description: ").strip() + if description.lower() in ['quit', 'exit', 'q']: + break + + if not description: + print("Please provide an incident description.") + continue + + service = input("Affected service (optional): ").strip() or "unknown" + affected_users = input("Affected users (e.g., '50%', 'all users'): ").strip() or "unknown" + business_impact = input("Business impact (high/medium/low): ").strip() or "unknown" + + incident_data = { + "description": description, + "service": service, + "affected_users": affected_users, + "business_impact": business_impact + } + + result = classifier.classify_incident(incident_data) + print("\n" + "=" * 50) + print(format_text_output(result)) + print("=" * 50) + print() + + except KeyboardInterrupt: + print("\n\nExiting...") + break + except Exception as e: + print(f"Error: {e}") + + +def main(): + """Main function with argument parsing and execution.""" + parser = argparse.ArgumentParser( + description="Classify incidents and provide response recommendations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python incident_classifier.py --input incident.json + echo "Database is down" | python incident_classifier.py --format text + python incident_classifier.py --interactive + +Input JSON format: + { + "description": "Database connection timeouts", + "service": "user-service", + "affected_users": "80%", + "business_impact": "high" + } + """ + ) + + parser.add_argument( + "--input", "-i", + help="Input file path (JSON format) or '-' for stdin" + ) + + parser.add_argument( + "--format", "-f", + choices=["json", "text"], + default="json", + help="Output format (default: json)" + ) + + parser.add_argument( + "--interactive", + action="store_true", + help="Run in interactive mode" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (default: stdout)" + ) + + args = parser.parse_args() + + # Interactive mode + if args.interactive: + interactive_mode() + return + + classifier = IncidentClassifier() + + try: + # Read input + if args.input == "-" or (not args.input and not sys.stdin.isatty()): + # Read from stdin + input_text = sys.stdin.read().strip() + if not input_text: + parser.error("No input provided") + + # Try to parse as JSON first, then as text + try: + incident_data = json.loads(input_text) + except json.JSONDecodeError: + incident_data = parse_input_text(input_text) + + elif args.input: + # Read from file + with open(args.input, 'r') as f: + incident_data = json.load(f) + else: + parser.error("No input specified. Use --input, --interactive, or pipe data to stdin.") + + # Validate required fields + if not isinstance(incident_data, dict): + parser.error("Input must be a JSON object") + + if "description" not in incident_data: + parser.error("Input must contain 'description' field") + + # Classify incident + result = classifier.classify_incident(incident_data) + + # Format output + if args.format == "json": + output = format_json_output(result) + else: + output = format_text_output(result) + + # Write output + if args.output: + with open(args.output, 'w') as f: + f.write(output) + f.write('\n') + else: + print(output) + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON - {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering-team/incident-commander/scripts/pir_generator.py b/engineering-team/incident-commander/scripts/pir_generator.py new file mode 100644 index 0000000..ee31b95 --- /dev/null +++ b/engineering-team/incident-commander/scripts/pir_generator.py @@ -0,0 +1,1638 @@ +#!/usr/bin/env python3 +""" +PIR (Post-Incident Review) Generator + +Generates comprehensive Post-Incident Review documents from incident data, timelines, +and actions taken. Applies multiple RCA frameworks including 5 Whys, Fishbone diagram, +and Timeline analysis. + +This tool creates structured PIR documents with root cause analysis, lessons learned, +action items, and follow-up recommendations. + +Usage: + python pir_generator.py --incident incident.json --timeline timeline.json --output pir.md + python pir_generator.py --incident incident.json --rca-method fishbone --action-items + cat incident.json | python pir_generator.py --format markdown +""" + +import argparse +import json +import sys +import re +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict, Counter + + +class PIRGenerator: + """ + Generates comprehensive Post-Incident Review documents with multiple + RCA frameworks, lessons learned, and actionable follow-up items. + """ + + def __init__(self): + """Initialize the PIR generator with templates and frameworks.""" + self.rca_frameworks = self._load_rca_frameworks() + self.pir_templates = self._load_pir_templates() + self.severity_guidelines = self._load_severity_guidelines() + self.action_item_types = self._load_action_item_types() + self.lessons_learned_categories = self._load_lessons_learned_categories() + + def _load_rca_frameworks(self) -> Dict[str, Dict]: + """Load root cause analysis framework definitions.""" + return { + "five_whys": { + "name": "5 Whys Analysis", + "description": "Iterative questioning technique to explore cause-and-effect relationships", + "steps": [ + "State the problem clearly", + "Ask why the problem occurred", + "For each answer, ask why again", + "Continue until root cause is identified", + "Verify the root cause addresses the original problem" + ], + "min_iterations": 3, + "max_iterations": 7 + }, + "fishbone": { + "name": "Fishbone (Ishikawa) Diagram", + "description": "Systematic analysis across multiple categories of potential causes", + "categories": [ + { + "name": "People", + "description": "Human factors, training, communication, experience", + "examples": ["Training gaps", "Communication failures", "Skill deficits", "Staffing issues"] + }, + { + "name": "Process", + "description": "Procedures, workflows, change management, review processes", + "examples": ["Missing procedures", "Inadequate reviews", "Change management gaps", "Documentation issues"] + }, + { + "name": "Technology", + "description": "Systems, tools, architecture, automation", + "examples": ["Architecture limitations", "Tool deficiencies", "Automation gaps", "Infrastructure issues"] + }, + { + "name": "Environment", + "description": "External factors, dependencies, infrastructure", + "examples": ["Third-party dependencies", "Network issues", "Hardware failures", "External service outages"] + } + ] + }, + "timeline": { + "name": "Timeline Analysis", + "description": "Chronological analysis of events to identify decision points and missed opportunities", + "focus_areas": [ + "Detection timing and effectiveness", + "Response time and escalation paths", + "Decision points and alternative paths", + "Communication effectiveness", + "Mitigation strategy effectiveness" + ] + }, + "bow_tie": { + "name": "Bow Tie Analysis", + "description": "Analysis of both preventive and protective measures around an incident", + "components": [ + "Hazards (what could go wrong)", + "Top events (what actually went wrong)", + "Threats (what caused it)", + "Consequences (what was the impact)", + "Barriers (what preventive/protective measures exist or could exist)" + ] + } + } + + def _load_pir_templates(self) -> Dict[str, str]: + """Load PIR document templates for different severity levels.""" + return { + "comprehensive": """# Post-Incident Review: {incident_title} + +## Executive Summary +{executive_summary} + +## Incident Overview +- **Incident ID:** {incident_id} +- **Date & Time:** {incident_date} +- **Duration:** {duration} +- **Severity:** {severity} +- **Status:** {status} +- **Incident Commander:** {incident_commander} +- **Responders:** {responders} + +### Customer Impact +{customer_impact} + +### Business Impact +{business_impact} + +## Timeline +{timeline_section} + +## Root Cause Analysis +{rca_section} + +## What Went Well +{what_went_well} + +## What Didn't Go Well +{what_went_wrong} + +## Lessons Learned +{lessons_learned} + +## Action Items +{action_items} + +## Follow-up and Prevention +{prevention_measures} + +## Appendix +{appendix_section} + +--- +*Generated on {generation_date} by PIR Generator* +""", + "standard": """# Post-Incident Review: {incident_title} + +## Summary +{executive_summary} + +## Incident Details +- **Date:** {incident_date} +- **Duration:** {duration} +- **Severity:** {severity} +- **Impact:** {customer_impact} + +## Timeline +{timeline_section} + +## Root Cause +{rca_section} + +## Action Items +{action_items} + +## Lessons Learned +{lessons_learned} + +--- +*Generated on {generation_date}* +""", + "brief": """# Incident Review: {incident_title} + +**Date:** {incident_date} | **Duration:** {duration} | **Severity:** {severity} + +## What Happened +{executive_summary} + +## Root Cause +{rca_section} + +## Actions +{action_items} + +--- +*{generation_date}* +""" + } + + def _load_severity_guidelines(self) -> Dict[str, Dict]: + """Load severity-specific PIR guidelines.""" + return { + "sev1": { + "required_sections": ["executive_summary", "timeline", "rca", "action_items", "lessons_learned"], + "required_attendees": ["incident_commander", "technical_leads", "engineering_manager", "product_manager"], + "timeline_requirement": "Complete timeline with 15-minute intervals", + "rca_methods": ["five_whys", "fishbone", "timeline"], + "review_deadline_hours": 24, + "follow_up_weeks": 4 + }, + "sev2": { + "required_sections": ["summary", "timeline", "rca", "action_items"], + "required_attendees": ["incident_commander", "technical_leads", "team_lead"], + "timeline_requirement": "Key milestone timeline", + "rca_methods": ["five_whys", "timeline"], + "review_deadline_hours": 72, + "follow_up_weeks": 2 + }, + "sev3": { + "required_sections": ["summary", "rca", "action_items"], + "required_attendees": ["technical_lead", "team_member"], + "timeline_requirement": "Basic timeline", + "rca_methods": ["five_whys"], + "review_deadline_hours": 168, # 1 week + "follow_up_weeks": 1 + }, + "sev4": { + "required_sections": ["summary", "action_items"], + "required_attendees": ["assigned_engineer"], + "timeline_requirement": "Optional", + "rca_methods": ["brief_analysis"], + "review_deadline_hours": 336, # 2 weeks + "follow_up_weeks": 0 + } + } + + def _load_action_item_types(self) -> Dict[str, Dict]: + """Load action item categorization and templates.""" + return { + "immediate_fix": { + "priority": "P0", + "timeline": "24-48 hours", + "description": "Critical bugs or security issues that need immediate attention", + "template": "Fix {issue_description} to prevent recurrence of {incident_type}", + "owners": ["engineer", "team_lead"] + }, + "process_improvement": { + "priority": "P1", + "timeline": "1-2 weeks", + "description": "Process gaps or communication issues identified", + "template": "Improve {process_area} to address {gap_description}", + "owners": ["team_lead", "process_owner"] + }, + "monitoring_alerting": { + "priority": "P1", + "timeline": "1 week", + "description": "Missing monitoring or alerting capabilities", + "template": "Implement {monitoring_type} for {system_component}", + "owners": ["sre", "engineer"] + }, + "documentation": { + "priority": "P2", + "timeline": "2-3 weeks", + "description": "Documentation gaps or runbook updates", + "template": "Update {documentation_type} to include {missing_information}", + "owners": ["technical_writer", "engineer"] + }, + "training": { + "priority": "P2", + "timeline": "1 month", + "description": "Training needs or knowledge gaps", + "template": "Provide {training_type} training on {topic}", + "owners": ["training_coordinator", "subject_matter_expert"] + }, + "architectural": { + "priority": "P1-P3", + "timeline": "1-3 months", + "description": "System design or architecture improvements", + "template": "Redesign {system_component} to improve {quality_attribute}", + "owners": ["architect", "engineering_manager"] + }, + "tooling": { + "priority": "P2", + "timeline": "2-4 weeks", + "description": "Tool improvements or new tool requirements", + "template": "Implement {tool_type} to support {use_case}", + "owners": ["devops", "engineer"] + } + } + + def _load_lessons_learned_categories(self) -> Dict[str, List[str]]: + """Load categories for organizing lessons learned.""" + return { + "detection_and_monitoring": [ + "Monitoring gaps identified", + "Alert fatigue issues", + "Detection timing improvements", + "Observability enhancements" + ], + "response_and_escalation": [ + "Response time improvements", + "Escalation path optimization", + "Communication effectiveness", + "Resource allocation lessons" + ], + "technical_systems": [ + "Architecture resilience", + "Failure mode analysis", + "Performance bottlenecks", + "Dependency management" + ], + "process_and_procedures": [ + "Runbook effectiveness", + "Change management gaps", + "Review process improvements", + "Documentation quality" + ], + "team_and_culture": [ + "Training needs identified", + "Cross-team collaboration", + "Knowledge sharing gaps", + "Decision-making processes" + ] + } + + def generate_pir(self, incident_data: Dict[str, Any], timeline_data: Optional[Dict] = None, + rca_method: str = "five_whys", template_type: str = "comprehensive") -> Dict[str, Any]: + """ + Generate a comprehensive PIR document from incident data. + + Args: + incident_data: Core incident information + timeline_data: Optional timeline reconstruction data + rca_method: RCA framework to use + template_type: PIR template type (comprehensive, standard, brief) + + Returns: + Dictionary containing PIR document and metadata + """ + # Extract incident information + incident_info = self._extract_incident_info(incident_data) + + # Generate root cause analysis + rca_results = self._perform_rca(incident_data, timeline_data, rca_method) + + # Generate lessons learned + lessons_learned = self._generate_lessons_learned(incident_data, timeline_data, rca_results) + + # Generate action items + action_items = self._generate_action_items(incident_data, rca_results, lessons_learned) + + # Create timeline section + timeline_section = self._create_timeline_section(timeline_data, incident_info["severity"]) + + # Generate document sections + sections = self._generate_document_sections( + incident_info, rca_results, lessons_learned, action_items, timeline_section + ) + + # Build final document + template = self.pir_templates[template_type] + pir_document = template.format(**sections) + + # Generate metadata + metadata = self._generate_metadata(incident_info, rca_results, action_items) + + return { + "pir_document": pir_document, + "metadata": metadata, + "incident_info": incident_info, + "rca_results": rca_results, + "lessons_learned": lessons_learned, + "action_items": action_items, + "generation_timestamp": datetime.now(timezone.utc).isoformat() + } + + def _extract_incident_info(self, incident_data: Dict) -> Dict[str, Any]: + """Extract and normalize incident information.""" + return { + "incident_id": incident_data.get("incident_id", "INC-" + datetime.now().strftime("%Y%m%d-%H%M")), + "title": incident_data.get("title", incident_data.get("description", "Incident")[:50]), + "description": incident_data.get("description", "No description provided"), + "severity": incident_data.get("severity", "unknown").lower(), + "start_time": self._parse_timestamp(incident_data.get("start_time", incident_data.get("timestamp", ""))), + "end_time": self._parse_timestamp(incident_data.get("end_time", "")), + "duration": self._calculate_duration(incident_data), + "affected_services": incident_data.get("affected_services", []), + "customer_impact": incident_data.get("customer_impact", "Unknown impact"), + "business_impact": incident_data.get("business_impact", "Unknown business impact"), + "incident_commander": incident_data.get("incident_commander", "TBD"), + "responders": incident_data.get("responders", []), + "status": incident_data.get("status", "resolved") + } + + def _parse_timestamp(self, timestamp_str: str) -> Optional[datetime]: + """Parse timestamp string to datetime object.""" + if not timestamp_str: + return None + + formats = [ + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%d %H:%M:%S", + "%m/%d/%Y %H:%M:%S" + ] + + for fmt in formats: + try: + dt = datetime.strptime(timestamp_str, fmt) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except ValueError: + continue + + return None + + def _calculate_duration(self, incident_data: Dict) -> str: + """Calculate incident duration in human-readable format.""" + start_time = self._parse_timestamp(incident_data.get("start_time", "")) + end_time = self._parse_timestamp(incident_data.get("end_time", "")) + + if start_time and end_time: + duration = end_time - start_time + total_minutes = int(duration.total_seconds() / 60) + + if total_minutes < 60: + return f"{total_minutes} minutes" + elif total_minutes < 1440: # Less than 24 hours + hours = total_minutes // 60 + minutes = total_minutes % 60 + return f"{hours}h {minutes}m" + else: + days = total_minutes // 1440 + hours = (total_minutes % 1440) // 60 + return f"{days}d {hours}h" + + return incident_data.get("duration", "Unknown duration") + + def _perform_rca(self, incident_data: Dict, timeline_data: Optional[Dict], method: str) -> Dict[str, Any]: + """Perform root cause analysis using specified method.""" + if method == "five_whys": + return self._five_whys_analysis(incident_data, timeline_data) + elif method == "fishbone": + return self._fishbone_analysis(incident_data, timeline_data) + elif method == "timeline": + return self._timeline_analysis(incident_data, timeline_data) + elif method == "bow_tie": + return self._bow_tie_analysis(incident_data, timeline_data) + else: + return self._five_whys_analysis(incident_data, timeline_data) # Default + + def _five_whys_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform 5 Whys root cause analysis.""" + problem_statement = incident_data.get("description", "Incident occurred") + + # Generate why questions based on incident data + whys = [] + current_issue = problem_statement + + # Generate systematic why questions + why_patterns = [ + f"Why did {current_issue}?", + "Why wasn't this detected earlier?", + "Why didn't existing safeguards prevent this?", + "Why wasn't there a backup mechanism?", + "Why wasn't this scenario anticipated?" + ] + + # Try to infer answers from incident data + potential_answers = self._infer_why_answers(incident_data, timeline_data) + + for i, why_question in enumerate(why_patterns): + answer = potential_answers[i] if i < len(potential_answers) else "Further investigation needed" + whys.append({ + "question": why_question, + "answer": answer, + "evidence": self._find_supporting_evidence(answer, incident_data, timeline_data) + }) + + # Identify root causes from the analysis + root_causes = self._extract_root_causes(whys) + + return { + "method": "five_whys", + "problem_statement": problem_statement, + "why_analysis": whys, + "root_causes": root_causes, + "confidence": self._calculate_rca_confidence(whys, incident_data) + } + + def _fishbone_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform Fishbone (Ishikawa) diagram analysis.""" + problem_statement = incident_data.get("description", "Incident occurred") + + # Analyze each category + categories = {} + for category_info in self.rca_frameworks["fishbone"]["categories"]: + category_name = category_info["name"] + contributing_factors = self._identify_category_factors( + category_name, incident_data, timeline_data + ) + categories[category_name] = { + "description": category_info["description"], + "factors": contributing_factors, + "examples": category_info["examples"] + } + + # Identify primary contributing factors + primary_factors = self._identify_primary_factors(categories) + + # Generate root cause hypothesis + root_causes = self._synthesize_fishbone_root_causes(categories, primary_factors) + + return { + "method": "fishbone", + "problem_statement": problem_statement, + "categories": categories, + "primary_factors": primary_factors, + "root_causes": root_causes, + "confidence": self._calculate_rca_confidence(categories, incident_data) + } + + def _timeline_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform timeline-based root cause analysis.""" + if not timeline_data: + return {"method": "timeline", "error": "No timeline data provided"} + + # Extract key decision points + decision_points = self._extract_decision_points(timeline_data) + + # Identify missed opportunities + missed_opportunities = self._identify_missed_opportunities(timeline_data) + + # Analyze response effectiveness + response_analysis = self._analyze_response_effectiveness(timeline_data) + + # Generate timeline-based root causes + root_causes = self._extract_timeline_root_causes( + decision_points, missed_opportunities, response_analysis + ) + + return { + "method": "timeline", + "decision_points": decision_points, + "missed_opportunities": missed_opportunities, + "response_analysis": response_analysis, + "root_causes": root_causes, + "confidence": self._calculate_rca_confidence(timeline_data, incident_data) + } + + def _bow_tie_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform Bow Tie analysis.""" + # Identify the top event (what went wrong) + top_event = incident_data.get("description", "Service failure") + + # Identify threats (what caused it) + threats = self._identify_threats(incident_data, timeline_data) + + # Identify consequences (impact) + consequences = self._identify_consequences(incident_data) + + # Identify existing barriers + existing_barriers = self._identify_existing_barriers(incident_data, timeline_data) + + # Recommend additional barriers + recommended_barriers = self._recommend_additional_barriers(threats, consequences) + + return { + "method": "bow_tie", + "top_event": top_event, + "threats": threats, + "consequences": consequences, + "existing_barriers": existing_barriers, + "recommended_barriers": recommended_barriers, + "confidence": self._calculate_rca_confidence(threats, incident_data) + } + + def _infer_why_answers(self, incident_data: Dict, timeline_data: Optional[Dict]) -> List[str]: + """Infer potential answers to why questions from available data.""" + answers = [] + + # Look for clues in incident description + description = incident_data.get("description", "").lower() + + # Common patterns and their inferred answers + if "database" in description and ("timeout" in description or "slow" in description): + answers.append("Database connection pool was exhausted") + answers.append("Connection pool configuration was insufficient for peak load") + answers.append("Load testing didn't include realistic database scenarios") + elif "deployment" in description or "release" in description: + answers.append("New deployment introduced a regression") + answers.append("Code review process missed the issue") + answers.append("Testing environment didn't match production") + elif "network" in description or "connectivity" in description: + answers.append("Network infrastructure had unexpected load") + answers.append("Network monitoring wasn't comprehensive enough") + answers.append("Redundancy mechanisms failed simultaneously") + else: + # Generic answers based on common root causes + answers.extend([ + "System couldn't handle the load/request volume", + "Monitoring didn't detect the issue early enough", + "Error handling mechanisms were insufficient", + "Dependencies failed without proper circuit breakers", + "System lacked sufficient redundancy/resilience" + ]) + + return answers[:5] # Return up to 5 answers + + def _find_supporting_evidence(self, answer: str, incident_data: Dict, timeline_data: Optional[Dict]) -> List[str]: + """Find supporting evidence for RCA answers.""" + evidence = [] + + # Look for supporting information in incident data + if timeline_data and "timeline" in timeline_data: + events = timeline_data["timeline"].get("events", []) + for event in events: + event_message = event.get("message", "").lower() + if any(keyword in event_message for keyword in answer.lower().split()): + evidence.append(f"Timeline event: {event['message']}") + + # Check incident metadata for supporting info + metadata = incident_data.get("metadata", {}) + for key, value in metadata.items(): + if isinstance(value, str) and any(keyword in value.lower() for keyword in answer.lower().split()): + evidence.append(f"Incident metadata: {key} = {value}") + + return evidence[:3] # Return top 3 pieces of evidence + + def _extract_root_causes(self, whys: List[Dict]) -> List[Dict]: + """Extract root causes from 5 Whys analysis.""" + root_causes = [] + + # The deepest "why" answers are typically closest to root causes + if len(whys) >= 3: + for i, why in enumerate(whys[-2:]): # Look at last 2 whys + if "further investigation needed" not in why["answer"].lower(): + root_causes.append({ + "cause": why["answer"], + "category": self._categorize_root_cause(why["answer"]), + "evidence": why["evidence"], + "confidence": "high" if len(why["evidence"]) > 1 else "medium" + }) + + return root_causes + + def _categorize_root_cause(self, cause: str) -> str: + """Categorize a root cause into standard categories.""" + cause_lower = cause.lower() + + if any(keyword in cause_lower for keyword in ["process", "procedure", "review", "change management"]): + return "Process" + elif any(keyword in cause_lower for keyword in ["training", "knowledge", "skill", "experience"]): + return "People" + elif any(keyword in cause_lower for keyword in ["system", "architecture", "code", "configuration"]): + return "Technology" + elif any(keyword in cause_lower for keyword in ["network", "infrastructure", "dependency", "third-party"]): + return "Environment" + else: + return "Unknown" + + def _identify_category_factors(self, category: str, incident_data: Dict, timeline_data: Optional[Dict]) -> List[Dict]: + """Identify contributing factors for a Fishbone category.""" + factors = [] + description = incident_data.get("description", "").lower() + + if category == "People": + if "misconfigured" in description or "human error" in description: + factors.append({"factor": "Configuration error", "likelihood": "high"}) + if timeline_data and self._has_delayed_response(timeline_data): + factors.append({"factor": "Delayed incident response", "likelihood": "medium"}) + + elif category == "Process": + if "deployment" in description: + factors.append({"factor": "Insufficient deployment validation", "likelihood": "high"}) + if "code review" in incident_data.get("context", "").lower(): + factors.append({"factor": "Code review process gaps", "likelihood": "medium"}) + + elif category == "Technology": + if "database" in description: + factors.append({"factor": "Database performance limitations", "likelihood": "high"}) + if "timeout" in description or "latency" in description: + factors.append({"factor": "System performance bottlenecks", "likelihood": "high"}) + + elif category == "Environment": + if "network" in description: + factors.append({"factor": "Network infrastructure issues", "likelihood": "medium"}) + if "third-party" in description or "external" in description: + factors.append({"factor": "External service dependencies", "likelihood": "medium"}) + + return factors + + def _identify_primary_factors(self, categories: Dict) -> List[Dict]: + """Identify primary contributing factors across all categories.""" + primary_factors = [] + + for category_name, category_data in categories.items(): + high_likelihood_factors = [ + f for f in category_data["factors"] + if f.get("likelihood") == "high" + ] + primary_factors.extend([ + {**factor, "category": category_name} + for factor in high_likelihood_factors + ]) + + return primary_factors + + def _synthesize_fishbone_root_causes(self, categories: Dict, primary_factors: List[Dict]) -> List[Dict]: + """Synthesize root causes from Fishbone analysis.""" + root_causes = [] + + # Group primary factors by category + category_factors = defaultdict(list) + for factor in primary_factors: + category_factors[factor["category"]].append(factor) + + # Create root causes from categories with multiple factors + for category, factors in category_factors.items(): + if len(factors) > 1: + root_causes.append({ + "cause": f"Multiple {category.lower()} issues contributed to the incident", + "category": category, + "contributing_factors": [f["factor"] for f in factors], + "confidence": "high" + }) + elif len(factors) == 1: + root_causes.append({ + "cause": factors[0]["factor"], + "category": category, + "confidence": "medium" + }) + + return root_causes + + def _has_delayed_response(self, timeline_data: Dict) -> bool: + """Check if timeline shows delayed response patterns.""" + if not timeline_data or "gap_analysis" not in timeline_data: + return False + + gaps = timeline_data["gap_analysis"].get("gaps", []) + return any(gap.get("type") == "phase_transition" for gap in gaps) + + def _extract_decision_points(self, timeline_data: Dict) -> List[Dict]: + """Extract key decision points from timeline.""" + decision_points = [] + + if "timeline" in timeline_data and "phases" in timeline_data["timeline"]: + phases = timeline_data["timeline"]["phases"] + + for i, phase in enumerate(phases): + if phase["name"] in ["escalation", "mitigation"]: + decision_points.append({ + "timestamp": phase["start_time"], + "decision": f"Initiated {phase['name']} phase", + "phase": phase["name"], + "duration": phase["duration_minutes"] + }) + + return decision_points + + def _identify_missed_opportunities(self, timeline_data: Dict) -> List[Dict]: + """Identify missed opportunities from gap analysis.""" + missed_opportunities = [] + + if "gap_analysis" in timeline_data: + gaps = timeline_data["gap_analysis"].get("gaps", []) + + for gap in gaps: + if gap.get("severity") == "critical": + missed_opportunities.append({ + "opportunity": f"Earlier {gap['type'].replace('_', ' ')}", + "gap_minutes": gap["gap_minutes"], + "potential_impact": "Could have reduced incident duration" + }) + + return missed_opportunities + + def _analyze_response_effectiveness(self, timeline_data: Dict) -> Dict[str, Any]: + """Analyze the effectiveness of incident response.""" + effectiveness = { + "overall_rating": "unknown", + "strengths": [], + "weaknesses": [], + "metrics": {} + } + + if "metrics" in timeline_data: + metrics = timeline_data["metrics"] + duration_metrics = metrics.get("duration_metrics", {}) + + # Analyze response times + time_to_mitigation = duration_metrics.get("time_to_mitigation_minutes", 0) + time_to_resolution = duration_metrics.get("time_to_resolution_minutes", 0) + + if time_to_mitigation <= 30: + effectiveness["strengths"].append("Quick mitigation response") + else: + effectiveness["weaknesses"].append("Slow mitigation response") + + if time_to_resolution <= 120: + effectiveness["strengths"].append("Fast resolution") + else: + effectiveness["weaknesses"].append("Extended resolution time") + + effectiveness["metrics"] = { + "time_to_mitigation": time_to_mitigation, + "time_to_resolution": time_to_resolution + } + + # Overall rating based on strengths vs weaknesses + if len(effectiveness["strengths"]) > len(effectiveness["weaknesses"]): + effectiveness["overall_rating"] = "effective" + elif len(effectiveness["weaknesses"]) > len(effectiveness["strengths"]): + effectiveness["overall_rating"] = "needs_improvement" + else: + effectiveness["overall_rating"] = "mixed" + + return effectiveness + + def _extract_timeline_root_causes(self, decision_points: List, missed_opportunities: List, + response_analysis: Dict) -> List[Dict]: + """Extract root causes from timeline analysis.""" + root_causes = [] + + # Root causes from missed opportunities + for opportunity in missed_opportunities: + if opportunity["gap_minutes"] > 60: # Significant gaps + root_causes.append({ + "cause": f"Delayed response: {opportunity['opportunity']}", + "category": "Process", + "evidence": f"{opportunity['gap_minutes']} minute gap identified", + "confidence": "high" + }) + + # Root causes from response effectiveness + for weakness in response_analysis.get("weaknesses", []): + root_causes.append({ + "cause": weakness, + "category": "Process", + "evidence": "Timeline analysis", + "confidence": "medium" + }) + + return root_causes + + def _identify_threats(self, incident_data: Dict, timeline_data: Optional[Dict]) -> List[Dict]: + """Identify threats for Bow Tie analysis.""" + threats = [] + description = incident_data.get("description", "").lower() + + if "deployment" in description: + threats.append({"threat": "Defective code deployment", "likelihood": "medium"}) + if "load" in description or "traffic" in description: + threats.append({"threat": "Unexpected load increase", "likelihood": "high"}) + if "database" in description: + threats.append({"threat": "Database performance degradation", "likelihood": "medium"}) + + return threats + + def _identify_consequences(self, incident_data: Dict) -> List[Dict]: + """Identify consequences for Bow Tie analysis.""" + consequences = [] + + customer_impact = incident_data.get("customer_impact", "").lower() + business_impact = incident_data.get("business_impact", "").lower() + + if "all users" in customer_impact or "complete outage" in customer_impact: + consequences.append({"consequence": "Complete service unavailability", "severity": "critical"}) + + if "revenue" in business_impact: + consequences.append({"consequence": "Revenue loss", "severity": "high"}) + + return consequences + + def _identify_existing_barriers(self, incident_data: Dict, timeline_data: Optional[Dict]) -> List[Dict]: + """Identify existing preventive/protective barriers.""" + barriers = [] + + # Look for evidence of existing controls + if timeline_data and "timeline" in timeline_data: + events = timeline_data["timeline"].get("events", []) + + for event in events: + message = event.get("message", "").lower() + if "alert" in message or "monitoring" in message: + barriers.append({ + "barrier": "Monitoring and alerting system", + "type": "detective", + "effectiveness": "partial" + }) + elif "rollback" in message: + barriers.append({ + "barrier": "Rollback capability", + "type": "corrective", + "effectiveness": "effective" + }) + + return barriers + + def _recommend_additional_barriers(self, threats: List[Dict], consequences: List[Dict]) -> List[Dict]: + """Recommend additional barriers based on threats and consequences.""" + recommendations = [] + + for threat in threats: + if "deployment" in threat["threat"].lower(): + recommendations.append({ + "barrier": "Enhanced pre-deployment testing", + "type": "preventive", + "justification": "Prevent defective deployments reaching production" + }) + elif "load" in threat["threat"].lower(): + recommendations.append({ + "barrier": "Auto-scaling and load shedding", + "type": "preventive", + "justification": "Handle unexpected load increases automatically" + }) + + return recommendations + + def _calculate_rca_confidence(self, analysis_data: Any, incident_data: Dict) -> str: + """Calculate confidence level for RCA results.""" + # Simple heuristic based on available data + confidence_score = 0 + + # More detailed incident data increases confidence + if incident_data.get("description") and len(incident_data["description"]) > 50: + confidence_score += 1 + + if incident_data.get("timeline") or incident_data.get("events"): + confidence_score += 2 + + if incident_data.get("logs") or incident_data.get("monitoring_data"): + confidence_score += 2 + + # Analysis data completeness + if isinstance(analysis_data, list) and len(analysis_data) > 3: + confidence_score += 1 + elif isinstance(analysis_data, dict) and len(analysis_data) > 5: + confidence_score += 1 + + if confidence_score >= 4: + return "high" + elif confidence_score >= 2: + return "medium" + else: + return "low" + + def _generate_lessons_learned(self, incident_data: Dict, timeline_data: Optional[Dict], + rca_results: Dict) -> Dict[str, List[str]]: + """Generate categorized lessons learned.""" + lessons = defaultdict(list) + + # Lessons from RCA + root_causes = rca_results.get("root_causes", []) + for root_cause in root_causes: + category = root_cause.get("category", "technical_systems").lower() + category_key = self._map_to_lessons_category(category) + + lesson = f"Identified: {root_cause['cause']}" + lessons[category_key].append(lesson) + + # Lessons from timeline analysis + if timeline_data and "gap_analysis" in timeline_data: + gaps = timeline_data["gap_analysis"].get("gaps", []) + for gap in gaps: + if gap.get("severity") == "critical": + lessons["response_and_escalation"].append( + f"Response time gap: {gap['type'].replace('_', ' ')} took {gap['gap_minutes']} minutes" + ) + + # Generic lessons based on incident characteristics + severity = incident_data.get("severity", "").lower() + if severity in ["sev1", "critical"]: + lessons["detection_and_monitoring"].append( + "Critical incidents require immediate detection and alerting" + ) + + return dict(lessons) + + def _map_to_lessons_category(self, category: str) -> str: + """Map RCA category to lessons learned category.""" + mapping = { + "people": "team_and_culture", + "process": "process_and_procedures", + "technology": "technical_systems", + "environment": "technical_systems", + "unknown": "process_and_procedures" + } + return mapping.get(category, "technical_systems") + + def _generate_action_items(self, incident_data: Dict, rca_results: Dict, + lessons_learned: Dict) -> List[Dict]: + """Generate actionable follow-up items.""" + action_items = [] + + # Actions from root causes + root_causes = rca_results.get("root_causes", []) + for root_cause in root_causes: + action_type = self._determine_action_type(root_cause) + action_template = self.action_item_types[action_type] + + action_items.append({ + "title": f"Address: {root_cause['cause'][:50]}...", + "description": root_cause["cause"], + "type": action_type, + "priority": action_template["priority"], + "timeline": action_template["timeline"], + "owner": "TBD", + "success_criteria": f"Prevent recurrence of {root_cause['cause'][:30]}...", + "related_root_cause": root_cause + }) + + # Actions from lessons learned + for category, lessons in lessons_learned.items(): + if len(lessons) > 1: # Multiple lessons in same category indicate systematic issue + action_items.append({ + "title": f"Improve {category.replace('_', ' ')}", + "description": f"Address multiple issues identified in {category}", + "type": "process_improvement", + "priority": "P1", + "timeline": "2-3 weeks", + "owner": "TBD", + "success_criteria": f"Comprehensive review and improvement of {category}" + }) + + # Standard actions based on severity + severity = incident_data.get("severity", "").lower() + if severity in ["sev1", "critical"]: + action_items.append({ + "title": "Conduct comprehensive post-incident review", + "description": "Schedule PIR meeting with all stakeholders", + "type": "process_improvement", + "priority": "P0", + "timeline": "24-48 hours", + "owner": incident_data.get("incident_commander", "TBD"), + "success_criteria": "PIR completed and documented" + }) + + return action_items + + def _determine_action_type(self, root_cause: Dict) -> str: + """Determine action item type based on root cause.""" + cause_text = root_cause.get("cause", "").lower() + category = root_cause.get("category", "").lower() + + if any(keyword in cause_text for keyword in ["bug", "error", "failure", "crash"]): + return "immediate_fix" + elif any(keyword in cause_text for keyword in ["monitor", "alert", "detect"]): + return "monitoring_alerting" + elif any(keyword in cause_text for keyword in ["process", "procedure", "review"]): + return "process_improvement" + elif any(keyword in cause_text for keyword in ["document", "runbook", "knowledge"]): + return "documentation" + elif any(keyword in cause_text for keyword in ["training", "skill", "knowledge"]): + return "training" + elif any(keyword in cause_text for keyword in ["architecture", "design", "system"]): + return "architectural" + else: + return "process_improvement" # Default + + def _create_timeline_section(self, timeline_data: Optional[Dict], severity: str) -> str: + """Create timeline section for PIR document.""" + if not timeline_data: + return "No detailed timeline available." + + timeline_content = [] + + if "timeline" in timeline_data and "phases" in timeline_data["timeline"]: + timeline_content.append("### Phase Timeline") + timeline_content.append("") + + phases = timeline_data["timeline"]["phases"] + for phase in phases: + timeline_content.append(f"**{phase['name'].title()} Phase**") + timeline_content.append(f"- Start: {phase['start_time']}") + timeline_content.append(f"- Duration: {phase['duration_minutes']} minutes") + timeline_content.append(f"- Events: {phase['event_count']}") + timeline_content.append("") + + if "metrics" in timeline_data: + metrics = timeline_data["metrics"] + duration_metrics = metrics.get("duration_metrics", {}) + + timeline_content.append("### Key Metrics") + timeline_content.append("") + timeline_content.append(f"- Total Duration: {duration_metrics.get('total_duration_minutes', 'N/A')} minutes") + timeline_content.append(f"- Time to Mitigation: {duration_metrics.get('time_to_mitigation_minutes', 'N/A')} minutes") + timeline_content.append(f"- Time to Resolution: {duration_metrics.get('time_to_resolution_minutes', 'N/A')} minutes") + timeline_content.append("") + + return "\n".join(timeline_content) + + def _generate_document_sections(self, incident_info: Dict, rca_results: Dict, + lessons_learned: Dict, action_items: List[Dict], + timeline_section: str) -> Dict[str, str]: + """Generate all document sections for PIR template.""" + sections = {} + + # Basic information + sections["incident_title"] = incident_info["title"] + sections["incident_id"] = incident_info["incident_id"] + sections["incident_date"] = incident_info["start_time"].strftime("%Y-%m-%d %H:%M:%S UTC") if incident_info["start_time"] else "Unknown" + sections["duration"] = incident_info["duration"] + sections["severity"] = incident_info["severity"].upper() + sections["status"] = incident_info["status"].title() + sections["incident_commander"] = incident_info["incident_commander"] + sections["responders"] = ", ".join(incident_info["responders"]) if incident_info["responders"] else "TBD" + sections["generation_date"] = datetime.now().strftime("%Y-%m-%d") + + # Impact sections + sections["customer_impact"] = incident_info["customer_impact"] + sections["business_impact"] = incident_info["business_impact"] + + # Executive summary + sections["executive_summary"] = self._create_executive_summary(incident_info, rca_results) + + # Timeline + sections["timeline_section"] = timeline_section + + # RCA section + sections["rca_section"] = self._create_rca_section(rca_results) + + # What went well/wrong + sections["what_went_well"] = self._create_what_went_well_section(incident_info, rca_results) + sections["what_went_wrong"] = self._create_what_went_wrong_section(rca_results, lessons_learned) + + # Lessons learned + sections["lessons_learned"] = self._create_lessons_learned_section(lessons_learned) + + # Action items + sections["action_items"] = self._create_action_items_section(action_items) + + # Prevention and appendix + sections["prevention_measures"] = self._create_prevention_section(rca_results, action_items) + sections["appendix_section"] = self._create_appendix_section(incident_info) + + return sections + + def _create_executive_summary(self, incident_info: Dict, rca_results: Dict) -> str: + """Create executive summary section.""" + summary_parts = [] + + # Incident description + summary_parts.append(f"On {incident_info['start_time'].strftime('%B %d, %Y') if incident_info['start_time'] else 'an unknown date'}, we experienced a {incident_info['severity']} incident affecting {incident_info.get('affected_services', ['our services'])}.") + + # Duration and impact + summary_parts.append(f"The incident lasted {incident_info['duration']} and had the following impact: {incident_info['customer_impact']}") + + # Root cause summary + root_causes = rca_results.get("root_causes", []) + if root_causes: + primary_cause = root_causes[0]["cause"] + summary_parts.append(f"Root cause analysis identified the primary issue as: {primary_cause}") + + # Resolution + summary_parts.append(f"The incident has been {incident_info['status']} and we have identified specific actions to prevent recurrence.") + + return " ".join(summary_parts) + + def _create_rca_section(self, rca_results: Dict) -> str: + """Create RCA section content.""" + rca_content = [] + + method = rca_results.get("method", "unknown") + rca_content.append(f"### Analysis Method: {self.rca_frameworks.get(method, {}).get('name', method)}") + rca_content.append("") + + if method == "five_whys" and "why_analysis" in rca_results: + rca_content.append("#### Why Analysis") + rca_content.append("") + + for i, why in enumerate(rca_results["why_analysis"], 1): + rca_content.append(f"**Why {i}:** {why['question']}") + rca_content.append(f"**Answer:** {why['answer']}") + if why["evidence"]: + rca_content.append(f"**Evidence:** {', '.join(why['evidence'])}") + rca_content.append("") + + elif method == "fishbone" and "categories" in rca_results: + rca_content.append("#### Contributing Factor Analysis") + rca_content.append("") + + for category, data in rca_results["categories"].items(): + if data["factors"]: + rca_content.append(f"**{category}:**") + for factor in data["factors"]: + rca_content.append(f"- {factor['factor']} (likelihood: {factor.get('likelihood', 'unknown')})") + rca_content.append("") + + # Root causes summary + root_causes = rca_results.get("root_causes", []) + if root_causes: + rca_content.append("#### Identified Root Causes") + rca_content.append("") + + for i, cause in enumerate(root_causes, 1): + rca_content.append(f"{i}. **{cause['cause']}**") + rca_content.append(f" - Category: {cause.get('category', 'Unknown')}") + rca_content.append(f" - Confidence: {cause.get('confidence', 'Unknown')}") + if cause.get("evidence"): + rca_content.append(f" - Evidence: {cause['evidence']}") + rca_content.append("") + + return "\n".join(rca_content) + + def _create_what_went_well_section(self, incident_info: Dict, rca_results: Dict) -> str: + """Create what went well section.""" + positives = [] + + # Generic positive aspects + if incident_info["status"] == "resolved": + positives.append("The incident was successfully resolved") + + if incident_info["incident_commander"] != "TBD": + positives.append("Incident command was established") + + if len(incident_info.get("responders", [])) > 1: + positives.append("Multiple team members collaborated on resolution") + + # Analysis-specific positives + if rca_results.get("confidence") == "high": + positives.append("Root cause analysis provided clear insights") + + if not positives: + positives.append("Incident response process was followed") + + return "\n".join([f"- {positive}" for positive in positives]) + + def _create_what_went_wrong_section(self, rca_results: Dict, lessons_learned: Dict) -> str: + """Create what went wrong section.""" + issues = [] + + # Issues from RCA + root_causes = rca_results.get("root_causes", []) + for cause in root_causes[:3]: # Show top 3 + issues.append(cause["cause"]) + + # Issues from lessons learned + for category, lessons in lessons_learned.items(): + if lessons: + issues.append(f"{category.replace('_', ' ').title()}: {lessons[0]}") + + if not issues: + issues.append("Analysis in progress") + + return "\n".join([f"- {issue}" for issue in issues]) + + def _create_lessons_learned_section(self, lessons_learned: Dict) -> str: + """Create lessons learned section.""" + content = [] + + for category, lessons in lessons_learned.items(): + if lessons: + content.append(f"### {category.replace('_', ' ').title()}") + content.append("") + + for lesson in lessons: + content.append(f"- {lesson}") + + content.append("") + + if not content: + content.append("Lessons learned to be documented following detailed analysis.") + + return "\n".join(content) + + def _create_action_items_section(self, action_items: List[Dict]) -> str: + """Create action items section.""" + if not action_items: + return "Action items to be defined." + + content = [] + + # Group by priority + priority_groups = defaultdict(list) + for item in action_items: + priority_groups[item.get("priority", "P3")].append(item) + + for priority in ["P0", "P1", "P2", "P3"]: + items = priority_groups.get(priority, []) + if items: + content.append(f"### {priority} - {self._get_priority_description(priority)}") + content.append("") + + for item in items: + content.append(f"**{item['title']}**") + content.append(f"- Owner: {item.get('owner', 'TBD')}") + content.append(f"- Timeline: {item.get('timeline', 'TBD')}") + content.append(f"- Success Criteria: {item.get('success_criteria', 'TBD')}") + content.append("") + + return "\n".join(content) + + def _get_priority_description(self, priority: str) -> str: + """Get human-readable priority description.""" + descriptions = { + "P0": "Critical - Immediate Action Required", + "P1": "High Priority - Complete Within 1-2 Weeks", + "P2": "Medium Priority - Complete Within 1 Month", + "P3": "Low Priority - Complete When Capacity Allows" + } + return descriptions.get(priority, "Unknown Priority") + + def _create_prevention_section(self, rca_results: Dict, action_items: List[Dict]) -> str: + """Create prevention and follow-up section.""" + content = [] + + content.append("### Prevention Measures") + content.append("") + content.append("Based on the root cause analysis, the following preventive measures have been identified:") + content.append("") + + # Extract prevention-focused action items + prevention_items = [item for item in action_items if "prevent" in item.get("description", "").lower()] + + if prevention_items: + for item in prevention_items: + content.append(f"- {item['title']}: {item.get('description', '')}") + else: + content.append("- Implement comprehensive testing for similar scenarios") + content.append("- Improve monitoring and alerting coverage") + content.append("- Enhance error handling and resilience patterns") + + content.append("") + content.append("### Follow-up Schedule") + content.append("") + content.append("- 1 week: Review action item progress") + content.append("- 1 month: Evaluate effectiveness of implemented changes") + content.append("- 3 months: Conduct follow-up assessment and update preventive measures") + + return "\n".join(content) + + def _create_appendix_section(self, incident_info: Dict) -> str: + """Create appendix section.""" + content = [] + + content.append("### Additional Information") + content.append("") + content.append(f"- Incident ID: {incident_info['incident_id']}") + content.append(f"- Severity Classification: {incident_info['severity']}") + + if incident_info.get("affected_services"): + content.append(f"- Affected Services: {', '.join(incident_info['affected_services'])}") + + content.append("") + content.append("### References") + content.append("") + content.append("- Incident tracking ticket: [Link TBD]") + content.append("- Monitoring dashboards: [Link TBD]") + content.append("- Communication thread: [Link TBD]") + + return "\n".join(content) + + def _generate_metadata(self, incident_info: Dict, rca_results: Dict, action_items: List[Dict]) -> Dict[str, Any]: + """Generate PIR metadata for tracking and analysis.""" + return { + "pir_id": f"PIR-{incident_info['incident_id']}", + "incident_severity": incident_info["severity"], + "rca_method": rca_results.get("method", "unknown"), + "rca_confidence": rca_results.get("confidence", "unknown"), + "total_action_items": len(action_items), + "critical_action_items": len([item for item in action_items if item.get("priority") == "P0"]), + "estimated_prevention_timeline": self._estimate_prevention_timeline(action_items), + "categories_affected": list(set(item.get("type", "unknown") for item in action_items)), + "review_completeness": self._assess_review_completeness(incident_info, rca_results, action_items) + } + + def _estimate_prevention_timeline(self, action_items: List[Dict]) -> str: + """Estimate timeline for implementing all prevention measures.""" + if not action_items: + return "unknown" + + # Find the longest timeline among action items + max_weeks = 0 + for item in action_items: + timeline = item.get("timeline", "") + if "week" in timeline: + try: + weeks = int(re.findall(r'\d+', timeline)[0]) + max_weeks = max(max_weeks, weeks) + except (IndexError, ValueError): + pass + elif "month" in timeline: + try: + months = int(re.findall(r'\d+', timeline)[0]) + max_weeks = max(max_weeks, months * 4) + except (IndexError, ValueError): + pass + + if max_weeks == 0: + return "1-2 weeks" + elif max_weeks <= 4: + return f"{max_weeks} weeks" + else: + return f"{max_weeks // 4} months" + + def _assess_review_completeness(self, incident_info: Dict, rca_results: Dict, action_items: List[Dict]) -> float: + """Assess completeness of the PIR (0-1 score).""" + score = 0.0 + + # Basic information completeness + if incident_info.get("description"): + score += 0.1 + if incident_info.get("start_time"): + score += 0.1 + if incident_info.get("customer_impact"): + score += 0.1 + + # RCA completeness + if rca_results.get("root_causes"): + score += 0.2 + if rca_results.get("confidence") in ["medium", "high"]: + score += 0.1 + + # Action items completeness + if action_items: + score += 0.2 + if any(item.get("owner") and item["owner"] != "TBD" for item in action_items): + score += 0.1 + + # Additional factors + if incident_info.get("incident_commander") != "TBD": + score += 0.1 + if len(action_items) >= 3: # Multiple action items show thorough analysis + score += 0.1 + + return min(score, 1.0) + + +def format_json_output(result: Dict) -> str: + """Format result as pretty JSON.""" + return json.dumps(result, indent=2, ensure_ascii=False) + + +def format_markdown_output(result: Dict) -> str: + """Format result as Markdown PIR document.""" + return result.get("pir_document", "Error: No PIR document generated") + + +def format_text_output(result: Dict) -> str: + """Format result as human-readable summary.""" + if "error" in result: + return f"Error: {result['error']}" + + metadata = result.get("metadata", {}) + incident_info = result.get("incident_info", {}) + rca_results = result.get("rca_results", {}) + action_items = result.get("action_items", []) + + output = [] + output.append("=" * 60) + output.append("POST-INCIDENT REVIEW SUMMARY") + output.append("=" * 60) + output.append("") + + # Basic info + output.append("INCIDENT INFORMATION:") + output.append(f" PIR ID: {metadata.get('pir_id', 'Unknown')}") + output.append(f" Severity: {incident_info.get('severity', 'Unknown').upper()}") + output.append(f" Duration: {incident_info.get('duration', 'Unknown')}") + output.append(f" Status: {incident_info.get('status', 'Unknown').title()}") + output.append("") + + # RCA summary + output.append("ROOT CAUSE ANALYSIS:") + output.append(f" Method: {rca_results.get('method', 'Unknown')}") + output.append(f" Confidence: {rca_results.get('confidence', 'Unknown').title()}") + + root_causes = rca_results.get("root_causes", []) + if root_causes: + output.append(f" Root Causes Identified: {len(root_causes)}") + for i, cause in enumerate(root_causes[:3], 1): + output.append(f" {i}. {cause.get('cause', 'Unknown')[:60]}...") + output.append("") + + # Action items summary + output.append("ACTION ITEMS:") + output.append(f" Total Actions: {len(action_items)}") + output.append(f" Critical (P0): {metadata.get('critical_action_items', 0)}") + output.append(f" Prevention Timeline: {metadata.get('estimated_prevention_timeline', 'Unknown')}") + + if action_items: + output.append(" Top Actions:") + for item in action_items[:3]: + output.append(f" - {item.get('title', 'Unknown')[:50]}...") + output.append("") + + # Completeness + completeness = metadata.get("review_completeness", 0) * 100 + output.append(f"REVIEW COMPLETENESS: {completeness:.0f}%") + output.append("") + + output.append("=" * 60) + + return "\n".join(output) + + +def main(): + """Main function with argument parsing and execution.""" + parser = argparse.ArgumentParser( + description="Generate Post-Incident Review documents with RCA and action items", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python pir_generator.py --incident incident.json --output pir.md + python pir_generator.py --incident incident.json --rca-method fishbone + cat incident.json | python pir_generator.py --format markdown + +Incident JSON format: + { + "incident_id": "INC-2024-001", + "title": "Database performance degradation", + "description": "Users experiencing slow response times", + "severity": "sev2", + "start_time": "2024-01-01T12:00:00Z", + "end_time": "2024-01-01T14:30:00Z", + "customer_impact": "50% of users affected by slow page loads", + "business_impact": "Moderate user experience degradation", + "incident_commander": "Alice Smith", + "responders": ["Bob Jones", "Carol Johnson"] + } + """ + ) + + parser.add_argument( + "--incident", "-i", + help="Incident data file (JSON) or '-' for stdin" + ) + + parser.add_argument( + "--timeline", "-t", + help="Timeline reconstruction file (JSON)" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (default: stdout)" + ) + + parser.add_argument( + "--format", "-f", + choices=["json", "markdown", "text"], + default="markdown", + help="Output format (default: markdown)" + ) + + parser.add_argument( + "--rca-method", + choices=["five_whys", "fishbone", "timeline", "bow_tie"], + default="five_whys", + help="Root cause analysis method (default: five_whys)" + ) + + parser.add_argument( + "--template-type", + choices=["comprehensive", "standard", "brief"], + default="comprehensive", + help="PIR template type (default: comprehensive)" + ) + + parser.add_argument( + "--action-items", + action="store_true", + help="Generate detailed action items" + ) + + args = parser.parse_args() + + generator = PIRGenerator() + + try: + # Read incident data + if args.incident == "-" or (not args.incident and not sys.stdin.isatty()): + # Read from stdin + input_text = sys.stdin.read().strip() + if not input_text: + parser.error("No incident data provided") + incident_data = json.loads(input_text) + elif args.incident: + # Read from file + with open(args.incident, 'r') as f: + incident_data = json.load(f) + else: + parser.error("No incident data specified. Use --incident or pipe data to stdin.") + + # Read timeline data if provided + timeline_data = None + if args.timeline: + with open(args.timeline, 'r') as f: + timeline_data = json.load(f) + + # Validate incident data + if not isinstance(incident_data, dict): + parser.error("Incident data must be a JSON object") + + if not incident_data.get("description") and not incident_data.get("title"): + parser.error("Incident data must contain 'description' or 'title'") + + # Generate PIR + result = generator.generate_pir( + incident_data=incident_data, + timeline_data=timeline_data, + rca_method=args.rca_method, + template_type=args.template_type + ) + + # Format output + if args.format == "json": + output = format_json_output(result) + elif args.format == "markdown": + output = format_markdown_output(result) + else: + output = format_text_output(result) + + # Write output + if args.output: + with open(args.output, 'w') as f: + f.write(output) + f.write('\n') + else: + print(output) + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON - {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering-team/incident-commander/scripts/timeline_reconstructor.py b/engineering-team/incident-commander/scripts/timeline_reconstructor.py new file mode 100644 index 0000000..ec60c5b --- /dev/null +++ b/engineering-team/incident-commander/scripts/timeline_reconstructor.py @@ -0,0 +1,1007 @@ +#!/usr/bin/env python3 +""" +Timeline Reconstructor + +Reconstructs incident timelines from timestamped events (logs, alerts, Slack messages). +Identifies incident phases, calculates durations, and performs gap analysis. + +This tool processes chronological event data and creates a coherent narrative +of how an incident progressed from detection through resolution. + +Usage: + python timeline_reconstructor.py --input events.json --output timeline.md + python timeline_reconstructor.py --input events.json --detect-phases --gap-analysis + cat events.json | python timeline_reconstructor.py --format text +""" + +import argparse +import json +import sys +import re +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict, namedtuple + + +# Event data structure +Event = namedtuple('Event', ['timestamp', 'source', 'type', 'message', 'severity', 'actor', 'metadata']) + +# Phase data structure +Phase = namedtuple('Phase', ['name', 'start_time', 'end_time', 'duration', 'events', 'description']) + + +class TimelineReconstructor: + """ + Reconstructs incident timelines from disparate event sources. + Identifies phases, calculates metrics, and performs gap analysis. + """ + + def __init__(self): + """Initialize the reconstructor with phase detection rules and templates.""" + self.phase_patterns = self._load_phase_patterns() + self.event_types = self._load_event_types() + self.severity_mapping = self._load_severity_mapping() + self.gap_thresholds = self._load_gap_thresholds() + + def _load_phase_patterns(self) -> Dict[str, Dict]: + """Load patterns for identifying incident phases.""" + return { + "detection": { + "keywords": [ + "alert", "alarm", "triggered", "fired", "detected", "noticed", + "monitoring", "threshold exceeded", "anomaly", "spike", + "error rate", "latency increase", "timeout", "failure" + ], + "event_types": ["alert", "monitoring", "notification"], + "priority": 1, + "description": "Initial detection of the incident through monitoring or observation" + }, + "triage": { + "keywords": [ + "investigating", "triaging", "assessing", "evaluating", + "checking", "looking into", "analyzing", "reviewing", + "diagnosis", "troubleshooting", "examining" + ], + "event_types": ["investigation", "communication", "action"], + "priority": 2, + "description": "Assessment and initial investigation of the incident" + }, + "escalation": { + "keywords": [ + "escalating", "paging", "calling in", "requesting help", + "engaging", "involving", "notifying", "alerting team", + "incident commander", "war room", "all hands" + ], + "event_types": ["escalation", "communication", "notification"], + "priority": 3, + "description": "Escalation to additional resources or higher severity response" + }, + "mitigation": { + "keywords": [ + "fixing", "patching", "deploying", "rolling back", "restarting", + "scaling", "rerouting", "bypassing", "workaround", + "implementing fix", "applying solution", "remediation" + ], + "event_types": ["deployment", "action", "fix"], + "priority": 4, + "description": "Active mitigation efforts to resolve the incident" + }, + "resolution": { + "keywords": [ + "resolved", "fixed", "restored", "recovered", "back online", + "working", "normal", "stable", "healthy", "operational", + "incident closed", "service restored" + ], + "event_types": ["resolution", "confirmation"], + "priority": 5, + "description": "Confirmation that the incident has been resolved" + }, + "review": { + "keywords": [ + "post-mortem", "retrospective", "review", "lessons learned", + "pir", "post-incident", "analysis", "follow-up", + "action items", "improvements" + ], + "event_types": ["review", "documentation"], + "priority": 6, + "description": "Post-incident review and documentation activities" + } + } + + def _load_event_types(self) -> Dict[str, Dict]: + """Load event type classification rules.""" + return { + "alert": { + "sources": ["monitoring", "nagios", "datadog", "newrelic", "prometheus"], + "indicators": ["alert", "alarm", "threshold", "metric"], + "severity_boost": 2 + }, + "log": { + "sources": ["application", "server", "container", "system"], + "indicators": ["error", "exception", "warn", "fail"], + "severity_boost": 1 + }, + "communication": { + "sources": ["slack", "teams", "email", "chat"], + "indicators": ["message", "notification", "update"], + "severity_boost": 0 + }, + "deployment": { + "sources": ["ci/cd", "jenkins", "github", "gitlab", "deploy"], + "indicators": ["deploy", "release", "build", "merge"], + "severity_boost": 3 + }, + "action": { + "sources": ["manual", "script", "automation", "operator"], + "indicators": ["executed", "ran", "performed", "applied"], + "severity_boost": 2 + }, + "escalation": { + "sources": ["pagerduty", "opsgenie", "oncall", "escalation"], + "indicators": ["paged", "escalated", "notified", "assigned"], + "severity_boost": 3 + } + } + + def _load_severity_mapping(self) -> Dict[str, int]: + """Load severity level mappings.""" + return { + "critical": 5, "crit": 5, "sev1": 5, "p1": 5, + "high": 4, "major": 4, "sev2": 4, "p2": 4, + "medium": 3, "moderate": 3, "sev3": 3, "p3": 3, + "low": 2, "minor": 2, "sev4": 2, "p4": 2, + "info": 1, "informational": 1, "debug": 1, + "unknown": 0 + } + + def _load_gap_thresholds(self) -> Dict[str, int]: + """Load gap analysis thresholds in minutes.""" + return { + "detection_to_triage": 15, # Should start investigating within 15 min + "triage_to_mitigation": 30, # Should start mitigation within 30 min + "mitigation_to_resolution": 120, # Should resolve within 2 hours + "communication_gap": 30, # Should communicate every 30 min + "action_gap": 60, # Should take actions every hour + "phase_transition": 45 # Should transition phases within 45 min + } + + def reconstruct_timeline(self, events_data: List[Dict]) -> Dict[str, Any]: + """ + Main reconstruction method that processes events and builds timeline. + + Args: + events_data: List of event dictionaries + + Returns: + Dictionary with timeline analysis and metrics + """ + # Parse and normalize events + events = self._parse_events(events_data) + if not events: + return {"error": "No valid events found"} + + # Sort events chronologically + events.sort(key=lambda e: e.timestamp) + + # Detect phases + phases = self._detect_phases(events) + + # Calculate metrics + metrics = self._calculate_metrics(events, phases) + + # Perform gap analysis + gap_analysis = self._analyze_gaps(events, phases) + + # Generate timeline narrative + narrative = self._generate_narrative(events, phases) + + # Create summary statistics + summary = self._generate_summary(events, phases, metrics) + + return { + "timeline": { + "total_events": len(events), + "time_range": { + "start": events[0].timestamp.isoformat(), + "end": events[-1].timestamp.isoformat(), + "duration_minutes": int((events[-1].timestamp - events[0].timestamp).total_seconds() / 60) + }, + "phases": [self._phase_to_dict(phase) for phase in phases], + "events": [self._event_to_dict(event) for event in events] + }, + "metrics": metrics, + "gap_analysis": gap_analysis, + "narrative": narrative, + "summary": summary, + "reconstruction_timestamp": datetime.now(timezone.utc).isoformat() + } + + def _parse_events(self, events_data: List[Dict]) -> List[Event]: + """Parse raw event data into normalized Event objects.""" + events = [] + + for event_dict in events_data: + try: + # Parse timestamp + timestamp_str = event_dict.get("timestamp", event_dict.get("time", "")) + if not timestamp_str: + continue + + timestamp = self._parse_timestamp(timestamp_str) + if not timestamp: + continue + + # Extract other fields + source = event_dict.get("source", "unknown") + event_type = self._classify_event_type(event_dict) + message = event_dict.get("message", event_dict.get("description", "")) + severity = self._parse_severity(event_dict.get("severity", event_dict.get("level", "unknown"))) + actor = event_dict.get("actor", event_dict.get("user", "system")) + + # Extract metadata + metadata = {k: v for k, v in event_dict.items() + if k not in ["timestamp", "time", "source", "type", "message", "severity", "actor"]} + + event = Event( + timestamp=timestamp, + source=source, + type=event_type, + message=message, + severity=severity, + actor=actor, + metadata=metadata + ) + + events.append(event) + + except Exception as e: + # Skip invalid events but log them + continue + + return events + + def _parse_timestamp(self, timestamp_str: str) -> Optional[datetime]: + """Parse various timestamp formats.""" + # Common timestamp formats + formats = [ + "%Y-%m-%dT%H:%M:%S.%fZ", # ISO with microseconds + "%Y-%m-%dT%H:%M:%SZ", # ISO without microseconds + "%Y-%m-%d %H:%M:%S", # Standard format + "%m/%d/%Y %H:%M:%S", # US format + "%d/%m/%Y %H:%M:%S", # EU format + "%Y-%m-%d %H:%M:%S.%f", # With microseconds + "%Y%m%d_%H%M%S", # Compact format + ] + + for fmt in formats: + try: + dt = datetime.strptime(timestamp_str, fmt) + # Ensure timezone awareness + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except ValueError: + continue + + # Try parsing as Unix timestamp + try: + timestamp_float = float(timestamp_str) + return datetime.fromtimestamp(timestamp_float, tz=timezone.utc) + except ValueError: + pass + + return None + + def _classify_event_type(self, event_dict: Dict) -> str: + """Classify event type based on source and content.""" + source = event_dict.get("source", "").lower() + message = event_dict.get("message", "").lower() + event_type = event_dict.get("type", "").lower() + + # Check explicit type first + if event_type in self.event_types: + return event_type + + # Classify based on source and content + for type_name, type_info in self.event_types.items(): + # Check source patterns + if any(src in source for src in type_info["sources"]): + return type_name + + # Check message indicators + if any(indicator in message for indicator in type_info["indicators"]): + return type_name + + return "unknown" + + def _parse_severity(self, severity_str: str) -> int: + """Parse severity string to numeric value.""" + severity_clean = str(severity_str).lower().strip() + return self.severity_mapping.get(severity_clean, 0) + + def _detect_phases(self, events: List[Event]) -> List[Phase]: + """Detect incident phases based on event patterns.""" + phases = [] + current_phase = None + phase_events = [] + + for event in events: + detected_phase = self._identify_phase(event) + + if detected_phase != current_phase: + # End current phase if exists + if current_phase and phase_events: + phase_obj = Phase( + name=current_phase, + start_time=phase_events[0].timestamp, + end_time=phase_events[-1].timestamp, + duration=(phase_events[-1].timestamp - phase_events[0].timestamp).total_seconds() / 60, + events=phase_events.copy(), + description=self.phase_patterns[current_phase]["description"] + ) + phases.append(phase_obj) + + # Start new phase + current_phase = detected_phase + phase_events = [event] + else: + phase_events.append(event) + + # Add final phase + if current_phase and phase_events: + phase_obj = Phase( + name=current_phase, + start_time=phase_events[0].timestamp, + end_time=phase_events[-1].timestamp, + duration=(phase_events[-1].timestamp - phase_events[0].timestamp).total_seconds() / 60, + events=phase_events, + description=self.phase_patterns[current_phase]["description"] + ) + phases.append(phase_obj) + + return self._merge_adjacent_phases(phases) + + def _identify_phase(self, event: Event) -> str: + """Identify which phase an event belongs to.""" + message_lower = event.message.lower() + + # Score each phase based on keywords and event type + phase_scores = {} + + for phase_name, pattern_info in self.phase_patterns.items(): + score = 0 + + # Keyword matching + for keyword in pattern_info["keywords"]: + if keyword in message_lower: + score += 2 + + # Event type matching + if event.type in pattern_info["event_types"]: + score += 3 + + # Severity boost for certain phases + if phase_name == "escalation" and event.severity >= 4: + score += 2 + + phase_scores[phase_name] = score + + # Return highest scoring phase, default to triage + if phase_scores and max(phase_scores.values()) > 0: + return max(phase_scores, key=phase_scores.get) + + return "triage" # Default phase + + def _merge_adjacent_phases(self, phases: List[Phase]) -> List[Phase]: + """Merge adjacent phases of the same type.""" + if not phases: + return phases + + merged = [] + current_phase = phases[0] + + for next_phase in phases[1:]: + if (next_phase.name == current_phase.name and + (next_phase.start_time - current_phase.end_time).total_seconds() < 300): # 5 min gap + # Merge phases + merged_events = current_phase.events + next_phase.events + current_phase = Phase( + name=current_phase.name, + start_time=current_phase.start_time, + end_time=next_phase.end_time, + duration=(next_phase.end_time - current_phase.start_time).total_seconds() / 60, + events=merged_events, + description=current_phase.description + ) + else: + merged.append(current_phase) + current_phase = next_phase + + merged.append(current_phase) + return merged + + def _calculate_metrics(self, events: List[Event], phases: List[Phase]) -> Dict[str, Any]: + """Calculate timeline metrics and KPIs.""" + if not events or not phases: + return {} + + start_time = events[0].timestamp + end_time = events[-1].timestamp + total_duration = (end_time - start_time).total_seconds() / 60 + + # Phase timing metrics + phase_durations = {phase.name: phase.duration for phase in phases} + + # Detection metrics + detection_time = 0 + if phases and phases[0].name == "detection": + detection_time = phases[0].duration + + # Time to mitigation + mitigation_start = None + for phase in phases: + if phase.name == "mitigation": + mitigation_start = (phase.start_time - start_time).total_seconds() / 60 + break + + # Time to resolution + resolution_time = None + for phase in phases: + if phase.name == "resolution": + resolution_time = (phase.start_time - start_time).total_seconds() / 60 + break + + # Communication frequency + comm_events = [e for e in events if e.type == "communication"] + comm_frequency = len(comm_events) / (total_duration / 60) if total_duration > 0 else 0 + + # Action frequency + action_events = [e for e in events if e.type == "action"] + action_frequency = len(action_events) / (total_duration / 60) if total_duration > 0 else 0 + + # Event source distribution + source_counts = defaultdict(int) + for event in events: + source_counts[event.source] += 1 + + return { + "duration_metrics": { + "total_duration_minutes": round(total_duration, 1), + "detection_duration_minutes": round(detection_time, 1), + "time_to_mitigation_minutes": round(mitigation_start or 0, 1), + "time_to_resolution_minutes": round(resolution_time or 0, 1), + "phase_durations": {k: round(v, 1) for k, v in phase_durations.items()} + }, + "activity_metrics": { + "total_events": len(events), + "events_per_hour": round((len(events) / (total_duration / 60)) if total_duration > 0 else 0, 1), + "communication_frequency": round(comm_frequency, 1), + "action_frequency": round(action_frequency, 1), + "unique_sources": len(source_counts), + "unique_actors": len(set(e.actor for e in events)) + }, + "phase_metrics": { + "total_phases": len(phases), + "phase_sequence": [p.name for p in phases], + "longest_phase": max(phases, key=lambda p: p.duration).name if phases else None, + "shortest_phase": min(phases, key=lambda p: p.duration).name if phases else None + }, + "source_distribution": dict(source_counts) + } + + def _analyze_gaps(self, events: List[Event], phases: List[Phase]) -> Dict[str, Any]: + """Perform gap analysis to identify potential issues.""" + gaps = [] + warnings = [] + + # Check phase transition timing + for i in range(len(phases) - 1): + current_phase = phases[i] + next_phase = phases[i + 1] + + transition_gap = (next_phase.start_time - current_phase.end_time).total_seconds() / 60 + threshold_key = f"{current_phase.name}_to_{next_phase.name}" + threshold = self.gap_thresholds.get(threshold_key, self.gap_thresholds["phase_transition"]) + + if transition_gap > threshold: + gaps.append({ + "type": "phase_transition", + "from_phase": current_phase.name, + "to_phase": next_phase.name, + "gap_minutes": round(transition_gap, 1), + "threshold_minutes": threshold, + "severity": "warning" if transition_gap < threshold * 2 else "critical" + }) + + # Check communication gaps + comm_events = [e for e in events if e.type == "communication"] + for i in range(len(comm_events) - 1): + gap_minutes = (comm_events[i+1].timestamp - comm_events[i].timestamp).total_seconds() / 60 + if gap_minutes > self.gap_thresholds["communication_gap"]: + gaps.append({ + "type": "communication_gap", + "gap_minutes": round(gap_minutes, 1), + "threshold_minutes": self.gap_thresholds["communication_gap"], + "severity": "warning" if gap_minutes < self.gap_thresholds["communication_gap"] * 2 else "critical" + }) + + # Check for missing phases + expected_phases = ["detection", "triage", "mitigation", "resolution"] + actual_phases = [p.name for p in phases] + missing_phases = [p for p in expected_phases if p not in actual_phases] + + for missing_phase in missing_phases: + warnings.append({ + "type": "missing_phase", + "phase": missing_phase, + "message": f"Expected phase '{missing_phase}' not detected in timeline" + }) + + # Check for unusually long phases + for phase in phases: + if phase.duration > 180: # 3 hours + warnings.append({ + "type": "long_phase", + "phase": phase.name, + "duration_minutes": round(phase.duration, 1), + "message": f"Phase '{phase.name}' lasted {phase.duration:.0f} minutes, which is unusually long" + }) + + return { + "gaps": gaps, + "warnings": warnings, + "gap_summary": { + "total_gaps": len(gaps), + "critical_gaps": len([g for g in gaps if g.get("severity") == "critical"]), + "warning_gaps": len([g for g in gaps if g.get("severity") == "warning"]), + "missing_phases": len(missing_phases) + } + } + + def _generate_narrative(self, events: List[Event], phases: List[Phase]) -> Dict[str, Any]: + """Generate human-readable incident narrative.""" + if not events or not phases: + return {"error": "Insufficient data for narrative generation"} + + # Create phase-based narrative + phase_narratives = [] + for phase in phases: + key_events = self._extract_key_events(phase.events) + narrative_text = self._create_phase_narrative(phase, key_events) + + phase_narratives.append({ + "phase": phase.name, + "start_time": phase.start_time.isoformat(), + "duration_minutes": round(phase.duration, 1), + "narrative": narrative_text, + "key_events": len(key_events), + "total_events": len(phase.events) + }) + + # Create overall summary + start_time = events[0].timestamp + end_time = events[-1].timestamp + total_duration = (end_time - start_time).total_seconds() / 60 + + summary = f"""Incident Timeline Summary: +The incident began at {start_time.strftime('%Y-%m-%d %H:%M:%S UTC')} and concluded at {end_time.strftime('%Y-%m-%d %H:%M:%S UTC')}, lasting approximately {total_duration:.0f} minutes. + +The incident progressed through {len(phases)} distinct phases: {', '.join(p.name for p in phases)}. + +Key milestones:""" + + for phase in phases: + summary += f"\n- {phase.name.title()}: {phase.start_time.strftime('%H:%M')} ({phase.duration:.0f} min)" + + return { + "summary": summary, + "phase_narratives": phase_narratives, + "timeline_type": self._classify_timeline_pattern(phases), + "complexity_score": self._calculate_complexity_score(events, phases) + } + + def _extract_key_events(self, events: List[Event]) -> List[Event]: + """Extract the most important events from a phase.""" + # Sort by severity and timestamp + sorted_events = sorted(events, key=lambda e: (e.severity, e.timestamp), reverse=True) + + # Take top events, but ensure chronological representation + key_events = [] + + # Always include first and last events + if events: + key_events.append(events[0]) + if len(events) > 1: + key_events.append(events[-1]) + + # Add high-severity events + high_severity_events = [e for e in events if e.severity >= 4] + key_events.extend(high_severity_events[:3]) + + # Remove duplicates while preserving order + seen = set() + unique_events = [] + for event in key_events: + event_key = (event.timestamp, event.message) + if event_key not in seen: + seen.add(event_key) + unique_events.append(event) + + return sorted(unique_events, key=lambda e: e.timestamp) + + def _create_phase_narrative(self, phase: Phase, key_events: List[Event]) -> str: + """Create narrative text for a phase.""" + phase_templates = { + "detection": "The incident was first detected when {first_event}. {additional_details}", + "triage": "Initial investigation began with {first_event}. The team {investigation_actions}", + "escalation": "The incident was escalated when {escalation_trigger}. {escalation_actions}", + "mitigation": "Mitigation efforts started with {first_action}. {mitigation_steps}", + "resolution": "The incident was resolved when {resolution_event}. {confirmation_steps}", + "review": "Post-incident review activities included {review_activities}" + } + + template = phase_templates.get(phase.name, "During the {phase_name} phase, {activities}") + + if not key_events: + return f"The {phase.name} phase lasted {phase.duration:.0f} minutes with {len(phase.events)} events." + + first_event = key_events[0].message + + # Customize based on phase + if phase.name == "detection": + return template.format( + first_event=first_event, + additional_details=f"This phase lasted {phase.duration:.0f} minutes with {len(phase.events)} total events." + ) + elif phase.name == "triage": + actions = [e.message for e in key_events if "investigating" in e.message.lower() or "checking" in e.message.lower()] + investigation_text = "performed various diagnostic activities" if not actions else f"focused on {actions[0]}" + return template.format( + first_event=first_event, + investigation_actions=investigation_text + ) + else: + return f"During the {phase.name} phase ({phase.duration:.0f} minutes), key activities included: {first_event}" + + def _classify_timeline_pattern(self, phases: List[Phase]) -> str: + """Classify the overall timeline pattern.""" + phase_names = [p.name for p in phases] + + if "escalation" in phase_names and phases[0].name == "detection": + return "standard_escalation" + elif len(phases) <= 3: + return "simple_resolution" + elif "review" in phase_names: + return "comprehensive_response" + else: + return "complex_incident" + + def _calculate_complexity_score(self, events: List[Event], phases: List[Phase]) -> float: + """Calculate incident complexity score (0-10).""" + score = 0.0 + + # Phase count contributes to complexity + score += min(len(phases) * 1.5, 6.0) + + # Event count contributes to complexity + score += min(len(events) / 20, 2.0) + + # Duration contributes to complexity + if events: + duration_hours = (events[-1].timestamp - events[0].timestamp).total_seconds() / 3600 + score += min(duration_hours / 2, 2.0) + + return min(score, 10.0) + + def _generate_summary(self, events: List[Event], phases: List[Phase], metrics: Dict) -> Dict[str, Any]: + """Generate comprehensive incident summary.""" + if not events: + return {} + + # Key statistics + start_time = events[0].timestamp + end_time = events[-1].timestamp + duration_minutes = metrics.get("duration_metrics", {}).get("total_duration_minutes", 0) + + # Phase analysis + phase_analysis = {} + for phase in phases: + phase_analysis[phase.name] = { + "duration_minutes": round(phase.duration, 1), + "event_count": len(phase.events), + "start_time": phase.start_time.isoformat(), + "end_time": phase.end_time.isoformat() + } + + # Actor involvement + actors = defaultdict(int) + for event in events: + actors[event.actor] += 1 + + return { + "incident_overview": { + "start_time": start_time.isoformat(), + "end_time": end_time.isoformat(), + "total_duration_minutes": round(duration_minutes, 1), + "total_events": len(events), + "phases_detected": len(phases) + }, + "phase_analysis": phase_analysis, + "key_participants": dict(actors), + "event_sources": dict(defaultdict(int, {e.source: 1 for e in events})), + "complexity_indicators": { + "unique_sources": len(set(e.source for e in events)), + "unique_actors": len(set(e.actor for e in events)), + "high_severity_events": len([e for e in events if e.severity >= 4]), + "phase_transitions": len(phases) - 1 if phases else 0 + } + } + + def _event_to_dict(self, event: Event) -> Dict: + """Convert Event namedtuple to dictionary.""" + return { + "timestamp": event.timestamp.isoformat(), + "source": event.source, + "type": event.type, + "message": event.message, + "severity": event.severity, + "actor": event.actor, + "metadata": event.metadata + } + + def _phase_to_dict(self, phase: Phase) -> Dict: + """Convert Phase namedtuple to dictionary.""" + return { + "name": phase.name, + "start_time": phase.start_time.isoformat(), + "end_time": phase.end_time.isoformat(), + "duration_minutes": round(phase.duration, 1), + "event_count": len(phase.events), + "description": phase.description + } + + +def format_json_output(result: Dict) -> str: + """Format result as pretty JSON.""" + return json.dumps(result, indent=2, ensure_ascii=False) + + +def format_text_output(result: Dict) -> str: + """Format result as human-readable text.""" + if "error" in result: + return f"Error: {result['error']}" + + timeline = result["timeline"] + metrics = result["metrics"] + narrative = result["narrative"] + + output = [] + output.append("=" * 80) + output.append("INCIDENT TIMELINE RECONSTRUCTION") + output.append("=" * 80) + output.append("") + + # Overview + time_range = timeline["time_range"] + output.append("OVERVIEW:") + output.append(f" Time Range: {time_range['start']} to {time_range['end']}") + output.append(f" Total Duration: {time_range['duration_minutes']} minutes") + output.append(f" Total Events: {timeline['total_events']}") + output.append(f" Phases Detected: {len(timeline['phases'])}") + output.append("") + + # Phase summary + output.append("PHASES:") + for phase in timeline["phases"]: + output.append(f" {phase['name'].upper()}:") + output.append(f" Start: {phase['start_time']}") + output.append(f" Duration: {phase['duration_minutes']} minutes") + output.append(f" Events: {phase['event_count']}") + output.append(f" Description: {phase['description']}") + output.append("") + + # Key metrics + if "duration_metrics" in metrics: + duration_metrics = metrics["duration_metrics"] + output.append("KEY METRICS:") + output.append(f" Time to Mitigation: {duration_metrics.get('time_to_mitigation_minutes', 'N/A')} minutes") + output.append(f" Time to Resolution: {duration_metrics.get('time_to_resolution_minutes', 'N/A')} minutes") + + if "activity_metrics" in metrics: + activity = metrics["activity_metrics"] + output.append(f" Events per Hour: {activity.get('events_per_hour', 'N/A')}") + output.append(f" Unique Sources: {activity.get('unique_sources', 'N/A')}") + output.append("") + + # Narrative + if "summary" in narrative: + output.append("INCIDENT NARRATIVE:") + output.append(narrative["summary"]) + output.append("") + + # Gap analysis + if "gap_analysis" in result and result["gap_analysis"]["gaps"]: + output.append("GAP ANALYSIS:") + for gap in result["gap_analysis"]["gaps"][:5]: # Show first 5 gaps + output.append(f" {gap['type'].replace('_', ' ').title()}: {gap['gap_minutes']} min gap (threshold: {gap['threshold_minutes']} min)") + output.append("") + + output.append("=" * 80) + + return "\n".join(output) + + +def format_markdown_output(result: Dict) -> str: + """Format result as Markdown timeline.""" + if "error" in result: + return f"# Error\n\n{result['error']}" + + timeline = result["timeline"] + narrative = result.get("narrative", {}) + + output = [] + output.append("# Incident Timeline") + output.append("") + + # Overview + time_range = timeline["time_range"] + output.append("## Overview") + output.append("") + output.append(f"- **Duration:** {time_range['duration_minutes']} minutes") + output.append(f"- **Start Time:** {time_range['start']}") + output.append(f"- **End Time:** {time_range['end']}") + output.append(f"- **Total Events:** {timeline['total_events']}") + output.append("") + + # Narrative summary + if "summary" in narrative: + output.append("## Summary") + output.append("") + output.append(narrative["summary"]) + output.append("") + + # Phase timeline + output.append("## Phase Timeline") + output.append("") + + for phase in timeline["phases"]: + output.append(f"### {phase['name'].title()} Phase") + output.append("") + output.append(f"**Duration:** {phase['duration_minutes']} minutes ") + output.append(f"**Start:** {phase['start_time']} ") + output.append(f"**Events:** {phase['event_count']} ") + output.append("") + output.append(phase["description"]) + output.append("") + + # Detailed timeline + output.append("## Detailed Event Timeline") + output.append("") + + for event in timeline["events"]: + timestamp = datetime.fromisoformat(event["timestamp"].replace('Z', '+00:00')) + output.append(f"**{timestamp.strftime('%H:%M:%S')}** [{event['source']}] {event['message']}") + output.append("") + + return "\n".join(output) + + +def main(): + """Main function with argument parsing and execution.""" + parser = argparse.ArgumentParser( + description="Reconstruct incident timeline from timestamped events", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python timeline_reconstructor.py --input events.json --output timeline.md + python timeline_reconstructor.py --input events.json --detect-phases --gap-analysis + cat events.json | python timeline_reconstructor.py --format text + +Input JSON format: + [ + { + "timestamp": "2024-01-01T12:00:00Z", + "source": "monitoring", + "type": "alert", + "message": "High error rate detected", + "severity": "critical", + "actor": "system" + } + ] + """ + ) + + parser.add_argument( + "--input", "-i", + help="Input file path (JSON format) or '-' for stdin" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (default: stdout)" + ) + + parser.add_argument( + "--format", "-f", + choices=["json", "text", "markdown"], + default="json", + help="Output format (default: json)" + ) + + parser.add_argument( + "--detect-phases", + action="store_true", + help="Enable advanced phase detection" + ) + + parser.add_argument( + "--gap-analysis", + action="store_true", + help="Perform gap analysis on timeline" + ) + + parser.add_argument( + "--min-events", + type=int, + default=1, + help="Minimum number of events required (default: 1)" + ) + + args = parser.parse_args() + + reconstructor = TimelineReconstructor() + + try: + # Read input + if args.input == "-" or (not args.input and not sys.stdin.isatty()): + # Read from stdin + input_text = sys.stdin.read().strip() + if not input_text: + parser.error("No input provided") + events_data = json.loads(input_text) + elif args.input: + # Read from file + with open(args.input, 'r') as f: + events_data = json.load(f) + else: + parser.error("No input specified. Use --input or pipe data to stdin.") + + # Validate input + if not isinstance(events_data, list): + parser.error("Input must be a JSON array of events") + + if len(events_data) < args.min_events: + parser.error(f"Minimum {args.min_events} events required") + + # Reconstruct timeline + result = reconstructor.reconstruct_timeline(events_data) + + # Format output + if args.format == "json": + output = format_json_output(result) + elif args.format == "markdown": + output = format_markdown_output(result) + else: + output = format_text_output(result) + + # Write output + if args.output: + with open(args.output, 'w') as f: + f.write(output) + f.write('\n') + else: + print(output) + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON - {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering/api-design-reviewer/references/api_antipatterns.md b/engineering/api-design-reviewer/references/api_antipatterns.md new file mode 100644 index 0000000..1e2bb99 --- /dev/null +++ b/engineering/api-design-reviewer/references/api_antipatterns.md @@ -0,0 +1,680 @@ +# Common API Anti-Patterns and How to Avoid Them + +## Introduction + +This document outlines common anti-patterns in REST API design that can lead to poor developer experience, maintenance nightmares, and scalability issues. Each anti-pattern is accompanied by examples and recommended solutions. + +## 1. Verb-Based URLs (The RPC Trap) + +### Anti-Pattern +Using verbs in URLs instead of treating endpoints as resources. + +``` +❌ Bad Examples: +POST /api/getUsers +POST /api/createUser +GET /api/deleteUser/123 +POST /api/updateUserPassword +GET /api/calculateOrderTotal/456 +``` + +### Why It's Bad +- Violates REST principles +- Makes the API feel like RPC instead of REST +- HTTP methods lose their semantic meaning +- Reduces cacheability +- Harder to understand resource relationships + +### Solution +``` +✅ Good Examples: +GET /api/users # Get users +POST /api/users # Create user +DELETE /api/users/123 # Delete user +PATCH /api/users/123/password # Update password +GET /api/orders/456/total # Get order total +``` + +## 2. Inconsistent Naming Conventions + +### Anti-Pattern +Mixed naming conventions across the API. + +```json +❌ Bad Examples: +{ + "user_id": 123, // snake_case + "firstName": "John", // camelCase + "last-name": "Doe", // kebab-case + "EMAIL": "john@example.com", // UPPER_CASE + "IsActive": true // PascalCase +} +``` + +### Why It's Bad +- Confuses developers +- Increases cognitive load +- Makes code generation difficult +- Reduces API adoption + +### Solution +```json +✅ Choose one convention and stick to it (camelCase recommended): +{ + "userId": 123, + "firstName": "John", + "lastName": "Doe", + "email": "john@example.com", + "isActive": true +} +``` + +## 3. Ignoring HTTP Status Codes + +### Anti-Pattern +Always returning HTTP 200 regardless of the actual result. + +```json +❌ Bad Example: +HTTP/1.1 200 OK +{ + "status": "error", + "code": 404, + "message": "User not found" +} +``` + +### Why It's Bad +- Breaks HTTP semantics +- Prevents proper error handling by clients +- Breaks caching and proxies +- Makes monitoring and debugging harder + +### Solution +```json +✅ Good Example: +HTTP/1.1 404 Not Found +{ + "error": { + "code": "USER_NOT_FOUND", + "message": "User with ID 123 not found", + "requestId": "req-abc123" + } +} +``` + +## 4. Overly Complex Nested Resources + +### Anti-Pattern +Creating deeply nested URL structures that are hard to navigate. + +``` +❌ Bad Example: +/companies/123/departments/456/teams/789/members/012/projects/345/tasks/678/comments/901 +``` + +### Why It's Bad +- URLs become unwieldy +- Creates tight coupling between resources +- Makes independent resource access difficult +- Complicates authorization logic + +### Solution +``` +✅ Good Examples: +/tasks/678 # Direct access to task +/tasks/678/comments # Task comments +/users/012/tasks # User's tasks +/projects/345?team=789 # Project filtering +``` + +## 5. Inconsistent Error Response Formats + +### Anti-Pattern +Different error response structures across endpoints. + +```json +❌ Bad Examples: +# Endpoint 1 +{"error": "Invalid email"} + +# Endpoint 2 +{"success": false, "msg": "User not found", "code": 404} + +# Endpoint 3 +{"errors": [{"field": "name", "message": "Required"}]} +``` + +### Why It's Bad +- Makes error handling complex for clients +- Reduces code reusability +- Poor developer experience + +### Solution +```json +✅ Standardized Error Format: +{ + "error": { + "code": "VALIDATION_ERROR", + "message": "The request contains invalid data", + "details": [ + { + "field": "email", + "code": "INVALID_FORMAT", + "message": "Email address is not valid" + } + ], + "requestId": "req-123456", + "timestamp": "2024-02-16T13:00:00Z" + } +} +``` + +## 6. Missing or Poor Pagination + +### Anti-Pattern +Returning all results in a single response or inconsistent pagination. + +```json +❌ Bad Examples: +# No pagination (returns 10,000 records) +GET /api/users + +# Inconsistent pagination parameters +GET /api/users?page=1&size=10 +GET /api/orders?offset=0&limit=20 +GET /api/products?start=0&count=50 +``` + +### Why It's Bad +- Can cause performance issues +- May overwhelm clients +- Inconsistent pagination parameters confuse developers +- No way to estimate total results + +### Solution +```json +✅ Good Example: +GET /api/users?page=1&pageSize=10 + +{ + "data": [...], + "pagination": { + "page": 1, + "pageSize": 10, + "total": 150, + "totalPages": 15, + "hasNext": true, + "hasPrev": false + } +} +``` + +## 7. Exposing Internal Implementation Details + +### Anti-Pattern +URLs and field names that reflect database structure or internal architecture. + +``` +❌ Bad Examples: +/api/user_table/123 +/api/db_orders +/api/legacy_customer_data +/api/temp_migration_users + +Response fields: +{ + "user_id_pk": 123, + "internal_ref_code": "usr_abc", + "db_created_timestamp": 1645123456 +} +``` + +### Why It's Bad +- Couples API to internal implementation +- Makes refactoring difficult +- Exposes unnecessary technical details +- Reduces API longevity + +### Solution +``` +✅ Good Examples: +/api/users/123 +/api/orders +/api/customers + +Response fields: +{ + "id": 123, + "referenceCode": "usr_abc", + "createdAt": "2024-02-16T13:00:00Z" +} +``` + +## 8. Overloading Single Endpoint + +### Anti-Pattern +Using one endpoint for multiple unrelated operations based on request parameters. + +``` +❌ Bad Example: +POST /api/user-actions +{ + "action": "create_user", + "userData": {...} +} + +POST /api/user-actions +{ + "action": "delete_user", + "userId": 123 +} + +POST /api/user-actions +{ + "action": "send_email", + "userId": 123, + "emailType": "welcome" +} +``` + +### Why It's Bad +- Breaks REST principles +- Makes documentation complex +- Complicates client implementation +- Reduces discoverability + +### Solution +``` +✅ Good Examples: +POST /api/users # Create user +DELETE /api/users/123 # Delete user +POST /api/users/123/emails # Send email to user +``` + +## 9. Lack of Versioning Strategy + +### Anti-Pattern +Making breaking changes without version management. + +``` +❌ Bad Examples: +# Original API +{ + "name": "John Doe", + "age": 30 +} + +# Later (breaking change with no versioning) +{ + "firstName": "John", + "lastName": "Doe", + "birthDate": "1994-02-16" +} +``` + +### Why It's Bad +- Breaks existing clients +- Forces all clients to update simultaneously +- No graceful migration path +- Reduces API stability + +### Solution +``` +✅ Good Examples: +# Version 1 +GET /api/v1/users/123 +{ + "name": "John Doe", + "age": 30 +} + +# Version 2 (with both versions supported) +GET /api/v2/users/123 +{ + "firstName": "John", + "lastName": "Doe", + "birthDate": "1994-02-16", + "age": 30 // Backwards compatibility +} +``` + +## 10. Poor Error Messages + +### Anti-Pattern +Vague, unhelpful, or technical error messages. + +```json +❌ Bad Examples: +{"error": "Something went wrong"} +{"error": "Invalid input"} +{"error": "SQL constraint violation: FK_user_profile_id"} +{"error": "NullPointerException at line 247"} +``` + +### Why It's Bad +- Doesn't help developers fix issues +- Increases support burden +- Poor developer experience +- May expose sensitive information + +### Solution +```json +✅ Good Examples: +{ + "error": { + "code": "VALIDATION_ERROR", + "message": "The email address is required and must be in a valid format", + "details": [ + { + "field": "email", + "code": "REQUIRED", + "message": "Email address is required" + } + ] + } +} +``` + +## 11. Ignoring Content Negotiation + +### Anti-Pattern +Hard-coding response format without considering client preferences. + +``` +❌ Bad Example: +# Always returns JSON regardless of Accept header +GET /api/users/123 +Accept: application/xml +# Returns JSON anyway +``` + +### Why It's Bad +- Reduces API flexibility +- Ignores HTTP standards +- Makes integration harder for diverse clients + +### Solution +``` +✅ Good Example: +GET /api/users/123 +Accept: application/xml + +HTTP/1.1 200 OK +Content-Type: application/xml + + + + 123 + John Doe + +``` + +## 12. Stateful API Design + +### Anti-Pattern +Maintaining session state on the server between requests. + +``` +❌ Bad Example: +# Step 1: Initialize session +POST /api/session/init + +# Step 2: Set context (requires step 1) +POST /api/session/set-user/123 + +# Step 3: Get data (requires steps 1 & 2) +GET /api/session/user-data +``` + +### Why It's Bad +- Breaks REST statelessness principle +- Reduces scalability +- Makes caching difficult +- Complicates error recovery + +### Solution +``` +✅ Good Example: +# Self-contained requests +GET /api/users/123/data +Authorization: Bearer jwt-token-with-context +``` + +## 13. Inconsistent HTTP Method Usage + +### Anti-Pattern +Using HTTP methods inappropriately or inconsistently. + +``` +❌ Bad Examples: +GET /api/users/123/delete # DELETE operation with GET +POST /api/users/123/get # GET operation with POST +PUT /api/users # Creating with PUT on collection +GET /api/users/search # Search with side effects +``` + +### Why It's Bad +- Violates HTTP semantics +- Breaks caching and idempotency expectations +- Confuses developers and tools + +### Solution +``` +✅ Good Examples: +DELETE /api/users/123 # Delete with DELETE +GET /api/users/123 # Get with GET +POST /api/users # Create on collection +GET /api/users?q=search # Safe search with GET +``` + +## 14. Missing Rate Limiting Information + +### Anti-Pattern +Not providing rate limiting information to clients. + +``` +❌ Bad Example: +HTTP/1.1 429 Too Many Requests +{ + "error": "Rate limit exceeded" +} +``` + +### Why It's Bad +- Clients don't know when to retry +- No information about current limits +- Difficult to implement proper backoff strategies + +### Solution +``` +✅ Good Example: +HTTP/1.1 429 Too Many Requests +X-RateLimit-Limit: 1000 +X-RateLimit-Remaining: 0 +X-RateLimit-Reset: 1640995200 +Retry-After: 3600 + +{ + "error": { + "code": "RATE_LIMIT_EXCEEDED", + "message": "API rate limit exceeded", + "retryAfter": 3600 + } +} +``` + +## 15. Chatty API Design + +### Anti-Pattern +Requiring multiple API calls to accomplish common tasks. + +``` +❌ Bad Example: +# Get user profile requires 4 API calls +GET /api/users/123 # Basic info +GET /api/users/123/profile # Profile details +GET /api/users/123/settings # User settings +GET /api/users/123/stats # User statistics +``` + +### Why It's Bad +- Increases latency +- Creates network overhead +- Makes mobile apps inefficient +- Complicates client implementation + +### Solution +``` +✅ Good Examples: +# Single call with expansion +GET /api/users/123?include=profile,settings,stats + +# Or provide composite endpoints +GET /api/users/123/dashboard + +# Or batch operations +POST /api/batch +{ + "requests": [ + {"method": "GET", "url": "/users/123"}, + {"method": "GET", "url": "/users/123/profile"} + ] +} +``` + +## 16. No Input Validation + +### Anti-Pattern +Accepting and processing invalid input without proper validation. + +```json +❌ Bad Example: +POST /api/users +{ + "email": "not-an-email", + "age": -5, + "name": "" +} + +# API processes this and fails later or stores invalid data +``` + +### Why It's Bad +- Leads to data corruption +- Security vulnerabilities +- Difficult to debug issues +- Poor user experience + +### Solution +```json +✅ Good Example: +POST /api/users +{ + "email": "not-an-email", + "age": -5, + "name": "" +} + +HTTP/1.1 400 Bad Request +{ + "error": { + "code": "VALIDATION_ERROR", + "message": "The request contains invalid data", + "details": [ + { + "field": "email", + "code": "INVALID_FORMAT", + "message": "Email must be a valid email address" + }, + { + "field": "age", + "code": "INVALID_RANGE", + "message": "Age must be between 0 and 150" + }, + { + "field": "name", + "code": "REQUIRED", + "message": "Name is required and cannot be empty" + } + ] + } +} +``` + +## 17. Synchronous Long-Running Operations + +### Anti-Pattern +Blocking the client with long-running operations in synchronous endpoints. + +``` +❌ Bad Example: +POST /api/reports/generate +# Client waits 30 seconds for response +``` + +### Why It's Bad +- Poor user experience +- Timeouts and connection issues +- Resource waste on client and server +- Doesn't scale well + +### Solution +``` +✅ Good Example: +# Async pattern +POST /api/reports +HTTP/1.1 202 Accepted +Location: /api/reports/job-123 +{ + "jobId": "job-123", + "status": "processing", + "estimatedCompletion": "2024-02-16T13:05:00Z" +} + +# Check status +GET /api/reports/job-123 +{ + "jobId": "job-123", + "status": "completed", + "result": "/api/reports/download/report-456" +} +``` + +## Prevention Strategies + +### 1. API Design Reviews +- Implement mandatory design reviews +- Use checklists based on these anti-patterns +- Include multiple stakeholders + +### 2. API Style Guides +- Create and enforce API style guides +- Use linting tools for consistency +- Regular training for development teams + +### 3. Automated Testing +- Test for common anti-patterns +- Include contract testing +- Monitor API usage patterns + +### 4. Documentation Standards +- Require comprehensive API documentation +- Include examples and error scenarios +- Keep documentation up-to-date + +### 5. Client Feedback +- Regularly collect feedback from API consumers +- Monitor API usage analytics +- Conduct developer experience surveys + +## Conclusion + +Avoiding these anti-patterns requires: +- Understanding REST principles +- Consistent design standards +- Regular review and refactoring +- Focus on developer experience +- Proper tooling and automation + +Remember: A well-designed API is an asset that grows in value over time, while a poorly designed API becomes a liability that hampers development and adoption. \ No newline at end of file diff --git a/engineering/api-design-reviewer/references/rest_design_rules.md b/engineering/api-design-reviewer/references/rest_design_rules.md new file mode 100644 index 0000000..1eb9b1f --- /dev/null +++ b/engineering/api-design-reviewer/references/rest_design_rules.md @@ -0,0 +1,487 @@ +# REST API Design Rules Reference + +## Core Principles + +### 1. Resources, Not Actions +REST APIs should focus on **resources** (nouns) rather than **actions** (verbs). The HTTP methods provide the actions. + +``` +✅ Good: +GET /users # Get all users +GET /users/123 # Get user 123 +POST /users # Create new user +PUT /users/123 # Update user 123 +DELETE /users/123 # Delete user 123 + +❌ Bad: +POST /getUsers +POST /createUser +POST /updateUser/123 +POST /deleteUser/123 +``` + +### 2. Hierarchical Resource Structure +Use hierarchical URLs to represent resource relationships: + +``` +/users/123/orders/456/items/789 +``` + +But avoid excessive nesting (max 3-4 levels): + +``` +❌ Too deep: /companies/123/departments/456/teams/789/members/012/tasks/345 +✅ Better: /tasks/345?member=012&team=789 +``` + +## Resource Naming Conventions + +### URLs Should Use Kebab-Case +``` +✅ Good: +/user-profiles +/order-items +/shipping-addresses + +❌ Bad: +/userProfiles +/user_profiles +/orderItems +``` + +### Collections vs Individual Resources +``` +Collection: /users +Individual: /users/123 +Sub-resource: /users/123/orders +``` + +### Pluralization Rules +- Use **plural nouns** for collections: `/users`, `/orders` +- Use **singular nouns** for single resources: `/user-profile`, `/current-session` +- Be consistent throughout your API + +## HTTP Methods Usage + +### GET - Safe and Idempotent +- **Purpose**: Retrieve data +- **Safe**: No side effects +- **Idempotent**: Multiple calls return same result +- **Request Body**: Should not have one +- **Cacheable**: Yes + +``` +GET /users/123 +GET /users?status=active&limit=10 +``` + +### POST - Not Idempotent +- **Purpose**: Create resources, non-idempotent operations +- **Safe**: No +- **Idempotent**: No +- **Request Body**: Usually required +- **Cacheable**: Generally no + +``` +POST /users # Create new user +POST /users/123/activate # Activate user (action) +``` + +### PUT - Idempotent +- **Purpose**: Create or completely replace a resource +- **Safe**: No +- **Idempotent**: Yes +- **Request Body**: Required (complete resource) +- **Cacheable**: No + +``` +PUT /users/123 # Replace entire user resource +``` + +### PATCH - Partial Update +- **Purpose**: Partially update a resource +- **Safe**: No +- **Idempotent**: Not necessarily +- **Request Body**: Required (partial resource) +- **Cacheable**: No + +``` +PATCH /users/123 # Update only specified fields +``` + +### DELETE - Idempotent +- **Purpose**: Remove a resource +- **Safe**: No +- **Idempotent**: Yes (same result if called multiple times) +- **Request Body**: Usually not needed +- **Cacheable**: No + +``` +DELETE /users/123 +``` + +## Status Codes + +### Success Codes (2xx) +- **200 OK**: Standard success response +- **201 Created**: Resource created successfully (POST) +- **202 Accepted**: Request accepted for processing (async) +- **204 No Content**: Success with no response body (DELETE, PUT) + +### Redirection Codes (3xx) +- **301 Moved Permanently**: Resource permanently moved +- **302 Found**: Temporary redirect +- **304 Not Modified**: Use cached version + +### Client Error Codes (4xx) +- **400 Bad Request**: Invalid request syntax or data +- **401 Unauthorized**: Authentication required +- **403 Forbidden**: Access denied (user authenticated but not authorized) +- **404 Not Found**: Resource not found +- **405 Method Not Allowed**: HTTP method not supported +- **409 Conflict**: Resource conflict (duplicates, version mismatch) +- **422 Unprocessable Entity**: Valid syntax but semantic errors +- **429 Too Many Requests**: Rate limit exceeded + +### Server Error Codes (5xx) +- **500 Internal Server Error**: Unexpected server error +- **502 Bad Gateway**: Invalid response from upstream server +- **503 Service Unavailable**: Server temporarily unavailable +- **504 Gateway Timeout**: Upstream server timeout + +## URL Design Patterns + +### Query Parameters for Filtering +``` +GET /users?status=active +GET /users?role=admin&department=engineering +GET /orders?created_after=2024-01-01&status=pending +``` + +### Pagination Parameters +``` +# Offset-based +GET /users?offset=20&limit=10 + +# Cursor-based +GET /users?cursor=eyJpZCI6MTIzfQ&limit=10 + +# Page-based +GET /users?page=3&page_size=10 +``` + +### Sorting Parameters +``` +GET /users?sort=created_at # Ascending +GET /users?sort=-created_at # Descending (prefix with -) +GET /users?sort=last_name,first_name # Multiple fields +``` + +### Field Selection +``` +GET /users?fields=id,name,email +GET /users/123?include=orders,profile +GET /users/123?exclude=internal_notes +``` + +### Search Parameters +``` +GET /users?q=john +GET /products?search=laptop&category=electronics +``` + +## Response Format Standards + +### Consistent Response Structure +```json +{ + "data": { + "id": 123, + "name": "John Doe", + "email": "john@example.com" + }, + "meta": { + "timestamp": "2024-02-16T13:00:00Z", + "version": "1.0" + } +} +``` + +### Collection Responses +```json +{ + "data": [ + {"id": 1, "name": "Item 1"}, + {"id": 2, "name": "Item 2"} + ], + "pagination": { + "total": 150, + "page": 1, + "pageSize": 10, + "totalPages": 15, + "hasNext": true, + "hasPrev": false + }, + "meta": { + "timestamp": "2024-02-16T13:00:00Z" + } +} +``` + +### Error Response Format +```json +{ + "error": { + "code": "VALIDATION_ERROR", + "message": "The request contains invalid parameters", + "details": [ + { + "field": "email", + "code": "INVALID_FORMAT", + "message": "Email address is not valid" + } + ], + "requestId": "req-123456", + "timestamp": "2024-02-16T13:00:00Z" + } +} +``` + +## Field Naming Conventions + +### Use camelCase for JSON Fields +```json +✅ Good: +{ + "firstName": "John", + "lastName": "Doe", + "createdAt": "2024-02-16T13:00:00Z", + "isActive": true +} + +❌ Bad: +{ + "first_name": "John", + "LastName": "Doe", + "created-at": "2024-02-16T13:00:00Z" +} +``` + +### Boolean Fields +Use positive, clear names with "is", "has", "can", or "should" prefixes: + +```json +✅ Good: +{ + "isActive": true, + "hasPermission": false, + "canEdit": true, + "shouldNotify": false +} + +❌ Bad: +{ + "active": true, + "disabled": false, // Double negative + "permission": false // Unclear meaning +} +``` + +### Date/Time Fields +- Use ISO 8601 format: `2024-02-16T13:00:00Z` +- Include timezone information +- Use consistent field naming: + +```json +{ + "createdAt": "2024-02-16T13:00:00Z", + "updatedAt": "2024-02-16T13:30:00Z", + "deletedAt": null, + "publishedAt": "2024-02-16T14:00:00Z" +} +``` + +## Content Negotiation + +### Accept Headers +``` +Accept: application/json +Accept: application/xml +Accept: application/json; version=1 +``` + +### Content-Type Headers +``` +Content-Type: application/json +Content-Type: application/json; charset=utf-8 +Content-Type: multipart/form-data +``` + +### Versioning via Headers +``` +Accept: application/vnd.myapi.v1+json +API-Version: 1.0 +``` + +## Caching Guidelines + +### Cache-Control Headers +``` +Cache-Control: public, max-age=3600 # Cache for 1 hour +Cache-Control: private, max-age=0 # Don't cache +Cache-Control: no-cache, must-revalidate # Always validate +``` + +### ETags for Conditional Requests +``` +HTTP/1.1 200 OK +ETag: "123456789" +Last-Modified: Wed, 21 Oct 2015 07:28:00 GMT + +# Client subsequent request: +If-None-Match: "123456789" +If-Modified-Since: Wed, 21 Oct 2015 07:28:00 GMT +``` + +## Security Headers + +### Authentication +``` +Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9... +Authorization: Basic dXNlcjpwYXNzd29yZA== +Authorization: Api-Key abc123def456 +``` + +### CORS Headers +``` +Access-Control-Allow-Origin: https://example.com +Access-Control-Allow-Methods: GET, POST, PUT, DELETE +Access-Control-Allow-Headers: Content-Type, Authorization +``` + +## Rate Limiting + +### Rate Limit Headers +``` +X-RateLimit-Limit: 1000 +X-RateLimit-Remaining: 999 +X-RateLimit-Reset: 1640995200 +X-RateLimit-Window: 3600 +``` + +### Rate Limit Exceeded Response +```json +HTTP/1.1 429 Too Many Requests +Retry-After: 3600 + +{ + "error": { + "code": "RATE_LIMIT_EXCEEDED", + "message": "API rate limit exceeded", + "details": { + "limit": 1000, + "window": "1 hour", + "retryAfter": 3600 + } + } +} +``` + +## Hypermedia (HATEOAS) + +### Links in Responses +```json +{ + "id": 123, + "name": "John Doe", + "email": "john@example.com", + "_links": { + "self": { + "href": "/users/123" + }, + "orders": { + "href": "/users/123/orders" + }, + "edit": { + "href": "/users/123", + "method": "PUT" + }, + "delete": { + "href": "/users/123", + "method": "DELETE" + } + } +} +``` + +### Link Relations +- **self**: Link to the resource itself +- **edit**: Link to edit the resource +- **delete**: Link to delete the resource +- **related**: Link to related resources +- **next/prev**: Pagination links + +## Common Anti-Patterns to Avoid + +### 1. Verbs in URLs +``` +❌ Bad: /api/getUser/123 +✅ Good: GET /api/users/123 +``` + +### 2. Inconsistent Naming +``` +❌ Bad: /user-profiles and /userAddresses +✅ Good: /user-profiles and /user-addresses +``` + +### 3. Deep Nesting +``` +❌ Bad: /companies/123/departments/456/teams/789/members/012 +✅ Good: /team-members/012?team=789 +``` + +### 4. Ignoring HTTP Status Codes +``` +❌ Bad: Always return 200 with error info in body +✅ Good: Use appropriate status codes (404, 400, 500, etc.) +``` + +### 5. Exposing Internal Structure +``` +❌ Bad: /api/database_table_users +✅ Good: /api/users +``` + +### 6. No Versioning Strategy +``` +❌ Bad: Breaking changes without version management +✅ Good: /api/v1/users or Accept: application/vnd.api+json;version=1 +``` + +### 7. Inconsistent Error Responses +``` +❌ Bad: Different error formats for different endpoints +✅ Good: Standardized error response structure +``` + +## Best Practices Summary + +1. **Use nouns for resources, not verbs** +2. **Leverage HTTP methods correctly** +3. **Maintain consistent naming conventions** +4. **Implement proper error handling** +5. **Use appropriate HTTP status codes** +6. **Design for cacheability** +7. **Implement security from the start** +8. **Plan for versioning** +9. **Provide comprehensive documentation** +10. **Follow HATEOAS principles when applicable** + +## Further Reading + +- [RFC 7231 - HTTP/1.1 Semantics and Content](https://tools.ietf.org/html/rfc7231) +- [RFC 6570 - URI Template](https://tools.ietf.org/html/rfc6570) +- [OpenAPI Specification](https://swagger.io/specification/) +- [REST API Design Best Practices](https://www.restapitutorial.com/) +- [HTTP Status Code Definitions](https://httpstatuses.com/) \ No newline at end of file diff --git a/engineering/api-design-reviewer/scripts/api_scorecard.py b/engineering/api-design-reviewer/scripts/api_scorecard.py new file mode 100644 index 0000000..dc67336 --- /dev/null +++ b/engineering/api-design-reviewer/scripts/api_scorecard.py @@ -0,0 +1,1661 @@ +#!/usr/bin/env python3 +""" +API Scorecard - Comprehensive API design quality assessment tool. + +This script evaluates API designs across multiple dimensions and generates +a detailed scorecard with letter grades and improvement recommendations. + +Scoring Dimensions: +- Consistency (30%): Naming conventions, response patterns, structural consistency +- Documentation (20%): Completeness and clarity of API documentation +- Security (20%): Authentication, authorization, and security best practices +- Usability (15%): Ease of use, discoverability, and developer experience +- Performance (15%): Caching, pagination, and efficiency patterns + +Generates letter grades (A-F) with detailed breakdowns and actionable recommendations. +""" + +import argparse +import json +import re +import sys +from typing import Any, Dict, List, Optional, Set, Tuple +from dataclasses import dataclass, field +from enum import Enum +import math + + +class ScoreCategory(Enum): + """Scoring categories.""" + CONSISTENCY = "consistency" + DOCUMENTATION = "documentation" + SECURITY = "security" + USABILITY = "usability" + PERFORMANCE = "performance" + + +@dataclass +class CategoryScore: + """Score for a specific category.""" + category: ScoreCategory + score: float # 0-100 + max_score: float # Usually 100 + weight: float # Percentage weight in overall score + issues: List[str] = field(default_factory=list) + recommendations: List[str] = field(default_factory=list) + + @property + def letter_grade(self) -> str: + """Convert score to letter grade.""" + if self.score >= 90: + return "A" + elif self.score >= 80: + return "B" + elif self.score >= 70: + return "C" + elif self.score >= 60: + return "D" + else: + return "F" + + @property + def weighted_score(self) -> float: + """Calculate weighted contribution to overall score.""" + return (self.score / 100.0) * self.weight + + +@dataclass +class APIScorecard: + """Complete API scorecard with all category scores.""" + category_scores: Dict[ScoreCategory, CategoryScore] = field(default_factory=dict) + overall_score: float = 0.0 + overall_grade: str = "F" + total_endpoints: int = 0 + api_info: Dict[str, Any] = field(default_factory=dict) + + def calculate_overall_score(self) -> None: + """Calculate overall weighted score and grade.""" + self.overall_score = sum(score.weighted_score for score in self.category_scores.values()) + + if self.overall_score >= 90: + self.overall_grade = "A" + elif self.overall_score >= 80: + self.overall_grade = "B" + elif self.overall_score >= 70: + self.overall_grade = "C" + elif self.overall_score >= 60: + self.overall_grade = "D" + else: + self.overall_grade = "F" + + def get_top_recommendations(self, limit: int = 5) -> List[str]: + """Get top recommendations across all categories.""" + all_recommendations = [] + for category_score in self.category_scores.values(): + for rec in category_score.recommendations: + all_recommendations.append(f"{category_score.category.value.title()}: {rec}") + + # Sort by category weight (highest impact first) + weighted_recs = [] + for category_score in sorted(self.category_scores.values(), + key=lambda x: x.weight, reverse=True): + for rec in category_score.recommendations[:2]: # Top 2 per category + weighted_recs.append(f"{category_score.category.value.title()}: {rec}") + + return weighted_recs[:limit] + + +class APIScoringEngine: + """Main API scoring engine.""" + + def __init__(self): + self.scorecard = APIScorecard() + self.spec: Optional[Dict] = None + + # Regex patterns for validation + self.kebab_case_pattern = re.compile(r'^[a-z]+(?:-[a-z0-9]+)*$') + self.camel_case_pattern = re.compile(r'^[a-z][a-zA-Z0-9]*$') + self.pascal_case_pattern = re.compile(r'^[A-Z][a-zA-Z0-9]*$') + + # HTTP methods + self.http_methods = {'GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD', 'OPTIONS'} + + # Category weights (must sum to 100) + self.category_weights = { + ScoreCategory.CONSISTENCY: 30.0, + ScoreCategory.DOCUMENTATION: 20.0, + ScoreCategory.SECURITY: 20.0, + ScoreCategory.USABILITY: 15.0, + ScoreCategory.PERFORMANCE: 15.0 + } + + def score_api(self, spec: Dict[str, Any]) -> APIScorecard: + """Generate comprehensive API scorecard.""" + self.spec = spec + self.scorecard = APIScorecard() + + # Extract basic API info + self._extract_api_info() + + # Score each category + self._score_consistency() + self._score_documentation() + self._score_security() + self._score_usability() + self._score_performance() + + # Calculate overall score + self.scorecard.calculate_overall_score() + + return self.scorecard + + def _extract_api_info(self) -> None: + """Extract basic API information.""" + info = self.spec.get('info', {}) + paths = self.spec.get('paths', {}) + + self.scorecard.api_info = { + 'title': info.get('title', 'Unknown API'), + 'version': info.get('version', ''), + 'description': info.get('description', ''), + 'total_paths': len(paths), + 'openapi_version': self.spec.get('openapi', self.spec.get('swagger', '')) + } + + # Count total endpoints + endpoint_count = 0 + for path_obj in paths.values(): + if isinstance(path_obj, dict): + endpoint_count += len([m for m in path_obj.keys() + if m.upper() in self.http_methods]) + + self.scorecard.total_endpoints = endpoint_count + + def _score_consistency(self) -> None: + """Score API consistency (30% weight).""" + category = ScoreCategory.CONSISTENCY + score = CategoryScore( + category=category, + score=0.0, + max_score=100.0, + weight=self.category_weights[category] + ) + + consistency_checks = [ + self._check_naming_consistency(), + self._check_response_consistency(), + self._check_error_format_consistency(), + self._check_parameter_consistency(), + self._check_url_structure_consistency(), + self._check_http_method_consistency(), + self._check_status_code_consistency() + ] + + # Average the consistency scores + valid_scores = [s for s in consistency_checks if s is not None] + if valid_scores: + score.score = sum(valid_scores) / len(valid_scores) + + # Add specific recommendations based on low scores + if score.score < 70: + score.recommendations.extend([ + "Review naming conventions across all endpoints and schemas", + "Standardize response formats and error structures", + "Ensure consistent HTTP method usage patterns" + ]) + elif score.score < 85: + score.recommendations.extend([ + "Minor consistency improvements needed in naming or response formats", + "Consider creating API design guidelines document" + ]) + + self.scorecard.category_scores[category] = score + + def _check_naming_consistency(self) -> float: + """Check naming convention consistency.""" + paths = self.spec.get('paths', {}) + schemas = self.spec.get('components', {}).get('schemas', {}) + + total_checks = 0 + passed_checks = 0 + + # Check path naming (should be kebab-case) + for path in paths.keys(): + segments = [seg for seg in path.split('/') if seg and not seg.startswith('{')] + for segment in segments: + total_checks += 1 + if self.kebab_case_pattern.match(segment) or re.match(r'^v\d+$', segment): + passed_checks += 1 + + # Check schema naming (should be PascalCase) + for schema_name in schemas.keys(): + total_checks += 1 + if self.pascal_case_pattern.match(schema_name): + passed_checks += 1 + + # Check property naming within schemas + for schema in schemas.values(): + if isinstance(schema, dict) and 'properties' in schema: + for prop_name in schema['properties'].keys(): + total_checks += 1 + if self.camel_case_pattern.match(prop_name): + passed_checks += 1 + + return (passed_checks / total_checks * 100) if total_checks > 0 else 100 + + def _check_response_consistency(self) -> float: + """Check response format consistency.""" + paths = self.spec.get('paths', {}) + + response_patterns = [] + total_responses = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods or not isinstance(operation, dict): + continue + + responses = operation.get('responses', {}) + for status_code, response in responses.items(): + if not isinstance(response, dict): + continue + + total_responses += 1 + content = response.get('content', {}) + + # Analyze response structure + for media_type, media_obj in content.items(): + schema = media_obj.get('schema', {}) + pattern = self._extract_schema_pattern(schema) + response_patterns.append(pattern) + + # Calculate consistency by comparing patterns + if not response_patterns: + return 100 + + pattern_counts = {} + for pattern in response_patterns: + pattern_key = json.dumps(pattern, sort_keys=True) + pattern_counts[pattern_key] = pattern_counts.get(pattern_key, 0) + 1 + + # Most common pattern should dominate for good consistency + max_count = max(pattern_counts.values()) if pattern_counts else 0 + consistency_ratio = max_count / len(response_patterns) if response_patterns else 1 + + return consistency_ratio * 100 + + def _extract_schema_pattern(self, schema: Dict[str, Any]) -> Dict[str, Any]: + """Extract a pattern from a schema for consistency checking.""" + if not isinstance(schema, dict): + return {} + + pattern = { + 'type': schema.get('type'), + 'has_properties': 'properties' in schema, + 'has_items': 'items' in schema, + 'required_count': len(schema.get('required', [])), + 'property_count': len(schema.get('properties', {})) + } + + return pattern + + def _check_error_format_consistency(self) -> float: + """Check error response format consistency.""" + paths = self.spec.get('paths', {}) + error_responses = [] + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + responses = operation.get('responses', {}) + for status_code, response in responses.items(): + try: + code_int = int(status_code) + if code_int >= 400: # Error responses + content = response.get('content', {}) + for media_type, media_obj in content.items(): + schema = media_obj.get('schema', {}) + error_responses.append(self._extract_schema_pattern(schema)) + except ValueError: + continue + + if not error_responses: + return 80 # No error responses defined - somewhat concerning + + # Check consistency of error response formats + pattern_counts = {} + for pattern in error_responses: + pattern_key = json.dumps(pattern, sort_keys=True) + pattern_counts[pattern_key] = pattern_counts.get(pattern_key, 0) + 1 + + max_count = max(pattern_counts.values()) if pattern_counts else 0 + consistency_ratio = max_count / len(error_responses) if error_responses else 1 + + return consistency_ratio * 100 + + def _check_parameter_consistency(self) -> float: + """Check parameter naming and usage consistency.""" + paths = self.spec.get('paths', {}) + + query_params = [] + path_params = [] + header_params = [] + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + parameters = operation.get('parameters', []) + for param in parameters: + if not isinstance(param, dict): + continue + + param_name = param.get('name', '') + param_in = param.get('in', '') + + if param_in == 'query': + query_params.append(param_name) + elif param_in == 'path': + path_params.append(param_name) + elif param_in == 'header': + header_params.append(param_name) + + # Check naming consistency for each parameter type + scores = [] + + # Query parameters should be camelCase or kebab-case + if query_params: + valid_query = sum(1 for p in query_params + if self.camel_case_pattern.match(p) or self.kebab_case_pattern.match(p)) + scores.append((valid_query / len(query_params)) * 100) + + # Path parameters should be camelCase or kebab-case + if path_params: + valid_path = sum(1 for p in path_params + if self.camel_case_pattern.match(p) or self.kebab_case_pattern.match(p)) + scores.append((valid_path / len(path_params)) * 100) + + return sum(scores) / len(scores) if scores else 100 + + def _check_url_structure_consistency(self) -> float: + """Check URL structure and pattern consistency.""" + paths = self.spec.get('paths', {}) + + total_paths = len(paths) + if total_paths == 0: + return 0 + + structure_score = 0 + + # Check for consistent versioning + versioned_paths = 0 + for path in paths.keys(): + if re.search(r'/v\d+/', path): + versioned_paths += 1 + + # Either all or none should be versioned for consistency + if versioned_paths == 0 or versioned_paths == total_paths: + structure_score += 25 + elif versioned_paths > total_paths * 0.8: + structure_score += 20 + + # Check for reasonable path depth + reasonable_depth = 0 + for path in paths.keys(): + segments = [seg for seg in path.split('/') if seg] + if 2 <= len(segments) <= 5: # Reasonable depth + reasonable_depth += 1 + + structure_score += (reasonable_depth / total_paths) * 25 + + # Check for RESTful resource patterns + restful_patterns = 0 + for path in paths.keys(): + # Look for patterns like /resources/{id} or /resources + if re.match(r'^/[a-z-]+(/\{[^}]+\})?(/[a-z-]+)*$', path): + restful_patterns += 1 + + structure_score += (restful_patterns / total_paths) * 30 + + # Check for consistent trailing slash usage + with_slash = sum(1 for path in paths.keys() if path.endswith('/')) + without_slash = total_paths - with_slash + + # Either all or none should have trailing slashes + if with_slash == 0 or without_slash == 0: + structure_score += 20 + elif min(with_slash, without_slash) < total_paths * 0.1: + structure_score += 15 + + return min(structure_score, 100) + + def _check_http_method_consistency(self) -> float: + """Check HTTP method usage consistency.""" + paths = self.spec.get('paths', {}) + + method_usage = {} + total_operations = 0 + + for path, path_obj in paths.items(): + if not isinstance(path_obj, dict): + continue + + for method in path_obj.keys(): + if method.upper() in self.http_methods: + method_upper = method.upper() + total_operations += 1 + + # Analyze method usage patterns + if method_upper not in method_usage: + method_usage[method_upper] = {'count': 0, 'appropriate': 0} + + method_usage[method_upper]['count'] += 1 + + # Check if method usage seems appropriate + if self._is_method_usage_appropriate(path, method_upper, path_obj[method]): + method_usage[method_upper]['appropriate'] += 1 + + if total_operations == 0: + return 0 + + # Calculate appropriateness score + total_appropriate = sum(data['appropriate'] for data in method_usage.values()) + return (total_appropriate / total_operations) * 100 + + def _is_method_usage_appropriate(self, path: str, method: str, operation: Dict) -> bool: + """Check if HTTP method usage is appropriate for the endpoint.""" + # Simple heuristics for method appropriateness + has_request_body = 'requestBody' in operation + path_has_id = '{' in path and '}' in path + + if method == 'GET': + return not has_request_body # GET should not have body + elif method == 'POST': + return not path_has_id # POST typically for collections + elif method == 'PUT': + return path_has_id and has_request_body # PUT for specific resources + elif method == 'PATCH': + return path_has_id # PATCH for specific resources + elif method == 'DELETE': + return path_has_id # DELETE for specific resources + + return True # Default to appropriate for other methods + + def _check_status_code_consistency(self) -> float: + """Check HTTP status code usage consistency.""" + paths = self.spec.get('paths', {}) + + method_status_patterns = {} + total_operations = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + total_operations += 1 + responses = operation.get('responses', {}) + status_codes = set(responses.keys()) + + if method.upper() not in method_status_patterns: + method_status_patterns[method.upper()] = [] + + method_status_patterns[method.upper()].append(status_codes) + + if total_operations == 0: + return 0 + + # Check consistency within each method type + consistency_scores = [] + + for method, status_patterns in method_status_patterns.items(): + if not status_patterns: + continue + + # Find common status codes for this method + all_codes = set() + for pattern in status_patterns: + all_codes.update(pattern) + + # Calculate how many operations use the most common codes + code_usage = {} + for code in all_codes: + code_usage[code] = sum(1 for pattern in status_patterns if code in pattern) + + # Score based on consistency of common status codes + if status_patterns: + avg_consistency = sum( + len([code for code in pattern if code_usage.get(code, 0) > len(status_patterns) * 0.5]) + for pattern in status_patterns + ) / len(status_patterns) + + method_consistency = min(avg_consistency / 3.0 * 100, 100) # Expect ~3 common codes + consistency_scores.append(method_consistency) + + return sum(consistency_scores) / len(consistency_scores) if consistency_scores else 100 + + def _score_documentation(self) -> None: + """Score API documentation quality (20% weight).""" + category = ScoreCategory.DOCUMENTATION + score = CategoryScore( + category=category, + score=0.0, + max_score=100.0, + weight=self.category_weights[category] + ) + + documentation_checks = [ + self._check_api_level_documentation(), + self._check_endpoint_documentation(), + self._check_schema_documentation(), + self._check_parameter_documentation(), + self._check_response_documentation(), + self._check_example_coverage() + ] + + valid_scores = [s for s in documentation_checks if s is not None] + if valid_scores: + score.score = sum(valid_scores) / len(valid_scores) + + # Add recommendations based on score + if score.score < 60: + score.recommendations.extend([ + "Add comprehensive descriptions to all API components", + "Include examples for complex operations and schemas", + "Document all parameters and response fields" + ]) + elif score.score < 80: + score.recommendations.extend([ + "Improve documentation completeness for some endpoints", + "Add more examples to enhance developer experience" + ]) + + self.scorecard.category_scores[category] = score + + def _check_api_level_documentation(self) -> float: + """Check API-level documentation completeness.""" + info = self.spec.get('info', {}) + score = 0 + + # Required fields + if info.get('title'): + score += 20 + if info.get('version'): + score += 20 + if info.get('description') and len(info['description']) > 20: + score += 30 + + # Optional but recommended fields + if info.get('contact'): + score += 15 + if info.get('license'): + score += 15 + + return score + + def _check_endpoint_documentation(self) -> float: + """Check endpoint-level documentation completeness.""" + paths = self.spec.get('paths', {}) + + total_operations = 0 + documented_operations = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + total_operations += 1 + doc_score = 0 + + if operation.get('summary'): + doc_score += 1 + if operation.get('description') and len(operation['description']) > 20: + doc_score += 1 + if operation.get('operationId'): + doc_score += 1 + + # Consider it documented if it has at least 2/3 elements + if doc_score >= 2: + documented_operations += 1 + + return (documented_operations / total_operations * 100) if total_operations > 0 else 100 + + def _check_schema_documentation(self) -> float: + """Check schema documentation completeness.""" + schemas = self.spec.get('components', {}).get('schemas', {}) + + if not schemas: + return 80 # No schemas to document + + total_schemas = len(schemas) + documented_schemas = 0 + + for schema_name, schema in schemas.items(): + if not isinstance(schema, dict): + continue + + doc_elements = 0 + + # Schema-level description + if schema.get('description'): + doc_elements += 1 + + # Property descriptions + properties = schema.get('properties', {}) + if properties: + described_props = sum(1 for prop in properties.values() + if isinstance(prop, dict) and prop.get('description')) + if described_props > len(properties) * 0.5: # At least 50% documented + doc_elements += 1 + + # Examples + if schema.get('example') or any( + isinstance(prop, dict) and prop.get('example') + for prop in properties.values() + ): + doc_elements += 1 + + if doc_elements >= 2: + documented_schemas += 1 + + return (documented_schemas / total_schemas * 100) if total_schemas > 0 else 100 + + def _check_parameter_documentation(self) -> float: + """Check parameter documentation completeness.""" + paths = self.spec.get('paths', {}) + + total_params = 0 + documented_params = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + parameters = operation.get('parameters', []) + for param in parameters: + if not isinstance(param, dict): + continue + + total_params += 1 + + doc_score = 0 + if param.get('description'): + doc_score += 1 + if param.get('example') or (param.get('schema', {}).get('example')): + doc_score += 1 + + if doc_score >= 1: # At least description + documented_params += 1 + + return (documented_params / total_params * 100) if total_params > 0 else 100 + + def _check_response_documentation(self) -> float: + """Check response documentation completeness.""" + paths = self.spec.get('paths', {}) + + total_responses = 0 + documented_responses = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + responses = operation.get('responses', {}) + for status_code, response in responses.items(): + if not isinstance(response, dict): + continue + + total_responses += 1 + + if response.get('description'): + documented_responses += 1 + + return (documented_responses / total_responses * 100) if total_responses > 0 else 100 + + def _check_example_coverage(self) -> float: + """Check example coverage across the API.""" + paths = self.spec.get('paths', {}) + schemas = self.spec.get('components', {}).get('schemas', {}) + + # Check examples in operations + total_operations = 0 + operations_with_examples = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + total_operations += 1 + + has_example = False + + # Check request body examples + request_body = operation.get('requestBody', {}) + if self._has_examples(request_body.get('content', {})): + has_example = True + + # Check response examples + responses = operation.get('responses', {}) + for response in responses.values(): + if isinstance(response, dict) and self._has_examples(response.get('content', {})): + has_example = True + break + + if has_example: + operations_with_examples += 1 + + # Check examples in schemas + total_schemas = len(schemas) + schemas_with_examples = 0 + + for schema in schemas.values(): + if isinstance(schema, dict) and self._schema_has_examples(schema): + schemas_with_examples += 1 + + # Combine scores + operation_score = (operations_with_examples / total_operations * 100) if total_operations > 0 else 100 + schema_score = (schemas_with_examples / total_schemas * 100) if total_schemas > 0 else 100 + + return (operation_score + schema_score) / 2 + + def _has_examples(self, content: Dict[str, Any]) -> bool: + """Check if content has examples.""" + for media_type, media_obj in content.items(): + if isinstance(media_obj, dict): + if media_obj.get('example') or media_obj.get('examples'): + return True + return False + + def _schema_has_examples(self, schema: Dict[str, Any]) -> bool: + """Check if schema has examples.""" + if schema.get('example'): + return True + + properties = schema.get('properties', {}) + for prop in properties.values(): + if isinstance(prop, dict) and prop.get('example'): + return True + + return False + + def _score_security(self) -> None: + """Score API security implementation (20% weight).""" + category = ScoreCategory.SECURITY + score = CategoryScore( + category=category, + score=0.0, + max_score=100.0, + weight=self.category_weights[category] + ) + + security_checks = [ + self._check_security_schemes(), + self._check_security_requirements(), + self._check_https_usage(), + self._check_authentication_patterns(), + self._check_sensitive_data_handling() + ] + + valid_scores = [s for s in security_checks if s is not None] + if valid_scores: + score.score = sum(valid_scores) / len(valid_scores) + + # Add recommendations + if score.score < 50: + score.recommendations.extend([ + "Implement comprehensive security schemes (OAuth2, API keys, etc.)", + "Ensure all endpoints have appropriate security requirements", + "Add input validation and rate limiting patterns" + ]) + elif score.score < 80: + score.recommendations.extend([ + "Review security coverage for all endpoints", + "Consider additional security measures for sensitive operations" + ]) + + self.scorecard.category_scores[category] = score + + def _check_security_schemes(self) -> float: + """Check security scheme definitions.""" + security_schemes = self.spec.get('components', {}).get('securitySchemes', {}) + + if not security_schemes: + return 20 # Very low score for no security + + score = 40 # Base score for having security schemes + + scheme_types = set() + for scheme in security_schemes.values(): + if isinstance(scheme, dict): + scheme_type = scheme.get('type') + scheme_types.add(scheme_type) + + # Bonus for modern security schemes + if 'oauth2' in scheme_types: + score += 30 + if 'apiKey' in scheme_types: + score += 15 + if 'http' in scheme_types: + score += 15 + + return min(score, 100) + + def _check_security_requirements(self) -> float: + """Check security requirement coverage.""" + paths = self.spec.get('paths', {}) + global_security = self.spec.get('security', []) + + total_operations = 0 + secured_operations = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + total_operations += 1 + + # Check if operation has security requirements + operation_security = operation.get('security') + + if operation_security is not None: + secured_operations += 1 + elif global_security: + secured_operations += 1 + + return (secured_operations / total_operations * 100) if total_operations > 0 else 0 + + def _check_https_usage(self) -> float: + """Check HTTPS enforcement.""" + servers = self.spec.get('servers', []) + + if not servers: + return 60 # No servers defined - assume HTTPS + + https_servers = 0 + for server in servers: + if isinstance(server, dict): + url = server.get('url', '') + if url.startswith('https://') or not url.startswith('http://'): + https_servers += 1 + + return (https_servers / len(servers) * 100) if servers else 100 + + def _check_authentication_patterns(self) -> float: + """Check authentication pattern quality.""" + security_schemes = self.spec.get('components', {}).get('securitySchemes', {}) + + if not security_schemes: + return 0 + + pattern_scores = [] + + for scheme in security_schemes.values(): + if not isinstance(scheme, dict): + continue + + scheme_type = scheme.get('type', '').lower() + + if scheme_type == 'oauth2': + # OAuth2 is highly recommended + flows = scheme.get('flows', {}) + if flows: + pattern_scores.append(95) + else: + pattern_scores.append(80) + elif scheme_type == 'http': + scheme_scheme = scheme.get('scheme', '').lower() + if scheme_scheme == 'bearer': + pattern_scores.append(85) + elif scheme_scheme == 'basic': + pattern_scores.append(60) # Less secure + else: + pattern_scores.append(70) + elif scheme_type == 'apikey': + location = scheme.get('in', '').lower() + if location == 'header': + pattern_scores.append(75) + else: + pattern_scores.append(60) # Query/cookie less secure + else: + pattern_scores.append(50) # Unknown scheme + + return sum(pattern_scores) / len(pattern_scores) if pattern_scores else 0 + + def _check_sensitive_data_handling(self) -> float: + """Check sensitive data handling patterns.""" + # This is a simplified check - in reality would need more sophisticated analysis + schemas = self.spec.get('components', {}).get('schemas', {}) + + score = 80 # Default good score + + # Look for potential sensitive fields without proper handling + sensitive_field_names = {'password', 'secret', 'token', 'key', 'ssn', 'credit_card'} + + for schema in schemas.values(): + if not isinstance(schema, dict): + continue + + properties = schema.get('properties', {}) + for prop_name, prop_def in properties.items(): + if not isinstance(prop_def, dict): + continue + + # Check for sensitive field names + if any(sensitive in prop_name.lower() for sensitive in sensitive_field_names): + # Check if it's marked as sensitive (writeOnly, format: password, etc.) + if not (prop_def.get('writeOnly') or + prop_def.get('format') == 'password' or + 'password' in prop_def.get('description', '').lower()): + score -= 10 # Penalty for exposed sensitive field + + return max(score, 0) + + def _score_usability(self) -> None: + """Score API usability and developer experience (15% weight).""" + category = ScoreCategory.USABILITY + score = CategoryScore( + category=category, + score=0.0, + max_score=100.0, + weight=self.category_weights[category] + ) + + usability_checks = [ + self._check_discoverability(), + self._check_error_handling(), + self._check_filtering_and_searching(), + self._check_resource_relationships(), + self._check_developer_experience() + ] + + valid_scores = [s for s in usability_checks if s is not None] + if valid_scores: + score.score = sum(valid_scores) / len(valid_scores) + + # Add recommendations + if score.score < 60: + score.recommendations.extend([ + "Improve error messages with actionable guidance", + "Add filtering and search capabilities to list endpoints", + "Enhance resource discoverability with better linking" + ]) + elif score.score < 80: + score.recommendations.extend([ + "Consider adding HATEOAS links for better discoverability", + "Enhance developer experience with better examples" + ]) + + self.scorecard.category_scores[category] = score + + def _check_discoverability(self) -> float: + """Check API discoverability features.""" + paths = self.spec.get('paths', {}) + + # Look for root/discovery endpoints + has_root = '/' in paths or any(path == '/api' or path.startswith('/api/') for path in paths) + + # Look for HATEOAS patterns in responses + hateoas_score = 0 + total_responses = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + responses = operation.get('responses', {}) + for response in responses.values(): + if not isinstance(response, dict): + continue + + total_responses += 1 + + # Look for link-like properties in response schemas + content = response.get('content', {}) + for media_obj in content.values(): + schema = media_obj.get('schema', {}) + if self._has_link_properties(schema): + hateoas_score += 1 + break + + discovery_score = 50 if has_root else 30 + if total_responses > 0: + hateoas_ratio = hateoas_score / total_responses + discovery_score += hateoas_ratio * 50 + + return min(discovery_score, 100) + + def _has_link_properties(self, schema: Dict[str, Any]) -> bool: + """Check if schema has link-like properties.""" + if not isinstance(schema, dict): + return False + + properties = schema.get('properties', {}) + link_indicators = {'links', '_links', 'href', 'url', 'self', 'next', 'prev'} + + return any(prop_name.lower() in link_indicators for prop_name in properties.keys()) + + def _check_error_handling(self) -> float: + """Check error handling quality.""" + paths = self.spec.get('paths', {}) + + total_operations = 0 + operations_with_errors = 0 + detailed_error_responses = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + total_operations += 1 + responses = operation.get('responses', {}) + + # Check for error responses + has_error_responses = any( + status_code.startswith('4') or status_code.startswith('5') + for status_code in responses.keys() + ) + + if has_error_responses: + operations_with_errors += 1 + + # Check for detailed error schemas + for status_code, response in responses.items(): + if (status_code.startswith('4') or status_code.startswith('5')) and isinstance(response, dict): + content = response.get('content', {}) + for media_obj in content.values(): + schema = media_obj.get('schema', {}) + if self._has_detailed_error_schema(schema): + detailed_error_responses += 1 + break + break + + if total_operations == 0: + return 0 + + error_coverage = (operations_with_errors / total_operations) * 60 + error_detail = (detailed_error_responses / operations_with_errors * 40) if operations_with_errors > 0 else 0 + + return error_coverage + error_detail + + def _has_detailed_error_schema(self, schema: Dict[str, Any]) -> bool: + """Check if error schema has detailed information.""" + if not isinstance(schema, dict): + return False + + properties = schema.get('properties', {}) + error_fields = {'error', 'message', 'details', 'code', 'timestamp'} + + matching_fields = sum(1 for field in error_fields if field in properties) + return matching_fields >= 2 # At least 2 standard error fields + + def _check_filtering_and_searching(self) -> float: + """Check filtering and search capabilities.""" + paths = self.spec.get('paths', {}) + + collection_endpoints = 0 + endpoints_with_filtering = 0 + + for path, path_obj in paths.items(): + if not isinstance(path_obj, dict): + continue + + # Identify collection endpoints (no path parameters) + if '{' not in path: + get_operation = path_obj.get('get') + if get_operation: + collection_endpoints += 1 + + # Check for filtering/search parameters + parameters = get_operation.get('parameters', []) + filter_params = {'filter', 'search', 'q', 'query', 'limit', 'page', 'offset'} + + has_filtering = any( + isinstance(param, dict) and param.get('name', '').lower() in filter_params + for param in parameters + ) + + if has_filtering: + endpoints_with_filtering += 1 + + return (endpoints_with_filtering / collection_endpoints * 100) if collection_endpoints > 0 else 100 + + def _check_resource_relationships(self) -> float: + """Check resource relationship handling.""" + paths = self.spec.get('paths', {}) + schemas = self.spec.get('components', {}).get('schemas', {}) + + # Look for nested resource patterns + nested_resources = 0 + total_resource_paths = 0 + + for path in paths.keys(): + # Skip root paths + if path.count('/') >= 3: # e.g., /api/users/123/orders + total_resource_paths += 1 + if '{' in path: + nested_resources += 1 + + # Look for relationship fields in schemas + schemas_with_relations = 0 + for schema in schemas.values(): + if not isinstance(schema, dict): + continue + + properties = schema.get('properties', {}) + relation_indicators = {'id', '_id', 'ref', 'link', 'relationship'} + + has_relations = any( + any(indicator in prop_name.lower() for indicator in relation_indicators) + for prop_name in properties.keys() + ) + + if has_relations: + schemas_with_relations += 1 + + nested_score = (nested_resources / total_resource_paths * 50) if total_resource_paths > 0 else 25 + schema_score = (schemas_with_relations / len(schemas) * 50) if schemas else 25 + + return nested_score + schema_score + + def _check_developer_experience(self) -> float: + """Check overall developer experience factors.""" + # This is a composite score based on various DX factors + factors = [] + + # Factor 1: Consistent response structure + factors.append(self._check_response_consistency()) + + # Factor 2: Clear operation IDs + paths = self.spec.get('paths', {}) + total_operations = 0 + operations_with_ids = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method, operation in path_obj.items(): + if method.upper() not in self.http_methods: + continue + + total_operations += 1 + if isinstance(operation, dict) and operation.get('operationId'): + operations_with_ids += 1 + + operation_id_score = (operations_with_ids / total_operations * 100) if total_operations > 0 else 100 + factors.append(operation_id_score) + + # Factor 3: Reasonable path complexity + avg_path_complexity = 0 + if paths: + complexities = [] + for path in paths.keys(): + segments = [seg for seg in path.split('/') if seg] + complexities.append(len(segments)) + + avg_complexity = sum(complexities) / len(complexities) + # Optimal complexity is 3-4 segments + if 3 <= avg_complexity <= 4: + avg_path_complexity = 100 + elif 2 <= avg_complexity <= 5: + avg_path_complexity = 80 + else: + avg_path_complexity = 60 + + factors.append(avg_path_complexity) + + return sum(factors) / len(factors) if factors else 0 + + def _score_performance(self) -> None: + """Score API performance patterns (15% weight).""" + category = ScoreCategory.PERFORMANCE + score = CategoryScore( + category=category, + score=0.0, + max_score=100.0, + weight=self.category_weights[category] + ) + + performance_checks = [ + self._check_caching_headers(), + self._check_pagination_patterns(), + self._check_compression_support(), + self._check_efficiency_patterns(), + self._check_batch_operations() + ] + + valid_scores = [s for s in performance_checks if s is not None] + if valid_scores: + score.score = sum(valid_scores) / len(valid_scores) + + # Add recommendations + if score.score < 60: + score.recommendations.extend([ + "Implement pagination for list endpoints", + "Add caching headers for cacheable responses", + "Consider batch operations for bulk updates" + ]) + elif score.score < 80: + score.recommendations.extend([ + "Review caching strategies for better performance", + "Consider field selection parameters for large responses" + ]) + + self.scorecard.category_scores[category] = score + + def _check_caching_headers(self) -> float: + """Check caching header implementation.""" + paths = self.spec.get('paths', {}) + + get_operations = 0 + cacheable_operations = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + get_operation = path_obj.get('get') + if get_operation and isinstance(get_operation, dict): + get_operations += 1 + + # Check for caching-related headers in responses + responses = get_operation.get('responses', {}) + for response in responses.values(): + if not isinstance(response, dict): + continue + + headers = response.get('headers', {}) + cache_headers = {'cache-control', 'etag', 'last-modified', 'expires'} + + if any(header.lower() in cache_headers for header in headers.keys()): + cacheable_operations += 1 + break + + return (cacheable_operations / get_operations * 100) if get_operations > 0 else 50 + + def _check_pagination_patterns(self) -> float: + """Check pagination implementation.""" + paths = self.spec.get('paths', {}) + + collection_endpoints = 0 + paginated_endpoints = 0 + + for path, path_obj in paths.items(): + if not isinstance(path_obj, dict): + continue + + # Identify collection endpoints + if '{' not in path: # No path parameters = collection + get_operation = path_obj.get('get') + if get_operation and isinstance(get_operation, dict): + collection_endpoints += 1 + + # Check for pagination parameters + parameters = get_operation.get('parameters', []) + pagination_params = {'limit', 'offset', 'page', 'pagesize', 'per_page', 'cursor'} + + has_pagination = any( + isinstance(param, dict) and param.get('name', '').lower() in pagination_params + for param in parameters + ) + + if has_pagination: + paginated_endpoints += 1 + + return (paginated_endpoints / collection_endpoints * 100) if collection_endpoints > 0 else 100 + + def _check_compression_support(self) -> float: + """Check compression support indicators.""" + # This is speculative - OpenAPI doesn't directly specify compression + # Look for indicators that compression is considered + + servers = self.spec.get('servers', []) + + # Check if any server descriptions mention compression + compression_mentions = 0 + for server in servers: + if isinstance(server, dict): + description = server.get('description', '').lower() + if any(term in description for term in ['gzip', 'compress', 'deflate']): + compression_mentions += 1 + + # Base score - assume compression is handled at server level + base_score = 70 + + if compression_mentions > 0: + return min(base_score + (compression_mentions * 10), 100) + + return base_score + + def _check_efficiency_patterns(self) -> float: + """Check efficiency patterns like field selection.""" + paths = self.spec.get('paths', {}) + + total_get_operations = 0 + operations_with_selection = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + get_operation = path_obj.get('get') + if get_operation and isinstance(get_operation, dict): + total_get_operations += 1 + + # Check for field selection parameters + parameters = get_operation.get('parameters', []) + selection_params = {'fields', 'select', 'include', 'exclude'} + + has_selection = any( + isinstance(param, dict) and param.get('name', '').lower() in selection_params + for param in parameters + ) + + if has_selection: + operations_with_selection += 1 + + return (operations_with_selection / total_get_operations * 100) if total_get_operations > 0 else 60 + + def _check_batch_operations(self) -> float: + """Check for batch operation support.""" + paths = self.spec.get('paths', {}) + + # Look for batch endpoints + batch_indicators = ['batch', 'bulk', 'multi'] + batch_endpoints = 0 + + for path in paths.keys(): + if any(indicator in path.lower() for indicator in batch_indicators): + batch_endpoints += 1 + + # Look for array-based request bodies (indicating batch operations) + array_operations = 0 + total_post_put_operations = 0 + + for path_obj in paths.values(): + if not isinstance(path_obj, dict): + continue + + for method in ['post', 'put', 'patch']: + operation = path_obj.get(method) + if operation and isinstance(operation, dict): + total_post_put_operations += 1 + + request_body = operation.get('requestBody', {}) + content = request_body.get('content', {}) + + for media_obj in content.values(): + schema = media_obj.get('schema', {}) + if schema.get('type') == 'array': + array_operations += 1 + break + + # Score based on presence of batch patterns + batch_score = min(batch_endpoints * 20, 60) # Up to 60 points for explicit batch endpoints + + if total_post_put_operations > 0: + array_score = (array_operations / total_post_put_operations) * 40 + batch_score += array_score + + return min(batch_score, 100) + + def generate_json_report(self) -> str: + """Generate JSON format scorecard.""" + report_data = { + "overall": { + "score": round(self.scorecard.overall_score, 2), + "grade": self.scorecard.overall_grade, + "totalEndpoints": self.scorecard.total_endpoints + }, + "api_info": self.scorecard.api_info, + "categories": {}, + "topRecommendations": self.scorecard.get_top_recommendations() + } + + for category, score in self.scorecard.category_scores.items(): + report_data["categories"][category.value] = { + "score": round(score.score, 2), + "grade": score.letter_grade, + "weight": score.weight, + "weightedScore": round(score.weighted_score, 2), + "issues": score.issues, + "recommendations": score.recommendations + } + + return json.dumps(report_data, indent=2) + + def generate_text_report(self) -> str: + """Generate human-readable scorecard report.""" + lines = [ + "═══════════════════════════════════════════════════════════════", + " API DESIGN SCORECARD", + "═══════════════════════════════════════════════════════════════", + f"API: {self.scorecard.api_info.get('title', 'Unknown')}", + f"Version: {self.scorecard.api_info.get('version', 'Unknown')}", + f"Total Endpoints: {self.scorecard.total_endpoints}", + "", + f"🏆 OVERALL GRADE: {self.scorecard.overall_grade} ({self.scorecard.overall_score:.1f}/100.0)", + "", + "═══════════════════════════════════════════════════════════════", + "DETAILED BREAKDOWN:", + "═══════════════════════════════════════════════════════════════" + ] + + # Sort categories by weight (most important first) + sorted_categories = sorted( + self.scorecard.category_scores.items(), + key=lambda x: x[1].weight, + reverse=True + ) + + for category, score in sorted_categories: + category_name = category.value.title().replace('_', ' ') + + lines.extend([ + "", + f"📊 {category_name.upper()} - Grade: {score.letter_grade} ({score.score:.1f}/100)", + f" Weight: {score.weight}% | Contribution: {score.weighted_score:.1f} points", + " " + "─" * 50 + ]) + + if score.recommendations: + lines.append(" 💡 Recommendations:") + for rec in score.recommendations[:3]: # Top 3 recommendations + lines.append(f" • {rec}") + else: + lines.append(" ✅ No specific recommendations - performing well!") + + # Overall assessment + lines.extend([ + "", + "═══════════════════════════════════════════════════════════════", + "OVERALL ASSESSMENT:", + "═══════════════════════════════════════════════════════════════" + ]) + + if self.scorecard.overall_grade == "A": + lines.extend([ + "🏆 EXCELLENT! Your API demonstrates outstanding design quality.", + " Continue following these best practices and consider sharing", + " your approach as a reference for other teams." + ]) + elif self.scorecard.overall_grade == "B": + lines.extend([ + "✅ GOOD! Your API follows most best practices with room for", + " minor improvements. Focus on the recommendations above", + " to achieve excellence." + ]) + elif self.scorecard.overall_grade == "C": + lines.extend([ + "⚠️ FAIR! Your API has a solid foundation but several areas", + " need improvement. Prioritize the high-weight categories", + " for maximum impact." + ]) + elif self.scorecard.overall_grade == "D": + lines.extend([ + "❌ NEEDS IMPROVEMENT! Your API has significant issues that", + " may impact developer experience and maintainability.", + " Focus on consistency and documentation first." + ]) + else: # Grade F + lines.extend([ + "🚨 CRITICAL ISSUES! Your API requires major redesign to meet", + " basic quality standards. Consider comprehensive review", + " of design principles and best practices." + ]) + + # Top recommendations + top_recs = self.scorecard.get_top_recommendations(3) + if top_recs: + lines.extend([ + "", + "🎯 TOP PRIORITY RECOMMENDATIONS:", + "" + ]) + for i, rec in enumerate(top_recs, 1): + lines.append(f" {i}. {rec}") + + lines.extend([ + "", + "═══════════════════════════════════════════════════════════════", + f"Generated by API Scorecard Tool | Score: {self.scorecard.overall_grade} ({self.scorecard.overall_score:.1f}%)", + "═══════════════════════════════════════════════════════════════" + ]) + + return "\n".join(lines) + + +def main(): + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + description="Generate comprehensive API design quality scorecard", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python api_scorecard.py openapi.json + python api_scorecard.py --format json openapi.json > scorecard.json + python api_scorecard.py --output scorecard.txt openapi.json + """ + ) + + parser.add_argument( + 'spec_file', + help='OpenAPI/Swagger specification file (JSON format)' + ) + + parser.add_argument( + '--format', + choices=['text', 'json'], + default='text', + help='Output format (default: text)' + ) + + parser.add_argument( + '--output', + help='Output file (default: stdout)' + ) + + parser.add_argument( + '--min-grade', + choices=['A', 'B', 'C', 'D', 'F'], + help='Exit with code 1 if grade is below minimum' + ) + + args = parser.parse_args() + + # Load specification file + try: + with open(args.spec_file, 'r') as f: + spec = json.load(f) + except FileNotFoundError: + print(f"Error: Specification file '{args.spec_file}' not found.", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in '{args.spec_file}': {e}", file=sys.stderr) + return 1 + + # Initialize scoring engine and generate scorecard + engine = APIScoringEngine() + + try: + scorecard = engine.score_api(spec) + except Exception as e: + print(f"Error during scoring: {e}", file=sys.stderr) + return 1 + + # Generate report + if args.format == 'json': + output = engine.generate_json_report() + else: + output = engine.generate_text_report() + + # Write output + if args.output: + try: + with open(args.output, 'w') as f: + f.write(output) + print(f"Scorecard written to {args.output}") + except IOError as e: + print(f"Error writing to '{args.output}': {e}", file=sys.stderr) + return 1 + else: + print(output) + + # Check minimum grade requirement + if args.min_grade: + grade_order = ['F', 'D', 'C', 'B', 'A'] + current_grade_index = grade_order.index(scorecard.overall_grade) + min_grade_index = grade_order.index(args.min_grade) + + if current_grade_index < min_grade_index: + print(f"Grade {scorecard.overall_grade} is below minimum required grade {args.min_grade}", file=sys.stderr) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/engineering/api-design-reviewer/scripts/breaking_change_detector.py b/engineering/api-design-reviewer/scripts/breaking_change_detector.py new file mode 100644 index 0000000..6f2736a --- /dev/null +++ b/engineering/api-design-reviewer/scripts/breaking_change_detector.py @@ -0,0 +1,1102 @@ +#!/usr/bin/env python3 +""" +Breaking Change Detector - Compares API specification versions to identify breaking changes. + +This script analyzes two versions of an API specification and detects potentially +breaking changes including: +- Removed endpoints +- Modified response structures +- Removed or renamed fields +- Field type changes +- New required fields +- HTTP status code changes +- Parameter changes + +Generates detailed reports with migration guides for each breaking change. +""" + +import argparse +import json +import sys +from typing import Any, Dict, List, Set, Optional, Tuple, Union +from dataclasses import dataclass, field +from enum import Enum + + +class ChangeType(Enum): + """Types of API changes.""" + BREAKING = "breaking" + POTENTIALLY_BREAKING = "potentially_breaking" + NON_BREAKING = "non_breaking" + ENHANCEMENT = "enhancement" + + +class ChangeSeverity(Enum): + """Severity levels for changes.""" + CRITICAL = "critical" # Will definitely break clients + HIGH = "high" # Likely to break some clients + MEDIUM = "medium" # May break clients depending on usage + LOW = "low" # Minor impact, unlikely to break clients + INFO = "info" # Informational, no breaking impact + + +@dataclass +class Change: + """Represents a detected change between API versions.""" + change_type: ChangeType + severity: ChangeSeverity + category: str + path: str + message: str + old_value: Any = None + new_value: Any = None + migration_guide: str = "" + impact_description: str = "" + + def to_dict(self) -> Dict[str, Any]: + """Convert change to dictionary for JSON serialization.""" + return { + "changeType": self.change_type.value, + "severity": self.severity.value, + "category": self.category, + "path": self.path, + "message": self.message, + "oldValue": self.old_value, + "newValue": self.new_value, + "migrationGuide": self.migration_guide, + "impactDescription": self.impact_description + } + + +@dataclass +class ComparisonReport: + """Complete comparison report between two API versions.""" + changes: List[Change] = field(default_factory=list) + summary: Dict[str, int] = field(default_factory=dict) + + def add_change(self, change: Change) -> None: + """Add a change to the report.""" + self.changes.append(change) + + def calculate_summary(self) -> None: + """Calculate summary statistics.""" + self.summary = { + "total_changes": len(self.changes), + "breaking_changes": len([c for c in self.changes if c.change_type == ChangeType.BREAKING]), + "potentially_breaking_changes": len([c for c in self.changes if c.change_type == ChangeType.POTENTIALLY_BREAKING]), + "non_breaking_changes": len([c for c in self.changes if c.change_type == ChangeType.NON_BREAKING]), + "enhancements": len([c for c in self.changes if c.change_type == ChangeType.ENHANCEMENT]), + "critical_severity": len([c for c in self.changes if c.severity == ChangeSeverity.CRITICAL]), + "high_severity": len([c for c in self.changes if c.severity == ChangeSeverity.HIGH]), + "medium_severity": len([c for c in self.changes if c.severity == ChangeSeverity.MEDIUM]), + "low_severity": len([c for c in self.changes if c.severity == ChangeSeverity.LOW]), + "info_severity": len([c for c in self.changes if c.severity == ChangeSeverity.INFO]) + } + + def has_breaking_changes(self) -> bool: + """Check if report contains any breaking changes.""" + return any(c.change_type in [ChangeType.BREAKING, ChangeType.POTENTIALLY_BREAKING] + for c in self.changes) + + +class BreakingChangeDetector: + """Main breaking change detection engine.""" + + def __init__(self): + self.report = ComparisonReport() + self.old_spec: Optional[Dict] = None + self.new_spec: Optional[Dict] = None + + def compare_specs(self, old_spec: Dict[str, Any], new_spec: Dict[str, Any]) -> ComparisonReport: + """Compare two API specifications and detect changes.""" + self.old_spec = old_spec + self.new_spec = new_spec + self.report = ComparisonReport() + + # Compare different sections of the API specification + self._compare_info_section() + self._compare_servers_section() + self._compare_paths_section() + self._compare_components_section() + self._compare_security_section() + + # Calculate summary statistics + self.report.calculate_summary() + + return self.report + + def _compare_info_section(self) -> None: + """Compare API info sections.""" + old_info = self.old_spec.get('info', {}) + new_info = self.new_spec.get('info', {}) + + # Version comparison + old_version = old_info.get('version', '') + new_version = new_info.get('version', '') + + if old_version != new_version: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="versioning", + path="/info/version", + message=f"API version changed from '{old_version}' to '{new_version}'", + old_value=old_version, + new_value=new_version, + impact_description="Version change indicates API evolution" + )) + + # Title comparison + old_title = old_info.get('title', '') + new_title = new_info.get('title', '') + + if old_title != new_title: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="metadata", + path="/info/title", + message=f"API title changed from '{old_title}' to '{new_title}'", + old_value=old_title, + new_value=new_title, + impact_description="Title change is cosmetic and doesn't affect functionality" + )) + + def _compare_servers_section(self) -> None: + """Compare server configurations.""" + old_servers = self.old_spec.get('servers', []) + new_servers = self.new_spec.get('servers', []) + + old_urls = {server.get('url', '') for server in old_servers if isinstance(server, dict)} + new_urls = {server.get('url', '') for server in new_servers if isinstance(server, dict)} + + # Removed servers + removed_urls = old_urls - new_urls + for url in removed_urls: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="servers", + path="/servers", + message=f"Server URL removed: {url}", + old_value=url, + new_value=None, + migration_guide=f"Update client configurations to use alternative server URLs: {list(new_urls)}", + impact_description="Clients configured to use removed server URL will fail to connect" + )) + + # Added servers + added_urls = new_urls - old_urls + for url in added_urls: + self.report.add_change(Change( + change_type=ChangeType.ENHANCEMENT, + severity=ChangeSeverity.INFO, + category="servers", + path="/servers", + message=f"New server URL added: {url}", + old_value=None, + new_value=url, + impact_description="New server option provides additional deployment flexibility" + )) + + def _compare_paths_section(self) -> None: + """Compare API paths and operations.""" + old_paths = self.old_spec.get('paths', {}) + new_paths = self.new_spec.get('paths', {}) + + # Find removed, added, and modified paths + old_path_set = set(old_paths.keys()) + new_path_set = set(new_paths.keys()) + + removed_paths = old_path_set - new_path_set + added_paths = new_path_set - old_path_set + common_paths = old_path_set & new_path_set + + # Handle removed paths + for path in removed_paths: + old_operations = self._extract_operations(old_paths[path]) + for method in old_operations: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.CRITICAL, + category="endpoints", + path=f"/paths{path}", + message=f"Endpoint removed: {method.upper()} {path}", + old_value=f"{method.upper()} {path}", + new_value=None, + migration_guide=self._generate_endpoint_removal_migration(path, method, new_paths), + impact_description="Clients using this endpoint will receive 404 errors" + )) + + # Handle added paths + for path in added_paths: + new_operations = self._extract_operations(new_paths[path]) + for method in new_operations: + self.report.add_change(Change( + change_type=ChangeType.ENHANCEMENT, + severity=ChangeSeverity.INFO, + category="endpoints", + path=f"/paths{path}", + message=f"New endpoint added: {method.upper()} {path}", + old_value=None, + new_value=f"{method.upper()} {path}", + impact_description="New functionality available to clients" + )) + + # Handle modified paths + for path in common_paths: + self._compare_path_operations(path, old_paths[path], new_paths[path]) + + def _extract_operations(self, path_object: Dict[str, Any]) -> List[str]: + """Extract HTTP operations from a path object.""" + http_methods = {'get', 'post', 'put', 'patch', 'delete', 'head', 'options', 'trace'} + return [method for method in path_object.keys() if method.lower() in http_methods] + + def _compare_path_operations(self, path: str, old_path_obj: Dict, new_path_obj: Dict) -> None: + """Compare operations within a specific path.""" + old_operations = set(self._extract_operations(old_path_obj)) + new_operations = set(self._extract_operations(new_path_obj)) + + # Removed operations + removed_ops = old_operations - new_operations + for method in removed_ops: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.CRITICAL, + category="endpoints", + path=f"/paths{path}/{method}", + message=f"HTTP method removed: {method.upper()} {path}", + old_value=f"{method.upper()} {path}", + new_value=None, + migration_guide=self._generate_method_removal_migration(path, method, new_operations), + impact_description="Clients using this method will receive 405 Method Not Allowed errors" + )) + + # Added operations + added_ops = new_operations - old_operations + for method in added_ops: + self.report.add_change(Change( + change_type=ChangeType.ENHANCEMENT, + severity=ChangeSeverity.INFO, + category="endpoints", + path=f"/paths{path}/{method}", + message=f"New HTTP method added: {method.upper()} {path}", + old_value=None, + new_value=f"{method.upper()} {path}", + impact_description="New method provides additional functionality for this resource" + )) + + # Modified operations + common_ops = old_operations & new_operations + for method in common_ops: + self._compare_operation_details(path, method, old_path_obj[method], new_path_obj[method]) + + def _compare_operation_details(self, path: str, method: str, old_op: Dict, new_op: Dict) -> None: + """Compare details of individual operations.""" + operation_path = f"/paths{path}/{method}" + + # Compare parameters + self._compare_parameters(operation_path, old_op.get('parameters', []), new_op.get('parameters', [])) + + # Compare request body + self._compare_request_body(operation_path, old_op.get('requestBody'), new_op.get('requestBody')) + + # Compare responses + self._compare_responses(operation_path, old_op.get('responses', {}), new_op.get('responses', {})) + + # Compare security requirements + self._compare_security_requirements(operation_path, old_op.get('security'), new_op.get('security')) + + def _compare_parameters(self, base_path: str, old_params: List[Dict], new_params: List[Dict]) -> None: + """Compare operation parameters.""" + # Create lookup dictionaries + old_param_map = {(p.get('name'), p.get('in')): p for p in old_params} + new_param_map = {(p.get('name'), p.get('in')): p for p in new_params} + + old_param_keys = set(old_param_map.keys()) + new_param_keys = set(new_param_map.keys()) + + # Removed parameters + removed_params = old_param_keys - new_param_keys + for param_key in removed_params: + name, location = param_key + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="parameters", + path=f"{base_path}/parameters", + message=f"Parameter removed: {name} (in: {location})", + old_value=old_param_map[param_key], + new_value=None, + migration_guide=f"Remove '{name}' parameter from {location} when calling this endpoint", + impact_description="Clients sending this parameter may receive validation errors" + )) + + # Added parameters + added_params = new_param_keys - old_param_keys + for param_key in added_params: + name, location = param_key + new_param = new_param_map[param_key] + is_required = new_param.get('required', False) + + if is_required: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.CRITICAL, + category="parameters", + path=f"{base_path}/parameters", + message=f"New required parameter added: {name} (in: {location})", + old_value=None, + new_value=new_param, + migration_guide=f"Add required '{name}' parameter to {location} when calling this endpoint", + impact_description="Clients not providing this parameter will receive 400 Bad Request errors" + )) + else: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="parameters", + path=f"{base_path}/parameters", + message=f"New optional parameter added: {name} (in: {location})", + old_value=None, + new_value=new_param, + impact_description="Optional parameter provides additional functionality" + )) + + # Modified parameters + common_params = old_param_keys & new_param_keys + for param_key in common_params: + name, location = param_key + old_param = old_param_map[param_key] + new_param = new_param_map[param_key] + self._compare_parameter_details(base_path, name, location, old_param, new_param) + + def _compare_parameter_details(self, base_path: str, name: str, location: str, + old_param: Dict, new_param: Dict) -> None: + """Compare individual parameter details.""" + param_path = f"{base_path}/parameters/{name}" + + # Required status change + old_required = old_param.get('required', False) + new_required = new_param.get('required', False) + + if old_required != new_required: + if new_required: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="parameters", + path=param_path, + message=f"Parameter '{name}' is now required (was optional)", + old_value=old_required, + new_value=new_required, + migration_guide=f"Ensure '{name}' parameter is always provided when calling this endpoint", + impact_description="Clients not providing this parameter will receive validation errors" + )) + else: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="parameters", + path=param_path, + message=f"Parameter '{name}' is now optional (was required)", + old_value=old_required, + new_value=new_required, + impact_description="Parameter is now optional, providing more flexibility to clients" + )) + + # Schema/type changes + old_schema = old_param.get('schema', {}) + new_schema = new_param.get('schema', {}) + + if old_schema != new_schema: + self._compare_schemas(param_path, old_schema, new_schema, f"parameter '{name}'") + + def _compare_request_body(self, base_path: str, old_body: Optional[Dict], new_body: Optional[Dict]) -> None: + """Compare request body specifications.""" + body_path = f"{base_path}/requestBody" + + # Request body added + if old_body is None and new_body is not None: + is_required = new_body.get('required', False) + if is_required: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="request_body", + path=body_path, + message="Required request body added", + old_value=None, + new_value=new_body, + migration_guide="Include request body with appropriate content type when calling this endpoint", + impact_description="Clients not providing request body will receive validation errors" + )) + else: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="request_body", + path=body_path, + message="Optional request body added", + old_value=None, + new_value=new_body, + impact_description="Optional request body provides additional functionality" + )) + + # Request body removed + elif old_body is not None and new_body is None: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="request_body", + path=body_path, + message="Request body removed", + old_value=old_body, + new_value=None, + migration_guide="Remove request body when calling this endpoint", + impact_description="Clients sending request body may receive validation errors" + )) + + # Request body modified + elif old_body is not None and new_body is not None: + self._compare_request_body_details(body_path, old_body, new_body) + + def _compare_request_body_details(self, base_path: str, old_body: Dict, new_body: Dict) -> None: + """Compare request body details.""" + # Required status change + old_required = old_body.get('required', False) + new_required = new_body.get('required', False) + + if old_required != new_required: + if new_required: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="request_body", + path=base_path, + message="Request body is now required (was optional)", + old_value=old_required, + new_value=new_required, + migration_guide="Always include request body when calling this endpoint", + impact_description="Clients not providing request body will receive validation errors" + )) + else: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="request_body", + path=base_path, + message="Request body is now optional (was required)", + old_value=old_required, + new_value=new_required, + impact_description="Request body is now optional, providing more flexibility" + )) + + # Content type changes + old_content = old_body.get('content', {}) + new_content = new_body.get('content', {}) + self._compare_content_types(base_path, old_content, new_content, "request body") + + def _compare_responses(self, base_path: str, old_responses: Dict, new_responses: Dict) -> None: + """Compare response specifications.""" + responses_path = f"{base_path}/responses" + + old_status_codes = set(old_responses.keys()) + new_status_codes = set(new_responses.keys()) + + # Removed status codes + removed_codes = old_status_codes - new_status_codes + for code in removed_codes: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="responses", + path=f"{responses_path}/{code}", + message=f"Response status code {code} removed", + old_value=old_responses[code], + new_value=None, + migration_guide=f"Handle alternative status codes: {list(new_status_codes)}", + impact_description=f"Clients expecting status code {code} need to handle different responses" + )) + + # Added status codes + added_codes = new_status_codes - old_status_codes + for code in added_codes: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="responses", + path=f"{responses_path}/{code}", + message=f"New response status code {code} added", + old_value=None, + new_value=new_responses[code], + impact_description="New status code provides more specific response information" + )) + + # Modified responses + common_codes = old_status_codes & new_status_codes + for code in common_codes: + self._compare_response_details(responses_path, code, old_responses[code], new_responses[code]) + + def _compare_response_details(self, base_path: str, status_code: str, + old_response: Dict, new_response: Dict) -> None: + """Compare individual response details.""" + response_path = f"{base_path}/{status_code}" + + # Compare content types and schemas + old_content = old_response.get('content', {}) + new_content = new_response.get('content', {}) + + self._compare_content_types(response_path, old_content, new_content, f"response {status_code}") + + def _compare_content_types(self, base_path: str, old_content: Dict, new_content: Dict, context: str) -> None: + """Compare content types and their schemas.""" + old_types = set(old_content.keys()) + new_types = set(new_content.keys()) + + # Removed content types + removed_types = old_types - new_types + for content_type in removed_types: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="content_types", + path=f"{base_path}/content", + message=f"Content type '{content_type}' removed from {context}", + old_value=content_type, + new_value=None, + migration_guide=f"Use alternative content types: {list(new_types)}", + impact_description=f"Clients expecting '{content_type}' need to handle different formats" + )) + + # Added content types + added_types = new_types - old_types + for content_type in added_types: + self.report.add_change(Change( + change_type=ChangeType.ENHANCEMENT, + severity=ChangeSeverity.INFO, + category="content_types", + path=f"{base_path}/content", + message=f"New content type '{content_type}' added to {context}", + old_value=None, + new_value=content_type, + impact_description=f"Additional format option available for {context}" + )) + + # Modified schemas for common content types + common_types = old_types & new_types + for content_type in common_types: + old_media = old_content[content_type] + new_media = new_content[content_type] + + old_schema = old_media.get('schema', {}) + new_schema = new_media.get('schema', {}) + + if old_schema != new_schema: + schema_path = f"{base_path}/content/{content_type}/schema" + self._compare_schemas(schema_path, old_schema, new_schema, f"{context} ({content_type})") + + def _compare_schemas(self, base_path: str, old_schema: Dict, new_schema: Dict, context: str) -> None: + """Compare schema definitions.""" + # Type changes + old_type = old_schema.get('type') + new_type = new_schema.get('type') + + if old_type != new_type and old_type is not None and new_type is not None: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.CRITICAL, + category="schema", + path=base_path, + message=f"Schema type changed from '{old_type}' to '{new_type}' for {context}", + old_value=old_type, + new_value=new_type, + migration_guide=f"Update client code to handle {new_type} instead of {old_type}", + impact_description="Type change will break client parsing and validation" + )) + + # Property changes for object types + if old_schema.get('type') == 'object' and new_schema.get('type') == 'object': + self._compare_object_properties(base_path, old_schema, new_schema, context) + + # Array item changes + if old_schema.get('type') == 'array' and new_schema.get('type') == 'array': + old_items = old_schema.get('items', {}) + new_items = new_schema.get('items', {}) + if old_items != new_items: + self._compare_schemas(f"{base_path}/items", old_items, new_items, f"{context} items") + + def _compare_object_properties(self, base_path: str, old_schema: Dict, new_schema: Dict, context: str) -> None: + """Compare object schema properties.""" + old_props = old_schema.get('properties', {}) + new_props = new_schema.get('properties', {}) + old_required = set(old_schema.get('required', [])) + new_required = set(new_schema.get('required', [])) + + old_prop_names = set(old_props.keys()) + new_prop_names = set(new_props.keys()) + + # Removed properties + removed_props = old_prop_names - new_prop_names + for prop_name in removed_props: + severity = ChangeSeverity.CRITICAL if prop_name in old_required else ChangeSeverity.HIGH + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=severity, + category="schema", + path=f"{base_path}/properties", + message=f"Property '{prop_name}' removed from {context}", + old_value=old_props[prop_name], + new_value=None, + migration_guide=f"Remove references to '{prop_name}' property in client code", + impact_description="Clients expecting this property will receive incomplete data" + )) + + # Added properties + added_props = new_prop_names - old_prop_names + for prop_name in added_props: + if prop_name in new_required: + # This is handled separately in required field changes + pass + else: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="schema", + path=f"{base_path}/properties", + message=f"New optional property '{prop_name}' added to {context}", + old_value=None, + new_value=new_props[prop_name], + impact_description="New property provides additional data without breaking existing clients" + )) + + # Required field changes + added_required = new_required - old_required + removed_required = old_required - new_required + + for prop_name in added_required: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.CRITICAL, + category="schema", + path=f"{base_path}/properties", + message=f"Property '{prop_name}' is now required in {context}", + old_value=False, + new_value=True, + migration_guide=f"Ensure '{prop_name}' is always provided when sending {context}", + impact_description="Clients not providing this property will receive validation errors" + )) + + for prop_name in removed_required: + self.report.add_change(Change( + change_type=ChangeType.NON_BREAKING, + severity=ChangeSeverity.INFO, + category="schema", + path=f"{base_path}/properties", + message=f"Property '{prop_name}' is no longer required in {context}", + old_value=True, + new_value=False, + impact_description="Property is now optional, providing more flexibility" + )) + + # Modified properties + common_props = old_prop_names & new_prop_names + for prop_name in common_props: + old_prop = old_props[prop_name] + new_prop = new_props[prop_name] + if old_prop != new_prop: + self._compare_schemas(f"{base_path}/properties/{prop_name}", + old_prop, new_prop, f"{context}.{prop_name}") + + def _compare_security_requirements(self, base_path: str, old_security: Optional[List], + new_security: Optional[List]) -> None: + """Compare security requirements.""" + # Simplified security comparison - could be expanded + if old_security != new_security: + severity = ChangeSeverity.HIGH if new_security else ChangeSeverity.CRITICAL + change_type = ChangeType.BREAKING + + if old_security is None and new_security is not None: + message = "Security requirements added" + migration_guide = "Ensure proper authentication/authorization when calling this endpoint" + impact = "Endpoint now requires authentication" + elif old_security is not None and new_security is None: + message = "Security requirements removed" + migration_guide = "Authentication is no longer required for this endpoint" + impact = "Endpoint is now publicly accessible" + severity = ChangeSeverity.MEDIUM # Less severe, more permissive + else: + message = "Security requirements modified" + migration_guide = "Update authentication/authorization method for this endpoint" + impact = "Different authentication method required" + + self.report.add_change(Change( + change_type=change_type, + severity=severity, + category="security", + path=f"{base_path}/security", + message=message, + old_value=old_security, + new_value=new_security, + migration_guide=migration_guide, + impact_description=impact + )) + + def _compare_components_section(self) -> None: + """Compare components sections.""" + old_components = self.old_spec.get('components', {}) + new_components = self.new_spec.get('components', {}) + + # Compare schemas + old_schemas = old_components.get('schemas', {}) + new_schemas = new_components.get('schemas', {}) + + old_schema_names = set(old_schemas.keys()) + new_schema_names = set(new_schemas.keys()) + + # Removed schemas + removed_schemas = old_schema_names - new_schema_names + for schema_name in removed_schemas: + self.report.add_change(Change( + change_type=ChangeType.BREAKING, + severity=ChangeSeverity.HIGH, + category="components", + path=f"/components/schemas/{schema_name}", + message=f"Schema '{schema_name}' removed from components", + old_value=old_schemas[schema_name], + new_value=None, + migration_guide=f"Remove references to schema '{schema_name}' or use alternative schemas", + impact_description="References to this schema will fail validation" + )) + + # Added schemas + added_schemas = new_schema_names - old_schema_names + for schema_name in added_schemas: + self.report.add_change(Change( + change_type=ChangeType.ENHANCEMENT, + severity=ChangeSeverity.INFO, + category="components", + path=f"/components/schemas/{schema_name}", + message=f"New schema '{schema_name}' added to components", + old_value=None, + new_value=new_schemas[schema_name], + impact_description="New reusable schema available" + )) + + # Modified schemas + common_schemas = old_schema_names & new_schema_names + for schema_name in common_schemas: + old_schema = old_schemas[schema_name] + new_schema = new_schemas[schema_name] + if old_schema != new_schema: + self._compare_schemas(f"/components/schemas/{schema_name}", + old_schema, new_schema, f"schema '{schema_name}'") + + def _compare_security_section(self) -> None: + """Compare security definitions.""" + old_security_schemes = self.old_spec.get('components', {}).get('securitySchemes', {}) + new_security_schemes = self.new_spec.get('components', {}).get('securitySchemes', {}) + + if old_security_schemes != new_security_schemes: + # Simplified comparison - could be more detailed + self.report.add_change(Change( + change_type=ChangeType.POTENTIALLY_BREAKING, + severity=ChangeSeverity.MEDIUM, + category="security", + path="/components/securitySchemes", + message="Security scheme definitions changed", + old_value=old_security_schemes, + new_value=new_security_schemes, + migration_guide="Review authentication implementation for compatibility with new security schemes", + impact_description="Authentication mechanisms may have changed" + )) + + def _generate_endpoint_removal_migration(self, removed_path: str, method: str, + remaining_paths: Dict[str, Any]) -> str: + """Generate migration guide for removed endpoints.""" + # Look for similar endpoints + similar_paths = [] + path_segments = removed_path.strip('/').split('/') + + for existing_path in remaining_paths.keys(): + existing_segments = existing_path.strip('/').split('/') + if len(existing_segments) == len(path_segments): + # Check similarity + similarity = sum(1 for i, seg in enumerate(path_segments) + if i < len(existing_segments) and seg == existing_segments[i]) + if similarity >= len(path_segments) * 0.5: # At least 50% similar + similar_paths.append(existing_path) + + if similar_paths: + return f"Consider using alternative endpoints: {', '.join(similar_paths[:3])}" + else: + return "No direct replacement available. Review API documentation for alternative approaches." + + def _generate_method_removal_migration(self, path: str, removed_method: str, + remaining_methods: Set[str]) -> str: + """Generate migration guide for removed HTTP methods.""" + method_alternatives = { + 'get': ['head'], + 'post': ['put', 'patch'], + 'put': ['post', 'patch'], + 'patch': ['put', 'post'], + 'delete': [] + } + + alternatives = [] + for alt_method in method_alternatives.get(removed_method.lower(), []): + if alt_method in remaining_methods: + alternatives.append(alt_method.upper()) + + if alternatives: + return f"Use alternative methods: {', '.join(alternatives)}" + else: + return f"No alternative HTTP methods available for {path}" + + def generate_json_report(self) -> str: + """Generate JSON format report.""" + report_data = { + "summary": self.report.summary, + "hasBreakingChanges": self.report.has_breaking_changes(), + "changes": [change.to_dict() for change in self.report.changes] + } + + return json.dumps(report_data, indent=2) + + def generate_text_report(self) -> str: + """Generate human-readable text report.""" + lines = [ + "═══════════════════════════════════════════════════════════════", + " BREAKING CHANGE ANALYSIS REPORT", + "═══════════════════════════════════════════════════════════════", + "", + "SUMMARY:", + f" Total Changes: {self.report.summary.get('total_changes', 0)}", + f" 🔴 Breaking Changes: {self.report.summary.get('breaking_changes', 0)}", + f" 🟡 Potentially Breaking: {self.report.summary.get('potentially_breaking_changes', 0)}", + f" 🟢 Non-Breaking Changes: {self.report.summary.get('non_breaking_changes', 0)}", + f" ✨ Enhancements: {self.report.summary.get('enhancements', 0)}", + "", + "SEVERITY BREAKDOWN:", + f" 🚨 Critical: {self.report.summary.get('critical_severity', 0)}", + f" ⚠️ High: {self.report.summary.get('high_severity', 0)}", + f" ⚪ Medium: {self.report.summary.get('medium_severity', 0)}", + f" 🔵 Low: {self.report.summary.get('low_severity', 0)}", + f" ℹ️ Info: {self.report.summary.get('info_severity', 0)}", + "" + ] + + if not self.report.changes: + lines.extend([ + "🎉 No changes detected between the API versions!", + "" + ]) + else: + # Group changes by type and severity + breaking_changes = [c for c in self.report.changes if c.change_type == ChangeType.BREAKING] + potentially_breaking = [c for c in self.report.changes if c.change_type == ChangeType.POTENTIALLY_BREAKING] + non_breaking = [c for c in self.report.changes if c.change_type == ChangeType.NON_BREAKING] + enhancements = [c for c in self.report.changes if c.change_type == ChangeType.ENHANCEMENT] + + # Breaking changes section + if breaking_changes: + lines.extend([ + "🔴 BREAKING CHANGES:", + "═" * 60 + ]) + for change in sorted(breaking_changes, key=lambda x: x.severity.value): + self._add_change_to_report(lines, change) + lines.append("") + + # Potentially breaking changes section + if potentially_breaking: + lines.extend([ + "🟡 POTENTIALLY BREAKING CHANGES:", + "═" * 60 + ]) + for change in sorted(potentially_breaking, key=lambda x: x.severity.value): + self._add_change_to_report(lines, change) + lines.append("") + + # Non-breaking changes section + if non_breaking: + lines.extend([ + "🟢 NON-BREAKING CHANGES:", + "═" * 60 + ]) + for change in non_breaking: + self._add_change_to_report(lines, change) + lines.append("") + + # Enhancements section + if enhancements: + lines.extend([ + "✨ ENHANCEMENTS:", + "═" * 60 + ]) + for change in enhancements: + self._add_change_to_report(lines, change) + lines.append("") + + # Add overall assessment + lines.extend([ + "═══════════════════════════════════════════════════════════════", + "OVERALL ASSESSMENT:", + "═══════════════════════════════════════════════════════════════" + ]) + + if self.report.has_breaking_changes(): + breaking_count = self.report.summary.get('breaking_changes', 0) + potentially_breaking_count = self.report.summary.get('potentially_breaking_changes', 0) + + if breaking_count > 0: + lines.extend([ + f"⛔ MAJOR VERSION BUMP REQUIRED", + f" This API version contains {breaking_count} breaking changes that will", + f" definitely break existing clients. A major version bump is required.", + "" + ]) + elif potentially_breaking_count > 0: + lines.extend([ + f"⚠️ MINOR VERSION BUMP RECOMMENDED", + f" This API version contains {potentially_breaking_count} potentially breaking", + f" changes. Consider a minor version bump and communicate changes to clients.", + "" + ]) + else: + lines.extend([ + "✅ PATCH VERSION BUMP ACCEPTABLE", + " No breaking changes detected. This version is backward compatible", + " with existing clients.", + "" + ]) + + return "\n".join(lines) + + def _add_change_to_report(self, lines: List[str], change: Change) -> None: + """Add a change to the text report.""" + severity_icons = { + ChangeSeverity.CRITICAL: "🚨", + ChangeSeverity.HIGH: "⚠️ ", + ChangeSeverity.MEDIUM: "⚪", + ChangeSeverity.LOW: "🔵", + ChangeSeverity.INFO: "ℹ️ " + } + + icon = severity_icons.get(change.severity, "❓") + + lines.extend([ + f"{icon} {change.severity.value.upper()}: {change.message}", + f" Path: {change.path}", + f" Category: {change.category}" + ]) + + if change.impact_description: + lines.append(f" Impact: {change.impact_description}") + + if change.migration_guide: + lines.append(f" 💡 Migration: {change.migration_guide}") + + lines.append("") + + +def main(): + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + description="Compare API specification versions to detect breaking changes", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python breaking_change_detector.py v1.json v2.json + python breaking_change_detector.py --format json v1.json v2.json > changes.json + python breaking_change_detector.py --output report.txt v1.json v2.json + """ + ) + + parser.add_argument( + 'old_spec', + help='Old API specification file (JSON format)' + ) + + parser.add_argument( + 'new_spec', + help='New API specification file (JSON format)' + ) + + parser.add_argument( + '--format', + choices=['text', 'json'], + default='text', + help='Output format (default: text)' + ) + + parser.add_argument( + '--output', + help='Output file (default: stdout)' + ) + + parser.add_argument( + '--exit-on-breaking', + action='store_true', + help='Exit with code 1 if breaking changes are detected' + ) + + args = parser.parse_args() + + # Load specification files + try: + with open(args.old_spec, 'r') as f: + old_spec = json.load(f) + except FileNotFoundError: + print(f"Error: Old specification file '{args.old_spec}' not found.", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in '{args.old_spec}': {e}", file=sys.stderr) + return 1 + + try: + with open(args.new_spec, 'r') as f: + new_spec = json.load(f) + except FileNotFoundError: + print(f"Error: New specification file '{args.new_spec}' not found.", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in '{args.new_spec}': {e}", file=sys.stderr) + return 1 + + # Initialize detector and compare specifications + detector = BreakingChangeDetector() + + try: + report = detector.compare_specs(old_spec, new_spec) + except Exception as e: + print(f"Error during comparison: {e}", file=sys.stderr) + return 1 + + # Generate report + if args.format == 'json': + output = detector.generate_json_report() + else: + output = detector.generate_text_report() + + # Write output + if args.output: + try: + with open(args.output, 'w') as f: + f.write(output) + print(f"Breaking change report written to {args.output}") + except IOError as e: + print(f"Error writing to '{args.output}': {e}", file=sys.stderr) + return 1 + else: + print(output) + + # Exit with appropriate code + if args.exit_on_breaking and report.has_breaking_changes(): + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/engineering/interview-system-designer/SKILL.md b/engineering/interview-system-designer/SKILL.md new file mode 100644 index 0000000..adb7d8c --- /dev/null +++ b/engineering/interview-system-designer/SKILL.md @@ -0,0 +1,458 @@ +--- +name: interview-system-designer +description: This skill should be used when the user asks to "design interview processes", "create hiring pipelines", "calibrate interview loops", "generate interview questions", "design competency matrices", "analyze interviewer bias", "create scoring rubrics", "build question banks", or "optimize hiring systems". Use for designing role-specific interview loops, competency assessments, and hiring calibration systems. +--- + +# Interview System Designer + +Comprehensive interview system design, competency assessment, and hiring process optimization. + +## Table of Contents + +- [Quick Start](#quick-start) +- [Tools Overview](#tools-overview) + - [Interview Loop Designer](#1-interview-loop-designer) + - [Question Bank Generator](#2-question-bank-generator) + - [Hiring Calibrator](#3-hiring-calibrator) +- [Interview System Workflows](#interview-system-workflows) + - [Role-Specific Loop Design](#role-specific-loop-design) + - [Competency Matrix Development](#competency-matrix-development) + - [Question Bank Creation](#question-bank-creation) + - [Bias Mitigation Framework](#bias-mitigation-framework) + - [Hiring Bar Calibration](#hiring-bar-calibration) +- [Competency Frameworks](#competency-frameworks) +- [Scoring & Calibration](#scoring--calibration) +- [Reference Documentation](#reference-documentation) +- [Industry Standards](#industry-standards) + +--- + +## Quick Start + +```bash +# Design a complete interview loop for a senior software engineer role +python loop_designer.py --role "Senior Software Engineer" --level senior --team platform --output loops/ + +# Generate a comprehensive question bank for a product manager position +python question_bank_generator.py --role "Product Manager" --level senior --competencies leadership,strategy,analytics --output questions/ + +# Analyze interview calibration across multiple candidates and interviewers +python hiring_calibrator.py --input interview_data.json --output calibration_report.json --analysis-type full +``` + +--- + +## Tools Overview + +### 1. Interview Loop Designer + +Generates calibrated interview loops tailored to specific roles, levels, and teams. + +**Input:** Role definition (title, level, team, competency requirements) +**Output:** Complete interview loop with rounds, focus areas, time allocation, scorecard templates + +**Key Features:** +- Role-specific competency mapping +- Level-appropriate question difficulty +- Interviewer skill requirements +- Time-optimized scheduling +- Standardized scorecards + +**Usage:** +```bash +# Design loop for a specific role +python loop_designer.py --role "Staff Data Scientist" --level staff --team ml-platform + +# Generate loop with specific focus areas +python loop_designer.py --role "Engineering Manager" --level senior --competencies leadership,technical,strategy + +# Create loop for multiple levels +python loop_designer.py --role "Backend Engineer" --levels junior,mid,senior --output loops/backend/ +``` + +### 2. Question Bank Generator + +Creates comprehensive, competency-based interview questions with detailed scoring criteria. + +**Input:** Role requirements, competency areas, experience level +**Output:** Structured question bank with scoring rubrics, follow-up probes, and calibration examples + +**Key Features:** +- Competency-based question organization +- Level-appropriate difficulty progression +- Behavioral and technical question types +- Anti-bias question design +- Calibration examples (poor/good/great answers) + +**Usage:** +```bash +# Generate questions for technical competencies +python question_bank_generator.py --role "Frontend Engineer" --competencies react,typescript,system-design + +# Create behavioral question bank +python question_bank_generator.py --role "Product Manager" --question-types behavioral,leadership --output pm_questions/ + +# Generate questions for all levels +python question_bank_generator.py --role "DevOps Engineer" --levels junior,mid,senior,staff +``` + +### 3. Hiring Calibrator + +Analyzes interview scores to detect bias, calibration issues, and recommends improvements. + +**Input:** Interview results data (candidate scores, interviewer feedback, demographics) +**Output:** Calibration analysis, bias detection report, interviewer coaching recommendations + +**Key Features:** +- Statistical bias detection +- Interviewer calibration analysis +- Score distribution analysis +- Recommendation engine +- Trend tracking over time + +**Usage:** +```bash +# Analyze calibration across all interviews +python hiring_calibrator.py --input interview_results.json --analysis-type comprehensive + +# Focus on specific competency areas +python hiring_calibrator.py --input data.json --competencies technical,leadership --output bias_report.json + +# Track calibration trends over time +python hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly +``` + +--- + +## Interview System Workflows + +### Role-Specific Loop Design + +#### Software Engineering Roles + +**Junior/Mid Software Engineer (2-4 years)** +- **Duration:** 3-4 hours across 3-4 rounds +- **Focus Areas:** Coding fundamentals, debugging, system understanding, growth mindset +- **Rounds:** + 1. Technical Phone Screen (45min) - Coding fundamentals, algorithms + 2. Coding Deep Dive (60min) - Problem-solving, code quality, testing + 3. System Design Basics (45min) - Component interaction, basic scalability + 4. Behavioral & Values (30min) - Team collaboration, learning agility + +**Senior Software Engineer (5-8 years)** +- **Duration:** 4-5 hours across 4-5 rounds +- **Focus Areas:** System design, technical leadership, mentoring capability, domain expertise +- **Rounds:** + 1. Technical Phone Screen (45min) - Advanced algorithms, optimization + 2. System Design (60min) - Scalability, trade-offs, architectural decisions + 3. Coding Excellence (60min) - Code quality, testing strategies, refactoring + 4. Technical Leadership (45min) - Mentoring, technical decisions, cross-team collaboration + 5. Behavioral & Culture (30min) - Leadership examples, conflict resolution + +**Staff+ Engineer (8+ years)** +- **Duration:** 5-6 hours across 5-6 rounds +- **Focus Areas:** Architectural vision, organizational impact, technical strategy, cross-functional leadership +- **Rounds:** + 1. Technical Phone Screen (45min) - System architecture, complex problem-solving + 2. Architecture Design (90min) - Large-scale systems, technology choices, evolution patterns + 3. Technical Strategy (60min) - Technical roadmaps, technology adoption, risk assessment + 4. Leadership & Influence (60min) - Cross-team impact, technical vision, stakeholder management + 5. Coding & Best Practices (45min) - Code quality standards, development processes + 6. Cultural & Strategic Fit (30min) - Company values, strategic thinking + +#### Product Management Roles + +**Product Manager (3-6 years)** +- **Duration:** 3-4 hours across 4 rounds +- **Focus Areas:** Product sense, analytical thinking, stakeholder management, execution +- **Rounds:** + 1. Product Sense (60min) - Feature prioritization, user empathy, market understanding + 2. Analytical Thinking (45min) - Data interpretation, metrics design, experimentation + 3. Execution & Process (45min) - Project management, cross-functional collaboration + 4. Behavioral & Leadership (30min) - Stakeholder management, conflict resolution + +**Senior Product Manager (6-10 years)** +- **Duration:** 4-5 hours across 4-5 rounds +- **Focus Areas:** Product strategy, team leadership, business impact, market analysis +- **Rounds:** + 1. Product Strategy (75min) - Market analysis, competitive positioning, roadmap planning + 2. Leadership & Influence (60min) - Team building, stakeholder management, decision-making + 3. Data & Analytics (45min) - Advanced metrics, experimentation design, business intelligence + 4. Technical Collaboration (45min) - Technical trade-offs, engineering partnership + 5. Case Study Presentation (45min) - Past impact, lessons learned, strategic thinking + +#### Design Roles + +**UX Designer (2-5 years)** +- **Duration:** 3-4 hours across 3-4 rounds +- **Focus Areas:** Design process, user research, visual design, collaboration +- **Rounds:** + 1. Portfolio Review (60min) - Design process, problem-solving approach, visual skills + 2. Design Challenge (90min) - User-centered design, wireframing, iteration + 3. Collaboration & Process (45min) - Cross-functional work, feedback incorporation + 4. Behavioral & Values (30min) - User advocacy, creative problem-solving + +**Senior UX Designer (5+ years)** +- **Duration:** 4-5 hours across 4-5 rounds +- **Focus Areas:** Design leadership, system thinking, research methodology, business impact +- **Rounds:** + 1. Portfolio Deep Dive (75min) - Design impact, methodology, leadership examples + 2. Design System Challenge (90min) - Systems thinking, scalability, consistency + 3. Research & Strategy (60min) - User research methods, data-driven design decisions + 4. Leadership & Mentoring (45min) - Design team leadership, process improvement + 5. Business & Strategy (30min) - Design's business impact, stakeholder management + +### Competency Matrix Development + +#### Technical Competencies + +**Software Engineering** +- **Coding Proficiency:** Algorithm design, data structures, language expertise +- **System Design:** Architecture patterns, scalability, performance optimization +- **Testing & Quality:** Unit testing, integration testing, code review practices +- **DevOps & Tools:** CI/CD, monitoring, debugging, development workflows + +**Data Science & Analytics** +- **Statistical Analysis:** Statistical methods, hypothesis testing, experimental design +- **Machine Learning:** Algorithm selection, model evaluation, feature engineering +- **Data Engineering:** ETL processes, data pipeline design, data quality +- **Business Intelligence:** Metrics design, dashboard creation, stakeholder communication + +**Product Management** +- **Product Strategy:** Market analysis, competitive research, roadmap planning +- **User Research:** User interviews, usability testing, persona development +- **Data Analysis:** Metrics interpretation, A/B testing, cohort analysis +- **Technical Understanding:** API design, database concepts, system architecture + +#### Behavioral Competencies + +**Leadership & Influence** +- **Team Building:** Hiring, onboarding, team culture development +- **Mentoring & Coaching:** Skill development, career guidance, feedback delivery +- **Strategic Thinking:** Long-term planning, vision setting, decision-making frameworks +- **Change Management:** Process improvement, organizational change, resistance handling + +**Communication & Collaboration** +- **Stakeholder Management:** Expectation setting, conflict resolution, alignment building +- **Cross-Functional Partnership:** Engineering-Product-Design collaboration +- **Presentation Skills:** Technical communication, executive briefings, documentation +- **Active Listening:** Empathy, question asking, perspective taking + +**Problem-Solving & Innovation** +- **Analytical Thinking:** Problem decomposition, root cause analysis, hypothesis formation +- **Creative Problem-Solving:** Alternative solution generation, constraint navigation +- **Learning Agility:** Skill acquisition, adaptation to change, knowledge transfer +- **Risk Assessment:** Uncertainty navigation, trade-off analysis, mitigation planning + +### Question Bank Creation + +#### Technical Questions by Level + +**Junior Level Questions** +- **Coding:** "Implement a function to find the second largest element in an array" +- **System Design:** "How would you design a simple URL shortener for 1000 users?" +- **Debugging:** "Walk through how you would debug a slow-loading web page" + +**Senior Level Questions** +- **Architecture:** "Design a real-time chat system supporting 1M concurrent users" +- **Leadership:** "Describe how you would onboard a new team member in your area" +- **Trade-offs:** "Compare microservices vs monolith for a rapidly scaling startup" + +**Staff+ Level Questions** +- **Strategy:** "How would you evaluate and introduce a new programming language to the organization?" +- **Influence:** "Describe a time you drove technical consensus across multiple teams" +- **Vision:** "How do you balance technical debt against feature development?" + +#### Behavioral Questions Framework + +**STAR Method Implementation** +- **Situation:** Context and background of the scenario +- **Task:** Specific challenge or goal that needed to be addressed +- **Action:** Concrete steps taken to address the challenge +- **Result:** Measurable outcomes and lessons learned + +**Sample Questions:** +- "Tell me about a time you had to influence a decision without formal authority" +- "Describe a situation where you had to deliver difficult feedback to a colleague" +- "Give an example of when you had to adapt your communication style for different audiences" +- "Walk me through a time when you had to make a decision with incomplete information" + +### Bias Mitigation Framework + +#### Structural Bias Prevention + +**Interview Panel Composition** +- Diverse interviewer panels (gender, ethnicity, experience level) +- Rotating panel assignments to prevent pattern bias +- Anonymous resume screening for initial phone screens +- Standardized question sets to ensure consistency + +**Process Standardization** +- Structured interview guides with required probing questions +- Consistent time allocation across all candidates +- Standardized evaluation criteria and scoring rubrics +- Required justification for all scoring decisions + +#### Cognitive Bias Recognition + +**Common Interview Biases** +- **Halo Effect:** One strong impression influences overall assessment +- **Confirmation Bias:** Seeking information that confirms initial impressions +- **Similarity Bias:** Favoring candidates with similar backgrounds/experiences +- **Contrast Effect:** Comparing candidates against each other rather than standard +- **Anchoring Bias:** Over-relying on first piece of information received + +**Mitigation Strategies** +- Pre-interview bias awareness training for all interviewers +- Structured debrief sessions with independent score recording +- Regular calibration sessions with example candidate discussions +- Statistical monitoring of scoring patterns by interviewer and demographic + +### Hiring Bar Calibration + +#### Calibration Methodology + +**Regular Calibration Sessions** +- Monthly interviewer calibration meetings +- Shadow interviewing for new interviewers (minimum 5 sessions) +- Quarterly cross-team calibration reviews +- Annual hiring bar review and adjustment process + +**Performance Tracking** +- New hire performance correlation with interview scores +- Interviewer accuracy tracking (prediction vs actual performance) +- False positive/negative analysis +- Offer acceptance rate analysis by interviewer + +**Feedback Loops** +- Six-month new hire performance reviews +- Manager feedback on interview process effectiveness +- Candidate experience surveys and feedback integration +- Continuous process improvement based on data analysis + +--- + +## Competency Frameworks + +### Engineering Competency Levels + +#### Level 1-2: Individual Contributor (Junior/Mid) +- **Technical Skills:** Language proficiency, testing basics, code review participation +- **Problem Solving:** Structured approach to debugging, logical thinking +- **Communication:** Clear status updates, effective question asking +- **Learning:** Proactive skill development, mentorship seeking + +#### Level 3-4: Senior Individual Contributor +- **Technical Leadership:** Architecture decisions, code quality advocacy +- **Mentoring:** Junior developer guidance, knowledge sharing +- **Project Ownership:** End-to-end feature delivery, stakeholder communication +- **Innovation:** Process improvement, technology evaluation + +#### Level 5-6: Staff+ Engineer +- **Organizational Impact:** Cross-team technical leadership, strategic planning +- **Technical Vision:** Long-term architectural planning, technology roadmap +- **People Development:** Team growth, hiring contribution, culture building +- **External Influence:** Industry contribution, thought leadership + +### Product Management Competency Levels + +#### Level 1-2: Associate/Product Manager +- **Product Execution:** Feature specification, requirements gathering +- **User Focus:** User research participation, feedback collection +- **Data Analysis:** Basic metrics analysis, experiment interpretation +- **Stakeholder Management:** Cross-functional collaboration, communication + +#### Level 3-4: Senior Product Manager +- **Strategic Thinking:** Market analysis, competitive positioning +- **Leadership:** Cross-functional team leadership, decision making +- **Business Impact:** Revenue impact, market share growth +- **Process Innovation:** Product development process improvement + +#### Level 5-6: Principal Product Manager +- **Vision Setting:** Product strategy, market direction +- **Organizational Influence:** Executive communication, team building +- **Innovation Leadership:** New market creation, disruptive thinking +- **Talent Development:** PM team growth, hiring leadership + +--- + +## Scoring & Calibration + +### Scoring Rubric Framework + +#### 4-Point Scoring Scale +- **4 - Exceeds Expectations:** Demonstrates mastery beyond required level +- **3 - Meets Expectations:** Solid performance meeting all requirements +- **2 - Partially Meets:** Shows potential but has development areas +- **1 - Does Not Meet:** Significant gaps in required competencies + +#### Competency-Specific Scoring + +**Technical Competencies** +- Code Quality (4): Clean, maintainable, well-tested code with excellent documentation +- Code Quality (3): Functional code with good structure and basic testing +- Code Quality (2): Working code with some structural issues or missing tests +- Code Quality (1): Non-functional or poorly structured code with significant issues + +**Leadership Competencies** +- Team Influence (4): Drives team success, develops others, creates lasting positive change +- Team Influence (3): Contributes positively to team dynamics and outcomes +- Team Influence (2): Shows leadership potential with some effective examples +- Team Influence (1): Limited evidence of leadership ability or negative team impact + +### Calibration Standards + +#### Statistical Benchmarks +- Target score distribution: 20% (4s), 40% (3s), 30% (2s), 10% (1s) +- Interviewer consistency target: <0.5 standard deviation from team average +- Pass rate target: 15-25% for most roles (varies by level and market conditions) +- Time to hire target: 2-3 weeks from first interview to offer + +#### Quality Metrics +- New hire 6-month performance correlation: >0.6 with interview scores +- Interviewer agreement rate: >80% within 1 point on final recommendations +- Candidate experience satisfaction: >4.0/5.0 average rating +- Offer acceptance rate: >85% for preferred candidates + +--- + +## Reference Documentation + +### Interview Templates +- Role-specific interview guides and question banks +- Scorecard templates for consistent evaluation +- Debrief facilitation guides for effective team discussions + +### Bias Mitigation Resources +- Unconscious bias training materials and exercises +- Structured interviewing best practices checklist +- Demographic diversity tracking and reporting templates + +### Calibration Tools +- Interview performance correlation analysis templates +- Interviewer coaching and development frameworks +- Hiring pipeline metrics and dashboard specifications + +--- + +## Industry Standards + +### Best Practices Integration +- Google's structured interviewing methodology +- Amazon's Leadership Principles assessment framework +- Microsoft's competency-based evaluation system +- Netflix's culture fit assessment approach + +### Compliance & Legal Considerations +- EEOC compliance requirements and documentation +- ADA accommodation procedures and guidelines +- International hiring law considerations +- Privacy and data protection requirements (GDPR, CCPA) + +### Continuous Improvement Framework +- Regular process auditing and refinement cycles +- Industry benchmarking and comparative analysis +- Technology integration for interview optimization +- Candidate experience enhancement initiatives + +This comprehensive interview system design framework provides the structure and tools necessary to build fair, effective, and scalable hiring processes that consistently identify top talent while minimizing bias and maximizing candidate experience. \ No newline at end of file diff --git a/engineering/interview-system-designer/loop_designer.py b/engineering/interview-system-designer/loop_designer.py new file mode 100644 index 0000000..b6cf046 --- /dev/null +++ b/engineering/interview-system-designer/loop_designer.py @@ -0,0 +1,908 @@ +#!/usr/bin/env python3 +""" +Interview Loop Designer + +Generates calibrated interview loops tailored to specific roles, levels, and teams. +Creates complete interview loops with rounds, focus areas, time allocation, +interviewer skill requirements, and scorecard templates. + +Usage: + python loop_designer.py --role "Senior Software Engineer" --level senior --team platform + python loop_designer.py --role "Product Manager" --level mid --competencies leadership,strategy + python loop_designer.py --input role_definition.json --output loops/ +""" + +import os +import sys +import json +import argparse +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict + + +class InterviewLoopDesigner: + """Designs comprehensive interview loops based on role requirements.""" + + def __init__(self): + self.competency_frameworks = self._init_competency_frameworks() + self.role_templates = self._init_role_templates() + self.interviewer_skills = self._init_interviewer_skills() + + def _init_competency_frameworks(self) -> Dict[str, Dict]: + """Initialize competency frameworks for different roles.""" + return { + "software_engineer": { + "junior": { + "required": ["coding_fundamentals", "debugging", "testing_basics", "version_control"], + "preferred": ["system_understanding", "code_review", "collaboration"], + "focus_areas": ["technical_execution", "learning_agility", "team_collaboration"] + }, + "mid": { + "required": ["advanced_coding", "system_design_basics", "testing_strategy", "debugging_complex"], + "preferred": ["mentoring_basics", "technical_communication", "project_ownership"], + "focus_areas": ["technical_depth", "system_thinking", "ownership"] + }, + "senior": { + "required": ["system_architecture", "technical_leadership", "mentoring", "cross_team_collab"], + "preferred": ["technology_evaluation", "process_improvement", "hiring_contribution"], + "focus_areas": ["technical_leadership", "system_architecture", "people_development"] + }, + "staff": { + "required": ["architectural_vision", "organizational_impact", "technical_strategy", "team_building"], + "preferred": ["industry_influence", "innovation_leadership", "executive_communication"], + "focus_areas": ["organizational_impact", "technical_vision", "strategic_influence"] + }, + "principal": { + "required": ["company_wide_impact", "technical_vision", "talent_development", "strategic_planning"], + "preferred": ["industry_leadership", "board_communication", "market_influence"], + "focus_areas": ["strategic_leadership", "organizational_transformation", "external_influence"] + } + }, + "product_manager": { + "junior": { + "required": ["product_execution", "user_research", "data_analysis", "stakeholder_comm"], + "preferred": ["market_awareness", "technical_understanding", "project_management"], + "focus_areas": ["execution_excellence", "user_focus", "analytical_thinking"] + }, + "mid": { + "required": ["product_strategy", "cross_functional_leadership", "metrics_design", "market_analysis"], + "preferred": ["team_building", "technical_collaboration", "competitive_analysis"], + "focus_areas": ["strategic_thinking", "leadership", "business_impact"] + }, + "senior": { + "required": ["business_strategy", "team_leadership", "p&l_ownership", "market_positioning"], + "preferred": ["hiring_leadership", "board_communication", "partnership_development"], + "focus_areas": ["business_leadership", "market_strategy", "organizational_impact"] + }, + "staff": { + "required": ["portfolio_management", "organizational_leadership", "strategic_planning", "market_creation"], + "preferred": ["executive_presence", "investor_relations", "acquisition_strategy"], + "focus_areas": ["strategic_leadership", "market_innovation", "organizational_transformation"] + } + }, + "designer": { + "junior": { + "required": ["design_fundamentals", "user_research", "prototyping", "design_tools"], + "preferred": ["user_empathy", "visual_design", "collaboration"], + "focus_areas": ["design_execution", "user_research", "creative_problem_solving"] + }, + "mid": { + "required": ["design_systems", "user_testing", "cross_functional_collab", "design_strategy"], + "preferred": ["mentoring", "process_improvement", "business_understanding"], + "focus_areas": ["design_leadership", "system_thinking", "business_impact"] + }, + "senior": { + "required": ["design_leadership", "team_building", "strategic_design", "stakeholder_management"], + "preferred": ["design_culture", "hiring_leadership", "executive_communication"], + "focus_areas": ["design_strategy", "team_leadership", "organizational_impact"] + } + }, + "data_scientist": { + "junior": { + "required": ["statistical_analysis", "python_r", "data_visualization", "sql"], + "preferred": ["machine_learning", "business_understanding", "communication"], + "focus_areas": ["analytical_skills", "technical_execution", "business_impact"] + }, + "mid": { + "required": ["advanced_ml", "experiment_design", "data_engineering", "stakeholder_comm"], + "preferred": ["mentoring", "project_leadership", "product_collaboration"], + "focus_areas": ["advanced_analytics", "project_leadership", "cross_functional_impact"] + }, + "senior": { + "required": ["data_strategy", "team_leadership", "ml_systems", "business_strategy"], + "preferred": ["hiring_leadership", "executive_communication", "technology_evaluation"], + "focus_areas": ["strategic_leadership", "technical_vision", "organizational_impact"] + } + }, + "devops_engineer": { + "junior": { + "required": ["infrastructure_basics", "scripting", "monitoring", "troubleshooting"], + "preferred": ["automation", "cloud_platforms", "security_awareness"], + "focus_areas": ["operational_excellence", "automation_mindset", "problem_solving"] + }, + "mid": { + "required": ["ci_cd_design", "infrastructure_as_code", "security_implementation", "performance_optimization"], + "preferred": ["team_collaboration", "incident_management", "capacity_planning"], + "focus_areas": ["system_reliability", "automation_leadership", "cross_team_collaboration"] + }, + "senior": { + "required": ["platform_architecture", "team_leadership", "security_strategy", "organizational_impact"], + "preferred": ["hiring_contribution", "technology_evaluation", "executive_communication"], + "focus_areas": ["platform_leadership", "strategic_thinking", "organizational_transformation"] + } + }, + "engineering_manager": { + "junior": { + "required": ["team_leadership", "technical_background", "people_management", "project_coordination"], + "preferred": ["hiring_experience", "performance_management", "technical_mentoring"], + "focus_areas": ["people_leadership", "team_building", "execution_excellence"] + }, + "senior": { + "required": ["organizational_leadership", "strategic_planning", "talent_development", "cross_functional_leadership"], + "preferred": ["technical_vision", "culture_building", "executive_communication"], + "focus_areas": ["organizational_impact", "strategic_leadership", "talent_development"] + }, + "staff": { + "required": ["multi_team_leadership", "organizational_strategy", "executive_presence", "cultural_transformation"], + "preferred": ["board_communication", "market_understanding", "acquisition_integration"], + "focus_areas": ["organizational_transformation", "strategic_leadership", "cultural_evolution"] + } + } + } + + def _init_role_templates(self) -> Dict[str, Dict]: + """Initialize role-specific interview templates.""" + return { + "software_engineer": { + "core_rounds": ["technical_phone_screen", "coding_deep_dive", "system_design", "behavioral"], + "optional_rounds": ["technical_leadership", "domain_expertise", "culture_fit"], + "total_duration_range": (180, 360), # 3-6 hours + "required_competencies": ["coding", "problem_solving", "communication"] + }, + "product_manager": { + "core_rounds": ["product_sense", "analytical_thinking", "execution_process", "behavioral"], + "optional_rounds": ["strategic_thinking", "technical_collaboration", "leadership"], + "total_duration_range": (180, 300), # 3-5 hours + "required_competencies": ["product_strategy", "analytical_thinking", "stakeholder_management"] + }, + "designer": { + "core_rounds": ["portfolio_review", "design_challenge", "collaboration_process", "behavioral"], + "optional_rounds": ["design_system_thinking", "research_methodology", "leadership"], + "total_duration_range": (180, 300), # 3-5 hours + "required_competencies": ["design_process", "user_empathy", "visual_communication"] + }, + "data_scientist": { + "core_rounds": ["technical_assessment", "case_study", "statistical_thinking", "behavioral"], + "optional_rounds": ["ml_systems", "business_strategy", "technical_leadership"], + "total_duration_range": (210, 330), # 3.5-5.5 hours + "required_competencies": ["statistical_analysis", "programming", "business_acumen"] + }, + "devops_engineer": { + "core_rounds": ["technical_assessment", "system_design", "troubleshooting", "behavioral"], + "optional_rounds": ["security_assessment", "automation_design", "leadership"], + "total_duration_range": (180, 300), # 3-5 hours + "required_competencies": ["infrastructure", "automation", "problem_solving"] + }, + "engineering_manager": { + "core_rounds": ["leadership_assessment", "technical_background", "people_management", "behavioral"], + "optional_rounds": ["strategic_thinking", "hiring_assessment", "culture_building"], + "total_duration_range": (240, 360), # 4-6 hours + "required_competencies": ["people_leadership", "technical_understanding", "strategic_thinking"] + } + } + + def _init_interviewer_skills(self) -> Dict[str, Dict]: + """Initialize interviewer skill requirements for different round types.""" + return { + "technical_phone_screen": { + "required_skills": ["technical_assessment", "coding_evaluation"], + "preferred_experience": ["same_domain", "senior_level"], + "calibration_level": "standard" + }, + "coding_deep_dive": { + "required_skills": ["advanced_technical", "code_quality_assessment"], + "preferred_experience": ["senior_engineer", "system_design"], + "calibration_level": "high" + }, + "system_design": { + "required_skills": ["architecture_design", "scalability_assessment"], + "preferred_experience": ["senior_architect", "large_scale_systems"], + "calibration_level": "high" + }, + "behavioral": { + "required_skills": ["behavioral_interviewing", "competency_assessment"], + "preferred_experience": ["hiring_manager", "people_leadership"], + "calibration_level": "standard" + }, + "technical_leadership": { + "required_skills": ["leadership_assessment", "technical_mentoring"], + "preferred_experience": ["engineering_manager", "tech_lead"], + "calibration_level": "high" + }, + "product_sense": { + "required_skills": ["product_evaluation", "market_analysis"], + "preferred_experience": ["product_manager", "product_leadership"], + "calibration_level": "high" + }, + "analytical_thinking": { + "required_skills": ["data_analysis", "metrics_evaluation"], + "preferred_experience": ["data_analyst", "product_manager"], + "calibration_level": "standard" + }, + "design_challenge": { + "required_skills": ["design_evaluation", "user_experience"], + "preferred_experience": ["senior_designer", "design_manager"], + "calibration_level": "high" + } + } + + def generate_interview_loop(self, role: str, level: str, team: Optional[str] = None, + competencies: Optional[List[str]] = None) -> Dict[str, Any]: + """Generate a complete interview loop for the specified role and level.""" + + # Normalize inputs + role_key = role.lower().replace(" ", "_").replace("-", "_") + level_key = level.lower() + + # Get role template and competency requirements + if role_key not in self.competency_frameworks: + role_key = self._find_closest_role(role_key) + + if level_key not in self.competency_frameworks[role_key]: + level_key = self._find_closest_level(role_key, level_key) + + competency_req = self.competency_frameworks[role_key][level_key] + role_template = self.role_templates.get(role_key, self.role_templates["software_engineer"]) + + # Design the interview loop + rounds = self._design_rounds(role_key, level_key, competency_req, role_template, competencies) + schedule = self._create_schedule(rounds) + scorecard = self._generate_scorecard(role_key, level_key, competency_req) + interviewer_requirements = self._define_interviewer_requirements(rounds) + + return { + "role": role, + "level": level, + "team": team, + "generated_at": datetime.now().isoformat(), + "total_duration_minutes": sum(round_info["duration_minutes"] for round_info in rounds.values()), + "total_rounds": len(rounds), + "rounds": rounds, + "suggested_schedule": schedule, + "scorecard_template": scorecard, + "interviewer_requirements": interviewer_requirements, + "competency_framework": competency_req, + "calibration_notes": self._generate_calibration_notes(role_key, level_key) + } + + def _find_closest_role(self, role_key: str) -> str: + """Find the closest matching role template.""" + role_mappings = { + "engineer": "software_engineer", + "developer": "software_engineer", + "swe": "software_engineer", + "backend": "software_engineer", + "frontend": "software_engineer", + "fullstack": "software_engineer", + "pm": "product_manager", + "product": "product_manager", + "ux": "designer", + "ui": "designer", + "graphic": "designer", + "data": "data_scientist", + "analyst": "data_scientist", + "ml": "data_scientist", + "ops": "devops_engineer", + "sre": "devops_engineer", + "infrastructure": "devops_engineer", + "manager": "engineering_manager", + "lead": "engineering_manager" + } + + for key_part in role_key.split("_"): + if key_part in role_mappings: + return role_mappings[key_part] + + return "software_engineer" # Default fallback + + def _find_closest_level(self, role_key: str, level_key: str) -> str: + """Find the closest matching level for the role.""" + available_levels = list(self.competency_frameworks[role_key].keys()) + + level_mappings = { + "entry": "junior", + "associate": "junior", + "jr": "junior", + "mid": "mid", + "middle": "mid", + "sr": "senior", + "senior": "senior", + "staff": "staff", + "principal": "principal", + "lead": "senior", + "manager": "senior" + } + + mapped_level = level_mappings.get(level_key, level_key) + + if mapped_level in available_levels: + return mapped_level + elif "senior" in available_levels: + return "senior" + else: + return available_levels[0] + + def _design_rounds(self, role_key: str, level_key: str, competency_req: Dict, + role_template: Dict, custom_competencies: Optional[List[str]]) -> Dict[str, Dict]: + """Design the specific interview rounds based on role and level.""" + rounds = {} + + # Determine which rounds to include + core_rounds = role_template["core_rounds"].copy() + optional_rounds = role_template["optional_rounds"].copy() + + # Add optional rounds based on level + if level_key in ["senior", "staff", "principal"]: + if "technical_leadership" in optional_rounds and role_key in ["software_engineer", "engineering_manager"]: + core_rounds.append("technical_leadership") + if "strategic_thinking" in optional_rounds and role_key in ["product_manager", "engineering_manager"]: + core_rounds.append("strategic_thinking") + if "design_system_thinking" in optional_rounds and role_key == "designer": + core_rounds.append("design_system_thinking") + + if level_key in ["staff", "principal"]: + if "domain_expertise" in optional_rounds: + core_rounds.append("domain_expertise") + + # Define round details + round_definitions = self._get_round_definitions() + + for i, round_type in enumerate(core_rounds, 1): + if round_type in round_definitions: + round_def = round_definitions[round_type].copy() + round_def["order"] = i + round_def["focus_areas"] = self._customize_focus_areas(round_type, competency_req, custom_competencies) + rounds[f"round_{i}_{round_type}"] = round_def + + return rounds + + def _get_round_definitions(self) -> Dict[str, Dict]: + """Get predefined round definitions with standard durations and formats.""" + return { + "technical_phone_screen": { + "name": "Technical Phone Screen", + "duration_minutes": 45, + "format": "virtual", + "objectives": ["Assess coding fundamentals", "Evaluate problem-solving approach", "Screen for basic technical competency"], + "question_types": ["coding_problems", "technical_concepts", "experience_questions"], + "evaluation_criteria": ["technical_accuracy", "problem_solving_process", "communication_clarity"] + }, + "coding_deep_dive": { + "name": "Coding Deep Dive", + "duration_minutes": 75, + "format": "in_person_or_virtual", + "objectives": ["Evaluate coding skills in depth", "Assess code quality and testing", "Review debugging approach"], + "question_types": ["complex_coding_problems", "code_review", "testing_strategy"], + "evaluation_criteria": ["code_quality", "testing_approach", "debugging_skills", "optimization_thinking"] + }, + "system_design": { + "name": "System Design", + "duration_minutes": 75, + "format": "collaborative_whiteboard", + "objectives": ["Assess architectural thinking", "Evaluate scalability considerations", "Review trade-off analysis"], + "question_types": ["system_architecture", "scalability_design", "trade_off_analysis"], + "evaluation_criteria": ["architectural_thinking", "scalability_awareness", "trade_off_reasoning"] + }, + "behavioral": { + "name": "Behavioral Interview", + "duration_minutes": 45, + "format": "conversational", + "objectives": ["Assess cultural fit", "Evaluate past experiences", "Review leadership examples"], + "question_types": ["star_method_questions", "situational_scenarios", "values_alignment"], + "evaluation_criteria": ["communication_skills", "leadership_examples", "cultural_alignment"] + }, + "technical_leadership": { + "name": "Technical Leadership", + "duration_minutes": 60, + "format": "discussion_based", + "objectives": ["Evaluate mentoring capability", "Assess technical decision making", "Review cross-team collaboration"], + "question_types": ["leadership_scenarios", "technical_decisions", "mentoring_examples"], + "evaluation_criteria": ["leadership_potential", "technical_judgment", "influence_skills"] + }, + "product_sense": { + "name": "Product Sense", + "duration_minutes": 75, + "format": "case_study", + "objectives": ["Assess product intuition", "Evaluate user empathy", "Review market understanding"], + "question_types": ["product_scenarios", "feature_prioritization", "user_journey_analysis"], + "evaluation_criteria": ["product_intuition", "user_empathy", "analytical_thinking"] + }, + "analytical_thinking": { + "name": "Analytical Thinking", + "duration_minutes": 60, + "format": "data_analysis", + "objectives": ["Evaluate data interpretation", "Assess metric design", "Review experiment planning"], + "question_types": ["data_interpretation", "metric_design", "experiment_analysis"], + "evaluation_criteria": ["analytical_rigor", "metric_intuition", "experimental_thinking"] + }, + "design_challenge": { + "name": "Design Challenge", + "duration_minutes": 90, + "format": "hands_on_design", + "objectives": ["Assess design process", "Evaluate user-centered thinking", "Review iteration approach"], + "question_types": ["design_problems", "user_research", "design_critique"], + "evaluation_criteria": ["design_process", "user_focus", "visual_communication"] + }, + "portfolio_review": { + "name": "Portfolio Review", + "duration_minutes": 75, + "format": "presentation_discussion", + "objectives": ["Review past work", "Assess design thinking", "Evaluate impact measurement"], + "question_types": ["portfolio_walkthrough", "design_decisions", "impact_stories"], + "evaluation_criteria": ["design_quality", "process_thinking", "business_impact"] + } + } + + def _customize_focus_areas(self, round_type: str, competency_req: Dict, + custom_competencies: Optional[List[str]]) -> List[str]: + """Customize focus areas based on role competency requirements.""" + base_focus_areas = competency_req.get("focus_areas", []) + + round_focus_mapping = { + "technical_phone_screen": ["coding_fundamentals", "problem_solving"], + "coding_deep_dive": ["technical_execution", "code_quality"], + "system_design": ["system_thinking", "architectural_reasoning"], + "behavioral": ["cultural_fit", "communication", "teamwork"], + "technical_leadership": ["leadership", "mentoring", "influence"], + "product_sense": ["product_intuition", "user_empathy"], + "analytical_thinking": ["data_analysis", "metric_design"], + "design_challenge": ["design_process", "user_focus"] + } + + focus_areas = round_focus_mapping.get(round_type, []) + + # Add custom competencies if specified + if custom_competencies: + focus_areas.extend([comp for comp in custom_competencies if comp not in focus_areas]) + + # Add role-specific focus areas + focus_areas.extend([area for area in base_focus_areas if area not in focus_areas]) + + return focus_areas[:5] # Limit to top 5 focus areas + + def _create_schedule(self, rounds: Dict[str, Dict]) -> Dict[str, Any]: + """Create a suggested interview schedule.""" + sorted_rounds = sorted(rounds.items(), key=lambda x: x[1]["order"]) + + # Calculate optimal scheduling + total_duration = sum(round_info["duration_minutes"] for _, round_info in sorted_rounds) + + if total_duration <= 240: # 4 hours or less - single day + schedule_type = "single_day" + day_structure = self._create_single_day_schedule(sorted_rounds) + else: # Multi-day schedule + schedule_type = "multi_day" + day_structure = self._create_multi_day_schedule(sorted_rounds) + + return { + "type": schedule_type, + "total_duration_minutes": total_duration, + "recommended_breaks": self._calculate_breaks(total_duration), + "day_structure": day_structure, + "logistics_notes": self._generate_logistics_notes(sorted_rounds) + } + + def _create_single_day_schedule(self, rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]: + """Create a single-day interview schedule.""" + start_time = datetime.strptime("09:00", "%H:%M") + current_time = start_time + + schedule = [] + + for round_name, round_info in rounds: + # Add break if needed (after 90 minutes of interviews) + if schedule and sum(item.get("duration_minutes", 0) for item in schedule if "break" not in item.get("type", "")) >= 90: + schedule.append({ + "type": "break", + "start_time": current_time.strftime("%H:%M"), + "duration_minutes": 15, + "end_time": (current_time + timedelta(minutes=15)).strftime("%H:%M") + }) + current_time += timedelta(minutes=15) + + # Add the interview round + end_time = current_time + timedelta(minutes=round_info["duration_minutes"]) + schedule.append({ + "type": "interview", + "round_name": round_name, + "title": round_info["name"], + "start_time": current_time.strftime("%H:%M"), + "end_time": end_time.strftime("%H:%M"), + "duration_minutes": round_info["duration_minutes"], + "format": round_info["format"] + }) + current_time = end_time + + return { + "day_1": { + "date": "TBD", + "start_time": start_time.strftime("%H:%M"), + "end_time": current_time.strftime("%H:%M"), + "rounds": schedule + } + } + + def _create_multi_day_schedule(self, rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]: + """Create a multi-day interview schedule.""" + # Split rounds across days (max 4 hours per day) + max_daily_minutes = 240 + days = {} + current_day = 1 + current_day_duration = 0 + current_day_rounds = [] + + for round_name, round_info in rounds: + duration = round_info["duration_minutes"] + 15 # Add buffer time + + if current_day_duration + duration > max_daily_minutes and current_day_rounds: + # Finalize current day + days[f"day_{current_day}"] = self._finalize_day_schedule(current_day_rounds) + current_day += 1 + current_day_duration = 0 + current_day_rounds = [] + + current_day_rounds.append((round_name, round_info)) + current_day_duration += duration + + # Finalize last day + if current_day_rounds: + days[f"day_{current_day}"] = self._finalize_day_schedule(current_day_rounds) + + return days + + def _finalize_day_schedule(self, day_rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]: + """Finalize the schedule for a specific day.""" + start_time = datetime.strptime("09:00", "%H:%M") + current_time = start_time + schedule = [] + + for round_name, round_info in day_rounds: + end_time = current_time + timedelta(minutes=round_info["duration_minutes"]) + schedule.append({ + "type": "interview", + "round_name": round_name, + "title": round_info["name"], + "start_time": current_time.strftime("%H:%M"), + "end_time": end_time.strftime("%H:%M"), + "duration_minutes": round_info["duration_minutes"], + "format": round_info["format"] + }) + current_time = end_time + timedelta(minutes=15) # 15-min buffer + + return { + "date": "TBD", + "start_time": start_time.strftime("%H:%M"), + "end_time": (current_time - timedelta(minutes=15)).strftime("%H:%M"), + "rounds": schedule + } + + def _calculate_breaks(self, total_duration: int) -> List[Dict[str, Any]]: + """Calculate recommended breaks based on total duration.""" + breaks = [] + + if total_duration >= 120: # 2+ hours + breaks.append({"type": "short_break", "duration": 15, "after_minutes": 90}) + + if total_duration >= 240: # 4+ hours + breaks.append({"type": "lunch_break", "duration": 60, "after_minutes": 180}) + + if total_duration >= 360: # 6+ hours + breaks.append({"type": "short_break", "duration": 15, "after_minutes": 300}) + + return breaks + + def _generate_scorecard(self, role_key: str, level_key: str, competency_req: Dict) -> Dict[str, Any]: + """Generate a scorecard template for the interview loop.""" + scoring_dimensions = [] + + # Add competency-based scoring dimensions + for competency in competency_req["required"]: + scoring_dimensions.append({ + "dimension": competency, + "weight": "high", + "scale": "1-4", + "description": f"Assessment of {competency.replace('_', ' ')} competency" + }) + + for competency in competency_req.get("preferred", []): + scoring_dimensions.append({ + "dimension": competency, + "weight": "medium", + "scale": "1-4", + "description": f"Assessment of {competency.replace('_', ' ')} competency" + }) + + # Add standard dimensions + standard_dimensions = [ + {"dimension": "communication", "weight": "high", "scale": "1-4"}, + {"dimension": "cultural_fit", "weight": "medium", "scale": "1-4"}, + {"dimension": "learning_agility", "weight": "medium", "scale": "1-4"} + ] + + scoring_dimensions.extend(standard_dimensions) + + return { + "scoring_scale": { + "4": "Exceeds Expectations - Demonstrates mastery beyond required level", + "3": "Meets Expectations - Solid performance meeting all requirements", + "2": "Partially Meets - Shows potential but has development areas", + "1": "Does Not Meet - Significant gaps in required competencies" + }, + "dimensions": scoring_dimensions, + "overall_recommendation": { + "options": ["Strong Hire", "Hire", "No Hire", "Strong No Hire"], + "criteria": "Based on weighted average and minimum thresholds" + }, + "calibration_notes": { + "required": True, + "min_length": 100, + "sections": ["strengths", "areas_for_development", "specific_examples"] + } + } + + def _define_interviewer_requirements(self, rounds: Dict[str, Dict]) -> Dict[str, Dict]: + """Define interviewer skill requirements for each round.""" + requirements = {} + + for round_name, round_info in rounds.items(): + round_type = round_name.split("_", 2)[-1] # Extract round type + + if round_type in self.interviewer_skills: + skill_req = self.interviewer_skills[round_type].copy() + skill_req["suggested_interviewers"] = self._suggest_interviewer_profiles(round_type) + requirements[round_name] = skill_req + else: + # Default requirements + requirements[round_name] = { + "required_skills": ["interviewing_basics", "evaluation_skills"], + "preferred_experience": ["relevant_domain"], + "calibration_level": "standard", + "suggested_interviewers": ["experienced_interviewer"] + } + + return requirements + + def _suggest_interviewer_profiles(self, round_type: str) -> List[str]: + """Suggest specific interviewer profiles for different round types.""" + profile_mapping = { + "technical_phone_screen": ["senior_engineer", "tech_lead"], + "coding_deep_dive": ["senior_engineer", "staff_engineer"], + "system_design": ["senior_architect", "staff_engineer"], + "behavioral": ["hiring_manager", "people_manager"], + "technical_leadership": ["engineering_manager", "senior_staff"], + "product_sense": ["senior_pm", "product_leader"], + "analytical_thinking": ["senior_analyst", "data_scientist"], + "design_challenge": ["senior_designer", "design_manager"] + } + + return profile_mapping.get(round_type, ["experienced_interviewer"]) + + def _generate_calibration_notes(self, role_key: str, level_key: str) -> Dict[str, Any]: + """Generate calibration notes and best practices.""" + return { + "hiring_bar_notes": f"Calibrated for {level_key} level {role_key.replace('_', ' ')} role", + "common_pitfalls": [ + "Avoid comparing candidates to each other rather than to the role standard", + "Don't let one strong/weak area overshadow overall assessment", + "Ensure consistent application of evaluation criteria" + ], + "calibration_checkpoints": [ + "Review score distribution after every 5 candidates", + "Conduct monthly interviewer calibration sessions", + "Track correlation with 6-month performance reviews" + ], + "escalation_criteria": [ + "Any candidate receiving all 4s or all 1s", + "Significant disagreement between interviewers (>1.5 point spread)", + "Unusual circumstances or accommodations needed" + ] + } + + def _generate_logistics_notes(self, rounds: List[Tuple[str, Dict]]) -> List[str]: + """Generate logistics and coordination notes.""" + notes = [ + "Coordinate interviewer availability before scheduling", + "Ensure all interviewers have access to job description and competency requirements", + "Prepare interview rooms/virtual links for all rounds", + "Share candidate resume and application with all interviewers" + ] + + # Add format-specific notes + formats_used = {round_info["format"] for _, round_info in rounds} + + if "virtual" in formats_used: + notes.append("Test video conferencing setup before virtual interviews") + notes.append("Share virtual meeting links with candidate 24 hours in advance") + + if "collaborative_whiteboard" in formats_used: + notes.append("Prepare whiteboard or collaborative online tool for design sessions") + + if "hands_on_design" in formats_used: + notes.append("Provide design tools access or ensure candidate can screen share their preferred tools") + + return notes + + +def format_human_readable(loop_data: Dict[str, Any]) -> str: + """Format the interview loop data in a human-readable format.""" + output = [] + + # Header + output.append(f"Interview Loop Design for {loop_data['role']} ({loop_data['level'].title()} Level)") + output.append("=" * 60) + + if loop_data.get('team'): + output.append(f"Team: {loop_data['team']}") + + output.append(f"Generated: {loop_data['generated_at']}") + output.append(f"Total Duration: {loop_data['total_duration_minutes']} minutes ({loop_data['total_duration_minutes']//60}h {loop_data['total_duration_minutes']%60}m)") + output.append(f"Total Rounds: {loop_data['total_rounds']}") + output.append("") + + # Interview Rounds + output.append("INTERVIEW ROUNDS") + output.append("-" * 40) + + sorted_rounds = sorted(loop_data['rounds'].items(), key=lambda x: x[1]['order']) + for round_name, round_info in sorted_rounds: + output.append(f"\nRound {round_info['order']}: {round_info['name']}") + output.append(f"Duration: {round_info['duration_minutes']} minutes") + output.append(f"Format: {round_info['format'].replace('_', ' ').title()}") + + output.append("Objectives:") + for obj in round_info['objectives']: + output.append(f" • {obj}") + + output.append("Focus Areas:") + for area in round_info['focus_areas']: + output.append(f" • {area.replace('_', ' ').title()}") + + # Suggested Schedule + output.append("\nSUGGESTED SCHEDULE") + output.append("-" * 40) + + schedule = loop_data['suggested_schedule'] + output.append(f"Schedule Type: {schedule['type'].replace('_', ' ').title()}") + + for day_name, day_info in schedule['day_structure'].items(): + output.append(f"\n{day_name.replace('_', ' ').title()}:") + output.append(f"Time: {day_info['start_time']} - {day_info['end_time']}") + + for item in day_info['rounds']: + if item['type'] == 'interview': + output.append(f" {item['start_time']}-{item['end_time']}: {item['title']} ({item['duration_minutes']}min)") + else: + output.append(f" {item['start_time']}-{item['end_time']}: {item['type'].title()} ({item['duration_minutes']}min)") + + # Interviewer Requirements + output.append("\nINTERVIEWER REQUIREMENTS") + output.append("-" * 40) + + for round_name, requirements in loop_data['interviewer_requirements'].items(): + round_display = round_name.split("_", 2)[-1].replace("_", " ").title() + output.append(f"\n{round_display}:") + output.append(f"Required Skills: {', '.join(requirements['required_skills'])}") + output.append(f"Suggested Interviewers: {', '.join(requirements['suggested_interviewers'])}") + output.append(f"Calibration Level: {requirements['calibration_level'].title()}") + + # Scorecard Overview + output.append("\nSCORECARD TEMPLATE") + output.append("-" * 40) + + scorecard = loop_data['scorecard_template'] + output.append("Scoring Scale:") + for score, description in scorecard['scoring_scale'].items(): + output.append(f" {score}: {description}") + + output.append("\nEvaluation Dimensions:") + for dim in scorecard['dimensions']: + output.append(f" • {dim['dimension'].replace('_', ' ').title()} (Weight: {dim['weight']})") + + # Calibration Notes + output.append("\nCALIBRATION NOTES") + output.append("-" * 40) + + calibration = loop_data['calibration_notes'] + output.append(f"Hiring Bar: {calibration['hiring_bar_notes']}") + + output.append("\nCommon Pitfalls:") + for pitfall in calibration['common_pitfalls']: + output.append(f" • {pitfall}") + + return "\n".join(output) + + +def main(): + parser = argparse.ArgumentParser(description="Generate calibrated interview loops for specific roles and levels") + parser.add_argument("--role", type=str, help="Job role title (e.g., 'Senior Software Engineer')") + parser.add_argument("--level", type=str, help="Experience level (junior, mid, senior, staff, principal)") + parser.add_argument("--team", type=str, help="Team or department (optional)") + parser.add_argument("--competencies", type=str, help="Comma-separated list of specific competencies to focus on") + parser.add_argument("--input", type=str, help="Input JSON file with role definition") + parser.add_argument("--output", type=str, help="Output directory or file path") + parser.add_argument("--format", choices=["json", "text", "both"], default="both", help="Output format") + + args = parser.parse_args() + + designer = InterviewLoopDesigner() + + # Handle input + if args.input: + try: + with open(args.input, 'r') as f: + role_data = json.load(f) + role = role_data.get('role') or role_data.get('title', '') + level = role_data.get('level', 'senior') + team = role_data.get('team') + competencies = role_data.get('competencies') + except Exception as e: + print(f"Error reading input file: {e}") + sys.exit(1) + else: + if not args.role or not args.level: + print("Error: --role and --level are required when not using --input") + sys.exit(1) + + role = args.role + level = args.level + team = args.team + competencies = args.competencies.split(',') if args.competencies else None + + # Generate interview loop + try: + loop_data = designer.generate_interview_loop(role, level, team, competencies) + + # Handle output + if args.output: + output_path = args.output + if os.path.isdir(output_path): + safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_') + base_filename = f"{safe_role}_{level}_interview_loop" + json_path = os.path.join(output_path, f"{base_filename}.json") + text_path = os.path.join(output_path, f"{base_filename}.txt") + else: + # Use provided path as base + json_path = output_path if output_path.endswith('.json') else f"{output_path}.json" + text_path = output_path.replace('.json', '.txt') if output_path.endswith('.json') else f"{output_path}.txt" + else: + safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_') + base_filename = f"{safe_role}_{level}_interview_loop" + json_path = f"{base_filename}.json" + text_path = f"{base_filename}.txt" + + # Write outputs + if args.format in ["json", "both"]: + with open(json_path, 'w') as f: + json.dump(loop_data, f, indent=2, default=str) + print(f"JSON output written to: {json_path}") + + if args.format in ["text", "both"]: + with open(text_path, 'w') as f: + f.write(format_human_readable(loop_data)) + print(f"Text output written to: {text_path}") + + # Always print summary to stdout + print("\nInterview Loop Summary:") + print(f"Role: {loop_data['role']} ({loop_data['level'].title()})") + print(f"Total Duration: {loop_data['total_duration_minutes']} minutes") + print(f"Number of Rounds: {loop_data['total_rounds']}") + print(f"Schedule Type: {loop_data['suggested_schedule']['type'].replace('_', ' ').title()}") + + except Exception as e: + print(f"Error generating interview loop: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file