From daace789540ee6563fa94d1ecfa14e858033958a Mon Sep 17 00:00:00 2001 From: Leo Date: Mon, 16 Feb 2026 12:43:38 +0000 Subject: [PATCH 1/4] feat: Add comprehensive incident-commander skill - Add SKILL.md with 300+ lines of incident response playbook - Implement incident_classifier.py: severity classification and response recommendations - Implement timeline_reconstructor.py: event timeline reconstruction with phase analysis - Implement pir_generator.py: comprehensive PIR generation with multiple RCA frameworks - Add reference documentation: severity matrix, RCA frameworks, communication templates - Add sample data files and expected outputs for testing - All scripts are standalone with zero external dependencies - Dual output formats: JSON + human-readable text - Professional, opinionated defaults based on SRE best practices This POWERFUL-tier skill provides end-to-end incident response capabilities from detection through post-incident review. --- engineering-team/incident-commander/README.md | 252 +++ engineering-team/incident-commander/SKILL.md | 668 +++++++ .../sample_incident_classification.json | 14 + .../assets/sample_incident_pir_data.json | 74 + .../assets/sample_timeline_events.json | 263 +++ .../assets/simple_incident.json | 6 + .../assets/simple_timeline_events.json | 30 + .../incident_classification_text_output.txt | 44 + .../expected_outputs/pir_markdown_output.md | 88 + .../simple_incident_classification.txt | 44 + .../timeline_reconstruction_text_output.txt | 110 ++ .../references/communication_templates.md | 591 ++++++ .../references/incident_severity_matrix.md | 292 +++ .../references/rca_frameworks_guide.md | 562 ++++++ .../scripts/incident_classifier.py | 914 +++++++++ .../scripts/pir_generator.py | 1638 +++++++++++++++++ .../scripts/timeline_reconstructor.py | 1007 ++++++++++ 17 files changed, 6597 insertions(+) create mode 100644 engineering-team/incident-commander/README.md create mode 100644 engineering-team/incident-commander/SKILL.md create mode 100644 engineering-team/incident-commander/assets/sample_incident_classification.json create mode 100644 engineering-team/incident-commander/assets/sample_incident_pir_data.json create mode 100644 engineering-team/incident-commander/assets/sample_timeline_events.json create mode 100644 engineering-team/incident-commander/assets/simple_incident.json create mode 100644 engineering-team/incident-commander/assets/simple_timeline_events.json create mode 100644 engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt create mode 100644 engineering-team/incident-commander/expected_outputs/pir_markdown_output.md create mode 100644 engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt create mode 100644 engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt create mode 100644 engineering-team/incident-commander/references/communication_templates.md create mode 100644 engineering-team/incident-commander/references/incident_severity_matrix.md create mode 100644 engineering-team/incident-commander/references/rca_frameworks_guide.md create mode 100644 engineering-team/incident-commander/scripts/incident_classifier.py create mode 100644 engineering-team/incident-commander/scripts/pir_generator.py create mode 100644 engineering-team/incident-commander/scripts/timeline_reconstructor.py diff --git a/engineering-team/incident-commander/README.md b/engineering-team/incident-commander/README.md new file mode 100644 index 0000000..27fde44 --- /dev/null +++ b/engineering-team/incident-commander/README.md @@ -0,0 +1,252 @@ +# Incident Commander Skill + +A comprehensive incident response framework providing structured tools for managing technology incidents from detection through resolution and post-incident review. + +## Overview + +This skill implements battle-tested practices from SRE and DevOps teams at scale, providing: + +- **Automated Severity Classification** - Intelligent incident triage +- **Timeline Reconstruction** - Transform scattered events into coherent narratives +- **Post-Incident Review Generation** - Structured PIRs with RCA frameworks +- **Communication Templates** - Pre-built stakeholder communication +- **Comprehensive Documentation** - Reference guides for incident response + +## Quick Start + +### Classify an Incident + +```bash +# From JSON file +python scripts/incident_classifier.py --input incident.json --format text + +# From stdin text +echo "Database is down affecting all users" | python scripts/incident_classifier.py --format text + +# Interactive mode +python scripts/incident_classifier.py --interactive +``` + +### Reconstruct Timeline + +```bash +# Analyze event timeline +python scripts/timeline_reconstructor.py --input events.json --format text + +# With gap analysis +python scripts/timeline_reconstructor.py --input events.json --gap-analysis --format markdown +``` + +### Generate PIR Document + +```bash +# Basic PIR +python scripts/pir_generator.py --incident incident.json --format markdown + +# Comprehensive PIR with timeline +python scripts/pir_generator.py --incident incident.json --timeline timeline.json --rca-method fishbone +``` + +## Scripts + +### incident_classifier.py + +**Purpose:** Analyzes incident descriptions and provides severity classification, team recommendations, and response templates. + +**Input:** JSON object with incident details or plain text description +**Output:** JSON + human-readable classification report + +**Example Input:** +```json +{ + "description": "Database connection timeouts causing 500 errors", + "service": "payment-api", + "affected_users": "80%", + "business_impact": "high" +} +``` + +**Key Features:** +- SEV1-4 severity classification +- Recommended response teams +- Initial action prioritization +- Communication templates +- Response timelines + +### timeline_reconstructor.py + +**Purpose:** Reconstructs incident timelines from timestamped events, identifies phases, and performs gap analysis. + +**Input:** JSON array of timestamped events +**Output:** Formatted timeline with phase analysis and metrics + +**Example Input:** +```json +[ + { + "timestamp": "2024-01-01T12:00:00Z", + "source": "monitoring", + "message": "High error rate detected", + "severity": "critical", + "actor": "system" + } +] +``` + +**Key Features:** +- Phase detection (detection → triage → mitigation → resolution) +- Duration analysis +- Gap identification +- Communication effectiveness analysis +- Response metrics + +### pir_generator.py + +**Purpose:** Generates comprehensive Post-Incident Review documents with multiple RCA frameworks. + +**Input:** Incident data JSON, optional timeline data +**Output:** Structured PIR document with RCA analysis + +**Key Features:** +- Multiple RCA methods (5 Whys, Fishbone, Timeline, Bow Tie) +- Automated action item generation +- Lessons learned categorization +- Follow-up planning +- Completeness assessment + +## Sample Data + +The `assets/` directory contains sample data files for testing: + +- `sample_incident_classification.json` - Database connection pool exhaustion incident +- `sample_timeline_events.json` - Complete timeline with 21 events across phases +- `sample_incident_pir_data.json` - Comprehensive incident data for PIR generation +- `simple_incident.json` - Minimal incident for basic testing +- `simple_timeline_events.json` - Simple 4-event timeline + +## Expected Outputs + +The `expected_outputs/` directory contains reference outputs showing what each script produces: + +- `incident_classification_text_output.txt` - Detailed classification report +- `timeline_reconstruction_text_output.txt` - Complete timeline analysis +- `pir_markdown_output.md` - Full PIR document +- `simple_incident_classification.txt` - Basic classification example + +## Reference Documentation + +### references/incident_severity_matrix.md +Complete severity classification system with: +- SEV1-4 definitions and criteria +- Response requirements and timelines +- Escalation paths +- Communication requirements +- Decision trees and examples + +### references/rca_frameworks_guide.md +Detailed guide for root cause analysis: +- 5 Whys methodology +- Fishbone (Ishikawa) diagram analysis +- Timeline analysis techniques +- Bow Tie analysis for high-risk incidents +- Framework selection guidelines + +### references/communication_templates.md +Standardized communication templates: +- Severity-specific notification templates +- Stakeholder-specific messaging +- Escalation communications +- Resolution notifications +- Customer communication guidelines + +## Usage Patterns + +### End-to-End Incident Workflow + +1. **Initial Classification** +```bash +echo "Payment API returning 500 errors for 70% of requests" | \ + python scripts/incident_classifier.py --format text +``` + +2. **Timeline Reconstruction** (after collecting events) +```bash +python scripts/timeline_reconstructor.py \ + --input events.json \ + --gap-analysis \ + --format markdown \ + --output timeline.md +``` + +3. **PIR Generation** (after incident resolution) +```bash +python scripts/pir_generator.py \ + --incident incident.json \ + --timeline timeline.md \ + --rca-method fishbone \ + --output pir.md +``` + +### Integration Examples + +**CI/CD Pipeline Integration:** +```bash +# Classify deployment issues +cat deployment_error.log | python scripts/incident_classifier.py --format json +``` + +**Monitoring Integration:** +```bash +# Process alert events +curl -s "monitoring-api/events" | python scripts/timeline_reconstructor.py --format text +``` + +**Runbook Generation:** +Use classification output to automatically select appropriate runbooks and escalation procedures. + +## Quality Standards + +- **Zero External Dependencies** - All scripts use only Python standard library +- **Dual Output Format** - Both JSON (machine-readable) and text (human-readable) +- **Robust Input Handling** - Graceful handling of missing or malformed data +- **Professional Defaults** - Opinionated, battle-tested configurations +- **Comprehensive Testing** - Sample data and expected outputs included + +## Technical Requirements + +- Python 3.6+ +- No external dependencies required +- Works with standard Unix tools (pipes, redirection) +- Cross-platform compatible + +## Severity Classification Reference + +| Severity | Description | Response Time | Update Frequency | +|----------|-------------|---------------|------------------| +| **SEV1** | Complete outage | 5 minutes | Every 15 minutes | +| **SEV2** | Major degradation | 15 minutes | Every 30 minutes | +| **SEV3** | Minor impact | 2 hours | At milestones | +| **SEV4** | Low impact | 1-2 days | Weekly | + +## Getting Help + +Each script includes comprehensive help: +```bash +python scripts/incident_classifier.py --help +python scripts/timeline_reconstructor.py --help +python scripts/pir_generator.py --help +``` + +For methodology questions, refer to the reference documentation in the `references/` directory. + +## Contributing + +When adding new features: +1. Maintain zero external dependencies +2. Add comprehensive examples to `assets/` +3. Update expected outputs in `expected_outputs/` +4. Follow the established patterns for argument parsing and output formatting + +## License + +This skill is part of the claude-skills repository. See the main repository LICENSE for details. \ No newline at end of file diff --git a/engineering-team/incident-commander/SKILL.md b/engineering-team/incident-commander/SKILL.md new file mode 100644 index 0000000..0895eb9 --- /dev/null +++ b/engineering-team/incident-commander/SKILL.md @@ -0,0 +1,668 @@ +# Incident Commander Skill + +**Category:** Engineering Team +**Tier:** POWERFUL +**Author:** Claude Skills Team +**Version:** 1.0.0 +**Last Updated:** February 2026 + +## Overview + +The Incident Commander skill provides a comprehensive incident response framework for managing technology incidents from detection through resolution and post-incident review. This skill implements battle-tested practices from SRE and DevOps teams at scale, providing structured tools for severity classification, timeline reconstruction, and thorough post-incident analysis. + +## Key Features + +- **Automated Severity Classification** - Intelligent incident triage based on impact and urgency metrics +- **Timeline Reconstruction** - Transform scattered logs and events into coherent incident narratives +- **Post-Incident Review Generation** - Structured PIRs with multiple RCA frameworks +- **Communication Templates** - Pre-built templates for stakeholder updates and escalations +- **Runbook Integration** - Generate actionable runbooks from incident patterns + +## Skills Included + +### Core Tools + +1. **Incident Classifier** (`incident_classifier.py`) + - Analyzes incident descriptions and outputs severity levels + - Recommends response teams and initial actions + - Generates communication templates based on severity + +2. **Timeline Reconstructor** (`timeline_reconstructor.py`) + - Processes timestamped events from multiple sources + - Reconstructs chronological incident timeline + - Identifies gaps and provides duration analysis + +3. **PIR Generator** (`pir_generator.py`) + - Creates comprehensive Post-Incident Review documents + - Applies multiple RCA frameworks (5 Whys, Fishbone, Timeline) + - Generates actionable follow-up items + +## Incident Response Framework + +### Severity Classification System + +#### SEV1 - Critical Outage +**Definition:** Complete service failure affecting all users or critical business functions + +**Characteristics:** +- Customer-facing services completely unavailable +- Data loss or corruption affecting users +- Security breaches with customer data exposure +- Revenue-generating systems down +- SLA violations with financial penalties + +**Response Requirements:** +- Immediate escalation to on-call engineer +- Incident Commander assigned within 5 minutes +- Executive notification within 15 minutes +- Public status page update within 15 minutes +- War room established +- All hands on deck if needed + +**Communication Frequency:** Every 15 minutes until resolution + +#### SEV2 - Major Impact +**Definition:** Significant degradation affecting subset of users or non-critical functions + +**Characteristics:** +- Partial service degradation (>25% of users affected) +- Performance issues causing user frustration +- Non-critical features unavailable +- Internal tools impacting productivity +- Data inconsistencies not affecting user experience + +**Response Requirements:** +- On-call engineer response within 15 minutes +- Incident Commander assigned within 30 minutes +- Status page update within 30 minutes +- Stakeholder notification within 1 hour +- Regular team updates + +**Communication Frequency:** Every 30 minutes during active response + +#### SEV3 - Minor Impact +**Definition:** Limited impact with workarounds available + +**Characteristics:** +- Single feature or component affected +- <25% of users impacted +- Workarounds available +- Performance degradation not significantly impacting UX +- Non-urgent monitoring alerts + +**Response Requirements:** +- Response within 2 hours during business hours +- Next business day response acceptable outside hours +- Internal team notification +- Optional status page update + +**Communication Frequency:** At key milestones only + +#### SEV4 - Low Impact +**Definition:** Minimal impact, cosmetic issues, or planned maintenance + +**Characteristics:** +- Cosmetic bugs +- Documentation issues +- Logging or monitoring gaps +- Performance issues with no user impact +- Development/test environment issues + +**Response Requirements:** +- Response within 1-2 business days +- Standard ticket/issue tracking +- No special escalation required + +**Communication Frequency:** Standard development cycle updates + +### Incident Commander Role + +#### Primary Responsibilities + +1. **Command and Control** + - Own the incident response process + - Make critical decisions about resource allocation + - Coordinate between technical teams and stakeholders + - Maintain situational awareness across all response streams + +2. **Communication Hub** + - Provide regular updates to stakeholders + - Manage external communications (status pages, customer notifications) + - Facilitate effective communication between response teams + - Shield responders from external distractions + +3. **Process Management** + - Ensure proper incident tracking and documentation + - Drive toward resolution while maintaining quality + - Coordinate handoffs between team members + - Plan and execute rollback strategies if needed + +4. **Post-Incident Leadership** + - Ensure thorough post-incident reviews are conducted + - Drive implementation of preventive measures + - Share learnings with broader organization + +#### Decision-Making Framework + +**Emergency Decisions (SEV1/2):** +- Incident Commander has full authority +- Bias toward action over analysis +- Document decisions for later review +- Consult subject matter experts but don't get blocked + +**Resource Allocation:** +- Can pull in any necessary team members +- Authority to escalate to senior leadership +- Can approve emergency spend for external resources +- Make call on communication channels and timing + +**Technical Decisions:** +- Lean on technical leads for implementation details +- Make final calls on trade-offs between speed and risk +- Approve rollback vs. fix-forward strategies +- Coordinate testing and validation approaches + +### Communication Templates + +#### Initial Incident Notification (SEV1/2) + +``` +Subject: [SEV{severity}] {Service Name} - {Brief Description} + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV{level} +- Impact: {user impact description} +- Current Status: {investigating/mitigating/resolved} + +Technical Details: +- Affected Services: {service list} +- Symptoms: {what users are experiencing} +- Initial Assessment: {suspected root cause if known} + +Response Team: +- Incident Commander: {name} +- Technical Lead: {name} +- SMEs Engaged: {list} + +Next Update: {timestamp} +Status Page: {link} +War Room: {bridge/chat link} + +--- +{Incident Commander Name} +{Contact Information} +``` + +#### Executive Summary (SEV1) + +``` +Subject: URGENT - Customer-Impacting Outage - {Service Name} + +Executive Summary: +{2-3 sentence description of customer impact and business implications} + +Key Metrics: +- Time to Detection: {X minutes} +- Time to Engagement: {X minutes} +- Estimated Customer Impact: {number/percentage} +- Current Status: {status} +- ETA to Resolution: {time or "investigating"} + +Leadership Actions Required: +- [ ] Customer communication approval +- [ ] PR/Communications coordination +- [ ] Resource allocation decisions +- [ ] External vendor engagement + +Incident Commander: {name} ({contact}) +Next Update: {time} + +--- +This is an automated alert from our incident response system. +``` + +#### Customer Communication Template + +``` +We are currently experiencing {brief description of issue} affecting {scope of impact}. + +Our engineering team was alerted at {time} and is actively working to resolve the issue. We will provide updates every {frequency} until resolved. + +What we know: +- {factual statement of impact} +- {factual statement of scope} +- {brief status of response} + +What we're doing: +- {primary response action} +- {secondary response action} + +Workaround (if available): +{workaround steps or "No workaround currently available"} + +We apologize for the inconvenience and will share more information as it becomes available. + +Next update: {time} +Status page: {link} +``` + +### Stakeholder Management + +#### Stakeholder Classification + +**Internal Stakeholders:** +- **Engineering Leadership** - Technical decisions and resource allocation +- **Product Management** - Customer impact assessment and feature implications +- **Customer Support** - User communication and support ticket management +- **Sales/Account Management** - Customer relationship management for enterprise clients +- **Executive Team** - Business impact decisions and external communication approval +- **Legal/Compliance** - Regulatory reporting and liability assessment + +**External Stakeholders:** +- **Customers** - Service availability and impact communication +- **Partners** - API availability and integration impacts +- **Vendors** - Third-party service dependencies and support escalation +- **Regulators** - Compliance reporting for regulated industries +- **Public/Media** - Transparency for public-facing outages + +#### Communication Cadence by Stakeholder + +| Stakeholder | SEV1 | SEV2 | SEV3 | SEV4 | +|-------------|------|------|------|------| +| Engineering Leadership | Real-time | 30min | 4hrs | Daily | +| Executive Team | 15min | 1hr | EOD | Weekly | +| Customer Support | Real-time | 30min | 2hrs | As needed | +| Customers | 15min | 1hr | Optional | None | +| Partners | 30min | 2hrs | Optional | None | + +### Runbook Generation Framework + +#### Dynamic Runbook Components + +1. **Detection Playbooks** + - Monitoring alert definitions + - Triage decision trees + - Escalation trigger points + - Initial response actions + +2. **Response Playbooks** + - Step-by-step mitigation procedures + - Rollback instructions + - Validation checkpoints + - Communication checkpoints + +3. **Recovery Playbooks** + - Service restoration procedures + - Data consistency checks + - Performance validation + - User notification processes + +#### Runbook Template Structure + +```markdown +# {Service/Component} Incident Response Runbook + +## Quick Reference +- **Severity Indicators:** {list of conditions for each severity level} +- **Key Contacts:** {on-call rotations and escalation paths} +- **Critical Commands:** {list of emergency commands with descriptions} + +## Detection +### Monitoring Alerts +- {Alert name}: {description and thresholds} +- {Alert name}: {description and thresholds} + +### Manual Detection Signs +- {Symptom}: {what to look for and where} +- {Symptom}: {what to look for and where} + +## Initial Response (0-15 minutes) +1. **Assess Severity** + - [ ] Check {primary metric} + - [ ] Verify {secondary indicator} + - [ ] Classify as SEV{level} based on {criteria} + +2. **Establish Command** + - [ ] Page Incident Commander if SEV1/2 + - [ ] Create incident tracking ticket + - [ ] Join war room: {link/bridge info} + +3. **Initial Investigation** + - [ ] Check recent deployments: {deployment log location} + - [ ] Review error logs: {log location and queries} + - [ ] Verify dependencies: {dependency check commands} + +## Mitigation Strategies +### Strategy 1: {Name} +**Use when:** {conditions} +**Steps:** +1. {detailed step with commands} +2. {detailed step with expected outcomes} +3. {validation step} + +**Rollback Plan:** +1. {rollback step} +2. {verification step} + +### Strategy 2: {Name} +{similar structure} + +## Recovery and Validation +1. **Service Restoration** + - [ ] {restoration step} + - [ ] Wait for {metric} to return to normal + - [ ] Validate end-to-end functionality + +2. **Communication** + - [ ] Update status page + - [ ] Notify stakeholders + - [ ] Schedule PIR + +## Common Pitfalls +- **{Pitfall}:** {description and how to avoid} +- **{Pitfall}:** {description and how to avoid} + +## Reference Information +- **Architecture Diagram:** {link} +- **Monitoring Dashboard:** {link} +- **Related Runbooks:** {links to dependent service runbooks} +``` + +### Post-Incident Review (PIR) Framework + +#### PIR Timeline and Ownership + +**Timeline:** +- **24 hours:** Initial PIR draft completed by Incident Commander +- **3 business days:** Final PIR published with all stakeholder input +- **1 week:** Action items assigned with owners and due dates +- **4 weeks:** Follow-up review on action item progress + +**Roles:** +- **PIR Owner:** Incident Commander (can delegate writing but owns completion) +- **Technical Contributors:** All engineers involved in response +- **Review Committee:** Engineering leadership, affected product teams +- **Action Item Owners:** Assigned based on expertise and capacity + +#### Root Cause Analysis Frameworks + +#### 1. Five Whys Method + +The Five Whys technique involves asking "why" repeatedly to drill down to root causes: + +**Example Application:** +- **Problem:** Database became unresponsive during peak traffic +- **Why 1:** Why did the database become unresponsive? → Connection pool was exhausted +- **Why 2:** Why was the connection pool exhausted? → Application was creating more connections than usual +- **Why 3:** Why was the application creating more connections? → New feature wasn't properly connection pooling +- **Why 4:** Why wasn't the feature properly connection pooling? → Code review missed this pattern +- **Why 5:** Why did code review miss this? → No automated checks for connection pooling patterns + +**Best Practices:** +- Ask "why" at least 3 times, often need 5+ iterations +- Focus on process failures, not individual blame +- Each "why" should point to a actionable system improvement +- Consider multiple root cause paths, not just one linear chain + +#### 2. Fishbone (Ishikawa) Diagram + +Systematic analysis across multiple categories of potential causes: + +**Categories:** +- **People:** Training, experience, communication, handoffs +- **Process:** Procedures, change management, review processes +- **Technology:** Architecture, tooling, monitoring, automation +- **Environment:** Infrastructure, dependencies, external factors + +**Application Method:** +1. State the problem clearly at the "head" of the fishbone +2. For each category, brainstorm potential contributing factors +3. For each factor, ask what caused that factor (sub-causes) +4. Identify the factors most likely to be root causes +5. Validate root causes with evidence from the incident + +#### 3. Timeline Analysis + +Reconstruct the incident chronologically to identify decision points and missed opportunities: + +**Timeline Elements:** +- **Detection:** When was the issue first observable? When was it first detected? +- **Notification:** How quickly were the right people informed? +- **Response:** What actions were taken and how effective were they? +- **Communication:** When were stakeholders updated? +- **Resolution:** What finally resolved the issue? + +**Analysis Questions:** +- Where were there delays and what caused them? +- What decisions would we make differently with perfect information? +- Where did communication break down? +- What automation could have detected/resolved faster? + +### Escalation Paths + +#### Technical Escalation + +**Level 1:** On-call engineer +- **Responsibility:** Initial response and common issue resolution +- **Escalation Trigger:** Issue not resolved within SLA timeframe +- **Timeframe:** 15 minutes (SEV1), 30 minutes (SEV2) + +**Level 2:** Senior engineer/Team lead +- **Responsibility:** Complex technical issues requiring deeper expertise +- **Escalation Trigger:** Level 1 requests help or timeout occurs +- **Timeframe:** 30 minutes (SEV1), 1 hour (SEV2) + +**Level 3:** Engineering Manager/Staff Engineer +- **Responsibility:** Cross-team coordination and architectural decisions +- **Escalation Trigger:** Issue spans multiple systems or teams +- **Timeframe:** 45 minutes (SEV1), 2 hours (SEV2) + +**Level 4:** Director of Engineering/CTO +- **Responsibility:** Resource allocation and business impact decisions +- **Escalation Trigger:** Extended outage or significant business impact +- **Timeframe:** 1 hour (SEV1), 4 hours (SEV2) + +#### Business Escalation + +**Customer Impact Assessment:** +- **High:** Revenue loss, SLA breaches, customer churn risk +- **Medium:** User experience degradation, support ticket volume +- **Low:** Internal tools, development impact only + +**Escalation Matrix:** + +| Severity | Duration | Business Escalation | +|----------|----------|-------------------| +| SEV1 | Immediate | VP Engineering | +| SEV1 | 30 minutes | CTO + Customer Success VP | +| SEV1 | 1 hour | CEO + Full Executive Team | +| SEV2 | 2 hours | VP Engineering | +| SEV2 | 4 hours | CTO | +| SEV3 | 1 business day | Engineering Manager | + +### Status Page Management + +#### Update Principles + +1. **Transparency:** Provide factual information without speculation +2. **Timeliness:** Update within committed timeframes +3. **Clarity:** Use customer-friendly language, avoid technical jargon +4. **Completeness:** Include impact scope, status, and next update time + +#### Status Categories + +- **Operational:** All systems functioning normally +- **Degraded Performance:** Some users may experience slowness +- **Partial Outage:** Subset of features unavailable +- **Major Outage:** Service unavailable for most/all users +- **Under Maintenance:** Planned maintenance window + +#### Update Template + +``` +{Timestamp} - {Status Category} + +{Brief description of current state} + +Impact: {who is affected and how} +Cause: {root cause if known, "under investigation" if not} +Resolution: {what's being done to fix it} + +Next update: {specific time} + +We apologize for any inconvenience this may cause. +``` + +### Action Item Framework + +#### Action Item Categories + +1. **Immediate Fixes** + - Critical bugs discovered during incident + - Security vulnerabilities exposed + - Data integrity issues + +2. **Process Improvements** + - Communication gaps + - Escalation procedure updates + - Runbook additions/updates + +3. **Technical Debt** + - Architecture improvements + - Monitoring enhancements + - Automation opportunities + +4. **Organizational Changes** + - Team structure adjustments + - Training requirements + - Tool/platform investments + +#### Action Item Template + +``` +**Title:** {Concise description of the action} +**Priority:** {Critical/High/Medium/Low} +**Category:** {Fix/Process/Technical/Organizational} +**Owner:** {Assigned person} +**Due Date:** {Specific date} +**Success Criteria:** {How will we know this is complete} +**Dependencies:** {What needs to happen first} +**Related PIRs:** {Links to other incidents this addresses} + +**Description:** +{Detailed description of what needs to be done and why} + +**Implementation Plan:** +1. {Step 1} +2. {Step 2} +3. {Validation step} + +**Progress Updates:** +- {Date}: {Progress update} +- {Date}: {Progress update} +``` + +## Usage Examples + +### Example 1: Database Connection Pool Exhaustion + +```bash +# Classify the incident +echo '{"description": "Users reporting 500 errors, database connections timing out", "affected_users": "80%", "business_impact": "high"}' | python scripts/incident_classifier.py + +# Reconstruct timeline from logs +python scripts/timeline_reconstructor.py --input assets/db_incident_events.json --output timeline.md + +# Generate PIR after resolution +python scripts/pir_generator.py --incident assets/db_incident_data.json --timeline timeline.md --output pir.md +``` + +### Example 2: API Rate Limiting Incident + +```bash +# Quick classification from stdin +echo "API rate limits causing customer API calls to fail" | python scripts/incident_classifier.py --format text + +# Build timeline from multiple sources +python scripts/timeline_reconstructor.py --input assets/api_incident_logs.json --detect-phases --gap-analysis + +# Generate comprehensive PIR +python scripts/pir_generator.py --incident assets/api_incident_summary.json --rca-method fishbone --action-items +``` + +## Best Practices + +### During Incident Response + +1. **Maintain Calm Leadership** + - Stay composed under pressure + - Make decisive calls with incomplete information + - Communicate confidence while acknowledging uncertainty + +2. **Document Everything** + - All actions taken and their outcomes + - Decision rationale, especially for controversial calls + - Timeline of events as they happen + +3. **Effective Communication** + - Use clear, jargon-free language + - Provide regular updates even when there's no new information + - Manage stakeholder expectations proactively + +4. **Technical Excellence** + - Prefer rollbacks to risky fixes under pressure + - Validate fixes before declaring resolution + - Plan for secondary failures and cascading effects + +### Post-Incident + +1. **Blameless Culture** + - Focus on system failures, not individual mistakes + - Encourage honest reporting of what went wrong + - Celebrate learning and improvement opportunities + +2. **Action Item Discipline** + - Assign specific owners and due dates + - Track progress publicly + - Prioritize based on risk and effort + +3. **Knowledge Sharing** + - Share PIRs broadly within the organization + - Update runbooks based on lessons learned + - Conduct training sessions for common failure modes + +4. **Continuous Improvement** + - Look for patterns across multiple incidents + - Invest in tooling and automation + - Regularly review and update processes + +## Integration with Existing Tools + +### Monitoring and Alerting +- PagerDuty/Opsgenie integration for escalation +- Datadog/Grafana for metrics and dashboards +- ELK/Splunk for log analysis and correlation + +### Communication Platforms +- Slack/Teams for war room coordination +- Zoom/Meet for video bridges +- Status page providers (Statuspage.io, etc.) + +### Documentation Systems +- Confluence/Notion for PIR storage +- GitHub/GitLab for runbook version control +- JIRA/Linear for action item tracking + +### Change Management +- CI/CD pipeline integration +- Deployment tracking systems +- Feature flag platforms for quick rollbacks + +## Conclusion + +The Incident Commander skill provides a comprehensive framework for managing incidents from detection through post-incident review. By implementing structured processes, clear communication templates, and thorough analysis tools, teams can improve their incident response capabilities and build more resilient systems. + +The key to successful incident management is preparation, practice, and continuous learning. Use this framework as a starting point, but adapt it to your organization's specific needs, culture, and technical environment. + +Remember: The goal isn't to prevent all incidents (which is impossible), but to detect them quickly, respond effectively, communicate clearly, and learn continuously. \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/sample_incident_classification.json b/engineering-team/incident-commander/assets/sample_incident_classification.json new file mode 100644 index 0000000..00a7677 --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_incident_classification.json @@ -0,0 +1,14 @@ +{ + "description": "Database connection timeouts causing 500 errors for payment processing API. Users unable to complete checkout. Error rate spiked from 0.1% to 45% starting at 14:30 UTC. Database monitoring shows connection pool exhaustion with 200/200 connections active.", + "service": "payment-api", + "affected_users": "80%", + "business_impact": "high", + "duration_minutes": 95, + "metadata": { + "error_rate": "45%", + "connection_pool_utilization": "100%", + "affected_regions": ["us-west", "us-east", "eu-west"], + "detection_method": "monitoring_alert", + "customer_escalations": 12 + } +} \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/sample_incident_pir_data.json b/engineering-team/incident-commander/assets/sample_incident_pir_data.json new file mode 100644 index 0000000..c04749d --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_incident_pir_data.json @@ -0,0 +1,74 @@ +{ + "incident_id": "INC-2024-0315-001", + "title": "Payment API Database Connection Pool Exhaustion", + "description": "Database connection pool exhaustion caused widespread 500 errors in payment processing API, preventing users from completing purchases. Root cause was an inefficient database query introduced in deployment v2.3.1.", + "severity": "sev2", + "start_time": "2024-03-15T14:30:00Z", + "end_time": "2024-03-15T15:35:00Z", + "duration": "1h 5m", + "affected_services": ["payment-api", "checkout-service", "subscription-billing"], + "customer_impact": "80% of users unable to complete payments or checkout. Approximately 2,400 failed payment attempts during the incident. Users experienced immediate 500 errors when attempting to pay.", + "business_impact": "Estimated revenue loss of $45,000 during outage period. No SLA breaches as resolution was within 2-hour window. 12 customer escalations through support channels.", + "incident_commander": "Mike Rodriguez", + "responders": [ + "Sarah Chen - On-call Engineer, Primary Responder", + "Tom Wilson - Database Team Lead", + "Lisa Park - Database Engineer", + "Mike Rodriguez - Incident Commander", + "David Kumar - DevOps Engineer" + ], + "status": "resolved", + "detection_details": { + "detection_method": "automated_monitoring", + "detection_time": "2024-03-15T14:30:00Z", + "alert_source": "Datadog error rate threshold", + "time_to_detection": "immediate" + }, + "response_details": { + "time_to_response": "5 minutes", + "time_to_escalation": "10 minutes", + "time_to_resolution": "65 minutes", + "war_room_established": "2024-03-15T14:45:00Z", + "executives_notified": false, + "status_page_updated": true + }, + "technical_details": { + "root_cause": "Inefficient database query introduced in deployment v2.3.1 caused each payment validation to take 15 seconds instead of normal 0.1 seconds, exhausting the 200-connection database pool", + "affected_regions": ["us-west", "us-east", "eu-west"], + "error_metrics": { + "peak_error_rate": "45%", + "normal_error_rate": "0.1%", + "connection_pool_max": 200, + "connections_exhausted_at": "100%" + }, + "resolution_method": "rollback", + "rollback_target": "v2.2.9", + "rollback_duration": "7 minutes" + }, + "communication_log": [ + { + "timestamp": "2024-03-15T14:50:00Z", + "type": "status_page", + "message": "Investigating payment processing issues", + "audience": "customers" + }, + { + "timestamp": "2024-03-15T15:35:00Z", + "type": "status_page", + "message": "Payment processing issues resolved", + "audience": "customers" + } + ], + "lessons_learned_preview": [ + "Deployment v2.3.1 code review missed performance implications of query change", + "Load testing didn't include realistic database query patterns", + "Connection pool monitoring could have provided earlier warning", + "Rollback procedure worked effectively - 7 minute rollback time" + ], + "preliminary_action_items": [ + "Fix inefficient query for v2.3.2 deployment", + "Add database query performance checks to CI pipeline", + "Improve load testing to include database performance scenarios", + "Add connection pool utilization alerts" + ] +} \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/sample_timeline_events.json b/engineering-team/incident-commander/assets/sample_timeline_events.json new file mode 100644 index 0000000..18438da --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_timeline_events.json @@ -0,0 +1,263 @@ +[ + { + "timestamp": "2024-03-15T14:30:00Z", + "source": "datadog", + "type": "alert", + "message": "High error rate detected on payment-api: 45% error rate (threshold: 5%)", + "severity": "critical", + "actor": "monitoring-system", + "metadata": { + "alert_id": "ALT-001", + "metric_value": "45%", + "threshold": "5%" + } + }, + { + "timestamp": "2024-03-15T14:32:00Z", + "source": "pagerduty", + "type": "escalation", + "message": "Paged on-call engineer Sarah Chen for payment-api alerts", + "severity": "high", + "actor": "pagerduty-system", + "metadata": { + "incident_id": "PD-12345", + "responder": "sarah.chen@company.com" + } + }, + { + "timestamp": "2024-03-15T14:35:00Z", + "source": "slack", + "type": "communication", + "message": "Sarah Chen acknowledged the alert and is investigating payment-api issues", + "severity": "medium", + "actor": "sarah.chen", + "metadata": { + "channel": "#incidents", + "message_id": "1234567890.123456" + } + }, + { + "timestamp": "2024-03-15T14:38:00Z", + "source": "application_logs", + "type": "log", + "message": "Database connection pool exhausted: 200/200 connections active, unable to acquire new connections", + "severity": "critical", + "actor": "payment-api", + "metadata": { + "log_level": "ERROR", + "component": "database_pool", + "connection_count": 200, + "max_connections": 200 + } + }, + { + "timestamp": "2024-03-15T14:40:00Z", + "source": "slack", + "type": "escalation", + "message": "Sarah Chen: Escalating to incident commander - database connection pool exhausted, need database team", + "severity": "high", + "actor": "sarah.chen", + "metadata": { + "channel": "#incidents", + "escalation_reason": "database_expertise_needed" + } + }, + { + "timestamp": "2024-03-15T14:42:00Z", + "source": "pagerduty", + "type": "escalation", + "message": "Incident commander Mike Rodriguez assigned to incident PD-12345", + "severity": "high", + "actor": "pagerduty-system", + "metadata": { + "incident_commander": "mike.rodriguez@company.com", + "role": "incident_commander" + } + }, + { + "timestamp": "2024-03-15T14:45:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: War room established in #war-room-payment-api. Engaging database team.", + "severity": "high", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#incidents", + "war_room": "#war-room-payment-api" + } + }, + { + "timestamp": "2024-03-15T14:47:00Z", + "source": "pagerduty", + "type": "escalation", + "message": "Database team engineers paged: Tom Wilson, Lisa Park", + "severity": "medium", + "actor": "pagerduty-system", + "metadata": { + "team": "database-team", + "responders": ["tom.wilson@company.com", "lisa.park@company.com"] + } + }, + { + "timestamp": "2024-03-15T14:50:00Z", + "source": "statuspage", + "type": "communication", + "message": "Status page updated: Investigating payment processing issues", + "severity": "medium", + "actor": "mike.rodriguez", + "metadata": { + "status": "investigating", + "affected_systems": ["payment-api"] + } + }, + { + "timestamp": "2024-03-15T14:52:00Z", + "source": "slack", + "type": "communication", + "message": "Tom Wilson: Joining war room. Looking at database metrics now. Seeing unusual query patterns from recent deployment.", + "severity": "medium", + "actor": "tom.wilson", + "metadata": { + "channel": "#war-room-payment-api", + "investigation_focus": "database_metrics" + } + }, + { + "timestamp": "2024-03-15T14:55:00Z", + "source": "database_monitoring", + "type": "log", + "message": "Identified slow query introduced in deployment v2.3.1: payment validation taking 15s per request", + "severity": "critical", + "actor": "database-monitor", + "metadata": { + "deployment_version": "v2.3.1", + "query_time": "15s", + "normal_query_time": "0.1s" + } + }, + { + "timestamp": "2024-03-15T15:00:00Z", + "source": "slack", + "type": "communication", + "message": "Tom Wilson: Root cause identified - inefficient query in v2.3.1 deployment. Recommending immediate rollback.", + "severity": "high", + "actor": "tom.wilson", + "metadata": { + "channel": "#war-room-payment-api", + "root_cause": "inefficient_query", + "recommendation": "rollback" + } + }, + { + "timestamp": "2024-03-15T15:02:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: Approved rollback to v2.2.9. Sarah initiating rollback procedure.", + "severity": "high", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#war-room-payment-api", + "decision": "rollback_approved", + "target_version": "v2.2.9" + } + }, + { + "timestamp": "2024-03-15T15:05:00Z", + "source": "deployment_system", + "type": "action", + "message": "Rollback initiated: payment-api v2.3.1 → v2.2.9", + "severity": "medium", + "actor": "sarah.chen", + "metadata": { + "from_version": "v2.3.1", + "to_version": "v2.2.9", + "deployment_type": "rollback" + } + }, + { + "timestamp": "2024-03-15T15:12:00Z", + "source": "deployment_system", + "type": "action", + "message": "Rollback completed successfully: payment-api now running v2.2.9 across all regions", + "severity": "medium", + "actor": "deployment-system", + "metadata": { + "deployment_status": "completed", + "regions": ["us-west", "us-east", "eu-west"] + } + }, + { + "timestamp": "2024-03-15T15:15:00Z", + "source": "datadog", + "type": "log", + "message": "Error rate decreasing: payment-api error rate dropped to 8% and continuing to decline", + "severity": "medium", + "actor": "monitoring-system", + "metadata": { + "error_rate": "8%", + "trend": "decreasing" + } + }, + { + "timestamp": "2024-03-15T15:18:00Z", + "source": "database_monitoring", + "type": "log", + "message": "Connection pool utilization normalizing: 45/200 connections active", + "severity": "low", + "actor": "database-monitor", + "metadata": { + "connection_count": 45, + "max_connections": 200, + "utilization": "22.5%" + } + }, + { + "timestamp": "2024-03-15T15:25:00Z", + "source": "datadog", + "type": "log", + "message": "Error rate returned to normal: payment-api error rate now 0.2% (within normal range)", + "severity": "low", + "actor": "monitoring-system", + "metadata": { + "error_rate": "0.2%", + "status": "normal" + } + }, + { + "timestamp": "2024-03-15T15:30:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: All metrics returned to normal. Declaring incident resolved. Thanks to all responders.", + "severity": "low", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#war-room-payment-api", + "status": "resolved" + } + }, + { + "timestamp": "2024-03-15T15:35:00Z", + "source": "statuspage", + "type": "communication", + "message": "Status page updated: Payment processing issues resolved. All systems operational.", + "severity": "low", + "actor": "mike.rodriguez", + "metadata": { + "status": "resolved", + "duration": "65 minutes" + } + }, + { + "timestamp": "2024-03-15T15:40:00Z", + "source": "slack", + "type": "communication", + "message": "Mike Rodriguez: PIR scheduled for tomorrow 10am. Action item: fix the inefficient query in v2.3.2", + "severity": "low", + "actor": "mike.rodriguez", + "metadata": { + "channel": "#incidents", + "pir_time": "2024-03-16T10:00:00Z", + "action_item": "fix_query_v2.3.2" + } + } +] \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/simple_incident.json b/engineering-team/incident-commander/assets/simple_incident.json new file mode 100644 index 0000000..b1af6a3 --- /dev/null +++ b/engineering-team/incident-commander/assets/simple_incident.json @@ -0,0 +1,6 @@ +{ + "description": "Users reporting slow page loads on the main website", + "service": "web-frontend", + "affected_users": "25%", + "business_impact": "medium" +} \ No newline at end of file diff --git a/engineering-team/incident-commander/assets/simple_timeline_events.json b/engineering-team/incident-commander/assets/simple_timeline_events.json new file mode 100644 index 0000000..75b1126 --- /dev/null +++ b/engineering-team/incident-commander/assets/simple_timeline_events.json @@ -0,0 +1,30 @@ +[ + { + "timestamp": "2024-03-10T09:00:00Z", + "source": "monitoring", + "message": "High CPU utilization detected on web servers", + "severity": "medium", + "actor": "system" + }, + { + "timestamp": "2024-03-10T09:05:00Z", + "source": "slack", + "message": "Engineer investigating high CPU alerts", + "severity": "medium", + "actor": "john.doe" + }, + { + "timestamp": "2024-03-10T09:15:00Z", + "source": "deployment", + "message": "Deployed hotfix to reduce CPU usage", + "severity": "low", + "actor": "john.doe" + }, + { + "timestamp": "2024-03-10T09:25:00Z", + "source": "monitoring", + "message": "CPU utilization returned to normal levels", + "severity": "low", + "actor": "system" + } +] \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt b/engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt new file mode 100644 index 0000000..0182b8e --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/incident_classification_text_output.txt @@ -0,0 +1,44 @@ +============================================================ +INCIDENT CLASSIFICATION REPORT +============================================================ + +CLASSIFICATION: + Severity: SEV1 + Confidence: 100.0% + Reasoning: Classified as SEV1 based on: keywords: timeout, 500 error; user impact: 80% + Timestamp: 2026-02-16T12:41:46.644096+00:00 + +RECOMMENDED RESPONSE: + Primary Team: Analytics Team + Supporting Teams: SRE, API Team, Backend Engineering, Finance Engineering, Payments Team, DevOps, Compliance Team, Database Team, Platform Team, Data Engineering + Response Time: 5 minutes + +INITIAL ACTIONS: + 1. Establish incident command (Priority 1) + Timeout: 5 minutes + Page incident commander and establish war room + + 2. Create incident ticket (Priority 1) + Timeout: 2 minutes + Create tracking ticket with all known details + + 3. Update status page (Priority 2) + Timeout: 15 minutes + Post initial status page update acknowledging incident + + 4. Notify executives (Priority 2) + Timeout: 15 minutes + Alert executive team of customer-impacting outage + + 5. Engage subject matter experts (Priority 3) + Timeout: 10 minutes + Page relevant SMEs based on affected systems + +COMMUNICATION: + Subject: 🚨 [SEV1] payment-api - Database connection timeouts causing 500 errors fo... + Urgency: SEV1 + Recipients: on-call, engineering-leadership, executives, customer-success + Channels: pager, phone, slack, email, status-page + Update Frequency: Every 15 minutes + +============================================================ \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/pir_markdown_output.md b/engineering-team/incident-commander/expected_outputs/pir_markdown_output.md new file mode 100644 index 0000000..c9f46ac --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/pir_markdown_output.md @@ -0,0 +1,88 @@ +# Post-Incident Review: Payment API Database Connection Pool Exhaustion + +## Executive Summary +On March 15, 2024, we experienced a sev2 incident affecting ['payment-api', 'checkout-service', 'subscription-billing']. The incident lasted 1h 5m and had the following impact: 80% of users unable to complete payments or checkout. Approximately 2,400 failed payment attempts during the incident. Users experienced immediate 500 errors when attempting to pay. The incident has been resolved and we have identified specific actions to prevent recurrence. + +## Incident Overview +- **Incident ID:** INC-2024-0315-001 +- **Date & Time:** 2024-03-15 14:30:00 UTC +- **Duration:** 1h 5m +- **Severity:** SEV2 +- **Status:** Resolved +- **Incident Commander:** Mike Rodriguez +- **Responders:** Sarah Chen - On-call Engineer, Primary Responder, Tom Wilson - Database Team Lead, Lisa Park - Database Engineer, Mike Rodriguez - Incident Commander, David Kumar - DevOps Engineer + +### Customer Impact +80% of users unable to complete payments or checkout. Approximately 2,400 failed payment attempts during the incident. Users experienced immediate 500 errors when attempting to pay. + +### Business Impact +Estimated revenue loss of $45,000 during outage period. No SLA breaches as resolution was within 2-hour window. 12 customer escalations through support channels. + +## Timeline +No detailed timeline available. + +## Root Cause Analysis +### Analysis Method: 5 Whys Analysis + +#### Why Analysis + +**Why 1:** Why did Database connection pool exhaustion caused widespread 500 errors in payment processing API, preventing users from completing purchases. Root cause was an inefficient database query introduced in deployment v2.3.1.? +**Answer:** New deployment introduced a regression + +**Why 2:** Why wasn't this detected earlier? +**Answer:** Code review process missed the issue + +**Why 3:** Why didn't existing safeguards prevent this? +**Answer:** Testing environment didn't match production + +**Why 4:** Why wasn't there a backup mechanism? +**Answer:** Further investigation needed + +**Why 5:** Why wasn't this scenario anticipated? +**Answer:** Further investigation needed + + +## What Went Well +- The incident was successfully resolved +- Incident command was established +- Multiple team members collaborated on resolution + +## What Didn't Go Well +- Analysis in progress + +## Lessons Learned +Lessons learned to be documented following detailed analysis. + +## Action Items +Action items to be defined. + +## Follow-up and Prevention +### Prevention Measures + +Based on the root cause analysis, the following preventive measures have been identified: + +- Implement comprehensive testing for similar scenarios +- Improve monitoring and alerting coverage +- Enhance error handling and resilience patterns + +### Follow-up Schedule + +- 1 week: Review action item progress +- 1 month: Evaluate effectiveness of implemented changes +- 3 months: Conduct follow-up assessment and update preventive measures + +## Appendix +### Additional Information + +- Incident ID: INC-2024-0315-001 +- Severity Classification: sev2 +- Affected Services: payment-api, checkout-service, subscription-billing + +### References + +- Incident tracking ticket: [Link TBD] +- Monitoring dashboards: [Link TBD] +- Communication thread: [Link TBD] + +--- +*Generated on 2026-02-16 by PIR Generator* \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt b/engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt new file mode 100644 index 0000000..75d747d --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/simple_incident_classification.txt @@ -0,0 +1,44 @@ +============================================================ +INCIDENT CLASSIFICATION REPORT +============================================================ + +CLASSIFICATION: + Severity: SEV2 + Confidence: 100.0% + Reasoning: Classified as SEV2 based on: keywords: slow; user impact: 25% + Timestamp: 2026-02-16T12:42:41.889774+00:00 + +RECOMMENDED RESPONSE: + Primary Team: UX Engineering + Supporting Teams: Product Engineering, Frontend Team + Response Time: 15 minutes + +INITIAL ACTIONS: + 1. Assign incident commander (Priority 1) + Timeout: 30 minutes + Assign IC and establish coordination channel + + 2. Create incident tracking (Priority 1) + Timeout: 5 minutes + Create incident ticket with details and timeline + + 3. Assess customer impact (Priority 2) + Timeout: 15 minutes + Determine scope and severity of user impact + + 4. Engage response team (Priority 2) + Timeout: 30 minutes + Page appropriate technical responders + + 5. Begin investigation (Priority 3) + Timeout: 15 minutes + Start technical analysis and debugging + +COMMUNICATION: + Subject: ⚠️ [SEV2] web-frontend - Users reporting slow page loads on the main websit... + Urgency: SEV2 + Recipients: on-call, engineering-leadership, product-team + Channels: pager, slack, email + Update Frequency: Every 30 minutes + +============================================================ \ No newline at end of file diff --git a/engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt b/engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt new file mode 100644 index 0000000..f772409 --- /dev/null +++ b/engineering-team/incident-commander/expected_outputs/timeline_reconstruction_text_output.txt @@ -0,0 +1,110 @@ +================================================================================ +INCIDENT TIMELINE RECONSTRUCTION +================================================================================ + +OVERVIEW: + Time Range: 2024-03-15T14:30:00+00:00 to 2024-03-15T15:40:00+00:00 + Total Duration: 70 minutes + Total Events: 21 + Phases Detected: 12 + +PHASES: + DETECTION: + Start: 2024-03-15T14:30:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Initial detection of the incident through monitoring or observation + + ESCALATION: + Start: 2024-03-15T14:32:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Escalation to additional resources or higher severity response + + TRIAGE: + Start: 2024-03-15T14:35:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Assessment and initial investigation of the incident + + ESCALATION: + Start: 2024-03-15T14:38:00+00:00 + Duration: 9.0 minutes + Events: 5 + Description: Escalation to additional resources or higher severity response + + TRIAGE: + Start: 2024-03-15T14:50:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Assessment and initial investigation of the incident + + ESCALATION: + Start: 2024-03-15T14:52:00+00:00 + Duration: 10.0 minutes + Events: 4 + Description: Escalation to additional resources or higher severity response + + TRIAGE: + Start: 2024-03-15T15:05:00+00:00 + Duration: 7.0 minutes + Events: 2 + Description: Assessment and initial investigation of the incident + + DETECTION: + Start: 2024-03-15T15:15:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Initial detection of the incident through monitoring or observation + + RESOLUTION: + Start: 2024-03-15T15:18:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Confirmation that the incident has been resolved + + DETECTION: + Start: 2024-03-15T15:25:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Initial detection of the incident through monitoring or observation + + RESOLUTION: + Start: 2024-03-15T15:30:00+00:00 + Duration: 5.0 minutes + Events: 2 + Description: Confirmation that the incident has been resolved + + TRIAGE: + Start: 2024-03-15T15:40:00+00:00 + Duration: 0.0 minutes + Events: 1 + Description: Assessment and initial investigation of the incident + +KEY METRICS: + Time to Mitigation: 0 minutes + Time to Resolution: 48.0 minutes + Events per Hour: 18.0 + Unique Sources: 7 + +INCIDENT NARRATIVE: +Incident Timeline Summary: +The incident began at 2024-03-15 14:30:00 UTC and concluded at 2024-03-15 15:40:00 UTC, lasting approximately 70 minutes. + +The incident progressed through 12 distinct phases: detection, escalation, triage, escalation, triage, escalation, triage, detection, resolution, detection, resolution, triage. + +Key milestones: +- Detection: 14:30 (0 min) +- Escalation: 14:32 (0 min) +- Triage: 14:35 (0 min) +- Escalation: 14:38 (9 min) +- Triage: 14:50 (0 min) +- Escalation: 14:52 (10 min) +- Triage: 15:05 (7 min) +- Detection: 15:15 (0 min) +- Resolution: 15:18 (0 min) +- Detection: 15:25 (0 min) +- Resolution: 15:30 (5 min) +- Triage: 15:40 (0 min) + +================================================================================ \ No newline at end of file diff --git a/engineering-team/incident-commander/references/communication_templates.md b/engineering-team/incident-commander/references/communication_templates.md new file mode 100644 index 0000000..c3b370f --- /dev/null +++ b/engineering-team/incident-commander/references/communication_templates.md @@ -0,0 +1,591 @@ +# Incident Communication Templates + +## Overview + +This document provides standardized communication templates for incident response. These templates ensure consistent, clear communication across different severity levels and stakeholder groups. + +## Template Usage Guidelines + +### General Principles +1. **Be Clear and Concise** - Use simple language, avoid jargon +2. **Be Factual** - Only state what is known, avoid speculation +3. **Be Timely** - Send updates at committed intervals +4. **Be Actionable** - Include next steps and expected timelines +5. **Be Accountable** - Include contact information for follow-up + +### Template Selection +- Choose templates based on incident severity and audience +- Customize templates with specific incident details +- Always include next update time and contact information +- Escalate template types as severity increases + +--- + +## SEV1 Templates + +### Initial Alert - Internal Teams + +**Subject:** 🚨 [SEV1] CRITICAL: {Service} Complete Outage - Immediate Response Required + +``` +CRITICAL INCIDENT ALERT - IMMEDIATE ATTENTION REQUIRED + +Incident Summary: +- Service: {Service Name} +- Status: Complete Outage +- Start Time: {Timestamp} +- Customer Impact: {Impact Description} +- Estimated Affected Users: {Number/Percentage} + +Immediate Actions Needed: +✓ Incident Commander: {Name} - ASSIGNED +✓ War Room: {Bridge/Chat Link} - JOIN NOW +✓ On-Call Response: {Team} - PAGED +⏳ Executive Notification: In progress +⏳ Status Page Update: Within 15 minutes + +Current Situation: +{Brief description of what we know} + +What We're Doing: +{Immediate response actions being taken} + +Next Update: {Timestamp - 15 minutes from now} + +Incident Commander: {Name} +Contact: {Phone/Slack} + +THIS IS A CUSTOMER-IMPACTING INCIDENT REQUIRING IMMEDIATE ATTENTION +``` + +### Executive Notification - SEV1 + +**Subject:** 🚨 URGENT: Customer-Impacting Outage - {Service} + +``` +EXECUTIVE ALERT: Critical customer-facing incident + +Service: {Service Name} +Impact: {Customer impact description} +Duration: {Current duration} (started {start time}) +Business Impact: {Revenue/SLA/compliance implications} + +Customer Impact Summary: +- Affected Users: {Number/percentage} +- Revenue Impact: {$ amount if known} +- SLA Status: {Breach status} +- Customer Escalations: {Number if any} + +Response Status: +- Incident Commander: {Name} ({contact}) +- Response Team Size: {Number of engineers} +- Root Cause: {If known, otherwise "Under investigation"} +- ETA to Resolution: {If known, otherwise "Investigating"} + +Executive Actions Required: +- [ ] Customer communication approval needed +- [ ] Legal/compliance notification: {If applicable} +- [ ] PR/Media response preparation: {If needed} +- [ ] Resource allocation decisions: {If escalation needed} + +War Room: {Link} +Next Update: {15 minutes from now} + +This incident meets SEV1 criteria and requires executive oversight. + +{Incident Commander contact information} +``` + +### Customer Communication - SEV1 + +**Subject:** Service Disruption - Immediate Action Being Taken + +``` +We are currently experiencing a service disruption affecting {service description}. + +What's Happening: +{Clear, customer-friendly description of the issue} + +Impact: +{What customers are experiencing - be specific} + +What We're Doing: +We detected this issue at {time} and immediately mobilized our engineering team. We are actively working to resolve this issue and will provide updates every 15 minutes. + +Current Actions: +• {Action 1 - customer-friendly description} +• {Action 2 - customer-friendly description} +• {Action 3 - customer-friendly description} + +Workaround: +{If available, provide clear steps} +{If not available: "We are working on alternative solutions and will share them as soon as available."} + +Next Update: {Timestamp} +Status Page: {Link} +Support: {Contact information if different from usual} + +We sincerely apologize for the inconvenience and are committed to resolving this as quickly as possible. + +{Company Name} Team +``` + +### Status Page Update - SEV1 + +**Status:** Major Outage + +``` +{Timestamp} - Investigating + +We are currently investigating reports of {service} being unavailable. Our team has been alerted and is actively investigating the cause. + +Affected Services: {List of affected services} +Impact: {Customer-facing impact description} + +We will provide an update within 15 minutes. +``` + +``` +{Timestamp} - Identified + +We have identified the cause of the {service} outage. Our engineering team is implementing a fix. + +Root Cause: {Brief, customer-friendly explanation} +Expected Resolution: {Timeline if known} + +Next update in 15 minutes. +``` + +``` +{Timestamp} - Monitoring + +The fix has been implemented and we are monitoring the service recovery. + +Current Status: {Recovery progress} +Next Steps: {What we're monitoring} + +We expect full service restoration within {timeframe}. +``` + +``` +{Timestamp} - Resolved + +{Service} is now fully operational. We have confirmed that all functionality is working as expected. + +Total Duration: {Duration} +Root Cause: {Brief summary} + +We apologize for the inconvenience. A full post-incident review will be conducted and shared within 24 hours. +``` + +--- + +## SEV2 Templates + +### Team Notification - SEV2 + +**Subject:** ⚠️ [SEV2] {Service} Performance Issues - Response Team Mobilizing + +``` +SEV2 INCIDENT: Performance degradation requiring active response + +Incident Details: +- Service: {Service Name} +- Issue: {Description of performance issue} +- Start Time: {Timestamp} +- Affected Users: {Percentage/description} +- Business Impact: {Impact on business operations} + +Current Status: +{What we know about the issue} + +Response Team: +- Incident Commander: {Name} ({contact}) +- Primary Responder: {Name} ({team}) +- Supporting Teams: {List of engaged teams} + +Immediate Actions: +✓ {Action 1 - completed} +⏳ {Action 2 - in progress} +⏳ {Action 3 - next step} + +Metrics: +- Error Rate: {Current vs normal} +- Response Time: {Current vs normal} +- Throughput: {Current vs normal} + +Communication Plan: +- Internal Updates: Every 30 minutes +- Stakeholder Notification: {If needed} +- Status Page Update: {Planned/not needed} + +Coordination Channel: {Slack channel} +Next Update: {30 minutes from now} + +Incident Commander: {Name} | {Contact} +``` + +### Stakeholder Update - SEV2 + +**Subject:** [SEV2] Service Performance Update - {Service} + +``` +Service Performance Incident Update + +Service: {Service Name} +Duration: {Current duration} +Impact: {Description of user impact} + +Current Status: +{Brief status of the incident and response efforts} + +What We Know: +• {Key finding 1} +• {Key finding 2} +• {Key finding 3} + +What We're Doing: +• {Response action 1} +• {Response action 2} +• {Monitoring/verification steps} + +Customer Impact: +{Realistic assessment of what users are experiencing} + +Workaround: +{If available, provide steps} + +Expected Resolution: +{Timeline if known, otherwise "Continuing investigation"} + +Next Update: {30 minutes} +Contact: {Incident Commander information} + +This incident is being actively managed and does not currently require escalation. +``` + +### Customer Communication - SEV2 (Optional) + +**Subject:** Temporary Service Performance Issues + +``` +We are currently experiencing performance issues with {service name} that may affect your experience. + +What You Might Notice: +{Specific symptoms users might experience} + +What We're Doing: +Our team identified this issue at {time} and is actively working on a resolution. We expect to have this resolved within {timeframe}. + +Workaround: +{If applicable, provide simple workaround steps} + +We will update our status page at {link} with progress information. + +Thank you for your patience as we work to resolve this issue quickly. + +{Company Name} Support Team +``` + +--- + +## SEV3 Templates + +### Team Assignment - SEV3 + +**Subject:** [SEV3] Issue Assignment - {Component} Issue + +``` +SEV3 Issue Assignment + +Service/Component: {Affected component} +Issue: {Description} +Reported: {Timestamp} +Reporter: {Person/system that reported} + +Issue Details: +{Detailed description of the problem} + +Impact Assessment: +- Affected Users: {Scope} +- Business Impact: {Assessment} +- Urgency: {Business hours response appropriate} + +Assignment: +- Primary: {Engineer name} +- Team: {Responsible team} +- Expected Response: {Within 2-4 hours} + +Investigation Plan: +1. {Investigation step 1} +2. {Investigation step 2} +3. {Communication checkpoint} + +Workaround: +{If known, otherwise "Investigating alternatives"} + +This issue will be tracked in {ticket system} as {ticket number}. + +Team Lead: {Name} | {Contact} +``` + +### Status Update - SEV3 + +**Subject:** [SEV3] Progress Update - {Component} + +``` +SEV3 Issue Progress Update + +Issue: {Brief description} +Assigned to: {Engineer/Team} +Investigation Status: {Current progress} + +Findings So Far: +{What has been discovered during investigation} + +Next Steps: +{Planned actions and timeline} + +Impact Update: +{Any changes to scope or urgency} + +Expected Resolution: +{Timeline if known} + +This issue continues to be tracked as SEV3 with no escalation required. + +Contact: {Assigned engineer} | {Team lead} +``` + +--- + +## SEV4 Templates + +### Issue Documentation - SEV4 + +**Subject:** [SEV4] Issue Documented - {Description} + +``` +SEV4 Issue Logged + +Description: {Clear description of the issue} +Reporter: {Name/system} +Date: {Date reported} + +Impact: +{Minimal impact description} + +Priority Assessment: +This issue has been classified as SEV4 and will be addressed in the normal development cycle. + +Assignment: +- Team: {Responsible team} +- Sprint: {Target sprint} +- Estimated Effort: {Story points/hours} + +This issue is tracked as {ticket number} in {system}. + +Product Owner: {Name} +``` + +--- + +## Escalation Templates + +### Severity Escalation + +**Subject:** ESCALATION: {Original Severity} → {New Severity} - {Service} + +``` +SEVERITY ESCALATION NOTIFICATION + +Original Classification: {Original severity} +New Classification: {New severity} +Escalation Time: {Timestamp} +Escalated By: {Name and role} + +Escalation Reasons: +• {Reason 1 - scope expansion/duration/impact} +• {Reason 2} +• {Reason 3} + +Updated Impact: +{New assessment of customer/business impact} + +Updated Response Requirements: +{New response team, communication frequency, etc.} + +Previous Response Actions: +{Summary of actions taken under previous severity} + +New Incident Commander: {If changed} +Updated Communication Plan: {New frequency/recipients} + +All stakeholders should adjust response according to {new severity} protocols. + +Incident Commander: {Name} | {Contact} +``` + +### Management Escalation + +**Subject:** MANAGEMENT ESCALATION: Extended {Severity} Incident - {Service} + +``` +Management Escalation Required + +Incident: {Service} {brief description} +Original Severity: {Severity} +Duration: {Current duration} +Escalation Trigger: {Duration threshold/scope change/customer escalation} + +Current Status: +{Brief status of incident response} + +Challenges Encountered: +• {Challenge 1} +• {Challenge 2} +• {Resource/expertise needs} + +Business Impact: +{Updated assessment of business implications} + +Management Decision Required: +• {Decision 1 - resource allocation/external expertise/communication} +• {Decision 2} + +Recommended Actions: +{Incident Commander's recommendations} + +This escalation follows standard procedures for {trigger type}. + +Incident Commander: {Name} +Contact: {Phone/Slack} +War Room: {Link} +``` + +--- + +## Resolution Templates + +### Resolution Confirmation - All Severities + +**Subject:** RESOLVED: [{Severity}] {Service} Incident - {Brief Description} + +``` +INCIDENT RESOLVED + +Service: {Service Name} +Issue: {Brief description} +Duration: {Total duration} +Resolution Time: {Timestamp} + +Resolution Summary: +{Brief description of how the issue was resolved} + +Root Cause: +{Brief explanation - detailed PIR to follow} + +Impact Summary: +- Users Affected: {Final count/percentage} +- Business Impact: {Final assessment} +- Services Affected: {List} + +Resolution Actions Taken: +• {Action 1} +• {Action 2} +• {Verification steps} + +Monitoring: +We will continue monitoring {service} for {duration} to ensure stability. + +Next Steps: +• Post-incident review scheduled for {date} +• Action items to be tracked in {system} +• Follow-up communication: {If needed} + +Thank you to everyone who participated in the incident response. + +Incident Commander: {Name} +``` + +### Customer Resolution Communication + +**Subject:** Service Restored - Thank You for Your Patience + +``` +Service Update: Issue Resolved + +We're pleased to report that the {service} issues have been fully resolved as of {timestamp}. + +What Was Fixed: +{Customer-friendly explanation of the resolution} + +Duration: +The issue lasted {duration} from {start time} to {end time}. + +What We Learned: +{Brief, high-level takeaway} + +Our Commitment: +We are conducting a thorough review of this incident and will implement improvements to prevent similar issues in the future. A summary of our findings and improvements will be shared {timeframe}. + +We sincerely apologize for any inconvenience this may have caused and appreciate your patience while we worked to resolve the issue. + +If you continue to experience any problems, please contact our support team at {contact information}. + +Thank you, +{Company Name} Team +``` + +--- + +## Template Customization Guidelines + +### Placeholders to Always Replace +- `{Service}` / `{Service Name}` - Specific service or component +- `{Timestamp}` - Specific date/time in consistent format +- `{Name}` / `{Contact}` - Actual names and contact information +- `{Duration}` - Actual time durations +- `{Link}` - Real URLs to war rooms, status pages, etc. + +### Language Guidelines +- Use active voice ("We are investigating" not "The issue is being investigated") +- Be specific about timelines ("within 30 minutes" not "soon") +- Avoid technical jargon in customer communications +- Include empathy in customer-facing messages +- Use consistent terminology throughout incident lifecycle + +### Timing Guidelines +| Severity | Initial Notification | Update Frequency | Resolution Notification | +|----------|---------------------|------------------|------------------------| +| SEV1 | Immediate (< 5 min) | Every 15 minutes | Immediate | +| SEV2 | Within 15 minutes | Every 30 minutes | Within 15 minutes | +| SEV3 | Within 2 hours | At milestones | Within 1 hour | +| SEV4 | Within 1 business day | Weekly | When resolved | + +### Audience-Specific Considerations + +#### Engineering Teams +- Include technical details +- Provide specific metrics and logs +- Include coordination channels +- List specific actions and owners + +#### Executive/Business +- Focus on business impact +- Include customer and revenue implications +- Provide clear timeline and resource needs +- Highlight any external factors (PR, legal, compliance) + +#### Customers +- Use plain language +- Focus on customer impact and workarounds +- Provide realistic timelines +- Include support contact information +- Show empathy and accountability + +--- + +**Last Updated:** February 2026 +**Next Review:** May 2026 +**Owner:** Incident Management Team \ No newline at end of file diff --git a/engineering-team/incident-commander/references/incident_severity_matrix.md b/engineering-team/incident-commander/references/incident_severity_matrix.md new file mode 100644 index 0000000..7ab1265 --- /dev/null +++ b/engineering-team/incident-commander/references/incident_severity_matrix.md @@ -0,0 +1,292 @@ +# Incident Severity Classification Matrix + +## Overview + +This document defines the severity classification system used for incident response. The classification determines response requirements, escalation paths, and communication frequency. + +## Severity Levels + +### SEV1 - Critical Outage + +**Definition:** Complete service failure affecting all users or critical business functions + +#### Impact Criteria +- Customer-facing services completely unavailable +- Data loss or corruption affecting users +- Security breaches with customer data exposure +- Revenue-generating systems down +- SLA violations with financial penalties +- > 75% of users affected + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | Immediate (0-5 minutes) | +| **Incident Commander** | Assigned within 5 minutes | +| **War Room** | Established within 10 minutes | +| **Executive Notification** | Within 15 minutes | +| **Public Status Page** | Updated within 15 minutes | +| **Customer Communication** | Within 30 minutes | + +#### Escalation Path +1. **Immediate**: On-call Engineer → Incident Commander +2. **15 minutes**: VP Engineering + Customer Success VP +3. **30 minutes**: CTO +4. **60 minutes**: CEO + Full Executive Team + +#### Communication Requirements +- **Frequency**: Every 15 minutes until resolution +- **Channels**: PagerDuty, Phone, Slack, Email, Status Page +- **Recipients**: All engineering, executives, customer success +- **Template**: SEV1 Executive Alert Template + +--- + +### SEV2 - Major Impact + +**Definition:** Significant degradation affecting subset of users or non-critical functions + +#### Impact Criteria +- Partial service degradation (25-75% of users affected) +- Performance issues causing user frustration +- Non-critical features unavailable +- Internal tools impacting productivity +- Data inconsistencies not affecting user experience +- API errors affecting integrations + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | 15 minutes | +| **Incident Commander** | Assigned within 30 minutes | +| **Status Page Update** | Within 30 minutes | +| **Stakeholder Notification** | Within 1 hour | +| **Team Assembly** | Within 30 minutes | + +#### Escalation Path +1. **Immediate**: On-call Engineer → Team Lead +2. **30 minutes**: Engineering Manager +3. **2 hours**: VP Engineering +4. **4 hours**: CTO (if unresolved) + +#### Communication Requirements +- **Frequency**: Every 30 minutes during active response +- **Channels**: PagerDuty, Slack, Email +- **Recipients**: Engineering team, product team, relevant stakeholders +- **Template**: SEV2 Major Impact Template + +--- + +### SEV3 - Minor Impact + +**Definition:** Limited impact with workarounds available + +#### Impact Criteria +- Single feature or component affected +- < 25% of users impacted +- Workarounds available +- Performance degradation not significantly impacting UX +- Non-urgent monitoring alerts +- Development/test environment issues + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | 2 hours (business hours) | +| **After Hours Response** | Next business day | +| **Team Assignment** | Within 4 hours | +| **Status Page Update** | Optional | +| **Internal Notification** | Within 2 hours | + +#### Escalation Path +1. **Immediate**: Assigned Engineer +2. **4 hours**: Team Lead +3. **1 business day**: Engineering Manager (if needed) + +#### Communication Requirements +- **Frequency**: At key milestones only +- **Channels**: Slack, Email +- **Recipients**: Assigned team, team lead +- **Template**: SEV3 Minor Impact Template + +--- + +### SEV4 - Low Impact + +**Definition:** Minimal impact, cosmetic issues, or planned maintenance + +#### Impact Criteria +- Cosmetic bugs +- Documentation issues +- Logging or monitoring gaps +- Performance issues with no user impact +- Development/test environment issues +- Feature requests or enhancements + +#### Response Requirements +| Metric | Requirement | +|--------|-------------| +| **Response Time** | 1-2 business days | +| **Assignment** | Next sprint planning | +| **Tracking** | Standard ticket system | +| **Escalation** | None required | + +#### Communication Requirements +- **Frequency**: Standard development cycle updates +- **Channels**: Ticket system +- **Recipients**: Product owner, assigned developer +- **Template**: Standard issue template + +## Classification Guidelines + +### User Impact Assessment + +| Impact Scope | Description | Typical Severity | +|--------------|-------------|------------------| +| **All Users** | 100% of users affected | SEV1 | +| **Major Subset** | 50-75% of users affected | SEV1/SEV2 | +| **Significant Subset** | 25-50% of users affected | SEV2 | +| **Limited Users** | 5-25% of users affected | SEV2/SEV3 | +| **Few Users** | < 5% of users affected | SEV3/SEV4 | +| **No User Impact** | Internal only | SEV4 | + +### Business Impact Assessment + +| Business Impact | Description | Severity Boost | +|-----------------|-------------|----------------| +| **Revenue Loss** | Direct revenue impact | +1 severity level | +| **SLA Breach** | Contract violations | +1 severity level | +| **Regulatory** | Compliance implications | +1 severity level | +| **Brand Damage** | Public-facing issues | +1 severity level | +| **Security** | Data or system security | +2 severity levels | + +### Duration Considerations + +| Duration | Impact on Classification | +|----------|--------------------------| +| **< 15 minutes** | May reduce severity by 1 level | +| **15-60 minutes** | Standard classification | +| **1-4 hours** | May increase severity by 1 level | +| **> 4 hours** | Significant severity increase | + +## Decision Tree + +``` +1. Is this a security incident with data exposure? + → YES: SEV1 (regardless of user count) + → NO: Continue to step 2 + +2. Are revenue-generating services completely down? + → YES: SEV1 + → NO: Continue to step 3 + +3. What percentage of users are affected? + → > 75%: SEV1 + → 25-75%: SEV2 + → 5-25%: SEV3 + → < 5%: SEV4 + +4. Apply business impact modifiers +5. Consider duration factors +6. When in doubt, err on higher severity +``` + +## Examples + +### SEV1 Examples +- Payment processing system completely down +- All user authentication failing +- Database corruption causing data loss +- Security breach with customer data exposed +- Website returning 500 errors for all users + +### SEV2 Examples +- Payment processing slow (30-second delays) +- Search functionality returning incomplete results +- API rate limits causing partner integration issues +- Dashboard displaying stale data (> 1 hour old) +- Mobile app crashing for 40% of users + +### SEV3 Examples +- Single feature in admin panel not working +- Email notifications delayed by 1 hour +- Non-critical API endpoint returning errors +- Cosmetic UI bug in settings page +- Development environment deployment failing + +### SEV4 Examples +- Typo in help documentation +- Log format change needed for analysis +- Non-critical performance optimization +- Internal tool enhancement request +- Test data cleanup needed + +## Escalation Triggers + +### Automatic Escalation +- SEV1 incidents automatically escalate every 30 minutes if unresolved +- SEV2 incidents escalate after 2 hours without significant progress +- Any incident with expanding scope increases severity +- Customer escalation to support triggers severity review + +### Manual Escalation +- Incident Commander can escalate at any time +- Technical leads can request escalation +- Business stakeholders can request severity review +- External factors (media attention, regulatory) trigger escalation + +## Communication Templates + +### SEV1 Executive Alert +``` +Subject: 🚨 CRITICAL INCIDENT - [Service] Complete Outage + +URGENT: Customer-facing service outage requiring immediate attention + +Service: [Service Name] +Start Time: [Timestamp] +Impact: [Description of customer impact] +Estimated Affected Users: [Number/Percentage] +Business Impact: [Revenue/SLA/Brand implications] + +Incident Commander: [Name] ([Contact]) +Response Team: [Team members engaged] + +Current Status: [Brief status update] +Next Update: [Timestamp - 15 minutes from now] +War Room: [Bridge/Chat link] + +This is a customer-impacting incident requiring executive awareness. +``` + +### SEV2 Major Impact +``` +Subject: ⚠️ [SEV2] [Service] - Major Performance Impact + +Major service degradation affecting user experience + +Service: [Service Name] +Start Time: [Timestamp] +Impact: [Description of user impact] +Scope: [Affected functionality/users] + +Response Team: [Team Lead] + [Team members] +Status: [Current mitigation efforts] +Workaround: [If available] + +Next Update: 30 minutes +Status Page: [Link if updated] +``` + +## Review and Updates + +This severity matrix should be reviewed quarterly and updated based on: +- Incident response learnings +- Business priority changes +- Service architecture evolution +- Regulatory requirement changes +- Customer feedback and SLA updates + +**Last Updated:** February 2026 +**Next Review:** May 2026 +**Owner:** Engineering Leadership \ No newline at end of file diff --git a/engineering-team/incident-commander/references/rca_frameworks_guide.md b/engineering-team/incident-commander/references/rca_frameworks_guide.md new file mode 100644 index 0000000..4c62fc8 --- /dev/null +++ b/engineering-team/incident-commander/references/rca_frameworks_guide.md @@ -0,0 +1,562 @@ +# Root Cause Analysis (RCA) Frameworks Guide + +## Overview + +This guide provides detailed instructions for applying various Root Cause Analysis frameworks during Post-Incident Reviews. Each framework offers a different perspective and approach to identifying underlying causes of incidents. + +## Framework Selection Guidelines + +| Incident Type | Recommended Framework | Why | +|---------------|----------------------|-----| +| **Process Failure** | 5 Whys | Simple, direct cause-effect chain | +| **Complex System Failure** | Fishbone + Timeline | Multiple contributing factors | +| **Human Error** | Fishbone | Systematic analysis of contributing factors | +| **Extended Incidents** | Timeline Analysis | Understanding decision points | +| **High-Risk Incidents** | Bow Tie | Comprehensive barrier analysis | +| **Recurring Issues** | 5 Whys + Fishbone | Deep dive into systemic issues | + +--- + +## 5 Whys Analysis Framework + +### Purpose +Iteratively drill down through cause-effect relationships to identify root causes. + +### When to Use +- Simple, linear cause-effect chains +- Time-pressured analysis +- Process-related failures +- Individual component failures + +### Process Steps + +#### Step 1: Problem Statement +Write a clear, specific problem statement. + +**Good Example:** +> "The payment API returned 500 errors for 2 hours on March 15, affecting 80% of checkout attempts." + +**Poor Example:** +> "The system was broken." + +#### Step 2: First Why +Ask why the problem occurred. Focus on immediate, observable causes. + +**Example:** +- **Why 1:** Why did the payment API return 500 errors? +- **Answer:** The database connection pool was exhausted. + +#### Step 3: Subsequent Whys +For each answer, ask "why" again. Continue until you reach a root cause. + +**Example Chain:** +- **Why 2:** Why was the database connection pool exhausted? +- **Answer:** The application was creating more connections than usual. + +- **Why 3:** Why was the application creating more connections? +- **Answer:** A new feature wasn't properly closing connections. + +- **Why 4:** Why wasn't the feature properly closing connections? +- **Answer:** Code review missed the connection leak pattern. + +- **Why 5:** Why did code review miss this pattern? +- **Answer:** We don't have automated checks for connection pooling best practices. + +#### Step 4: Validation +Verify that addressing the root cause would prevent the original problem. + +### Best Practices + +1. **Ask at least 3 "whys"** - Surface causes are rarely root causes +2. **Focus on process failures, not people** - Avoid blame, focus on system improvements +3. **Use evidence** - Support each answer with data or observations +4. **Consider multiple paths** - Some problems have multiple root causes +5. **Test the logic** - Work backwards from root cause to problem + +### Common Pitfalls + +- **Stopping too early** - First few whys often reveal symptoms, not causes +- **Single-cause assumption** - Complex systems often have multiple contributing factors +- **Blame focus** - Focusing on individual mistakes rather than system failures +- **Vague answers** - Use specific, actionable answers + +### 5 Whys Template + +```markdown +## 5 Whys Analysis + +**Problem Statement:** [Clear description of the incident] + +**Why 1:** [First why question] +**Answer:** [Specific, evidence-based answer] +**Evidence:** [Supporting data, logs, observations] + +**Why 2:** [Second why question] +**Answer:** [Specific answer based on Why 1] +**Evidence:** [Supporting evidence] + +[Continue for 3-7 iterations] + +**Root Cause(s) Identified:** +1. [Primary root cause] +2. [Secondary root cause if applicable] + +**Validation:** [Confirm that addressing root causes would prevent recurrence] +``` + +--- + +## Fishbone (Ishikawa) Diagram Framework + +### Purpose +Systematically analyze potential causes across multiple categories to identify contributing factors. + +### When to Use +- Complex incidents with multiple potential causes +- When human factors are suspected +- Systemic or organizational issues +- When 5 Whys doesn't reveal clear root causes + +### Categories + +#### People (Human Factors) +- **Training and Skills** + - Insufficient training on new systems + - Lack of domain expertise + - Skill gaps in team + - Knowledge not shared across team + +- **Communication** + - Poor communication between teams + - Unclear responsibilities + - Information not reaching right people + - Language/cultural barriers + +- **Decision Making** + - Decisions made under pressure + - Insufficient information for decisions + - Risk assessment inadequate + - Approval processes bypassed + +#### Process (Procedures and Workflows) +- **Documentation** + - Outdated procedures + - Missing runbooks + - Unclear instructions + - Process not documented + +- **Change Management** + - Inadequate change review + - Rushed deployments + - Insufficient testing + - Rollback procedures unclear + +- **Review and Approval** + - Code review gaps + - Architecture review skipped + - Security review insufficient + - Performance review missing + +#### Technology (Systems and Tools) +- **Architecture** + - Single points of failure + - Insufficient redundancy + - Scalability limitations + - Tight coupling between systems + +- **Monitoring and Alerting** + - Missing monitoring + - Alert fatigue + - Inadequate thresholds + - Poor alert routing + +- **Tools and Automation** + - Manual processes prone to error + - Tool limitations + - Automation gaps + - Integration issues + +#### Environment (External Factors) +- **Infrastructure** + - Hardware failures + - Network issues + - Capacity limitations + - Geographic dependencies + +- **Dependencies** + - Third-party service failures + - External API changes + - Vendor issues + - Supply chain problems + +- **External Pressure** + - Time pressure from business + - Resource constraints + - Regulatory changes + - Market conditions + +### Process Steps + +#### Step 1: Define the Problem +Place the incident at the "head" of the fishbone diagram. + +#### Step 2: Brainstorm Causes +For each category, brainstorm potential contributing factors. + +#### Step 3: Drill Down +For each factor, ask what caused that factor (sub-causes). + +#### Step 4: Identify Primary Causes +Mark the most likely contributing factors based on evidence. + +#### Step 5: Validate +Gather evidence to support or refute each suspected cause. + +### Fishbone Template + +```markdown +## Fishbone Analysis + +**Problem:** [Incident description] + +### People +**Training/Skills:** +- [Factor 1]: [Evidence/likelihood] +- [Factor 2]: [Evidence/likelihood] + +**Communication:** +- [Factor 1]: [Evidence/likelihood] + +**Decision Making:** +- [Factor 1]: [Evidence/likelihood] + +### Process +**Documentation:** +- [Factor 1]: [Evidence/likelihood] + +**Change Management:** +- [Factor 1]: [Evidence/likelihood] + +**Review/Approval:** +- [Factor 1]: [Evidence/likelihood] + +### Technology +**Architecture:** +- [Factor 1]: [Evidence/likelihood] + +**Monitoring:** +- [Factor 1]: [Evidence/likelihood] + +**Tools:** +- [Factor 1]: [Evidence/likelihood] + +### Environment +**Infrastructure:** +- [Factor 1]: [Evidence/likelihood] + +**Dependencies:** +- [Factor 1]: [Evidence/likelihood] + +**External Factors:** +- [Factor 1]: [Evidence/likelihood] + +### Primary Contributing Factors +1. [Factor with highest evidence/impact] +2. [Second most significant factor] +3. [Third most significant factor] + +### Root Cause Hypothesis +[Synthesized explanation of how factors combined to cause incident] +``` + +--- + +## Timeline Analysis Framework + +### Purpose +Analyze the chronological sequence of events to identify decision points, missed opportunities, and process gaps. + +### When to Use +- Extended incidents (> 1 hour) +- Complex multi-phase incidents +- When response effectiveness is questioned +- Communication or coordination failures + +### Analysis Dimensions + +#### Detection Analysis +- **Time to Detection:** How long from onset to first alert? +- **Detection Method:** How was the incident first identified? +- **Alert Effectiveness:** Were the right people notified quickly? +- **False Negatives:** What signals were missed? + +#### Response Analysis +- **Time to Response:** How long from detection to first response action? +- **Escalation Timing:** Were escalations timely and appropriate? +- **Resource Mobilization:** How quickly were the right people engaged? +- **Decision Points:** What key decisions were made and when? + +#### Communication Analysis +- **Internal Communication:** How effective was team coordination? +- **External Communication:** Were stakeholders informed appropriately? +- **Communication Gaps:** Where did information flow break down? +- **Update Frequency:** Were updates provided at appropriate intervals? + +#### Resolution Analysis +- **Mitigation Strategy:** Was the chosen approach optimal? +- **Alternative Paths:** What other options were considered? +- **Resource Allocation:** Were resources used effectively? +- **Verification:** How was resolution confirmed? + +### Process Steps + +#### Step 1: Event Reconstruction +Create comprehensive timeline with all available events. + +#### Step 2: Phase Identification +Identify distinct phases (detection, triage, escalation, mitigation, resolution). + +#### Step 3: Gap Analysis +Identify time gaps and analyze their causes. + +#### Step 4: Decision Point Analysis +Examine key decision points and alternative paths. + +#### Step 5: Effectiveness Assessment +Evaluate the overall effectiveness of the response. + +### Timeline Template + +```markdown +## Timeline Analysis + +### Incident Phases +1. **Detection** ([start] - [end], [duration]) +2. **Triage** ([start] - [end], [duration]) +3. **Escalation** ([start] - [end], [duration]) +4. **Mitigation** ([start] - [end], [duration]) +5. **Resolution** ([start] - [end], [duration]) + +### Key Decision Points +**[Timestamp]:** [Decision made] +- **Context:** [Situation at time of decision] +- **Alternatives:** [Other options considered] +- **Outcome:** [Result of decision] +- **Assessment:** [Was this optimal?] + +### Communication Timeline +**[Timestamp]:** [Communication event] +- **Channel:** [Slack/Email/Phone/etc.] +- **Audience:** [Who was informed] +- **Content:** [What was communicated] +- **Effectiveness:** [Assessment] + +### Gaps and Delays +**[Time Period]:** [Description of gap] +- **Duration:** [Length of gap] +- **Cause:** [Why did gap occur] +- **Impact:** [Effect on incident response] + +### Response Effectiveness +**Strengths:** +- [What went well] +- [Effective decisions/actions] + +**Weaknesses:** +- [What could be improved] +- [Missed opportunities] + +### Root Causes from Timeline +1. [Process-based root cause] +2. [Communication-based root cause] +3. [Decision-making root cause] +``` + +--- + +## Bow Tie Analysis Framework + +### Purpose +Analyze both preventive measures (left side) and protective measures (right side) around an incident. + +### When to Use +- High-severity incidents (SEV1) +- Security incidents +- Safety-critical systems +- When comprehensive barrier analysis is needed + +### Components + +#### Hazards +What conditions create the potential for incidents? + +**Examples:** +- High traffic loads +- Software deployments +- Human interactions with critical systems +- Third-party dependencies + +#### Top Event +What actually went wrong? This is the center of the bow tie. + +**Examples:** +- "Database became unresponsive" +- "Payment processing failed" +- "User authentication service crashed" + +#### Threats (Left Side) +What specific causes could lead to the top event? + +**Examples:** +- Code defects in new deployment +- Database connection pool exhaustion +- Network connectivity issues +- DDoS attack + +#### Consequences (Right Side) +What are the potential impacts of the top event? + +**Examples:** +- Revenue loss +- Customer churn +- Regulatory violations +- Brand damage +- Data loss + +#### Barriers +What controls exist (or could exist) to prevent threats or mitigate consequences? + +**Preventive Barriers (Left Side):** +- Code reviews +- Automated testing +- Load testing +- Input validation +- Rate limiting + +**Protective Barriers (Right Side):** +- Circuit breakers +- Failover systems +- Backup procedures +- Customer communication +- Rollback capabilities + +### Process Steps + +#### Step 1: Define the Top Event +Clearly state what went wrong. + +#### Step 2: Identify Threats +Brainstorm all possible causes that could lead to the top event. + +#### Step 3: Identify Consequences +List all potential impacts of the top event. + +#### Step 4: Map Existing Barriers +Identify current controls for each threat and consequence. + +#### Step 5: Assess Barrier Effectiveness +Evaluate how well each barrier worked (or failed). + +#### Step 6: Recommend Additional Barriers +Identify new controls needed to prevent recurrence. + +### Bow Tie Template + +```markdown +## Bow Tie Analysis + +**Top Event:** [What went wrong] + +### Threats (Potential Causes) +1. **[Threat 1]** + - Likelihood: [High/Medium/Low] + - Current Barriers: [Preventive controls] + - Barrier Effectiveness: [Assessment] + +2. **[Threat 2]** + - Likelihood: [High/Medium/Low] + - Current Barriers: [Preventive controls] + - Barrier Effectiveness: [Assessment] + +### Consequences (Potential Impacts) +1. **[Consequence 1]** + - Severity: [High/Medium/Low] + - Current Barriers: [Protective controls] + - Barrier Effectiveness: [Assessment] + +2. **[Consequence 2]** + - Severity: [High/Medium/Low] + - Current Barriers: [Protective controls] + - Barrier Effectiveness: [Assessment] + +### Barrier Analysis +**Effective Barriers:** +- [Barrier that worked well] +- [Why it was effective] + +**Failed Barriers:** +- [Barrier that failed] +- [Why it failed] +- [How to improve] + +**Missing Barriers:** +- [Needed preventive control] +- [Needed protective control] + +### Recommendations +**Preventive Measures:** +1. [New barrier to prevent threat] +2. [Improvement to existing barrier] + +**Protective Measures:** +1. [New barrier to mitigate consequence] +2. [Improvement to existing barrier] +``` + +--- + +## Framework Comparison + +| Framework | Time Required | Complexity | Best For | Output | +|-----------|---------------|------------|----------|---------| +| **5 Whys** | 30-60 minutes | Low | Simple, linear causes | Clear cause chain | +| **Fishbone** | 1-2 hours | Medium | Complex, multi-factor | Comprehensive factor map | +| **Timeline** | 2-3 hours | Medium | Extended incidents | Process improvements | +| **Bow Tie** | 2-4 hours | High | High-risk incidents | Barrier strategy | + +## Combining Frameworks + +### 5 Whys + Fishbone +Use 5 Whys for initial analysis, then Fishbone to explore contributing factors. + +### Timeline + 5 Whys +Use Timeline to identify key decision points, then 5 Whys on critical failures. + +### Fishbone + Bow Tie +Use Fishbone to identify causes, then Bow Tie to develop comprehensive prevention strategy. + +## Quality Checklist + +- [ ] Root causes address systemic issues, not symptoms +- [ ] Analysis is backed by evidence, not assumptions +- [ ] Multiple perspectives considered (technical, process, human) +- [ ] Recommendations are specific and actionable +- [ ] Analysis focuses on prevention, not blame +- [ ] Findings are validated against incident timeline +- [ ] Contributing factors are prioritized by impact +- [ ] Root causes link clearly to preventive actions + +## Common Anti-Patterns + +- **Human Error as Root Cause** - Dig deeper into why human error occurred +- **Single Root Cause** - Complex systems usually have multiple contributing factors +- **Technology-Only Focus** - Consider process and organizational factors +- **Blame Assignment** - Focus on system improvements, not individual fault +- **Generic Recommendations** - Provide specific, measurable actions +- **Surface-Level Analysis** - Ensure you've reached true root causes + +--- + +**Last Updated:** February 2026 +**Next Review:** August 2026 +**Owner:** SRE Team + Engineering Leadership \ No newline at end of file diff --git a/engineering-team/incident-commander/scripts/incident_classifier.py b/engineering-team/incident-commander/scripts/incident_classifier.py new file mode 100644 index 0000000..8814e99 --- /dev/null +++ b/engineering-team/incident-commander/scripts/incident_classifier.py @@ -0,0 +1,914 @@ +#!/usr/bin/env python3 +""" +Incident Classifier + +Analyzes incident descriptions and outputs severity levels, recommended response teams, +initial actions, and communication templates. + +This tool uses pattern matching and keyword analysis to classify incidents according to +SEV1-4 criteria and provide structured response guidance. + +Usage: + python incident_classifier.py --input incident.json + echo "Database is down" | python incident_classifier.py --format text + python incident_classifier.py --interactive +""" + +import argparse +import json +import sys +import re +from datetime import datetime, timezone +from typing import Dict, List, Tuple, Optional, Any + + +class IncidentClassifier: + """ + Classifies incidents based on description, impact metrics, and business context. + Provides severity assessment, team recommendations, and response templates. + """ + + def __init__(self): + """Initialize the classifier with rules and templates.""" + self.severity_rules = self._load_severity_rules() + self.team_mappings = self._load_team_mappings() + self.communication_templates = self._load_communication_templates() + self.action_templates = self._load_action_templates() + + def _load_severity_rules(self) -> Dict[str, Dict]: + """Load severity classification rules and keywords.""" + return { + "sev1": { + "keywords": [ + "down", "outage", "offline", "unavailable", "crashed", "failed", + "critical", "emergency", "dead", "broken", "timeout", "500 error", + "data loss", "corrupted", "breach", "security incident", + "revenue impact", "customer facing", "all users", "complete failure" + ], + "impact_indicators": [ + "100%", "all users", "entire service", "complete", + "revenue loss", "sla violation", "customer churn", + "security breach", "data corruption", "regulatory" + ], + "duration_threshold": 0, # Immediate classification + "response_time": 300, # 5 minutes + "description": "Complete service failure affecting all users or critical business functions" + }, + "sev2": { + "keywords": [ + "degraded", "slow", "performance", "errors", "partial", + "intermittent", "high latency", "timeouts", "some users", + "feature broken", "api errors", "database slow" + ], + "impact_indicators": [ + "50%", "25-75%", "many users", "significant", + "performance degradation", "feature unavailable", + "support tickets", "user complaints" + ], + "duration_threshold": 300, # 5 minutes + "response_time": 900, # 15 minutes + "description": "Significant degradation affecting subset of users or non-critical functions" + }, + "sev3": { + "keywords": [ + "minor", "cosmetic", "single feature", "workaround available", + "edge case", "rare issue", "non-critical", "internal tool", + "logging issue", "monitoring gap" + ], + "impact_indicators": [ + "<25%", "few users", "limited impact", + "workaround exists", "internal only", + "development environment" + ], + "duration_threshold": 3600, # 1 hour + "response_time": 7200, # 2 hours + "description": "Limited impact with workarounds available" + }, + "sev4": { + "keywords": [ + "cosmetic", "documentation", "typo", "minor bug", + "enhancement", "nice to have", "low priority", + "test environment", "dev tools" + ], + "impact_indicators": [ + "no impact", "cosmetic only", "documentation", + "development", "testing", "non-production" + ], + "duration_threshold": 86400, # 24 hours + "response_time": 172800, # 2 days + "description": "Minimal impact, cosmetic issues, or planned maintenance" + } + } + + def _load_team_mappings(self) -> Dict[str, List[str]]: + """Load team assignment rules based on service/component keywords.""" + return { + "database": ["Database Team", "SRE", "Backend Engineering"], + "frontend": ["Frontend Team", "UX Engineering", "Product Engineering"], + "api": ["API Team", "Backend Engineering", "Platform Team"], + "infrastructure": ["SRE", "DevOps", "Platform Team"], + "security": ["Security Team", "SRE", "Compliance Team"], + "network": ["Network Engineering", "SRE", "Infrastructure Team"], + "authentication": ["Identity Team", "Security Team", "Backend Engineering"], + "payment": ["Payments Team", "Finance Engineering", "Compliance Team"], + "mobile": ["Mobile Team", "API Team", "QA Engineering"], + "monitoring": ["SRE", "Platform Team", "DevOps"], + "deployment": ["DevOps", "Release Engineering", "SRE"], + "data": ["Data Engineering", "Analytics Team", "Backend Engineering"] + } + + def _load_communication_templates(self) -> Dict[str, Dict]: + """Load communication templates for each severity level.""" + return { + "sev1": { + "subject": "🚨 [SEV1] {service} - {brief_description}", + "body": """CRITICAL INCIDENT ALERT + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV1 - Critical Outage +- Service: {service} +- Impact: {impact_description} +- Current Status: Investigating + +Customer Impact: +{customer_impact} + +Response Team: +- Incident Commander: TBD (assigning now) +- Primary Responder: {primary_responder} +- SMEs Required: {subject_matter_experts} + +Immediate Actions Taken: +{initial_actions} + +War Room: {war_room_link} +Status Page: Will be updated within 15 minutes +Next Update: {next_update_time} + +This is a customer-impacting incident requiring immediate attention. + +{incident_commander_contact}""" + }, + "sev2": { + "subject": "⚠️ [SEV2] {service} - {brief_description}", + "body": """MAJOR INCIDENT NOTIFICATION + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV2 - Major Impact +- Service: {service} +- Impact: {impact_description} +- Current Status: Investigating + +User Impact: +{customer_impact} + +Response Team: +- Primary Responder: {primary_responder} +- Supporting Team: {supporting_teams} +- Incident Commander: {incident_commander} + +Initial Assessment: +{initial_assessment} + +Next Steps: +{next_steps} + +Updates will be provided every 30 minutes. +Status page: {status_page_link} + +{contact_information}""" + }, + "sev3": { + "subject": "ℹ️ [SEV3] {service} - {brief_description}", + "body": """MINOR INCIDENT NOTIFICATION + +Incident Details: +- Start Time: {timestamp} +- Severity: SEV3 - Minor Impact +- Service: {service} +- Impact: {impact_description} +- Status: {current_status} + +Details: +{incident_details} + +Assigned Team: {assigned_team} +Estimated Resolution: {eta} + +Workaround: {workaround} + +This incident has limited customer impact and is being addressed during normal business hours. + +{team_contact}""" + }, + "sev4": { + "subject": "[SEV4] {service} - {brief_description}", + "body": """LOW PRIORITY ISSUE + +Issue Details: +- Reported: {timestamp} +- Severity: SEV4 - Low Impact +- Component: {service} +- Description: {description} + +This issue will be addressed in the normal development cycle. + +Assigned to: {assigned_team} +Target Resolution: {target_date} + +{standard_contact}""" + } + } + + def _load_action_templates(self) -> Dict[str, List[Dict]]: + """Load initial action templates for each severity level.""" + return { + "sev1": [ + { + "action": "Establish incident command", + "priority": 1, + "timeout_minutes": 5, + "description": "Page incident commander and establish war room" + }, + { + "action": "Create incident ticket", + "priority": 1, + "timeout_minutes": 2, + "description": "Create tracking ticket with all known details" + }, + { + "action": "Update status page", + "priority": 2, + "timeout_minutes": 15, + "description": "Post initial status page update acknowledging incident" + }, + { + "action": "Notify executives", + "priority": 2, + "timeout_minutes": 15, + "description": "Alert executive team of customer-impacting outage" + }, + { + "action": "Engage subject matter experts", + "priority": 3, + "timeout_minutes": 10, + "description": "Page relevant SMEs based on affected systems" + }, + { + "action": "Begin technical investigation", + "priority": 3, + "timeout_minutes": 5, + "description": "Start technical diagnosis and mitigation efforts" + } + ], + "sev2": [ + { + "action": "Assign incident commander", + "priority": 1, + "timeout_minutes": 30, + "description": "Assign IC and establish coordination channel" + }, + { + "action": "Create incident tracking", + "priority": 1, + "timeout_minutes": 5, + "description": "Create incident ticket with details and timeline" + }, + { + "action": "Assess customer impact", + "priority": 2, + "timeout_minutes": 15, + "description": "Determine scope and severity of user impact" + }, + { + "action": "Engage response team", + "priority": 2, + "timeout_minutes": 30, + "description": "Page appropriate technical responders" + }, + { + "action": "Begin investigation", + "priority": 3, + "timeout_minutes": 15, + "description": "Start technical analysis and debugging" + }, + { + "action": "Plan status communication", + "priority": 3, + "timeout_minutes": 30, + "description": "Determine if status page update is needed" + } + ], + "sev3": [ + { + "action": "Assign to appropriate team", + "priority": 1, + "timeout_minutes": 120, + "description": "Route to team with relevant expertise" + }, + { + "action": "Create tracking ticket", + "priority": 1, + "timeout_minutes": 30, + "description": "Document issue in standard ticketing system" + }, + { + "action": "Assess scope and impact", + "priority": 2, + "timeout_minutes": 60, + "description": "Understand full scope of the issue" + }, + { + "action": "Identify workarounds", + "priority": 2, + "timeout_minutes": 60, + "description": "Find temporary solutions if possible" + }, + { + "action": "Plan resolution approach", + "priority": 3, + "timeout_minutes": 120, + "description": "Develop plan for permanent fix" + } + ], + "sev4": [ + { + "action": "Create backlog item", + "priority": 1, + "timeout_minutes": 1440, # 24 hours + "description": "Add to team backlog for future sprint planning" + }, + { + "action": "Triage and prioritize", + "priority": 2, + "timeout_minutes": 2880, # 2 days + "description": "Review and prioritize against other work" + }, + { + "action": "Assign owner", + "priority": 3, + "timeout_minutes": 4320, # 3 days + "description": "Assign to appropriate developer when capacity allows" + } + ] + } + + def classify_incident(self, incident_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Main classification method that analyzes incident data and returns + comprehensive response recommendations. + + Args: + incident_data: Dictionary containing incident information + + Returns: + Dictionary with classification results and recommendations + """ + # Extract key information from incident data + description = incident_data.get('description', '').lower() + affected_users = incident_data.get('affected_users', '0%') + business_impact = incident_data.get('business_impact', 'unknown') + service = incident_data.get('service', 'unknown service') + duration = incident_data.get('duration_minutes', 0) + + # Classify severity + severity = self._classify_severity(description, affected_users, business_impact, duration) + + # Determine response teams + response_teams = self._determine_teams(description, service) + + # Generate initial actions + initial_actions = self._generate_initial_actions(severity, incident_data) + + # Create communication template + communication = self._generate_communication(severity, incident_data) + + # Calculate response timeline + timeline = self._generate_timeline(severity) + + # Determine escalation path + escalation = self._determine_escalation(severity, business_impact) + + return { + "classification": { + "severity": severity.upper(), + "confidence": self._calculate_confidence(description, affected_users, business_impact), + "reasoning": self._explain_classification(severity, description, affected_users), + "timestamp": datetime.now(timezone.utc).isoformat() + }, + "response": { + "primary_team": response_teams[0] if response_teams else "General Engineering", + "supporting_teams": response_teams[1:] if len(response_teams) > 1 else [], + "all_teams": response_teams, + "response_time_minutes": self.severity_rules[severity]["response_time"] // 60 + }, + "initial_actions": initial_actions, + "communication": communication, + "timeline": timeline, + "escalation": escalation, + "incident_data": { + "service": service, + "description": incident_data.get('description', ''), + "affected_users": affected_users, + "business_impact": business_impact, + "duration_minutes": duration + } + } + + def _classify_severity(self, description: str, affected_users: str, + business_impact: str, duration: int) -> str: + """Classify incident severity based on multiple factors.""" + scores = {"sev1": 0, "sev2": 0, "sev3": 0, "sev4": 0} + + # Keyword analysis + for severity, rules in self.severity_rules.items(): + for keyword in rules["keywords"]: + if keyword in description: + scores[severity] += 2 + + for indicator in rules["impact_indicators"]: + if indicator.lower() in description or indicator.lower() in affected_users.lower(): + scores[severity] += 3 + + # Business impact weighting + if business_impact.lower() in ['critical', 'high', 'severe']: + scores["sev1"] += 5 + scores["sev2"] += 3 + elif business_impact.lower() in ['medium', 'moderate']: + scores["sev2"] += 3 + scores["sev3"] += 2 + elif business_impact.lower() in ['low', 'minimal']: + scores["sev3"] += 2 + scores["sev4"] += 3 + + # User impact analysis + if '%' in affected_users: + try: + percentage = float(re.findall(r'\d+', affected_users)[0]) + if percentage >= 75: + scores["sev1"] += 4 + elif percentage >= 25: + scores["sev2"] += 4 + elif percentage >= 5: + scores["sev3"] += 3 + else: + scores["sev4"] += 2 + except (IndexError, ValueError): + pass + + # Duration consideration + if duration > 0: + if duration >= 3600: # 1 hour + scores["sev1"] += 2 + scores["sev2"] += 1 + elif duration >= 1800: # 30 minutes + scores["sev2"] += 2 + scores["sev3"] += 1 + + # Return highest scoring severity + return max(scores, key=scores.get) + + def _determine_teams(self, description: str, service: str) -> List[str]: + """Determine which teams should respond based on affected systems.""" + teams = set() + text_to_analyze = f"{description} {service}".lower() + + for component, team_list in self.team_mappings.items(): + if component in text_to_analyze: + teams.update(team_list) + + # Default teams if no specific match + if not teams: + teams = {"General Engineering", "SRE"} + + return list(teams) + + def _generate_initial_actions(self, severity: str, incident_data: Dict) -> List[Dict]: + """Generate prioritized initial actions based on severity.""" + base_actions = self.action_templates[severity].copy() + + # Customize actions based on incident details + for action in base_actions: + if severity in ["sev1", "sev2"]: + action["urgency"] = "immediate" if severity == "sev1" else "high" + else: + action["urgency"] = "normal" if severity == "sev3" else "low" + + return base_actions + + def _generate_communication(self, severity: str, incident_data: Dict) -> Dict: + """Generate communication template filled with incident data.""" + template = self.communication_templates[severity] + + # Fill template with incident data + now = datetime.now(timezone.utc) + service = incident_data.get('service', 'Unknown Service') + description = incident_data.get('description', 'Incident detected') + + communication = { + "subject": template["subject"].format( + service=service, + brief_description=description[:50] + "..." if len(description) > 50 else description + ), + "body": template["body"], + "urgency": severity, + "recipients": self._determine_recipients(severity), + "channels": self._determine_channels(severity), + "frequency_minutes": self._get_update_frequency(severity) + } + + return communication + + def _generate_timeline(self, severity: str) -> Dict: + """Generate expected response timeline.""" + rules = self.severity_rules[severity] + now = datetime.now(timezone.utc) + + milestones = [] + if severity == "sev1": + milestones = [ + {"milestone": "Incident Commander assigned", "minutes": 5}, + {"milestone": "War room established", "minutes": 10}, + {"milestone": "Initial status page update", "minutes": 15}, + {"milestone": "Executive notification", "minutes": 15}, + {"milestone": "First customer update", "minutes": 30} + ] + elif severity == "sev2": + milestones = [ + {"milestone": "Response team assembled", "minutes": 15}, + {"milestone": "Initial assessment complete", "minutes": 30}, + {"milestone": "Stakeholder notification", "minutes": 60}, + {"milestone": "Status page update (if needed)", "minutes": 60} + ] + elif severity == "sev3": + milestones = [ + {"milestone": "Team assignment", "minutes": 120}, + {"milestone": "Initial triage complete", "minutes": 240}, + {"milestone": "Resolution plan created", "minutes": 480} + ] + else: # sev4 + milestones = [ + {"milestone": "Backlog creation", "minutes": 1440}, + {"milestone": "Priority assessment", "minutes": 2880} + ] + + return { + "response_time_minutes": rules["response_time"] // 60, + "milestones": milestones, + "update_frequency_minutes": self._get_update_frequency(severity) + } + + def _determine_escalation(self, severity: str, business_impact: str) -> Dict: + """Determine escalation requirements and triggers.""" + escalation_rules = { + "sev1": { + "immediate": ["Incident Commander", "Engineering Manager"], + "15_minutes": ["VP Engineering", "Customer Success"], + "30_minutes": ["CTO"], + "60_minutes": ["CEO", "All C-Suite"], + "triggers": ["Extended outage", "Revenue impact", "Media attention"] + }, + "sev2": { + "immediate": ["Team Lead", "On-call Engineer"], + "30_minutes": ["Engineering Manager"], + "120_minutes": ["VP Engineering"], + "triggers": ["No progress", "Expanding scope", "Customer escalation"] + }, + "sev3": { + "immediate": ["Assigned Engineer"], + "240_minutes": ["Team Lead"], + "triggers": ["Issue complexity", "Multiple teams needed"] + }, + "sev4": { + "immediate": ["Product Owner"], + "triggers": ["Customer request", "Stakeholder priority"] + } + } + + return escalation_rules.get(severity, escalation_rules["sev4"]) + + def _determine_recipients(self, severity: str) -> List[str]: + """Determine who should receive notifications.""" + recipients = { + "sev1": ["on-call", "engineering-leadership", "executives", "customer-success"], + "sev2": ["on-call", "engineering-leadership", "product-team"], + "sev3": ["assigned-team", "team-lead"], + "sev4": ["assigned-engineer"] + } + return recipients.get(severity, recipients["sev4"]) + + def _determine_channels(self, severity: str) -> List[str]: + """Determine communication channels to use.""" + channels = { + "sev1": ["pager", "phone", "slack", "email", "status-page"], + "sev2": ["pager", "slack", "email"], + "sev3": ["slack", "email"], + "sev4": ["ticket-system"] + } + return channels.get(severity, channels["sev4"]) + + def _get_update_frequency(self, severity: str) -> int: + """Get recommended update frequency in minutes.""" + frequencies = {"sev1": 15, "sev2": 30, "sev3": 240, "sev4": 0} + return frequencies.get(severity, 0) + + def _calculate_confidence(self, description: str, affected_users: str, business_impact: str) -> float: + """Calculate confidence score for the classification.""" + confidence = 0.5 # Base confidence + + # Higher confidence with more specific information + if '%' in affected_users and any(char.isdigit() for char in affected_users): + confidence += 0.2 + + if business_impact.lower() in ['critical', 'high', 'medium', 'low']: + confidence += 0.15 + + if len(description.split()) > 5: # Detailed description + confidence += 0.15 + + return min(confidence, 1.0) + + def _explain_classification(self, severity: str, description: str, affected_users: str) -> str: + """Provide explanation for the classification decision.""" + rules = self.severity_rules[severity] + + matched_keywords = [] + for keyword in rules["keywords"]: + if keyword in description.lower(): + matched_keywords.append(keyword) + + explanation = f"Classified as {severity.upper()} based on: " + reasons = [] + + if matched_keywords: + reasons.append(f"keywords: {', '.join(matched_keywords[:3])}") + + if '%' in affected_users: + reasons.append(f"user impact: {affected_users}") + + if not reasons: + reasons.append("default classification based on available information") + + return explanation + "; ".join(reasons) + + +def format_json_output(result: Dict) -> str: + """Format result as pretty JSON.""" + return json.dumps(result, indent=2, ensure_ascii=False) + + +def format_text_output(result: Dict) -> str: + """Format result as human-readable text.""" + classification = result["classification"] + response = result["response"] + actions = result["initial_actions"] + communication = result["communication"] + + output = [] + output.append("=" * 60) + output.append("INCIDENT CLASSIFICATION REPORT") + output.append("=" * 60) + output.append("") + + # Classification section + output.append("CLASSIFICATION:") + output.append(f" Severity: {classification['severity']}") + output.append(f" Confidence: {classification['confidence']:.1%}") + output.append(f" Reasoning: {classification['reasoning']}") + output.append(f" Timestamp: {classification['timestamp']}") + output.append("") + + # Response section + output.append("RECOMMENDED RESPONSE:") + output.append(f" Primary Team: {response['primary_team']}") + if response['supporting_teams']: + output.append(f" Supporting Teams: {', '.join(response['supporting_teams'])}") + output.append(f" Response Time: {response['response_time_minutes']} minutes") + output.append("") + + # Actions section + output.append("INITIAL ACTIONS:") + for i, action in enumerate(actions[:5], 1): # Show first 5 actions + output.append(f" {i}. {action['action']} (Priority {action['priority']})") + output.append(f" Timeout: {action['timeout_minutes']} minutes") + output.append(f" {action['description']}") + output.append("") + + # Communication section + output.append("COMMUNICATION:") + output.append(f" Subject: {communication['subject']}") + output.append(f" Urgency: {communication['urgency'].upper()}") + output.append(f" Recipients: {', '.join(communication['recipients'])}") + output.append(f" Channels: {', '.join(communication['channels'])}") + if communication['frequency_minutes'] > 0: + output.append(f" Update Frequency: Every {communication['frequency_minutes']} minutes") + output.append("") + + output.append("=" * 60) + + return "\n".join(output) + + +def parse_input_text(text: str) -> Dict[str, Any]: + """Parse free-form text input into structured incident data.""" + # Basic parsing - in a real system, this would be more sophisticated + incident_data = { + "description": text.strip(), + "service": "unknown service", + "affected_users": "unknown", + "business_impact": "unknown" + } + + # Try to extract service name + service_patterns = [ + r'(?:service|api|database|server|application)\s+(\w+)', + r'(\w+)(?:\s+(?:is|has|service|api|database))', + r'(?:^|\s)(\w+)\s+(?:down|failed|broken)' + ] + + for pattern in service_patterns: + match = re.search(pattern, text.lower()) + if match: + incident_data["service"] = match.group(1) + break + + # Try to extract user impact + impact_patterns = [ + r'(\d+%)\s+(?:of\s+)?(?:users?|customers?)', + r'(?:all|every|100%)\s+(?:users?|customers?)', + r'(?:some|many|several)\s+(?:users?|customers?)' + ] + + for pattern in impact_patterns: + match = re.search(pattern, text.lower()) + if match: + incident_data["affected_users"] = match.group(1) if match.group(1) else match.group(0) + break + + # Try to infer business impact + if any(word in text.lower() for word in ['critical', 'urgent', 'emergency', 'down', 'outage']): + incident_data["business_impact"] = "high" + elif any(word in text.lower() for word in ['slow', 'degraded', 'performance']): + incident_data["business_impact"] = "medium" + elif any(word in text.lower() for word in ['minor', 'cosmetic', 'small']): + incident_data["business_impact"] = "low" + + return incident_data + + +def interactive_mode(): + """Run in interactive mode, prompting user for input.""" + classifier = IncidentClassifier() + + print("🚨 Incident Classifier - Interactive Mode") + print("=" * 50) + print("Enter incident details (or 'quit' to exit):") + print() + + while True: + try: + description = input("Incident description: ").strip() + if description.lower() in ['quit', 'exit', 'q']: + break + + if not description: + print("Please provide an incident description.") + continue + + service = input("Affected service (optional): ").strip() or "unknown" + affected_users = input("Affected users (e.g., '50%', 'all users'): ").strip() or "unknown" + business_impact = input("Business impact (high/medium/low): ").strip() or "unknown" + + incident_data = { + "description": description, + "service": service, + "affected_users": affected_users, + "business_impact": business_impact + } + + result = classifier.classify_incident(incident_data) + print("\n" + "=" * 50) + print(format_text_output(result)) + print("=" * 50) + print() + + except KeyboardInterrupt: + print("\n\nExiting...") + break + except Exception as e: + print(f"Error: {e}") + + +def main(): + """Main function with argument parsing and execution.""" + parser = argparse.ArgumentParser( + description="Classify incidents and provide response recommendations", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python incident_classifier.py --input incident.json + echo "Database is down" | python incident_classifier.py --format text + python incident_classifier.py --interactive + +Input JSON format: + { + "description": "Database connection timeouts", + "service": "user-service", + "affected_users": "80%", + "business_impact": "high" + } + """ + ) + + parser.add_argument( + "--input", "-i", + help="Input file path (JSON format) or '-' for stdin" + ) + + parser.add_argument( + "--format", "-f", + choices=["json", "text"], + default="json", + help="Output format (default: json)" + ) + + parser.add_argument( + "--interactive", + action="store_true", + help="Run in interactive mode" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (default: stdout)" + ) + + args = parser.parse_args() + + # Interactive mode + if args.interactive: + interactive_mode() + return + + classifier = IncidentClassifier() + + try: + # Read input + if args.input == "-" or (not args.input and not sys.stdin.isatty()): + # Read from stdin + input_text = sys.stdin.read().strip() + if not input_text: + parser.error("No input provided") + + # Try to parse as JSON first, then as text + try: + incident_data = json.loads(input_text) + except json.JSONDecodeError: + incident_data = parse_input_text(input_text) + + elif args.input: + # Read from file + with open(args.input, 'r') as f: + incident_data = json.load(f) + else: + parser.error("No input specified. Use --input, --interactive, or pipe data to stdin.") + + # Validate required fields + if not isinstance(incident_data, dict): + parser.error("Input must be a JSON object") + + if "description" not in incident_data: + parser.error("Input must contain 'description' field") + + # Classify incident + result = classifier.classify_incident(incident_data) + + # Format output + if args.format == "json": + output = format_json_output(result) + else: + output = format_text_output(result) + + # Write output + if args.output: + with open(args.output, 'w') as f: + f.write(output) + f.write('\n') + else: + print(output) + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON - {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering-team/incident-commander/scripts/pir_generator.py b/engineering-team/incident-commander/scripts/pir_generator.py new file mode 100644 index 0000000..ee31b95 --- /dev/null +++ b/engineering-team/incident-commander/scripts/pir_generator.py @@ -0,0 +1,1638 @@ +#!/usr/bin/env python3 +""" +PIR (Post-Incident Review) Generator + +Generates comprehensive Post-Incident Review documents from incident data, timelines, +and actions taken. Applies multiple RCA frameworks including 5 Whys, Fishbone diagram, +and Timeline analysis. + +This tool creates structured PIR documents with root cause analysis, lessons learned, +action items, and follow-up recommendations. + +Usage: + python pir_generator.py --incident incident.json --timeline timeline.json --output pir.md + python pir_generator.py --incident incident.json --rca-method fishbone --action-items + cat incident.json | python pir_generator.py --format markdown +""" + +import argparse +import json +import sys +import re +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict, Counter + + +class PIRGenerator: + """ + Generates comprehensive Post-Incident Review documents with multiple + RCA frameworks, lessons learned, and actionable follow-up items. + """ + + def __init__(self): + """Initialize the PIR generator with templates and frameworks.""" + self.rca_frameworks = self._load_rca_frameworks() + self.pir_templates = self._load_pir_templates() + self.severity_guidelines = self._load_severity_guidelines() + self.action_item_types = self._load_action_item_types() + self.lessons_learned_categories = self._load_lessons_learned_categories() + + def _load_rca_frameworks(self) -> Dict[str, Dict]: + """Load root cause analysis framework definitions.""" + return { + "five_whys": { + "name": "5 Whys Analysis", + "description": "Iterative questioning technique to explore cause-and-effect relationships", + "steps": [ + "State the problem clearly", + "Ask why the problem occurred", + "For each answer, ask why again", + "Continue until root cause is identified", + "Verify the root cause addresses the original problem" + ], + "min_iterations": 3, + "max_iterations": 7 + }, + "fishbone": { + "name": "Fishbone (Ishikawa) Diagram", + "description": "Systematic analysis across multiple categories of potential causes", + "categories": [ + { + "name": "People", + "description": "Human factors, training, communication, experience", + "examples": ["Training gaps", "Communication failures", "Skill deficits", "Staffing issues"] + }, + { + "name": "Process", + "description": "Procedures, workflows, change management, review processes", + "examples": ["Missing procedures", "Inadequate reviews", "Change management gaps", "Documentation issues"] + }, + { + "name": "Technology", + "description": "Systems, tools, architecture, automation", + "examples": ["Architecture limitations", "Tool deficiencies", "Automation gaps", "Infrastructure issues"] + }, + { + "name": "Environment", + "description": "External factors, dependencies, infrastructure", + "examples": ["Third-party dependencies", "Network issues", "Hardware failures", "External service outages"] + } + ] + }, + "timeline": { + "name": "Timeline Analysis", + "description": "Chronological analysis of events to identify decision points and missed opportunities", + "focus_areas": [ + "Detection timing and effectiveness", + "Response time and escalation paths", + "Decision points and alternative paths", + "Communication effectiveness", + "Mitigation strategy effectiveness" + ] + }, + "bow_tie": { + "name": "Bow Tie Analysis", + "description": "Analysis of both preventive and protective measures around an incident", + "components": [ + "Hazards (what could go wrong)", + "Top events (what actually went wrong)", + "Threats (what caused it)", + "Consequences (what was the impact)", + "Barriers (what preventive/protective measures exist or could exist)" + ] + } + } + + def _load_pir_templates(self) -> Dict[str, str]: + """Load PIR document templates for different severity levels.""" + return { + "comprehensive": """# Post-Incident Review: {incident_title} + +## Executive Summary +{executive_summary} + +## Incident Overview +- **Incident ID:** {incident_id} +- **Date & Time:** {incident_date} +- **Duration:** {duration} +- **Severity:** {severity} +- **Status:** {status} +- **Incident Commander:** {incident_commander} +- **Responders:** {responders} + +### Customer Impact +{customer_impact} + +### Business Impact +{business_impact} + +## Timeline +{timeline_section} + +## Root Cause Analysis +{rca_section} + +## What Went Well +{what_went_well} + +## What Didn't Go Well +{what_went_wrong} + +## Lessons Learned +{lessons_learned} + +## Action Items +{action_items} + +## Follow-up and Prevention +{prevention_measures} + +## Appendix +{appendix_section} + +--- +*Generated on {generation_date} by PIR Generator* +""", + "standard": """# Post-Incident Review: {incident_title} + +## Summary +{executive_summary} + +## Incident Details +- **Date:** {incident_date} +- **Duration:** {duration} +- **Severity:** {severity} +- **Impact:** {customer_impact} + +## Timeline +{timeline_section} + +## Root Cause +{rca_section} + +## Action Items +{action_items} + +## Lessons Learned +{lessons_learned} + +--- +*Generated on {generation_date}* +""", + "brief": """# Incident Review: {incident_title} + +**Date:** {incident_date} | **Duration:** {duration} | **Severity:** {severity} + +## What Happened +{executive_summary} + +## Root Cause +{rca_section} + +## Actions +{action_items} + +--- +*{generation_date}* +""" + } + + def _load_severity_guidelines(self) -> Dict[str, Dict]: + """Load severity-specific PIR guidelines.""" + return { + "sev1": { + "required_sections": ["executive_summary", "timeline", "rca", "action_items", "lessons_learned"], + "required_attendees": ["incident_commander", "technical_leads", "engineering_manager", "product_manager"], + "timeline_requirement": "Complete timeline with 15-minute intervals", + "rca_methods": ["five_whys", "fishbone", "timeline"], + "review_deadline_hours": 24, + "follow_up_weeks": 4 + }, + "sev2": { + "required_sections": ["summary", "timeline", "rca", "action_items"], + "required_attendees": ["incident_commander", "technical_leads", "team_lead"], + "timeline_requirement": "Key milestone timeline", + "rca_methods": ["five_whys", "timeline"], + "review_deadline_hours": 72, + "follow_up_weeks": 2 + }, + "sev3": { + "required_sections": ["summary", "rca", "action_items"], + "required_attendees": ["technical_lead", "team_member"], + "timeline_requirement": "Basic timeline", + "rca_methods": ["five_whys"], + "review_deadline_hours": 168, # 1 week + "follow_up_weeks": 1 + }, + "sev4": { + "required_sections": ["summary", "action_items"], + "required_attendees": ["assigned_engineer"], + "timeline_requirement": "Optional", + "rca_methods": ["brief_analysis"], + "review_deadline_hours": 336, # 2 weeks + "follow_up_weeks": 0 + } + } + + def _load_action_item_types(self) -> Dict[str, Dict]: + """Load action item categorization and templates.""" + return { + "immediate_fix": { + "priority": "P0", + "timeline": "24-48 hours", + "description": "Critical bugs or security issues that need immediate attention", + "template": "Fix {issue_description} to prevent recurrence of {incident_type}", + "owners": ["engineer", "team_lead"] + }, + "process_improvement": { + "priority": "P1", + "timeline": "1-2 weeks", + "description": "Process gaps or communication issues identified", + "template": "Improve {process_area} to address {gap_description}", + "owners": ["team_lead", "process_owner"] + }, + "monitoring_alerting": { + "priority": "P1", + "timeline": "1 week", + "description": "Missing monitoring or alerting capabilities", + "template": "Implement {monitoring_type} for {system_component}", + "owners": ["sre", "engineer"] + }, + "documentation": { + "priority": "P2", + "timeline": "2-3 weeks", + "description": "Documentation gaps or runbook updates", + "template": "Update {documentation_type} to include {missing_information}", + "owners": ["technical_writer", "engineer"] + }, + "training": { + "priority": "P2", + "timeline": "1 month", + "description": "Training needs or knowledge gaps", + "template": "Provide {training_type} training on {topic}", + "owners": ["training_coordinator", "subject_matter_expert"] + }, + "architectural": { + "priority": "P1-P3", + "timeline": "1-3 months", + "description": "System design or architecture improvements", + "template": "Redesign {system_component} to improve {quality_attribute}", + "owners": ["architect", "engineering_manager"] + }, + "tooling": { + "priority": "P2", + "timeline": "2-4 weeks", + "description": "Tool improvements or new tool requirements", + "template": "Implement {tool_type} to support {use_case}", + "owners": ["devops", "engineer"] + } + } + + def _load_lessons_learned_categories(self) -> Dict[str, List[str]]: + """Load categories for organizing lessons learned.""" + return { + "detection_and_monitoring": [ + "Monitoring gaps identified", + "Alert fatigue issues", + "Detection timing improvements", + "Observability enhancements" + ], + "response_and_escalation": [ + "Response time improvements", + "Escalation path optimization", + "Communication effectiveness", + "Resource allocation lessons" + ], + "technical_systems": [ + "Architecture resilience", + "Failure mode analysis", + "Performance bottlenecks", + "Dependency management" + ], + "process_and_procedures": [ + "Runbook effectiveness", + "Change management gaps", + "Review process improvements", + "Documentation quality" + ], + "team_and_culture": [ + "Training needs identified", + "Cross-team collaboration", + "Knowledge sharing gaps", + "Decision-making processes" + ] + } + + def generate_pir(self, incident_data: Dict[str, Any], timeline_data: Optional[Dict] = None, + rca_method: str = "five_whys", template_type: str = "comprehensive") -> Dict[str, Any]: + """ + Generate a comprehensive PIR document from incident data. + + Args: + incident_data: Core incident information + timeline_data: Optional timeline reconstruction data + rca_method: RCA framework to use + template_type: PIR template type (comprehensive, standard, brief) + + Returns: + Dictionary containing PIR document and metadata + """ + # Extract incident information + incident_info = self._extract_incident_info(incident_data) + + # Generate root cause analysis + rca_results = self._perform_rca(incident_data, timeline_data, rca_method) + + # Generate lessons learned + lessons_learned = self._generate_lessons_learned(incident_data, timeline_data, rca_results) + + # Generate action items + action_items = self._generate_action_items(incident_data, rca_results, lessons_learned) + + # Create timeline section + timeline_section = self._create_timeline_section(timeline_data, incident_info["severity"]) + + # Generate document sections + sections = self._generate_document_sections( + incident_info, rca_results, lessons_learned, action_items, timeline_section + ) + + # Build final document + template = self.pir_templates[template_type] + pir_document = template.format(**sections) + + # Generate metadata + metadata = self._generate_metadata(incident_info, rca_results, action_items) + + return { + "pir_document": pir_document, + "metadata": metadata, + "incident_info": incident_info, + "rca_results": rca_results, + "lessons_learned": lessons_learned, + "action_items": action_items, + "generation_timestamp": datetime.now(timezone.utc).isoformat() + } + + def _extract_incident_info(self, incident_data: Dict) -> Dict[str, Any]: + """Extract and normalize incident information.""" + return { + "incident_id": incident_data.get("incident_id", "INC-" + datetime.now().strftime("%Y%m%d-%H%M")), + "title": incident_data.get("title", incident_data.get("description", "Incident")[:50]), + "description": incident_data.get("description", "No description provided"), + "severity": incident_data.get("severity", "unknown").lower(), + "start_time": self._parse_timestamp(incident_data.get("start_time", incident_data.get("timestamp", ""))), + "end_time": self._parse_timestamp(incident_data.get("end_time", "")), + "duration": self._calculate_duration(incident_data), + "affected_services": incident_data.get("affected_services", []), + "customer_impact": incident_data.get("customer_impact", "Unknown impact"), + "business_impact": incident_data.get("business_impact", "Unknown business impact"), + "incident_commander": incident_data.get("incident_commander", "TBD"), + "responders": incident_data.get("responders", []), + "status": incident_data.get("status", "resolved") + } + + def _parse_timestamp(self, timestamp_str: str) -> Optional[datetime]: + """Parse timestamp string to datetime object.""" + if not timestamp_str: + return None + + formats = [ + "%Y-%m-%dT%H:%M:%S.%fZ", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%d %H:%M:%S", + "%m/%d/%Y %H:%M:%S" + ] + + for fmt in formats: + try: + dt = datetime.strptime(timestamp_str, fmt) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except ValueError: + continue + + return None + + def _calculate_duration(self, incident_data: Dict) -> str: + """Calculate incident duration in human-readable format.""" + start_time = self._parse_timestamp(incident_data.get("start_time", "")) + end_time = self._parse_timestamp(incident_data.get("end_time", "")) + + if start_time and end_time: + duration = end_time - start_time + total_minutes = int(duration.total_seconds() / 60) + + if total_minutes < 60: + return f"{total_minutes} minutes" + elif total_minutes < 1440: # Less than 24 hours + hours = total_minutes // 60 + minutes = total_minutes % 60 + return f"{hours}h {minutes}m" + else: + days = total_minutes // 1440 + hours = (total_minutes % 1440) // 60 + return f"{days}d {hours}h" + + return incident_data.get("duration", "Unknown duration") + + def _perform_rca(self, incident_data: Dict, timeline_data: Optional[Dict], method: str) -> Dict[str, Any]: + """Perform root cause analysis using specified method.""" + if method == "five_whys": + return self._five_whys_analysis(incident_data, timeline_data) + elif method == "fishbone": + return self._fishbone_analysis(incident_data, timeline_data) + elif method == "timeline": + return self._timeline_analysis(incident_data, timeline_data) + elif method == "bow_tie": + return self._bow_tie_analysis(incident_data, timeline_data) + else: + return self._five_whys_analysis(incident_data, timeline_data) # Default + + def _five_whys_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform 5 Whys root cause analysis.""" + problem_statement = incident_data.get("description", "Incident occurred") + + # Generate why questions based on incident data + whys = [] + current_issue = problem_statement + + # Generate systematic why questions + why_patterns = [ + f"Why did {current_issue}?", + "Why wasn't this detected earlier?", + "Why didn't existing safeguards prevent this?", + "Why wasn't there a backup mechanism?", + "Why wasn't this scenario anticipated?" + ] + + # Try to infer answers from incident data + potential_answers = self._infer_why_answers(incident_data, timeline_data) + + for i, why_question in enumerate(why_patterns): + answer = potential_answers[i] if i < len(potential_answers) else "Further investigation needed" + whys.append({ + "question": why_question, + "answer": answer, + "evidence": self._find_supporting_evidence(answer, incident_data, timeline_data) + }) + + # Identify root causes from the analysis + root_causes = self._extract_root_causes(whys) + + return { + "method": "five_whys", + "problem_statement": problem_statement, + "why_analysis": whys, + "root_causes": root_causes, + "confidence": self._calculate_rca_confidence(whys, incident_data) + } + + def _fishbone_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform Fishbone (Ishikawa) diagram analysis.""" + problem_statement = incident_data.get("description", "Incident occurred") + + # Analyze each category + categories = {} + for category_info in self.rca_frameworks["fishbone"]["categories"]: + category_name = category_info["name"] + contributing_factors = self._identify_category_factors( + category_name, incident_data, timeline_data + ) + categories[category_name] = { + "description": category_info["description"], + "factors": contributing_factors, + "examples": category_info["examples"] + } + + # Identify primary contributing factors + primary_factors = self._identify_primary_factors(categories) + + # Generate root cause hypothesis + root_causes = self._synthesize_fishbone_root_causes(categories, primary_factors) + + return { + "method": "fishbone", + "problem_statement": problem_statement, + "categories": categories, + "primary_factors": primary_factors, + "root_causes": root_causes, + "confidence": self._calculate_rca_confidence(categories, incident_data) + } + + def _timeline_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform timeline-based root cause analysis.""" + if not timeline_data: + return {"method": "timeline", "error": "No timeline data provided"} + + # Extract key decision points + decision_points = self._extract_decision_points(timeline_data) + + # Identify missed opportunities + missed_opportunities = self._identify_missed_opportunities(timeline_data) + + # Analyze response effectiveness + response_analysis = self._analyze_response_effectiveness(timeline_data) + + # Generate timeline-based root causes + root_causes = self._extract_timeline_root_causes( + decision_points, missed_opportunities, response_analysis + ) + + return { + "method": "timeline", + "decision_points": decision_points, + "missed_opportunities": missed_opportunities, + "response_analysis": response_analysis, + "root_causes": root_causes, + "confidence": self._calculate_rca_confidence(timeline_data, incident_data) + } + + def _bow_tie_analysis(self, incident_data: Dict, timeline_data: Optional[Dict]) -> Dict[str, Any]: + """Perform Bow Tie analysis.""" + # Identify the top event (what went wrong) + top_event = incident_data.get("description", "Service failure") + + # Identify threats (what caused it) + threats = self._identify_threats(incident_data, timeline_data) + + # Identify consequences (impact) + consequences = self._identify_consequences(incident_data) + + # Identify existing barriers + existing_barriers = self._identify_existing_barriers(incident_data, timeline_data) + + # Recommend additional barriers + recommended_barriers = self._recommend_additional_barriers(threats, consequences) + + return { + "method": "bow_tie", + "top_event": top_event, + "threats": threats, + "consequences": consequences, + "existing_barriers": existing_barriers, + "recommended_barriers": recommended_barriers, + "confidence": self._calculate_rca_confidence(threats, incident_data) + } + + def _infer_why_answers(self, incident_data: Dict, timeline_data: Optional[Dict]) -> List[str]: + """Infer potential answers to why questions from available data.""" + answers = [] + + # Look for clues in incident description + description = incident_data.get("description", "").lower() + + # Common patterns and their inferred answers + if "database" in description and ("timeout" in description or "slow" in description): + answers.append("Database connection pool was exhausted") + answers.append("Connection pool configuration was insufficient for peak load") + answers.append("Load testing didn't include realistic database scenarios") + elif "deployment" in description or "release" in description: + answers.append("New deployment introduced a regression") + answers.append("Code review process missed the issue") + answers.append("Testing environment didn't match production") + elif "network" in description or "connectivity" in description: + answers.append("Network infrastructure had unexpected load") + answers.append("Network monitoring wasn't comprehensive enough") + answers.append("Redundancy mechanisms failed simultaneously") + else: + # Generic answers based on common root causes + answers.extend([ + "System couldn't handle the load/request volume", + "Monitoring didn't detect the issue early enough", + "Error handling mechanisms were insufficient", + "Dependencies failed without proper circuit breakers", + "System lacked sufficient redundancy/resilience" + ]) + + return answers[:5] # Return up to 5 answers + + def _find_supporting_evidence(self, answer: str, incident_data: Dict, timeline_data: Optional[Dict]) -> List[str]: + """Find supporting evidence for RCA answers.""" + evidence = [] + + # Look for supporting information in incident data + if timeline_data and "timeline" in timeline_data: + events = timeline_data["timeline"].get("events", []) + for event in events: + event_message = event.get("message", "").lower() + if any(keyword in event_message for keyword in answer.lower().split()): + evidence.append(f"Timeline event: {event['message']}") + + # Check incident metadata for supporting info + metadata = incident_data.get("metadata", {}) + for key, value in metadata.items(): + if isinstance(value, str) and any(keyword in value.lower() for keyword in answer.lower().split()): + evidence.append(f"Incident metadata: {key} = {value}") + + return evidence[:3] # Return top 3 pieces of evidence + + def _extract_root_causes(self, whys: List[Dict]) -> List[Dict]: + """Extract root causes from 5 Whys analysis.""" + root_causes = [] + + # The deepest "why" answers are typically closest to root causes + if len(whys) >= 3: + for i, why in enumerate(whys[-2:]): # Look at last 2 whys + if "further investigation needed" not in why["answer"].lower(): + root_causes.append({ + "cause": why["answer"], + "category": self._categorize_root_cause(why["answer"]), + "evidence": why["evidence"], + "confidence": "high" if len(why["evidence"]) > 1 else "medium" + }) + + return root_causes + + def _categorize_root_cause(self, cause: str) -> str: + """Categorize a root cause into standard categories.""" + cause_lower = cause.lower() + + if any(keyword in cause_lower for keyword in ["process", "procedure", "review", "change management"]): + return "Process" + elif any(keyword in cause_lower for keyword in ["training", "knowledge", "skill", "experience"]): + return "People" + elif any(keyword in cause_lower for keyword in ["system", "architecture", "code", "configuration"]): + return "Technology" + elif any(keyword in cause_lower for keyword in ["network", "infrastructure", "dependency", "third-party"]): + return "Environment" + else: + return "Unknown" + + def _identify_category_factors(self, category: str, incident_data: Dict, timeline_data: Optional[Dict]) -> List[Dict]: + """Identify contributing factors for a Fishbone category.""" + factors = [] + description = incident_data.get("description", "").lower() + + if category == "People": + if "misconfigured" in description or "human error" in description: + factors.append({"factor": "Configuration error", "likelihood": "high"}) + if timeline_data and self._has_delayed_response(timeline_data): + factors.append({"factor": "Delayed incident response", "likelihood": "medium"}) + + elif category == "Process": + if "deployment" in description: + factors.append({"factor": "Insufficient deployment validation", "likelihood": "high"}) + if "code review" in incident_data.get("context", "").lower(): + factors.append({"factor": "Code review process gaps", "likelihood": "medium"}) + + elif category == "Technology": + if "database" in description: + factors.append({"factor": "Database performance limitations", "likelihood": "high"}) + if "timeout" in description or "latency" in description: + factors.append({"factor": "System performance bottlenecks", "likelihood": "high"}) + + elif category == "Environment": + if "network" in description: + factors.append({"factor": "Network infrastructure issues", "likelihood": "medium"}) + if "third-party" in description or "external" in description: + factors.append({"factor": "External service dependencies", "likelihood": "medium"}) + + return factors + + def _identify_primary_factors(self, categories: Dict) -> List[Dict]: + """Identify primary contributing factors across all categories.""" + primary_factors = [] + + for category_name, category_data in categories.items(): + high_likelihood_factors = [ + f for f in category_data["factors"] + if f.get("likelihood") == "high" + ] + primary_factors.extend([ + {**factor, "category": category_name} + for factor in high_likelihood_factors + ]) + + return primary_factors + + def _synthesize_fishbone_root_causes(self, categories: Dict, primary_factors: List[Dict]) -> List[Dict]: + """Synthesize root causes from Fishbone analysis.""" + root_causes = [] + + # Group primary factors by category + category_factors = defaultdict(list) + for factor in primary_factors: + category_factors[factor["category"]].append(factor) + + # Create root causes from categories with multiple factors + for category, factors in category_factors.items(): + if len(factors) > 1: + root_causes.append({ + "cause": f"Multiple {category.lower()} issues contributed to the incident", + "category": category, + "contributing_factors": [f["factor"] for f in factors], + "confidence": "high" + }) + elif len(factors) == 1: + root_causes.append({ + "cause": factors[0]["factor"], + "category": category, + "confidence": "medium" + }) + + return root_causes + + def _has_delayed_response(self, timeline_data: Dict) -> bool: + """Check if timeline shows delayed response patterns.""" + if not timeline_data or "gap_analysis" not in timeline_data: + return False + + gaps = timeline_data["gap_analysis"].get("gaps", []) + return any(gap.get("type") == "phase_transition" for gap in gaps) + + def _extract_decision_points(self, timeline_data: Dict) -> List[Dict]: + """Extract key decision points from timeline.""" + decision_points = [] + + if "timeline" in timeline_data and "phases" in timeline_data["timeline"]: + phases = timeline_data["timeline"]["phases"] + + for i, phase in enumerate(phases): + if phase["name"] in ["escalation", "mitigation"]: + decision_points.append({ + "timestamp": phase["start_time"], + "decision": f"Initiated {phase['name']} phase", + "phase": phase["name"], + "duration": phase["duration_minutes"] + }) + + return decision_points + + def _identify_missed_opportunities(self, timeline_data: Dict) -> List[Dict]: + """Identify missed opportunities from gap analysis.""" + missed_opportunities = [] + + if "gap_analysis" in timeline_data: + gaps = timeline_data["gap_analysis"].get("gaps", []) + + for gap in gaps: + if gap.get("severity") == "critical": + missed_opportunities.append({ + "opportunity": f"Earlier {gap['type'].replace('_', ' ')}", + "gap_minutes": gap["gap_minutes"], + "potential_impact": "Could have reduced incident duration" + }) + + return missed_opportunities + + def _analyze_response_effectiveness(self, timeline_data: Dict) -> Dict[str, Any]: + """Analyze the effectiveness of incident response.""" + effectiveness = { + "overall_rating": "unknown", + "strengths": [], + "weaknesses": [], + "metrics": {} + } + + if "metrics" in timeline_data: + metrics = timeline_data["metrics"] + duration_metrics = metrics.get("duration_metrics", {}) + + # Analyze response times + time_to_mitigation = duration_metrics.get("time_to_mitigation_minutes", 0) + time_to_resolution = duration_metrics.get("time_to_resolution_minutes", 0) + + if time_to_mitigation <= 30: + effectiveness["strengths"].append("Quick mitigation response") + else: + effectiveness["weaknesses"].append("Slow mitigation response") + + if time_to_resolution <= 120: + effectiveness["strengths"].append("Fast resolution") + else: + effectiveness["weaknesses"].append("Extended resolution time") + + effectiveness["metrics"] = { + "time_to_mitigation": time_to_mitigation, + "time_to_resolution": time_to_resolution + } + + # Overall rating based on strengths vs weaknesses + if len(effectiveness["strengths"]) > len(effectiveness["weaknesses"]): + effectiveness["overall_rating"] = "effective" + elif len(effectiveness["weaknesses"]) > len(effectiveness["strengths"]): + effectiveness["overall_rating"] = "needs_improvement" + else: + effectiveness["overall_rating"] = "mixed" + + return effectiveness + + def _extract_timeline_root_causes(self, decision_points: List, missed_opportunities: List, + response_analysis: Dict) -> List[Dict]: + """Extract root causes from timeline analysis.""" + root_causes = [] + + # Root causes from missed opportunities + for opportunity in missed_opportunities: + if opportunity["gap_minutes"] > 60: # Significant gaps + root_causes.append({ + "cause": f"Delayed response: {opportunity['opportunity']}", + "category": "Process", + "evidence": f"{opportunity['gap_minutes']} minute gap identified", + "confidence": "high" + }) + + # Root causes from response effectiveness + for weakness in response_analysis.get("weaknesses", []): + root_causes.append({ + "cause": weakness, + "category": "Process", + "evidence": "Timeline analysis", + "confidence": "medium" + }) + + return root_causes + + def _identify_threats(self, incident_data: Dict, timeline_data: Optional[Dict]) -> List[Dict]: + """Identify threats for Bow Tie analysis.""" + threats = [] + description = incident_data.get("description", "").lower() + + if "deployment" in description: + threats.append({"threat": "Defective code deployment", "likelihood": "medium"}) + if "load" in description or "traffic" in description: + threats.append({"threat": "Unexpected load increase", "likelihood": "high"}) + if "database" in description: + threats.append({"threat": "Database performance degradation", "likelihood": "medium"}) + + return threats + + def _identify_consequences(self, incident_data: Dict) -> List[Dict]: + """Identify consequences for Bow Tie analysis.""" + consequences = [] + + customer_impact = incident_data.get("customer_impact", "").lower() + business_impact = incident_data.get("business_impact", "").lower() + + if "all users" in customer_impact or "complete outage" in customer_impact: + consequences.append({"consequence": "Complete service unavailability", "severity": "critical"}) + + if "revenue" in business_impact: + consequences.append({"consequence": "Revenue loss", "severity": "high"}) + + return consequences + + def _identify_existing_barriers(self, incident_data: Dict, timeline_data: Optional[Dict]) -> List[Dict]: + """Identify existing preventive/protective barriers.""" + barriers = [] + + # Look for evidence of existing controls + if timeline_data and "timeline" in timeline_data: + events = timeline_data["timeline"].get("events", []) + + for event in events: + message = event.get("message", "").lower() + if "alert" in message or "monitoring" in message: + barriers.append({ + "barrier": "Monitoring and alerting system", + "type": "detective", + "effectiveness": "partial" + }) + elif "rollback" in message: + barriers.append({ + "barrier": "Rollback capability", + "type": "corrective", + "effectiveness": "effective" + }) + + return barriers + + def _recommend_additional_barriers(self, threats: List[Dict], consequences: List[Dict]) -> List[Dict]: + """Recommend additional barriers based on threats and consequences.""" + recommendations = [] + + for threat in threats: + if "deployment" in threat["threat"].lower(): + recommendations.append({ + "barrier": "Enhanced pre-deployment testing", + "type": "preventive", + "justification": "Prevent defective deployments reaching production" + }) + elif "load" in threat["threat"].lower(): + recommendations.append({ + "barrier": "Auto-scaling and load shedding", + "type": "preventive", + "justification": "Handle unexpected load increases automatically" + }) + + return recommendations + + def _calculate_rca_confidence(self, analysis_data: Any, incident_data: Dict) -> str: + """Calculate confidence level for RCA results.""" + # Simple heuristic based on available data + confidence_score = 0 + + # More detailed incident data increases confidence + if incident_data.get("description") and len(incident_data["description"]) > 50: + confidence_score += 1 + + if incident_data.get("timeline") or incident_data.get("events"): + confidence_score += 2 + + if incident_data.get("logs") or incident_data.get("monitoring_data"): + confidence_score += 2 + + # Analysis data completeness + if isinstance(analysis_data, list) and len(analysis_data) > 3: + confidence_score += 1 + elif isinstance(analysis_data, dict) and len(analysis_data) > 5: + confidence_score += 1 + + if confidence_score >= 4: + return "high" + elif confidence_score >= 2: + return "medium" + else: + return "low" + + def _generate_lessons_learned(self, incident_data: Dict, timeline_data: Optional[Dict], + rca_results: Dict) -> Dict[str, List[str]]: + """Generate categorized lessons learned.""" + lessons = defaultdict(list) + + # Lessons from RCA + root_causes = rca_results.get("root_causes", []) + for root_cause in root_causes: + category = root_cause.get("category", "technical_systems").lower() + category_key = self._map_to_lessons_category(category) + + lesson = f"Identified: {root_cause['cause']}" + lessons[category_key].append(lesson) + + # Lessons from timeline analysis + if timeline_data and "gap_analysis" in timeline_data: + gaps = timeline_data["gap_analysis"].get("gaps", []) + for gap in gaps: + if gap.get("severity") == "critical": + lessons["response_and_escalation"].append( + f"Response time gap: {gap['type'].replace('_', ' ')} took {gap['gap_minutes']} minutes" + ) + + # Generic lessons based on incident characteristics + severity = incident_data.get("severity", "").lower() + if severity in ["sev1", "critical"]: + lessons["detection_and_monitoring"].append( + "Critical incidents require immediate detection and alerting" + ) + + return dict(lessons) + + def _map_to_lessons_category(self, category: str) -> str: + """Map RCA category to lessons learned category.""" + mapping = { + "people": "team_and_culture", + "process": "process_and_procedures", + "technology": "technical_systems", + "environment": "technical_systems", + "unknown": "process_and_procedures" + } + return mapping.get(category, "technical_systems") + + def _generate_action_items(self, incident_data: Dict, rca_results: Dict, + lessons_learned: Dict) -> List[Dict]: + """Generate actionable follow-up items.""" + action_items = [] + + # Actions from root causes + root_causes = rca_results.get("root_causes", []) + for root_cause in root_causes: + action_type = self._determine_action_type(root_cause) + action_template = self.action_item_types[action_type] + + action_items.append({ + "title": f"Address: {root_cause['cause'][:50]}...", + "description": root_cause["cause"], + "type": action_type, + "priority": action_template["priority"], + "timeline": action_template["timeline"], + "owner": "TBD", + "success_criteria": f"Prevent recurrence of {root_cause['cause'][:30]}...", + "related_root_cause": root_cause + }) + + # Actions from lessons learned + for category, lessons in lessons_learned.items(): + if len(lessons) > 1: # Multiple lessons in same category indicate systematic issue + action_items.append({ + "title": f"Improve {category.replace('_', ' ')}", + "description": f"Address multiple issues identified in {category}", + "type": "process_improvement", + "priority": "P1", + "timeline": "2-3 weeks", + "owner": "TBD", + "success_criteria": f"Comprehensive review and improvement of {category}" + }) + + # Standard actions based on severity + severity = incident_data.get("severity", "").lower() + if severity in ["sev1", "critical"]: + action_items.append({ + "title": "Conduct comprehensive post-incident review", + "description": "Schedule PIR meeting with all stakeholders", + "type": "process_improvement", + "priority": "P0", + "timeline": "24-48 hours", + "owner": incident_data.get("incident_commander", "TBD"), + "success_criteria": "PIR completed and documented" + }) + + return action_items + + def _determine_action_type(self, root_cause: Dict) -> str: + """Determine action item type based on root cause.""" + cause_text = root_cause.get("cause", "").lower() + category = root_cause.get("category", "").lower() + + if any(keyword in cause_text for keyword in ["bug", "error", "failure", "crash"]): + return "immediate_fix" + elif any(keyword in cause_text for keyword in ["monitor", "alert", "detect"]): + return "monitoring_alerting" + elif any(keyword in cause_text for keyword in ["process", "procedure", "review"]): + return "process_improvement" + elif any(keyword in cause_text for keyword in ["document", "runbook", "knowledge"]): + return "documentation" + elif any(keyword in cause_text for keyword in ["training", "skill", "knowledge"]): + return "training" + elif any(keyword in cause_text for keyword in ["architecture", "design", "system"]): + return "architectural" + else: + return "process_improvement" # Default + + def _create_timeline_section(self, timeline_data: Optional[Dict], severity: str) -> str: + """Create timeline section for PIR document.""" + if not timeline_data: + return "No detailed timeline available." + + timeline_content = [] + + if "timeline" in timeline_data and "phases" in timeline_data["timeline"]: + timeline_content.append("### Phase Timeline") + timeline_content.append("") + + phases = timeline_data["timeline"]["phases"] + for phase in phases: + timeline_content.append(f"**{phase['name'].title()} Phase**") + timeline_content.append(f"- Start: {phase['start_time']}") + timeline_content.append(f"- Duration: {phase['duration_minutes']} minutes") + timeline_content.append(f"- Events: {phase['event_count']}") + timeline_content.append("") + + if "metrics" in timeline_data: + metrics = timeline_data["metrics"] + duration_metrics = metrics.get("duration_metrics", {}) + + timeline_content.append("### Key Metrics") + timeline_content.append("") + timeline_content.append(f"- Total Duration: {duration_metrics.get('total_duration_minutes', 'N/A')} minutes") + timeline_content.append(f"- Time to Mitigation: {duration_metrics.get('time_to_mitigation_minutes', 'N/A')} minutes") + timeline_content.append(f"- Time to Resolution: {duration_metrics.get('time_to_resolution_minutes', 'N/A')} minutes") + timeline_content.append("") + + return "\n".join(timeline_content) + + def _generate_document_sections(self, incident_info: Dict, rca_results: Dict, + lessons_learned: Dict, action_items: List[Dict], + timeline_section: str) -> Dict[str, str]: + """Generate all document sections for PIR template.""" + sections = {} + + # Basic information + sections["incident_title"] = incident_info["title"] + sections["incident_id"] = incident_info["incident_id"] + sections["incident_date"] = incident_info["start_time"].strftime("%Y-%m-%d %H:%M:%S UTC") if incident_info["start_time"] else "Unknown" + sections["duration"] = incident_info["duration"] + sections["severity"] = incident_info["severity"].upper() + sections["status"] = incident_info["status"].title() + sections["incident_commander"] = incident_info["incident_commander"] + sections["responders"] = ", ".join(incident_info["responders"]) if incident_info["responders"] else "TBD" + sections["generation_date"] = datetime.now().strftime("%Y-%m-%d") + + # Impact sections + sections["customer_impact"] = incident_info["customer_impact"] + sections["business_impact"] = incident_info["business_impact"] + + # Executive summary + sections["executive_summary"] = self._create_executive_summary(incident_info, rca_results) + + # Timeline + sections["timeline_section"] = timeline_section + + # RCA section + sections["rca_section"] = self._create_rca_section(rca_results) + + # What went well/wrong + sections["what_went_well"] = self._create_what_went_well_section(incident_info, rca_results) + sections["what_went_wrong"] = self._create_what_went_wrong_section(rca_results, lessons_learned) + + # Lessons learned + sections["lessons_learned"] = self._create_lessons_learned_section(lessons_learned) + + # Action items + sections["action_items"] = self._create_action_items_section(action_items) + + # Prevention and appendix + sections["prevention_measures"] = self._create_prevention_section(rca_results, action_items) + sections["appendix_section"] = self._create_appendix_section(incident_info) + + return sections + + def _create_executive_summary(self, incident_info: Dict, rca_results: Dict) -> str: + """Create executive summary section.""" + summary_parts = [] + + # Incident description + summary_parts.append(f"On {incident_info['start_time'].strftime('%B %d, %Y') if incident_info['start_time'] else 'an unknown date'}, we experienced a {incident_info['severity']} incident affecting {incident_info.get('affected_services', ['our services'])}.") + + # Duration and impact + summary_parts.append(f"The incident lasted {incident_info['duration']} and had the following impact: {incident_info['customer_impact']}") + + # Root cause summary + root_causes = rca_results.get("root_causes", []) + if root_causes: + primary_cause = root_causes[0]["cause"] + summary_parts.append(f"Root cause analysis identified the primary issue as: {primary_cause}") + + # Resolution + summary_parts.append(f"The incident has been {incident_info['status']} and we have identified specific actions to prevent recurrence.") + + return " ".join(summary_parts) + + def _create_rca_section(self, rca_results: Dict) -> str: + """Create RCA section content.""" + rca_content = [] + + method = rca_results.get("method", "unknown") + rca_content.append(f"### Analysis Method: {self.rca_frameworks.get(method, {}).get('name', method)}") + rca_content.append("") + + if method == "five_whys" and "why_analysis" in rca_results: + rca_content.append("#### Why Analysis") + rca_content.append("") + + for i, why in enumerate(rca_results["why_analysis"], 1): + rca_content.append(f"**Why {i}:** {why['question']}") + rca_content.append(f"**Answer:** {why['answer']}") + if why["evidence"]: + rca_content.append(f"**Evidence:** {', '.join(why['evidence'])}") + rca_content.append("") + + elif method == "fishbone" and "categories" in rca_results: + rca_content.append("#### Contributing Factor Analysis") + rca_content.append("") + + for category, data in rca_results["categories"].items(): + if data["factors"]: + rca_content.append(f"**{category}:**") + for factor in data["factors"]: + rca_content.append(f"- {factor['factor']} (likelihood: {factor.get('likelihood', 'unknown')})") + rca_content.append("") + + # Root causes summary + root_causes = rca_results.get("root_causes", []) + if root_causes: + rca_content.append("#### Identified Root Causes") + rca_content.append("") + + for i, cause in enumerate(root_causes, 1): + rca_content.append(f"{i}. **{cause['cause']}**") + rca_content.append(f" - Category: {cause.get('category', 'Unknown')}") + rca_content.append(f" - Confidence: {cause.get('confidence', 'Unknown')}") + if cause.get("evidence"): + rca_content.append(f" - Evidence: {cause['evidence']}") + rca_content.append("") + + return "\n".join(rca_content) + + def _create_what_went_well_section(self, incident_info: Dict, rca_results: Dict) -> str: + """Create what went well section.""" + positives = [] + + # Generic positive aspects + if incident_info["status"] == "resolved": + positives.append("The incident was successfully resolved") + + if incident_info["incident_commander"] != "TBD": + positives.append("Incident command was established") + + if len(incident_info.get("responders", [])) > 1: + positives.append("Multiple team members collaborated on resolution") + + # Analysis-specific positives + if rca_results.get("confidence") == "high": + positives.append("Root cause analysis provided clear insights") + + if not positives: + positives.append("Incident response process was followed") + + return "\n".join([f"- {positive}" for positive in positives]) + + def _create_what_went_wrong_section(self, rca_results: Dict, lessons_learned: Dict) -> str: + """Create what went wrong section.""" + issues = [] + + # Issues from RCA + root_causes = rca_results.get("root_causes", []) + for cause in root_causes[:3]: # Show top 3 + issues.append(cause["cause"]) + + # Issues from lessons learned + for category, lessons in lessons_learned.items(): + if lessons: + issues.append(f"{category.replace('_', ' ').title()}: {lessons[0]}") + + if not issues: + issues.append("Analysis in progress") + + return "\n".join([f"- {issue}" for issue in issues]) + + def _create_lessons_learned_section(self, lessons_learned: Dict) -> str: + """Create lessons learned section.""" + content = [] + + for category, lessons in lessons_learned.items(): + if lessons: + content.append(f"### {category.replace('_', ' ').title()}") + content.append("") + + for lesson in lessons: + content.append(f"- {lesson}") + + content.append("") + + if not content: + content.append("Lessons learned to be documented following detailed analysis.") + + return "\n".join(content) + + def _create_action_items_section(self, action_items: List[Dict]) -> str: + """Create action items section.""" + if not action_items: + return "Action items to be defined." + + content = [] + + # Group by priority + priority_groups = defaultdict(list) + for item in action_items: + priority_groups[item.get("priority", "P3")].append(item) + + for priority in ["P0", "P1", "P2", "P3"]: + items = priority_groups.get(priority, []) + if items: + content.append(f"### {priority} - {self._get_priority_description(priority)}") + content.append("") + + for item in items: + content.append(f"**{item['title']}**") + content.append(f"- Owner: {item.get('owner', 'TBD')}") + content.append(f"- Timeline: {item.get('timeline', 'TBD')}") + content.append(f"- Success Criteria: {item.get('success_criteria', 'TBD')}") + content.append("") + + return "\n".join(content) + + def _get_priority_description(self, priority: str) -> str: + """Get human-readable priority description.""" + descriptions = { + "P0": "Critical - Immediate Action Required", + "P1": "High Priority - Complete Within 1-2 Weeks", + "P2": "Medium Priority - Complete Within 1 Month", + "P3": "Low Priority - Complete When Capacity Allows" + } + return descriptions.get(priority, "Unknown Priority") + + def _create_prevention_section(self, rca_results: Dict, action_items: List[Dict]) -> str: + """Create prevention and follow-up section.""" + content = [] + + content.append("### Prevention Measures") + content.append("") + content.append("Based on the root cause analysis, the following preventive measures have been identified:") + content.append("") + + # Extract prevention-focused action items + prevention_items = [item for item in action_items if "prevent" in item.get("description", "").lower()] + + if prevention_items: + for item in prevention_items: + content.append(f"- {item['title']}: {item.get('description', '')}") + else: + content.append("- Implement comprehensive testing for similar scenarios") + content.append("- Improve monitoring and alerting coverage") + content.append("- Enhance error handling and resilience patterns") + + content.append("") + content.append("### Follow-up Schedule") + content.append("") + content.append("- 1 week: Review action item progress") + content.append("- 1 month: Evaluate effectiveness of implemented changes") + content.append("- 3 months: Conduct follow-up assessment and update preventive measures") + + return "\n".join(content) + + def _create_appendix_section(self, incident_info: Dict) -> str: + """Create appendix section.""" + content = [] + + content.append("### Additional Information") + content.append("") + content.append(f"- Incident ID: {incident_info['incident_id']}") + content.append(f"- Severity Classification: {incident_info['severity']}") + + if incident_info.get("affected_services"): + content.append(f"- Affected Services: {', '.join(incident_info['affected_services'])}") + + content.append("") + content.append("### References") + content.append("") + content.append("- Incident tracking ticket: [Link TBD]") + content.append("- Monitoring dashboards: [Link TBD]") + content.append("- Communication thread: [Link TBD]") + + return "\n".join(content) + + def _generate_metadata(self, incident_info: Dict, rca_results: Dict, action_items: List[Dict]) -> Dict[str, Any]: + """Generate PIR metadata for tracking and analysis.""" + return { + "pir_id": f"PIR-{incident_info['incident_id']}", + "incident_severity": incident_info["severity"], + "rca_method": rca_results.get("method", "unknown"), + "rca_confidence": rca_results.get("confidence", "unknown"), + "total_action_items": len(action_items), + "critical_action_items": len([item for item in action_items if item.get("priority") == "P0"]), + "estimated_prevention_timeline": self._estimate_prevention_timeline(action_items), + "categories_affected": list(set(item.get("type", "unknown") for item in action_items)), + "review_completeness": self._assess_review_completeness(incident_info, rca_results, action_items) + } + + def _estimate_prevention_timeline(self, action_items: List[Dict]) -> str: + """Estimate timeline for implementing all prevention measures.""" + if not action_items: + return "unknown" + + # Find the longest timeline among action items + max_weeks = 0 + for item in action_items: + timeline = item.get("timeline", "") + if "week" in timeline: + try: + weeks = int(re.findall(r'\d+', timeline)[0]) + max_weeks = max(max_weeks, weeks) + except (IndexError, ValueError): + pass + elif "month" in timeline: + try: + months = int(re.findall(r'\d+', timeline)[0]) + max_weeks = max(max_weeks, months * 4) + except (IndexError, ValueError): + pass + + if max_weeks == 0: + return "1-2 weeks" + elif max_weeks <= 4: + return f"{max_weeks} weeks" + else: + return f"{max_weeks // 4} months" + + def _assess_review_completeness(self, incident_info: Dict, rca_results: Dict, action_items: List[Dict]) -> float: + """Assess completeness of the PIR (0-1 score).""" + score = 0.0 + + # Basic information completeness + if incident_info.get("description"): + score += 0.1 + if incident_info.get("start_time"): + score += 0.1 + if incident_info.get("customer_impact"): + score += 0.1 + + # RCA completeness + if rca_results.get("root_causes"): + score += 0.2 + if rca_results.get("confidence") in ["medium", "high"]: + score += 0.1 + + # Action items completeness + if action_items: + score += 0.2 + if any(item.get("owner") and item["owner"] != "TBD" for item in action_items): + score += 0.1 + + # Additional factors + if incident_info.get("incident_commander") != "TBD": + score += 0.1 + if len(action_items) >= 3: # Multiple action items show thorough analysis + score += 0.1 + + return min(score, 1.0) + + +def format_json_output(result: Dict) -> str: + """Format result as pretty JSON.""" + return json.dumps(result, indent=2, ensure_ascii=False) + + +def format_markdown_output(result: Dict) -> str: + """Format result as Markdown PIR document.""" + return result.get("pir_document", "Error: No PIR document generated") + + +def format_text_output(result: Dict) -> str: + """Format result as human-readable summary.""" + if "error" in result: + return f"Error: {result['error']}" + + metadata = result.get("metadata", {}) + incident_info = result.get("incident_info", {}) + rca_results = result.get("rca_results", {}) + action_items = result.get("action_items", []) + + output = [] + output.append("=" * 60) + output.append("POST-INCIDENT REVIEW SUMMARY") + output.append("=" * 60) + output.append("") + + # Basic info + output.append("INCIDENT INFORMATION:") + output.append(f" PIR ID: {metadata.get('pir_id', 'Unknown')}") + output.append(f" Severity: {incident_info.get('severity', 'Unknown').upper()}") + output.append(f" Duration: {incident_info.get('duration', 'Unknown')}") + output.append(f" Status: {incident_info.get('status', 'Unknown').title()}") + output.append("") + + # RCA summary + output.append("ROOT CAUSE ANALYSIS:") + output.append(f" Method: {rca_results.get('method', 'Unknown')}") + output.append(f" Confidence: {rca_results.get('confidence', 'Unknown').title()}") + + root_causes = rca_results.get("root_causes", []) + if root_causes: + output.append(f" Root Causes Identified: {len(root_causes)}") + for i, cause in enumerate(root_causes[:3], 1): + output.append(f" {i}. {cause.get('cause', 'Unknown')[:60]}...") + output.append("") + + # Action items summary + output.append("ACTION ITEMS:") + output.append(f" Total Actions: {len(action_items)}") + output.append(f" Critical (P0): {metadata.get('critical_action_items', 0)}") + output.append(f" Prevention Timeline: {metadata.get('estimated_prevention_timeline', 'Unknown')}") + + if action_items: + output.append(" Top Actions:") + for item in action_items[:3]: + output.append(f" - {item.get('title', 'Unknown')[:50]}...") + output.append("") + + # Completeness + completeness = metadata.get("review_completeness", 0) * 100 + output.append(f"REVIEW COMPLETENESS: {completeness:.0f}%") + output.append("") + + output.append("=" * 60) + + return "\n".join(output) + + +def main(): + """Main function with argument parsing and execution.""" + parser = argparse.ArgumentParser( + description="Generate Post-Incident Review documents with RCA and action items", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python pir_generator.py --incident incident.json --output pir.md + python pir_generator.py --incident incident.json --rca-method fishbone + cat incident.json | python pir_generator.py --format markdown + +Incident JSON format: + { + "incident_id": "INC-2024-001", + "title": "Database performance degradation", + "description": "Users experiencing slow response times", + "severity": "sev2", + "start_time": "2024-01-01T12:00:00Z", + "end_time": "2024-01-01T14:30:00Z", + "customer_impact": "50% of users affected by slow page loads", + "business_impact": "Moderate user experience degradation", + "incident_commander": "Alice Smith", + "responders": ["Bob Jones", "Carol Johnson"] + } + """ + ) + + parser.add_argument( + "--incident", "-i", + help="Incident data file (JSON) or '-' for stdin" + ) + + parser.add_argument( + "--timeline", "-t", + help="Timeline reconstruction file (JSON)" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (default: stdout)" + ) + + parser.add_argument( + "--format", "-f", + choices=["json", "markdown", "text"], + default="markdown", + help="Output format (default: markdown)" + ) + + parser.add_argument( + "--rca-method", + choices=["five_whys", "fishbone", "timeline", "bow_tie"], + default="five_whys", + help="Root cause analysis method (default: five_whys)" + ) + + parser.add_argument( + "--template-type", + choices=["comprehensive", "standard", "brief"], + default="comprehensive", + help="PIR template type (default: comprehensive)" + ) + + parser.add_argument( + "--action-items", + action="store_true", + help="Generate detailed action items" + ) + + args = parser.parse_args() + + generator = PIRGenerator() + + try: + # Read incident data + if args.incident == "-" or (not args.incident and not sys.stdin.isatty()): + # Read from stdin + input_text = sys.stdin.read().strip() + if not input_text: + parser.error("No incident data provided") + incident_data = json.loads(input_text) + elif args.incident: + # Read from file + with open(args.incident, 'r') as f: + incident_data = json.load(f) + else: + parser.error("No incident data specified. Use --incident or pipe data to stdin.") + + # Read timeline data if provided + timeline_data = None + if args.timeline: + with open(args.timeline, 'r') as f: + timeline_data = json.load(f) + + # Validate incident data + if not isinstance(incident_data, dict): + parser.error("Incident data must be a JSON object") + + if not incident_data.get("description") and not incident_data.get("title"): + parser.error("Incident data must contain 'description' or 'title'") + + # Generate PIR + result = generator.generate_pir( + incident_data=incident_data, + timeline_data=timeline_data, + rca_method=args.rca_method, + template_type=args.template_type + ) + + # Format output + if args.format == "json": + output = format_json_output(result) + elif args.format == "markdown": + output = format_markdown_output(result) + else: + output = format_text_output(result) + + # Write output + if args.output: + with open(args.output, 'w') as f: + f.write(output) + f.write('\n') + else: + print(output) + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON - {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering-team/incident-commander/scripts/timeline_reconstructor.py b/engineering-team/incident-commander/scripts/timeline_reconstructor.py new file mode 100644 index 0000000..ec60c5b --- /dev/null +++ b/engineering-team/incident-commander/scripts/timeline_reconstructor.py @@ -0,0 +1,1007 @@ +#!/usr/bin/env python3 +""" +Timeline Reconstructor + +Reconstructs incident timelines from timestamped events (logs, alerts, Slack messages). +Identifies incident phases, calculates durations, and performs gap analysis. + +This tool processes chronological event data and creates a coherent narrative +of how an incident progressed from detection through resolution. + +Usage: + python timeline_reconstructor.py --input events.json --output timeline.md + python timeline_reconstructor.py --input events.json --detect-phases --gap-analysis + cat events.json | python timeline_reconstructor.py --format text +""" + +import argparse +import json +import sys +import re +from datetime import datetime, timezone, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict, namedtuple + + +# Event data structure +Event = namedtuple('Event', ['timestamp', 'source', 'type', 'message', 'severity', 'actor', 'metadata']) + +# Phase data structure +Phase = namedtuple('Phase', ['name', 'start_time', 'end_time', 'duration', 'events', 'description']) + + +class TimelineReconstructor: + """ + Reconstructs incident timelines from disparate event sources. + Identifies phases, calculates metrics, and performs gap analysis. + """ + + def __init__(self): + """Initialize the reconstructor with phase detection rules and templates.""" + self.phase_patterns = self._load_phase_patterns() + self.event_types = self._load_event_types() + self.severity_mapping = self._load_severity_mapping() + self.gap_thresholds = self._load_gap_thresholds() + + def _load_phase_patterns(self) -> Dict[str, Dict]: + """Load patterns for identifying incident phases.""" + return { + "detection": { + "keywords": [ + "alert", "alarm", "triggered", "fired", "detected", "noticed", + "monitoring", "threshold exceeded", "anomaly", "spike", + "error rate", "latency increase", "timeout", "failure" + ], + "event_types": ["alert", "monitoring", "notification"], + "priority": 1, + "description": "Initial detection of the incident through monitoring or observation" + }, + "triage": { + "keywords": [ + "investigating", "triaging", "assessing", "evaluating", + "checking", "looking into", "analyzing", "reviewing", + "diagnosis", "troubleshooting", "examining" + ], + "event_types": ["investigation", "communication", "action"], + "priority": 2, + "description": "Assessment and initial investigation of the incident" + }, + "escalation": { + "keywords": [ + "escalating", "paging", "calling in", "requesting help", + "engaging", "involving", "notifying", "alerting team", + "incident commander", "war room", "all hands" + ], + "event_types": ["escalation", "communication", "notification"], + "priority": 3, + "description": "Escalation to additional resources or higher severity response" + }, + "mitigation": { + "keywords": [ + "fixing", "patching", "deploying", "rolling back", "restarting", + "scaling", "rerouting", "bypassing", "workaround", + "implementing fix", "applying solution", "remediation" + ], + "event_types": ["deployment", "action", "fix"], + "priority": 4, + "description": "Active mitigation efforts to resolve the incident" + }, + "resolution": { + "keywords": [ + "resolved", "fixed", "restored", "recovered", "back online", + "working", "normal", "stable", "healthy", "operational", + "incident closed", "service restored" + ], + "event_types": ["resolution", "confirmation"], + "priority": 5, + "description": "Confirmation that the incident has been resolved" + }, + "review": { + "keywords": [ + "post-mortem", "retrospective", "review", "lessons learned", + "pir", "post-incident", "analysis", "follow-up", + "action items", "improvements" + ], + "event_types": ["review", "documentation"], + "priority": 6, + "description": "Post-incident review and documentation activities" + } + } + + def _load_event_types(self) -> Dict[str, Dict]: + """Load event type classification rules.""" + return { + "alert": { + "sources": ["monitoring", "nagios", "datadog", "newrelic", "prometheus"], + "indicators": ["alert", "alarm", "threshold", "metric"], + "severity_boost": 2 + }, + "log": { + "sources": ["application", "server", "container", "system"], + "indicators": ["error", "exception", "warn", "fail"], + "severity_boost": 1 + }, + "communication": { + "sources": ["slack", "teams", "email", "chat"], + "indicators": ["message", "notification", "update"], + "severity_boost": 0 + }, + "deployment": { + "sources": ["ci/cd", "jenkins", "github", "gitlab", "deploy"], + "indicators": ["deploy", "release", "build", "merge"], + "severity_boost": 3 + }, + "action": { + "sources": ["manual", "script", "automation", "operator"], + "indicators": ["executed", "ran", "performed", "applied"], + "severity_boost": 2 + }, + "escalation": { + "sources": ["pagerduty", "opsgenie", "oncall", "escalation"], + "indicators": ["paged", "escalated", "notified", "assigned"], + "severity_boost": 3 + } + } + + def _load_severity_mapping(self) -> Dict[str, int]: + """Load severity level mappings.""" + return { + "critical": 5, "crit": 5, "sev1": 5, "p1": 5, + "high": 4, "major": 4, "sev2": 4, "p2": 4, + "medium": 3, "moderate": 3, "sev3": 3, "p3": 3, + "low": 2, "minor": 2, "sev4": 2, "p4": 2, + "info": 1, "informational": 1, "debug": 1, + "unknown": 0 + } + + def _load_gap_thresholds(self) -> Dict[str, int]: + """Load gap analysis thresholds in minutes.""" + return { + "detection_to_triage": 15, # Should start investigating within 15 min + "triage_to_mitigation": 30, # Should start mitigation within 30 min + "mitigation_to_resolution": 120, # Should resolve within 2 hours + "communication_gap": 30, # Should communicate every 30 min + "action_gap": 60, # Should take actions every hour + "phase_transition": 45 # Should transition phases within 45 min + } + + def reconstruct_timeline(self, events_data: List[Dict]) -> Dict[str, Any]: + """ + Main reconstruction method that processes events and builds timeline. + + Args: + events_data: List of event dictionaries + + Returns: + Dictionary with timeline analysis and metrics + """ + # Parse and normalize events + events = self._parse_events(events_data) + if not events: + return {"error": "No valid events found"} + + # Sort events chronologically + events.sort(key=lambda e: e.timestamp) + + # Detect phases + phases = self._detect_phases(events) + + # Calculate metrics + metrics = self._calculate_metrics(events, phases) + + # Perform gap analysis + gap_analysis = self._analyze_gaps(events, phases) + + # Generate timeline narrative + narrative = self._generate_narrative(events, phases) + + # Create summary statistics + summary = self._generate_summary(events, phases, metrics) + + return { + "timeline": { + "total_events": len(events), + "time_range": { + "start": events[0].timestamp.isoformat(), + "end": events[-1].timestamp.isoformat(), + "duration_minutes": int((events[-1].timestamp - events[0].timestamp).total_seconds() / 60) + }, + "phases": [self._phase_to_dict(phase) for phase in phases], + "events": [self._event_to_dict(event) for event in events] + }, + "metrics": metrics, + "gap_analysis": gap_analysis, + "narrative": narrative, + "summary": summary, + "reconstruction_timestamp": datetime.now(timezone.utc).isoformat() + } + + def _parse_events(self, events_data: List[Dict]) -> List[Event]: + """Parse raw event data into normalized Event objects.""" + events = [] + + for event_dict in events_data: + try: + # Parse timestamp + timestamp_str = event_dict.get("timestamp", event_dict.get("time", "")) + if not timestamp_str: + continue + + timestamp = self._parse_timestamp(timestamp_str) + if not timestamp: + continue + + # Extract other fields + source = event_dict.get("source", "unknown") + event_type = self._classify_event_type(event_dict) + message = event_dict.get("message", event_dict.get("description", "")) + severity = self._parse_severity(event_dict.get("severity", event_dict.get("level", "unknown"))) + actor = event_dict.get("actor", event_dict.get("user", "system")) + + # Extract metadata + metadata = {k: v for k, v in event_dict.items() + if k not in ["timestamp", "time", "source", "type", "message", "severity", "actor"]} + + event = Event( + timestamp=timestamp, + source=source, + type=event_type, + message=message, + severity=severity, + actor=actor, + metadata=metadata + ) + + events.append(event) + + except Exception as e: + # Skip invalid events but log them + continue + + return events + + def _parse_timestamp(self, timestamp_str: str) -> Optional[datetime]: + """Parse various timestamp formats.""" + # Common timestamp formats + formats = [ + "%Y-%m-%dT%H:%M:%S.%fZ", # ISO with microseconds + "%Y-%m-%dT%H:%M:%SZ", # ISO without microseconds + "%Y-%m-%d %H:%M:%S", # Standard format + "%m/%d/%Y %H:%M:%S", # US format + "%d/%m/%Y %H:%M:%S", # EU format + "%Y-%m-%d %H:%M:%S.%f", # With microseconds + "%Y%m%d_%H%M%S", # Compact format + ] + + for fmt in formats: + try: + dt = datetime.strptime(timestamp_str, fmt) + # Ensure timezone awareness + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except ValueError: + continue + + # Try parsing as Unix timestamp + try: + timestamp_float = float(timestamp_str) + return datetime.fromtimestamp(timestamp_float, tz=timezone.utc) + except ValueError: + pass + + return None + + def _classify_event_type(self, event_dict: Dict) -> str: + """Classify event type based on source and content.""" + source = event_dict.get("source", "").lower() + message = event_dict.get("message", "").lower() + event_type = event_dict.get("type", "").lower() + + # Check explicit type first + if event_type in self.event_types: + return event_type + + # Classify based on source and content + for type_name, type_info in self.event_types.items(): + # Check source patterns + if any(src in source for src in type_info["sources"]): + return type_name + + # Check message indicators + if any(indicator in message for indicator in type_info["indicators"]): + return type_name + + return "unknown" + + def _parse_severity(self, severity_str: str) -> int: + """Parse severity string to numeric value.""" + severity_clean = str(severity_str).lower().strip() + return self.severity_mapping.get(severity_clean, 0) + + def _detect_phases(self, events: List[Event]) -> List[Phase]: + """Detect incident phases based on event patterns.""" + phases = [] + current_phase = None + phase_events = [] + + for event in events: + detected_phase = self._identify_phase(event) + + if detected_phase != current_phase: + # End current phase if exists + if current_phase and phase_events: + phase_obj = Phase( + name=current_phase, + start_time=phase_events[0].timestamp, + end_time=phase_events[-1].timestamp, + duration=(phase_events[-1].timestamp - phase_events[0].timestamp).total_seconds() / 60, + events=phase_events.copy(), + description=self.phase_patterns[current_phase]["description"] + ) + phases.append(phase_obj) + + # Start new phase + current_phase = detected_phase + phase_events = [event] + else: + phase_events.append(event) + + # Add final phase + if current_phase and phase_events: + phase_obj = Phase( + name=current_phase, + start_time=phase_events[0].timestamp, + end_time=phase_events[-1].timestamp, + duration=(phase_events[-1].timestamp - phase_events[0].timestamp).total_seconds() / 60, + events=phase_events, + description=self.phase_patterns[current_phase]["description"] + ) + phases.append(phase_obj) + + return self._merge_adjacent_phases(phases) + + def _identify_phase(self, event: Event) -> str: + """Identify which phase an event belongs to.""" + message_lower = event.message.lower() + + # Score each phase based on keywords and event type + phase_scores = {} + + for phase_name, pattern_info in self.phase_patterns.items(): + score = 0 + + # Keyword matching + for keyword in pattern_info["keywords"]: + if keyword in message_lower: + score += 2 + + # Event type matching + if event.type in pattern_info["event_types"]: + score += 3 + + # Severity boost for certain phases + if phase_name == "escalation" and event.severity >= 4: + score += 2 + + phase_scores[phase_name] = score + + # Return highest scoring phase, default to triage + if phase_scores and max(phase_scores.values()) > 0: + return max(phase_scores, key=phase_scores.get) + + return "triage" # Default phase + + def _merge_adjacent_phases(self, phases: List[Phase]) -> List[Phase]: + """Merge adjacent phases of the same type.""" + if not phases: + return phases + + merged = [] + current_phase = phases[0] + + for next_phase in phases[1:]: + if (next_phase.name == current_phase.name and + (next_phase.start_time - current_phase.end_time).total_seconds() < 300): # 5 min gap + # Merge phases + merged_events = current_phase.events + next_phase.events + current_phase = Phase( + name=current_phase.name, + start_time=current_phase.start_time, + end_time=next_phase.end_time, + duration=(next_phase.end_time - current_phase.start_time).total_seconds() / 60, + events=merged_events, + description=current_phase.description + ) + else: + merged.append(current_phase) + current_phase = next_phase + + merged.append(current_phase) + return merged + + def _calculate_metrics(self, events: List[Event], phases: List[Phase]) -> Dict[str, Any]: + """Calculate timeline metrics and KPIs.""" + if not events or not phases: + return {} + + start_time = events[0].timestamp + end_time = events[-1].timestamp + total_duration = (end_time - start_time).total_seconds() / 60 + + # Phase timing metrics + phase_durations = {phase.name: phase.duration for phase in phases} + + # Detection metrics + detection_time = 0 + if phases and phases[0].name == "detection": + detection_time = phases[0].duration + + # Time to mitigation + mitigation_start = None + for phase in phases: + if phase.name == "mitigation": + mitigation_start = (phase.start_time - start_time).total_seconds() / 60 + break + + # Time to resolution + resolution_time = None + for phase in phases: + if phase.name == "resolution": + resolution_time = (phase.start_time - start_time).total_seconds() / 60 + break + + # Communication frequency + comm_events = [e for e in events if e.type == "communication"] + comm_frequency = len(comm_events) / (total_duration / 60) if total_duration > 0 else 0 + + # Action frequency + action_events = [e for e in events if e.type == "action"] + action_frequency = len(action_events) / (total_duration / 60) if total_duration > 0 else 0 + + # Event source distribution + source_counts = defaultdict(int) + for event in events: + source_counts[event.source] += 1 + + return { + "duration_metrics": { + "total_duration_minutes": round(total_duration, 1), + "detection_duration_minutes": round(detection_time, 1), + "time_to_mitigation_minutes": round(mitigation_start or 0, 1), + "time_to_resolution_minutes": round(resolution_time or 0, 1), + "phase_durations": {k: round(v, 1) for k, v in phase_durations.items()} + }, + "activity_metrics": { + "total_events": len(events), + "events_per_hour": round((len(events) / (total_duration / 60)) if total_duration > 0 else 0, 1), + "communication_frequency": round(comm_frequency, 1), + "action_frequency": round(action_frequency, 1), + "unique_sources": len(source_counts), + "unique_actors": len(set(e.actor for e in events)) + }, + "phase_metrics": { + "total_phases": len(phases), + "phase_sequence": [p.name for p in phases], + "longest_phase": max(phases, key=lambda p: p.duration).name if phases else None, + "shortest_phase": min(phases, key=lambda p: p.duration).name if phases else None + }, + "source_distribution": dict(source_counts) + } + + def _analyze_gaps(self, events: List[Event], phases: List[Phase]) -> Dict[str, Any]: + """Perform gap analysis to identify potential issues.""" + gaps = [] + warnings = [] + + # Check phase transition timing + for i in range(len(phases) - 1): + current_phase = phases[i] + next_phase = phases[i + 1] + + transition_gap = (next_phase.start_time - current_phase.end_time).total_seconds() / 60 + threshold_key = f"{current_phase.name}_to_{next_phase.name}" + threshold = self.gap_thresholds.get(threshold_key, self.gap_thresholds["phase_transition"]) + + if transition_gap > threshold: + gaps.append({ + "type": "phase_transition", + "from_phase": current_phase.name, + "to_phase": next_phase.name, + "gap_minutes": round(transition_gap, 1), + "threshold_minutes": threshold, + "severity": "warning" if transition_gap < threshold * 2 else "critical" + }) + + # Check communication gaps + comm_events = [e for e in events if e.type == "communication"] + for i in range(len(comm_events) - 1): + gap_minutes = (comm_events[i+1].timestamp - comm_events[i].timestamp).total_seconds() / 60 + if gap_minutes > self.gap_thresholds["communication_gap"]: + gaps.append({ + "type": "communication_gap", + "gap_minutes": round(gap_minutes, 1), + "threshold_minutes": self.gap_thresholds["communication_gap"], + "severity": "warning" if gap_minutes < self.gap_thresholds["communication_gap"] * 2 else "critical" + }) + + # Check for missing phases + expected_phases = ["detection", "triage", "mitigation", "resolution"] + actual_phases = [p.name for p in phases] + missing_phases = [p for p in expected_phases if p not in actual_phases] + + for missing_phase in missing_phases: + warnings.append({ + "type": "missing_phase", + "phase": missing_phase, + "message": f"Expected phase '{missing_phase}' not detected in timeline" + }) + + # Check for unusually long phases + for phase in phases: + if phase.duration > 180: # 3 hours + warnings.append({ + "type": "long_phase", + "phase": phase.name, + "duration_minutes": round(phase.duration, 1), + "message": f"Phase '{phase.name}' lasted {phase.duration:.0f} minutes, which is unusually long" + }) + + return { + "gaps": gaps, + "warnings": warnings, + "gap_summary": { + "total_gaps": len(gaps), + "critical_gaps": len([g for g in gaps if g.get("severity") == "critical"]), + "warning_gaps": len([g for g in gaps if g.get("severity") == "warning"]), + "missing_phases": len(missing_phases) + } + } + + def _generate_narrative(self, events: List[Event], phases: List[Phase]) -> Dict[str, Any]: + """Generate human-readable incident narrative.""" + if not events or not phases: + return {"error": "Insufficient data for narrative generation"} + + # Create phase-based narrative + phase_narratives = [] + for phase in phases: + key_events = self._extract_key_events(phase.events) + narrative_text = self._create_phase_narrative(phase, key_events) + + phase_narratives.append({ + "phase": phase.name, + "start_time": phase.start_time.isoformat(), + "duration_minutes": round(phase.duration, 1), + "narrative": narrative_text, + "key_events": len(key_events), + "total_events": len(phase.events) + }) + + # Create overall summary + start_time = events[0].timestamp + end_time = events[-1].timestamp + total_duration = (end_time - start_time).total_seconds() / 60 + + summary = f"""Incident Timeline Summary: +The incident began at {start_time.strftime('%Y-%m-%d %H:%M:%S UTC')} and concluded at {end_time.strftime('%Y-%m-%d %H:%M:%S UTC')}, lasting approximately {total_duration:.0f} minutes. + +The incident progressed through {len(phases)} distinct phases: {', '.join(p.name for p in phases)}. + +Key milestones:""" + + for phase in phases: + summary += f"\n- {phase.name.title()}: {phase.start_time.strftime('%H:%M')} ({phase.duration:.0f} min)" + + return { + "summary": summary, + "phase_narratives": phase_narratives, + "timeline_type": self._classify_timeline_pattern(phases), + "complexity_score": self._calculate_complexity_score(events, phases) + } + + def _extract_key_events(self, events: List[Event]) -> List[Event]: + """Extract the most important events from a phase.""" + # Sort by severity and timestamp + sorted_events = sorted(events, key=lambda e: (e.severity, e.timestamp), reverse=True) + + # Take top events, but ensure chronological representation + key_events = [] + + # Always include first and last events + if events: + key_events.append(events[0]) + if len(events) > 1: + key_events.append(events[-1]) + + # Add high-severity events + high_severity_events = [e for e in events if e.severity >= 4] + key_events.extend(high_severity_events[:3]) + + # Remove duplicates while preserving order + seen = set() + unique_events = [] + for event in key_events: + event_key = (event.timestamp, event.message) + if event_key not in seen: + seen.add(event_key) + unique_events.append(event) + + return sorted(unique_events, key=lambda e: e.timestamp) + + def _create_phase_narrative(self, phase: Phase, key_events: List[Event]) -> str: + """Create narrative text for a phase.""" + phase_templates = { + "detection": "The incident was first detected when {first_event}. {additional_details}", + "triage": "Initial investigation began with {first_event}. The team {investigation_actions}", + "escalation": "The incident was escalated when {escalation_trigger}. {escalation_actions}", + "mitigation": "Mitigation efforts started with {first_action}. {mitigation_steps}", + "resolution": "The incident was resolved when {resolution_event}. {confirmation_steps}", + "review": "Post-incident review activities included {review_activities}" + } + + template = phase_templates.get(phase.name, "During the {phase_name} phase, {activities}") + + if not key_events: + return f"The {phase.name} phase lasted {phase.duration:.0f} minutes with {len(phase.events)} events." + + first_event = key_events[0].message + + # Customize based on phase + if phase.name == "detection": + return template.format( + first_event=first_event, + additional_details=f"This phase lasted {phase.duration:.0f} minutes with {len(phase.events)} total events." + ) + elif phase.name == "triage": + actions = [e.message for e in key_events if "investigating" in e.message.lower() or "checking" in e.message.lower()] + investigation_text = "performed various diagnostic activities" if not actions else f"focused on {actions[0]}" + return template.format( + first_event=first_event, + investigation_actions=investigation_text + ) + else: + return f"During the {phase.name} phase ({phase.duration:.0f} minutes), key activities included: {first_event}" + + def _classify_timeline_pattern(self, phases: List[Phase]) -> str: + """Classify the overall timeline pattern.""" + phase_names = [p.name for p in phases] + + if "escalation" in phase_names and phases[0].name == "detection": + return "standard_escalation" + elif len(phases) <= 3: + return "simple_resolution" + elif "review" in phase_names: + return "comprehensive_response" + else: + return "complex_incident" + + def _calculate_complexity_score(self, events: List[Event], phases: List[Phase]) -> float: + """Calculate incident complexity score (0-10).""" + score = 0.0 + + # Phase count contributes to complexity + score += min(len(phases) * 1.5, 6.0) + + # Event count contributes to complexity + score += min(len(events) / 20, 2.0) + + # Duration contributes to complexity + if events: + duration_hours = (events[-1].timestamp - events[0].timestamp).total_seconds() / 3600 + score += min(duration_hours / 2, 2.0) + + return min(score, 10.0) + + def _generate_summary(self, events: List[Event], phases: List[Phase], metrics: Dict) -> Dict[str, Any]: + """Generate comprehensive incident summary.""" + if not events: + return {} + + # Key statistics + start_time = events[0].timestamp + end_time = events[-1].timestamp + duration_minutes = metrics.get("duration_metrics", {}).get("total_duration_minutes", 0) + + # Phase analysis + phase_analysis = {} + for phase in phases: + phase_analysis[phase.name] = { + "duration_minutes": round(phase.duration, 1), + "event_count": len(phase.events), + "start_time": phase.start_time.isoformat(), + "end_time": phase.end_time.isoformat() + } + + # Actor involvement + actors = defaultdict(int) + for event in events: + actors[event.actor] += 1 + + return { + "incident_overview": { + "start_time": start_time.isoformat(), + "end_time": end_time.isoformat(), + "total_duration_minutes": round(duration_minutes, 1), + "total_events": len(events), + "phases_detected": len(phases) + }, + "phase_analysis": phase_analysis, + "key_participants": dict(actors), + "event_sources": dict(defaultdict(int, {e.source: 1 for e in events})), + "complexity_indicators": { + "unique_sources": len(set(e.source for e in events)), + "unique_actors": len(set(e.actor for e in events)), + "high_severity_events": len([e for e in events if e.severity >= 4]), + "phase_transitions": len(phases) - 1 if phases else 0 + } + } + + def _event_to_dict(self, event: Event) -> Dict: + """Convert Event namedtuple to dictionary.""" + return { + "timestamp": event.timestamp.isoformat(), + "source": event.source, + "type": event.type, + "message": event.message, + "severity": event.severity, + "actor": event.actor, + "metadata": event.metadata + } + + def _phase_to_dict(self, phase: Phase) -> Dict: + """Convert Phase namedtuple to dictionary.""" + return { + "name": phase.name, + "start_time": phase.start_time.isoformat(), + "end_time": phase.end_time.isoformat(), + "duration_minutes": round(phase.duration, 1), + "event_count": len(phase.events), + "description": phase.description + } + + +def format_json_output(result: Dict) -> str: + """Format result as pretty JSON.""" + return json.dumps(result, indent=2, ensure_ascii=False) + + +def format_text_output(result: Dict) -> str: + """Format result as human-readable text.""" + if "error" in result: + return f"Error: {result['error']}" + + timeline = result["timeline"] + metrics = result["metrics"] + narrative = result["narrative"] + + output = [] + output.append("=" * 80) + output.append("INCIDENT TIMELINE RECONSTRUCTION") + output.append("=" * 80) + output.append("") + + # Overview + time_range = timeline["time_range"] + output.append("OVERVIEW:") + output.append(f" Time Range: {time_range['start']} to {time_range['end']}") + output.append(f" Total Duration: {time_range['duration_minutes']} minutes") + output.append(f" Total Events: {timeline['total_events']}") + output.append(f" Phases Detected: {len(timeline['phases'])}") + output.append("") + + # Phase summary + output.append("PHASES:") + for phase in timeline["phases"]: + output.append(f" {phase['name'].upper()}:") + output.append(f" Start: {phase['start_time']}") + output.append(f" Duration: {phase['duration_minutes']} minutes") + output.append(f" Events: {phase['event_count']}") + output.append(f" Description: {phase['description']}") + output.append("") + + # Key metrics + if "duration_metrics" in metrics: + duration_metrics = metrics["duration_metrics"] + output.append("KEY METRICS:") + output.append(f" Time to Mitigation: {duration_metrics.get('time_to_mitigation_minutes', 'N/A')} minutes") + output.append(f" Time to Resolution: {duration_metrics.get('time_to_resolution_minutes', 'N/A')} minutes") + + if "activity_metrics" in metrics: + activity = metrics["activity_metrics"] + output.append(f" Events per Hour: {activity.get('events_per_hour', 'N/A')}") + output.append(f" Unique Sources: {activity.get('unique_sources', 'N/A')}") + output.append("") + + # Narrative + if "summary" in narrative: + output.append("INCIDENT NARRATIVE:") + output.append(narrative["summary"]) + output.append("") + + # Gap analysis + if "gap_analysis" in result and result["gap_analysis"]["gaps"]: + output.append("GAP ANALYSIS:") + for gap in result["gap_analysis"]["gaps"][:5]: # Show first 5 gaps + output.append(f" {gap['type'].replace('_', ' ').title()}: {gap['gap_minutes']} min gap (threshold: {gap['threshold_minutes']} min)") + output.append("") + + output.append("=" * 80) + + return "\n".join(output) + + +def format_markdown_output(result: Dict) -> str: + """Format result as Markdown timeline.""" + if "error" in result: + return f"# Error\n\n{result['error']}" + + timeline = result["timeline"] + narrative = result.get("narrative", {}) + + output = [] + output.append("# Incident Timeline") + output.append("") + + # Overview + time_range = timeline["time_range"] + output.append("## Overview") + output.append("") + output.append(f"- **Duration:** {time_range['duration_minutes']} minutes") + output.append(f"- **Start Time:** {time_range['start']}") + output.append(f"- **End Time:** {time_range['end']}") + output.append(f"- **Total Events:** {timeline['total_events']}") + output.append("") + + # Narrative summary + if "summary" in narrative: + output.append("## Summary") + output.append("") + output.append(narrative["summary"]) + output.append("") + + # Phase timeline + output.append("## Phase Timeline") + output.append("") + + for phase in timeline["phases"]: + output.append(f"### {phase['name'].title()} Phase") + output.append("") + output.append(f"**Duration:** {phase['duration_minutes']} minutes ") + output.append(f"**Start:** {phase['start_time']} ") + output.append(f"**Events:** {phase['event_count']} ") + output.append("") + output.append(phase["description"]) + output.append("") + + # Detailed timeline + output.append("## Detailed Event Timeline") + output.append("") + + for event in timeline["events"]: + timestamp = datetime.fromisoformat(event["timestamp"].replace('Z', '+00:00')) + output.append(f"**{timestamp.strftime('%H:%M:%S')}** [{event['source']}] {event['message']}") + output.append("") + + return "\n".join(output) + + +def main(): + """Main function with argument parsing and execution.""" + parser = argparse.ArgumentParser( + description="Reconstruct incident timeline from timestamped events", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python timeline_reconstructor.py --input events.json --output timeline.md + python timeline_reconstructor.py --input events.json --detect-phases --gap-analysis + cat events.json | python timeline_reconstructor.py --format text + +Input JSON format: + [ + { + "timestamp": "2024-01-01T12:00:00Z", + "source": "monitoring", + "type": "alert", + "message": "High error rate detected", + "severity": "critical", + "actor": "system" + } + ] + """ + ) + + parser.add_argument( + "--input", "-i", + help="Input file path (JSON format) or '-' for stdin" + ) + + parser.add_argument( + "--output", "-o", + help="Output file path (default: stdout)" + ) + + parser.add_argument( + "--format", "-f", + choices=["json", "text", "markdown"], + default="json", + help="Output format (default: json)" + ) + + parser.add_argument( + "--detect-phases", + action="store_true", + help="Enable advanced phase detection" + ) + + parser.add_argument( + "--gap-analysis", + action="store_true", + help="Perform gap analysis on timeline" + ) + + parser.add_argument( + "--min-events", + type=int, + default=1, + help="Minimum number of events required (default: 1)" + ) + + args = parser.parse_args() + + reconstructor = TimelineReconstructor() + + try: + # Read input + if args.input == "-" or (not args.input and not sys.stdin.isatty()): + # Read from stdin + input_text = sys.stdin.read().strip() + if not input_text: + parser.error("No input provided") + events_data = json.loads(input_text) + elif args.input: + # Read from file + with open(args.input, 'r') as f: + events_data = json.load(f) + else: + parser.error("No input specified. Use --input or pipe data to stdin.") + + # Validate input + if not isinstance(events_data, list): + parser.error("Input must be a JSON array of events") + + if len(events_data) < args.min_events: + parser.error(f"Minimum {args.min_events} events required") + + # Reconstruct timeline + result = reconstructor.reconstruct_timeline(events_data) + + # Format output + if args.format == "json": + output = format_json_output(result) + elif args.format == "markdown": + output = format_markdown_output(result) + else: + output = format_text_output(result) + + # Write output + if args.output: + with open(args.output, 'w') as f: + f.write(output) + f.write('\n') + else: + print(output) + + except FileNotFoundError as e: + print(f"Error: File not found - {e}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON - {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file From f6131b24d628f46f130eab14f0aefe02f5f09a64 Mon Sep 17 00:00:00 2001 From: alirezarezvani <5697919+alirezarezvani@users.noreply.github.com> Date: Mon, 16 Feb 2026 13:24:40 +0000 Subject: [PATCH 2/4] chore: sync codex skills symlinks [automated] --- .codex/skills-index.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.codex/skills-index.json b/.codex/skills-index.json index 068e78a..99ba724 100644 --- a/.codex/skills-index.json +++ b/.codex/skills-index.json @@ -51,7 +51,7 @@ "name": "incident-commander", "source": "../../engineering-team/incident-commander", "category": "engineering", - "description": "Production incident management with structured timeline analysis, severity classification (SEV1-4), automated postmortem generation, and SLA tracking. Features communication templates, escalation routing, 5-Whys root cause analysis, and MTTR/MTTD metrics for high-reliability engineering teams." + "description": "Skill from engineering-team" }, { "name": "ms365-tenant-manager", From 6707dd18c28aed1920b085e32539fbd0d2aa0900 Mon Sep 17 00:00:00 2001 From: Leo Date: Mon, 16 Feb 2026 13:30:58 +0000 Subject: [PATCH 3/4] feat: add interview-system-designer skill - Comprehensive interview system design toolkit - Interview Loop Designer: generates calibrated loops for any role/level - Question Bank Generator: creates competency-based questions with rubrics - Hiring Calibrator: analyzes interview data for bias and calibration issues - Complete reference materials: competency matrices, bias mitigation, debrief guides - Sample data and expected outputs for testing - Supports all major roles: SWE, PM, Designer, Data, DevOps, Leadership - Zero external dependencies, Python standard library only - Dual output: JSON + human-readable text formats --- .../interview-system-designer/README.md | 309 ++++ .../interview-system-designer/SKILL.md | 458 ++++++ .../assets/sample_interview_results.json | 382 +++++ .../assets/sample_role_definitions.json | 170 +++ .../product_manager_senior_questions.json | 622 ++++++++ .../product_manager_senior_questions.txt | 177 +++ ...ftware_engineer_senior_interview_loop.json | 435 ++++++ ...oftware_engineer_senior_interview_loop.txt | 151 ++ .../hiring_calibrator.py | 1306 +++++++++++++++++ .../loop_designer.py | 908 ++++++++++++ .../question_bank_generator.py | 1060 +++++++++++++ .../references/bias_mitigation_checklist.md | 308 ++++ .../references/competency_matrix_templates.md | 171 +++ .../references/debrief_facilitation_guide.md | 319 ++++ 14 files changed, 6776 insertions(+) create mode 100644 engineering/interview-system-designer/README.md create mode 100644 engineering/interview-system-designer/SKILL.md create mode 100644 engineering/interview-system-designer/assets/sample_interview_results.json create mode 100644 engineering/interview-system-designer/assets/sample_role_definitions.json create mode 100644 engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.json create mode 100644 engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.txt create mode 100644 engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.json create mode 100644 engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.txt create mode 100644 engineering/interview-system-designer/hiring_calibrator.py create mode 100644 engineering/interview-system-designer/loop_designer.py create mode 100644 engineering/interview-system-designer/question_bank_generator.py create mode 100644 engineering/interview-system-designer/references/bias_mitigation_checklist.md create mode 100644 engineering/interview-system-designer/references/competency_matrix_templates.md create mode 100644 engineering/interview-system-designer/references/debrief_facilitation_guide.md diff --git a/engineering/interview-system-designer/README.md b/engineering/interview-system-designer/README.md new file mode 100644 index 0000000..10d37c4 --- /dev/null +++ b/engineering/interview-system-designer/README.md @@ -0,0 +1,309 @@ +# Interview System Designer + +A comprehensive toolkit for designing, optimizing, and calibrating interview processes. This skill provides tools to create role-specific interview loops, generate competency-based question banks, and analyze hiring data for bias and calibration issues. + +## Overview + +The Interview System Designer skill includes three powerful Python tools and comprehensive reference materials to help you build fair, effective, and scalable hiring processes: + +1. **Interview Loop Designer** - Generate calibrated interview loops for any role and level +2. **Question Bank Generator** - Create competency-based interview questions with scoring rubrics +3. **Hiring Calibrator** - Analyze interview data to detect bias and calibration issues + +## Tools + +### 1. Interview Loop Designer (`loop_designer.py`) + +Generates complete interview loops tailored to specific roles, levels, and teams. + +**Features:** +- Role-specific competency mapping (SWE, PM, Designer, Data, DevOps, Leadership) +- Level-appropriate interview rounds (junior through principal) +- Optimized scheduling and time allocation +- Interviewer skill requirements +- Standardized scorecard templates + +**Usage:** +```bash +# Basic usage +python3 loop_designer.py --role "Senior Software Engineer" --level senior + +# With team and custom competencies +python3 loop_designer.py --role "Product Manager" --level mid --team growth --competencies leadership,strategy,analytics + +# Using JSON input file +python3 loop_designer.py --input assets/sample_role_definitions.json --output loops/ + +# Specify output format +python3 loop_designer.py --role "Staff Data Scientist" --level staff --format json --output data_scientist_loop.json +``` + +**Input Options:** +- `--role`: Job role title (e.g., "Senior Software Engineer", "Product Manager") +- `--level`: Experience level (junior, mid, senior, staff, principal) +- `--team`: Team or department (optional) +- `--competencies`: Comma-separated list of specific competencies to focus on +- `--input`: JSON file with role definition +- `--output`: Output directory or file path +- `--format`: Output format (json, text, both) - default: both + +**Example Output:** +``` +Interview Loop Design for Senior Software Engineer (Senior Level) +============================================================ +Total Duration: 300 minutes (5h 0m) +Total Rounds: 5 + +INTERVIEW ROUNDS +---------------------------------------- +Round 1: Technical Phone Screen +Duration: 45 minutes +Format: Virtual +Focus Areas: Coding Fundamentals, Problem Solving + +Round 2: System Design +Duration: 75 minutes +Format: Collaborative Whitboard +Focus Areas: System Thinking, Architectural Reasoning +... +``` + +### 2. Question Bank Generator (`question_bank_generator.py`) + +Creates comprehensive interview question banks organized by competency area. + +**Features:** +- Competency-based question organization +- Level-appropriate difficulty progression +- Multiple question types (technical, behavioral, situational) +- Detailed scoring rubrics with calibration examples +- Follow-up probes and conversation guides + +**Usage:** +```bash +# Generate questions for specific competencies +python3 question_bank_generator.py --role "Frontend Engineer" --competencies react,typescript,system-design + +# Create behavioral question bank +python3 question_bank_generator.py --role "Product Manager" --question-types behavioral,leadership --num-questions 15 + +# Generate questions for multiple levels +python3 question_bank_generator.py --role "DevOps Engineer" --levels junior,mid,senior --output questions/ +``` + +**Input Options:** +- `--role`: Job role title +- `--level`: Experience level (default: senior) +- `--competencies`: Comma-separated list of competencies to focus on +- `--question-types`: Types to include (technical, behavioral, situational) +- `--num-questions`: Number of questions to generate (default: 20) +- `--input`: JSON file with role requirements +- `--output`: Output directory or file path +- `--format`: Output format (json, text, both) - default: both + +**Question Types:** +- **Technical**: Coding problems, system design, domain-specific challenges +- **Behavioral**: STAR method questions focusing on past experiences +- **Situational**: Hypothetical scenarios testing decision-making + +### 3. Hiring Calibrator (`hiring_calibrator.py`) + +Analyzes interview scores to detect bias, calibration issues, and provides recommendations. + +**Features:** +- Statistical bias detection across demographics +- Interviewer calibration analysis +- Score distribution and trending analysis +- Specific coaching recommendations +- Comprehensive reporting with actionable insights + +**Usage:** +```bash +# Comprehensive analysis +python3 hiring_calibrator.py --input assets/sample_interview_results.json --analysis-type comprehensive + +# Focus on specific areas +python3 hiring_calibrator.py --input interview_data.json --analysis-type bias --competencies technical,leadership + +# Trend analysis over time +python3 hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly +``` + +**Input Options:** +- `--input`: JSON file with interview results data (required) +- `--analysis-type`: Type of analysis (comprehensive, bias, calibration, interviewer, scoring) +- `--competencies`: Comma-separated list of competencies to focus on +- `--trend-analysis`: Enable trend analysis over time +- `--period`: Time period for trends (daily, weekly, monthly, quarterly) +- `--output`: Output file path +- `--format`: Output format (json, text, both) - default: both + +**Analysis Types:** +- **Comprehensive**: Full analysis including bias, calibration, and recommendations +- **Bias**: Focus on demographic and interviewer bias patterns +- **Calibration**: Interviewer consistency and agreement analysis +- **Interviewer**: Individual interviewer performance and coaching needs +- **Scoring**: Score distribution and pattern analysis + +## Data Formats + +### Role Definition Input (JSON) +```json +{ + "role": "Senior Software Engineer", + "level": "senior", + "team": "platform", + "competencies": ["system_design", "technical_leadership", "mentoring"], + "requirements": { + "years_experience": "5-8", + "technical_skills": ["Python", "AWS", "Kubernetes"], + "leadership_experience": true + } +} +``` + +### Interview Results Input (JSON) +```json +[ + { + "candidate_id": "candidate_001", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_alice", + "date": "2024-01-15T09:00:00Z", + "scores": { + "coding_fundamentals": 3.5, + "system_design": 4.0, + "technical_leadership": 3.0, + "communication": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "male", + "ethnicity": "asian", + "years_experience": 6 + } +] +``` + +## Reference Materials + +### Competency Matrix Templates (`references/competency_matrix_templates.md`) +- Comprehensive competency matrices for all engineering roles +- Level-specific expectations (junior through principal) +- Assessment criteria and growth paths +- Customization guidelines for different company stages and industries + +### Bias Mitigation Checklist (`references/bias_mitigation_checklist.md`) +- Pre-interview preparation checklist +- Interview process bias prevention strategies +- Real-time bias interruption techniques +- Legal compliance reminders +- Emergency response protocols + +### Debrief Facilitation Guide (`references/debrief_facilitation_guide.md`) +- Structured debrief meeting frameworks +- Evidence-based discussion techniques +- Bias interruption strategies +- Decision documentation standards +- Common challenges and solutions + +## Sample Data + +The `assets/` directory contains sample data for testing: + +- `sample_role_definitions.json`: Example role definitions for various positions +- `sample_interview_results.json`: Sample interview data with multiple candidates and interviewers + +## Expected Outputs + +The `expected_outputs/` directory contains examples of tool outputs: + +- Interview loop designs in both JSON and human-readable formats +- Question banks with scoring rubrics and calibration examples +- Calibration analysis reports with bias detection and recommendations + +## Best Practices + +### Interview Loop Design +1. **Competency Focus**: Align interview rounds with role-critical competencies +2. **Level Calibration**: Adjust expectations and question difficulty based on experience level +3. **Time Optimization**: Balance thoroughness with candidate experience +4. **Interviewer Training**: Ensure interviewers are qualified and calibrated + +### Question Bank Development +1. **Evidence-Based**: Focus on observable behaviors and concrete examples +2. **Bias Mitigation**: Use structured questions that minimize subjective interpretation +3. **Calibration**: Include examples of different quality responses for consistency +4. **Continuous Improvement**: Regularly update questions based on predictive validity + +### Calibration Analysis +1. **Regular Monitoring**: Analyze hiring data quarterly for bias patterns +2. **Prompt Action**: Address calibration issues immediately with targeted coaching +3. **Data Quality**: Ensure complete and consistent data collection +4. **Legal Compliance**: Monitor for discriminatory patterns and document corrections + +## Installation & Setup + +No external dependencies required - uses Python 3 standard library only. + +```bash +# Clone or download the skill directory +cd interview-system-designer/ + +# Make scripts executable (optional) +chmod +x *.py + +# Test with sample data +python3 loop_designer.py --role "Senior Software Engineer" --level senior +python3 question_bank_generator.py --role "Product Manager" --level mid +python3 hiring_calibrator.py --input assets/sample_interview_results.json +``` + +## Integration + +### With Existing Systems +- **ATS Integration**: Export interview loops as structured data for applicant tracking systems +- **Calendar Systems**: Use scheduling outputs to auto-create interview blocks +- **HR Analytics**: Import calibration reports into broader diversity and inclusion dashboards + +### Custom Workflows +- **Batch Processing**: Process multiple roles or historical data sets +- **Automated Reporting**: Schedule regular calibration analysis +- **Custom Competencies**: Extend frameworks with company-specific competencies + +## Troubleshooting + +### Common Issues + +**"Role not found" errors:** +- The tool will map common variations (engineer → software_engineer) +- For custom roles, use the closest standard role and specify custom competencies + +**"Insufficient data" errors:** +- Minimum 5 interviews required for statistical analysis +- Ensure interview data includes required fields (candidate_id, interviewer_id, scores, date) + +**Missing output files:** +- Check file permissions in output directory +- Ensure adequate disk space +- Verify JSON input file format is valid + +### Performance Considerations + +- Interview loop generation: < 1 second +- Question bank generation: 1-3 seconds for 20 questions +- Calibration analysis: 1-5 seconds for 50 interviews, scales linearly + +## Contributing + +To extend this skill: + +1. **New Roles**: Add competency frameworks in `_init_competency_frameworks()` +2. **New Question Types**: Extend question templates in respective generators +3. **New Analysis Types**: Add analysis methods to hiring calibrator +4. **Custom Outputs**: Modify formatting functions for different output needs + +## License & Usage + +This skill is designed for internal company use in hiring process optimization. All bias detection and mitigation features should be reviewed with legal counsel to ensure compliance with local employment laws. + +For questions or support, refer to the comprehensive documentation in each script's docstring and the reference materials provided. \ No newline at end of file diff --git a/engineering/interview-system-designer/SKILL.md b/engineering/interview-system-designer/SKILL.md new file mode 100644 index 0000000..adb7d8c --- /dev/null +++ b/engineering/interview-system-designer/SKILL.md @@ -0,0 +1,458 @@ +--- +name: interview-system-designer +description: This skill should be used when the user asks to "design interview processes", "create hiring pipelines", "calibrate interview loops", "generate interview questions", "design competency matrices", "analyze interviewer bias", "create scoring rubrics", "build question banks", or "optimize hiring systems". Use for designing role-specific interview loops, competency assessments, and hiring calibration systems. +--- + +# Interview System Designer + +Comprehensive interview system design, competency assessment, and hiring process optimization. + +## Table of Contents + +- [Quick Start](#quick-start) +- [Tools Overview](#tools-overview) + - [Interview Loop Designer](#1-interview-loop-designer) + - [Question Bank Generator](#2-question-bank-generator) + - [Hiring Calibrator](#3-hiring-calibrator) +- [Interview System Workflows](#interview-system-workflows) + - [Role-Specific Loop Design](#role-specific-loop-design) + - [Competency Matrix Development](#competency-matrix-development) + - [Question Bank Creation](#question-bank-creation) + - [Bias Mitigation Framework](#bias-mitigation-framework) + - [Hiring Bar Calibration](#hiring-bar-calibration) +- [Competency Frameworks](#competency-frameworks) +- [Scoring & Calibration](#scoring--calibration) +- [Reference Documentation](#reference-documentation) +- [Industry Standards](#industry-standards) + +--- + +## Quick Start + +```bash +# Design a complete interview loop for a senior software engineer role +python loop_designer.py --role "Senior Software Engineer" --level senior --team platform --output loops/ + +# Generate a comprehensive question bank for a product manager position +python question_bank_generator.py --role "Product Manager" --level senior --competencies leadership,strategy,analytics --output questions/ + +# Analyze interview calibration across multiple candidates and interviewers +python hiring_calibrator.py --input interview_data.json --output calibration_report.json --analysis-type full +``` + +--- + +## Tools Overview + +### 1. Interview Loop Designer + +Generates calibrated interview loops tailored to specific roles, levels, and teams. + +**Input:** Role definition (title, level, team, competency requirements) +**Output:** Complete interview loop with rounds, focus areas, time allocation, scorecard templates + +**Key Features:** +- Role-specific competency mapping +- Level-appropriate question difficulty +- Interviewer skill requirements +- Time-optimized scheduling +- Standardized scorecards + +**Usage:** +```bash +# Design loop for a specific role +python loop_designer.py --role "Staff Data Scientist" --level staff --team ml-platform + +# Generate loop with specific focus areas +python loop_designer.py --role "Engineering Manager" --level senior --competencies leadership,technical,strategy + +# Create loop for multiple levels +python loop_designer.py --role "Backend Engineer" --levels junior,mid,senior --output loops/backend/ +``` + +### 2. Question Bank Generator + +Creates comprehensive, competency-based interview questions with detailed scoring criteria. + +**Input:** Role requirements, competency areas, experience level +**Output:** Structured question bank with scoring rubrics, follow-up probes, and calibration examples + +**Key Features:** +- Competency-based question organization +- Level-appropriate difficulty progression +- Behavioral and technical question types +- Anti-bias question design +- Calibration examples (poor/good/great answers) + +**Usage:** +```bash +# Generate questions for technical competencies +python question_bank_generator.py --role "Frontend Engineer" --competencies react,typescript,system-design + +# Create behavioral question bank +python question_bank_generator.py --role "Product Manager" --question-types behavioral,leadership --output pm_questions/ + +# Generate questions for all levels +python question_bank_generator.py --role "DevOps Engineer" --levels junior,mid,senior,staff +``` + +### 3. Hiring Calibrator + +Analyzes interview scores to detect bias, calibration issues, and recommends improvements. + +**Input:** Interview results data (candidate scores, interviewer feedback, demographics) +**Output:** Calibration analysis, bias detection report, interviewer coaching recommendations + +**Key Features:** +- Statistical bias detection +- Interviewer calibration analysis +- Score distribution analysis +- Recommendation engine +- Trend tracking over time + +**Usage:** +```bash +# Analyze calibration across all interviews +python hiring_calibrator.py --input interview_results.json --analysis-type comprehensive + +# Focus on specific competency areas +python hiring_calibrator.py --input data.json --competencies technical,leadership --output bias_report.json + +# Track calibration trends over time +python hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly +``` + +--- + +## Interview System Workflows + +### Role-Specific Loop Design + +#### Software Engineering Roles + +**Junior/Mid Software Engineer (2-4 years)** +- **Duration:** 3-4 hours across 3-4 rounds +- **Focus Areas:** Coding fundamentals, debugging, system understanding, growth mindset +- **Rounds:** + 1. Technical Phone Screen (45min) - Coding fundamentals, algorithms + 2. Coding Deep Dive (60min) - Problem-solving, code quality, testing + 3. System Design Basics (45min) - Component interaction, basic scalability + 4. Behavioral & Values (30min) - Team collaboration, learning agility + +**Senior Software Engineer (5-8 years)** +- **Duration:** 4-5 hours across 4-5 rounds +- **Focus Areas:** System design, technical leadership, mentoring capability, domain expertise +- **Rounds:** + 1. Technical Phone Screen (45min) - Advanced algorithms, optimization + 2. System Design (60min) - Scalability, trade-offs, architectural decisions + 3. Coding Excellence (60min) - Code quality, testing strategies, refactoring + 4. Technical Leadership (45min) - Mentoring, technical decisions, cross-team collaboration + 5. Behavioral & Culture (30min) - Leadership examples, conflict resolution + +**Staff+ Engineer (8+ years)** +- **Duration:** 5-6 hours across 5-6 rounds +- **Focus Areas:** Architectural vision, organizational impact, technical strategy, cross-functional leadership +- **Rounds:** + 1. Technical Phone Screen (45min) - System architecture, complex problem-solving + 2. Architecture Design (90min) - Large-scale systems, technology choices, evolution patterns + 3. Technical Strategy (60min) - Technical roadmaps, technology adoption, risk assessment + 4. Leadership & Influence (60min) - Cross-team impact, technical vision, stakeholder management + 5. Coding & Best Practices (45min) - Code quality standards, development processes + 6. Cultural & Strategic Fit (30min) - Company values, strategic thinking + +#### Product Management Roles + +**Product Manager (3-6 years)** +- **Duration:** 3-4 hours across 4 rounds +- **Focus Areas:** Product sense, analytical thinking, stakeholder management, execution +- **Rounds:** + 1. Product Sense (60min) - Feature prioritization, user empathy, market understanding + 2. Analytical Thinking (45min) - Data interpretation, metrics design, experimentation + 3. Execution & Process (45min) - Project management, cross-functional collaboration + 4. Behavioral & Leadership (30min) - Stakeholder management, conflict resolution + +**Senior Product Manager (6-10 years)** +- **Duration:** 4-5 hours across 4-5 rounds +- **Focus Areas:** Product strategy, team leadership, business impact, market analysis +- **Rounds:** + 1. Product Strategy (75min) - Market analysis, competitive positioning, roadmap planning + 2. Leadership & Influence (60min) - Team building, stakeholder management, decision-making + 3. Data & Analytics (45min) - Advanced metrics, experimentation design, business intelligence + 4. Technical Collaboration (45min) - Technical trade-offs, engineering partnership + 5. Case Study Presentation (45min) - Past impact, lessons learned, strategic thinking + +#### Design Roles + +**UX Designer (2-5 years)** +- **Duration:** 3-4 hours across 3-4 rounds +- **Focus Areas:** Design process, user research, visual design, collaboration +- **Rounds:** + 1. Portfolio Review (60min) - Design process, problem-solving approach, visual skills + 2. Design Challenge (90min) - User-centered design, wireframing, iteration + 3. Collaboration & Process (45min) - Cross-functional work, feedback incorporation + 4. Behavioral & Values (30min) - User advocacy, creative problem-solving + +**Senior UX Designer (5+ years)** +- **Duration:** 4-5 hours across 4-5 rounds +- **Focus Areas:** Design leadership, system thinking, research methodology, business impact +- **Rounds:** + 1. Portfolio Deep Dive (75min) - Design impact, methodology, leadership examples + 2. Design System Challenge (90min) - Systems thinking, scalability, consistency + 3. Research & Strategy (60min) - User research methods, data-driven design decisions + 4. Leadership & Mentoring (45min) - Design team leadership, process improvement + 5. Business & Strategy (30min) - Design's business impact, stakeholder management + +### Competency Matrix Development + +#### Technical Competencies + +**Software Engineering** +- **Coding Proficiency:** Algorithm design, data structures, language expertise +- **System Design:** Architecture patterns, scalability, performance optimization +- **Testing & Quality:** Unit testing, integration testing, code review practices +- **DevOps & Tools:** CI/CD, monitoring, debugging, development workflows + +**Data Science & Analytics** +- **Statistical Analysis:** Statistical methods, hypothesis testing, experimental design +- **Machine Learning:** Algorithm selection, model evaluation, feature engineering +- **Data Engineering:** ETL processes, data pipeline design, data quality +- **Business Intelligence:** Metrics design, dashboard creation, stakeholder communication + +**Product Management** +- **Product Strategy:** Market analysis, competitive research, roadmap planning +- **User Research:** User interviews, usability testing, persona development +- **Data Analysis:** Metrics interpretation, A/B testing, cohort analysis +- **Technical Understanding:** API design, database concepts, system architecture + +#### Behavioral Competencies + +**Leadership & Influence** +- **Team Building:** Hiring, onboarding, team culture development +- **Mentoring & Coaching:** Skill development, career guidance, feedback delivery +- **Strategic Thinking:** Long-term planning, vision setting, decision-making frameworks +- **Change Management:** Process improvement, organizational change, resistance handling + +**Communication & Collaboration** +- **Stakeholder Management:** Expectation setting, conflict resolution, alignment building +- **Cross-Functional Partnership:** Engineering-Product-Design collaboration +- **Presentation Skills:** Technical communication, executive briefings, documentation +- **Active Listening:** Empathy, question asking, perspective taking + +**Problem-Solving & Innovation** +- **Analytical Thinking:** Problem decomposition, root cause analysis, hypothesis formation +- **Creative Problem-Solving:** Alternative solution generation, constraint navigation +- **Learning Agility:** Skill acquisition, adaptation to change, knowledge transfer +- **Risk Assessment:** Uncertainty navigation, trade-off analysis, mitigation planning + +### Question Bank Creation + +#### Technical Questions by Level + +**Junior Level Questions** +- **Coding:** "Implement a function to find the second largest element in an array" +- **System Design:** "How would you design a simple URL shortener for 1000 users?" +- **Debugging:** "Walk through how you would debug a slow-loading web page" + +**Senior Level Questions** +- **Architecture:** "Design a real-time chat system supporting 1M concurrent users" +- **Leadership:** "Describe how you would onboard a new team member in your area" +- **Trade-offs:** "Compare microservices vs monolith for a rapidly scaling startup" + +**Staff+ Level Questions** +- **Strategy:** "How would you evaluate and introduce a new programming language to the organization?" +- **Influence:** "Describe a time you drove technical consensus across multiple teams" +- **Vision:** "How do you balance technical debt against feature development?" + +#### Behavioral Questions Framework + +**STAR Method Implementation** +- **Situation:** Context and background of the scenario +- **Task:** Specific challenge or goal that needed to be addressed +- **Action:** Concrete steps taken to address the challenge +- **Result:** Measurable outcomes and lessons learned + +**Sample Questions:** +- "Tell me about a time you had to influence a decision without formal authority" +- "Describe a situation where you had to deliver difficult feedback to a colleague" +- "Give an example of when you had to adapt your communication style for different audiences" +- "Walk me through a time when you had to make a decision with incomplete information" + +### Bias Mitigation Framework + +#### Structural Bias Prevention + +**Interview Panel Composition** +- Diverse interviewer panels (gender, ethnicity, experience level) +- Rotating panel assignments to prevent pattern bias +- Anonymous resume screening for initial phone screens +- Standardized question sets to ensure consistency + +**Process Standardization** +- Structured interview guides with required probing questions +- Consistent time allocation across all candidates +- Standardized evaluation criteria and scoring rubrics +- Required justification for all scoring decisions + +#### Cognitive Bias Recognition + +**Common Interview Biases** +- **Halo Effect:** One strong impression influences overall assessment +- **Confirmation Bias:** Seeking information that confirms initial impressions +- **Similarity Bias:** Favoring candidates with similar backgrounds/experiences +- **Contrast Effect:** Comparing candidates against each other rather than standard +- **Anchoring Bias:** Over-relying on first piece of information received + +**Mitigation Strategies** +- Pre-interview bias awareness training for all interviewers +- Structured debrief sessions with independent score recording +- Regular calibration sessions with example candidate discussions +- Statistical monitoring of scoring patterns by interviewer and demographic + +### Hiring Bar Calibration + +#### Calibration Methodology + +**Regular Calibration Sessions** +- Monthly interviewer calibration meetings +- Shadow interviewing for new interviewers (minimum 5 sessions) +- Quarterly cross-team calibration reviews +- Annual hiring bar review and adjustment process + +**Performance Tracking** +- New hire performance correlation with interview scores +- Interviewer accuracy tracking (prediction vs actual performance) +- False positive/negative analysis +- Offer acceptance rate analysis by interviewer + +**Feedback Loops** +- Six-month new hire performance reviews +- Manager feedback on interview process effectiveness +- Candidate experience surveys and feedback integration +- Continuous process improvement based on data analysis + +--- + +## Competency Frameworks + +### Engineering Competency Levels + +#### Level 1-2: Individual Contributor (Junior/Mid) +- **Technical Skills:** Language proficiency, testing basics, code review participation +- **Problem Solving:** Structured approach to debugging, logical thinking +- **Communication:** Clear status updates, effective question asking +- **Learning:** Proactive skill development, mentorship seeking + +#### Level 3-4: Senior Individual Contributor +- **Technical Leadership:** Architecture decisions, code quality advocacy +- **Mentoring:** Junior developer guidance, knowledge sharing +- **Project Ownership:** End-to-end feature delivery, stakeholder communication +- **Innovation:** Process improvement, technology evaluation + +#### Level 5-6: Staff+ Engineer +- **Organizational Impact:** Cross-team technical leadership, strategic planning +- **Technical Vision:** Long-term architectural planning, technology roadmap +- **People Development:** Team growth, hiring contribution, culture building +- **External Influence:** Industry contribution, thought leadership + +### Product Management Competency Levels + +#### Level 1-2: Associate/Product Manager +- **Product Execution:** Feature specification, requirements gathering +- **User Focus:** User research participation, feedback collection +- **Data Analysis:** Basic metrics analysis, experiment interpretation +- **Stakeholder Management:** Cross-functional collaboration, communication + +#### Level 3-4: Senior Product Manager +- **Strategic Thinking:** Market analysis, competitive positioning +- **Leadership:** Cross-functional team leadership, decision making +- **Business Impact:** Revenue impact, market share growth +- **Process Innovation:** Product development process improvement + +#### Level 5-6: Principal Product Manager +- **Vision Setting:** Product strategy, market direction +- **Organizational Influence:** Executive communication, team building +- **Innovation Leadership:** New market creation, disruptive thinking +- **Talent Development:** PM team growth, hiring leadership + +--- + +## Scoring & Calibration + +### Scoring Rubric Framework + +#### 4-Point Scoring Scale +- **4 - Exceeds Expectations:** Demonstrates mastery beyond required level +- **3 - Meets Expectations:** Solid performance meeting all requirements +- **2 - Partially Meets:** Shows potential but has development areas +- **1 - Does Not Meet:** Significant gaps in required competencies + +#### Competency-Specific Scoring + +**Technical Competencies** +- Code Quality (4): Clean, maintainable, well-tested code with excellent documentation +- Code Quality (3): Functional code with good structure and basic testing +- Code Quality (2): Working code with some structural issues or missing tests +- Code Quality (1): Non-functional or poorly structured code with significant issues + +**Leadership Competencies** +- Team Influence (4): Drives team success, develops others, creates lasting positive change +- Team Influence (3): Contributes positively to team dynamics and outcomes +- Team Influence (2): Shows leadership potential with some effective examples +- Team Influence (1): Limited evidence of leadership ability or negative team impact + +### Calibration Standards + +#### Statistical Benchmarks +- Target score distribution: 20% (4s), 40% (3s), 30% (2s), 10% (1s) +- Interviewer consistency target: <0.5 standard deviation from team average +- Pass rate target: 15-25% for most roles (varies by level and market conditions) +- Time to hire target: 2-3 weeks from first interview to offer + +#### Quality Metrics +- New hire 6-month performance correlation: >0.6 with interview scores +- Interviewer agreement rate: >80% within 1 point on final recommendations +- Candidate experience satisfaction: >4.0/5.0 average rating +- Offer acceptance rate: >85% for preferred candidates + +--- + +## Reference Documentation + +### Interview Templates +- Role-specific interview guides and question banks +- Scorecard templates for consistent evaluation +- Debrief facilitation guides for effective team discussions + +### Bias Mitigation Resources +- Unconscious bias training materials and exercises +- Structured interviewing best practices checklist +- Demographic diversity tracking and reporting templates + +### Calibration Tools +- Interview performance correlation analysis templates +- Interviewer coaching and development frameworks +- Hiring pipeline metrics and dashboard specifications + +--- + +## Industry Standards + +### Best Practices Integration +- Google's structured interviewing methodology +- Amazon's Leadership Principles assessment framework +- Microsoft's competency-based evaluation system +- Netflix's culture fit assessment approach + +### Compliance & Legal Considerations +- EEOC compliance requirements and documentation +- ADA accommodation procedures and guidelines +- International hiring law considerations +- Privacy and data protection requirements (GDPR, CCPA) + +### Continuous Improvement Framework +- Regular process auditing and refinement cycles +- Industry benchmarking and comparative analysis +- Technology integration for interview optimization +- Candidate experience enhancement initiatives + +This comprehensive interview system design framework provides the structure and tools necessary to build fair, effective, and scalable hiring processes that consistently identify top talent while minimizing bias and maximizing candidate experience. \ No newline at end of file diff --git a/engineering/interview-system-designer/assets/sample_interview_results.json b/engineering/interview-system-designer/assets/sample_interview_results.json new file mode 100644 index 0000000..8646b27 --- /dev/null +++ b/engineering/interview-system-designer/assets/sample_interview_results.json @@ -0,0 +1,382 @@ +[ + { + "candidate_id": "candidate_001", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_alice", + "date": "2024-01-15T09:00:00Z", + "scores": { + "coding_fundamentals": 3.5, + "system_design": 4.0, + "technical_leadership": 3.0, + "communication": 3.5, + "problem_solving": 4.0 + }, + "overall_recommendation": "Hire", + "gender": "male", + "ethnicity": "asian", + "years_experience": 6, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_001", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_bob", + "date": "2024-01-15T11:00:00Z", + "scores": { + "system_design": 3.5, + "technical_leadership": 3.5, + "mentoring": 3.0, + "cross_team_collaboration": 4.0, + "strategic_thinking": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "male", + "ethnicity": "asian", + "years_experience": 6, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_002", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_alice", + "date": "2024-01-16T09:00:00Z", + "scores": { + "coding_fundamentals": 2.5, + "system_design": 3.0, + "technical_leadership": 2.0, + "communication": 3.0, + "problem_solving": 3.0 + }, + "overall_recommendation": "No Hire", + "gender": "female", + "ethnicity": "hispanic", + "years_experience": 5, + "university_tier": "tier_2", + "previous_company_size": "startup" + }, + { + "candidate_id": "candidate_002", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_charlie", + "date": "2024-01-16T11:00:00Z", + "scores": { + "system_design": 2.0, + "technical_leadership": 2.5, + "mentoring": 2.0, + "cross_team_collaboration": 3.0, + "strategic_thinking": 2.5 + }, + "overall_recommendation": "No Hire", + "gender": "female", + "ethnicity": "hispanic", + "years_experience": 5, + "university_tier": "tier_2", + "previous_company_size": "startup" + }, + { + "candidate_id": "candidate_003", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_david", + "date": "2024-01-17T14:00:00Z", + "scores": { + "coding_fundamentals": 4.0, + "system_design": 3.5, + "technical_leadership": 4.0, + "communication": 4.0, + "problem_solving": 3.5 + }, + "overall_recommendation": "Strong Hire", + "gender": "male", + "ethnicity": "white", + "years_experience": 8, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_003", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_alice", + "date": "2024-01-17T16:00:00Z", + "scores": { + "system_design": 4.0, + "technical_leadership": 4.0, + "mentoring": 3.5, + "cross_team_collaboration": 4.0, + "strategic_thinking": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "male", + "ethnicity": "white", + "years_experience": 8, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_004", + "role": "Product Manager", + "interviewer_id": "interviewer_emma", + "date": "2024-01-18T10:00:00Z", + "scores": { + "product_strategy": 3.0, + "user_research": 3.5, + "data_analysis": 4.0, + "stakeholder_management": 3.0, + "communication": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "female", + "ethnicity": "black", + "years_experience": 4, + "university_tier": "tier_2", + "previous_company_size": "medium" + }, + { + "candidate_id": "candidate_005", + "role": "Product Manager", + "interviewer_id": "interviewer_frank", + "date": "2024-01-19T13:00:00Z", + "scores": { + "product_strategy": 2.5, + "user_research": 2.0, + "data_analysis": 3.0, + "stakeholder_management": 2.5, + "communication": 3.0 + }, + "overall_recommendation": "No Hire", + "gender": "male", + "ethnicity": "white", + "years_experience": 3, + "university_tier": "tier_3", + "previous_company_size": "startup" + }, + { + "candidate_id": "candidate_006", + "role": "Junior Software Engineer", + "interviewer_id": "interviewer_alice", + "date": "2024-01-20T09:00:00Z", + "scores": { + "coding_fundamentals": 3.0, + "debugging": 3.5, + "testing_basics": 3.0, + "collaboration": 4.0, + "learning_agility": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "female", + "ethnicity": "asian", + "years_experience": 1, + "university_tier": "bootcamp", + "previous_company_size": "none" + }, + { + "candidate_id": "candidate_007", + "role": "Junior Software Engineer", + "interviewer_id": "interviewer_bob", + "date": "2024-01-21T10:30:00Z", + "scores": { + "coding_fundamentals": 2.0, + "debugging": 2.5, + "testing_basics": 2.0, + "collaboration": 3.0, + "learning_agility": 3.0 + }, + "overall_recommendation": "No Hire", + "gender": "male", + "ethnicity": "hispanic", + "years_experience": 0, + "university_tier": "tier_2", + "previous_company_size": "none" + }, + { + "candidate_id": "candidate_008", + "role": "Staff Frontend Engineer", + "interviewer_id": "interviewer_grace", + "date": "2024-01-22T14:00:00Z", + "scores": { + "frontend_architecture": 4.0, + "system_design": 4.0, + "technical_leadership": 4.0, + "team_building": 3.5, + "strategic_thinking": 3.5 + }, + "overall_recommendation": "Strong Hire", + "gender": "female", + "ethnicity": "white", + "years_experience": 9, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_008", + "role": "Staff Frontend Engineer", + "interviewer_id": "interviewer_henry", + "date": "2024-01-22T16:00:00Z", + "scores": { + "frontend_architecture": 3.5, + "technical_leadership": 4.0, + "team_building": 4.0, + "cross_functional_collaboration": 4.0, + "organizational_impact": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "female", + "ethnicity": "white", + "years_experience": 9, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_009", + "role": "Data Scientist", + "interviewer_id": "interviewer_ivan", + "date": "2024-01-23T11:00:00Z", + "scores": { + "statistical_analysis": 3.5, + "machine_learning": 4.0, + "data_engineering": 3.0, + "business_acumen": 3.5, + "communication": 3.0 + }, + "overall_recommendation": "Hire", + "gender": "male", + "ethnicity": "indian", + "years_experience": 5, + "university_tier": "tier_1", + "previous_company_size": "medium" + }, + { + "candidate_id": "candidate_010", + "role": "DevOps Engineer", + "interviewer_id": "interviewer_jane", + "date": "2024-01-24T15:00:00Z", + "scores": { + "infrastructure_automation": 3.5, + "ci_cd_design": 4.0, + "monitoring_observability": 3.0, + "security_implementation": 3.5, + "incident_management": 4.0 + }, + "overall_recommendation": "Hire", + "gender": "female", + "ethnicity": "black", + "years_experience": 6, + "university_tier": "tier_2", + "previous_company_size": "startup" + }, + { + "candidate_id": "candidate_011", + "role": "UX Designer", + "interviewer_id": "interviewer_karl", + "date": "2024-01-25T10:00:00Z", + "scores": { + "design_process": 4.0, + "user_research": 3.5, + "design_systems": 4.0, + "cross_functional_collaboration": 3.5, + "design_leadership": 3.0 + }, + "overall_recommendation": "Hire", + "gender": "non_binary", + "ethnicity": "white", + "years_experience": 7, + "university_tier": "tier_1", + "previous_company_size": "medium" + }, + { + "candidate_id": "candidate_012", + "role": "Engineering Manager", + "interviewer_id": "interviewer_lisa", + "date": "2024-01-26T13:30:00Z", + "scores": { + "people_leadership": 4.0, + "technical_background": 3.5, + "strategic_thinking": 3.5, + "performance_management": 4.0, + "cross_functional_leadership": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "male", + "ethnicity": "white", + "years_experience": 8, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_013", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_alice", + "date": "2024-01-27T09:00:00Z", + "scores": { + "coding_fundamentals": 4.0, + "system_design": 4.0, + "technical_leadership": 4.0, + "communication": 4.0, + "problem_solving": 4.0 + }, + "overall_recommendation": "Strong Hire", + "gender": "female", + "ethnicity": "asian", + "years_experience": 7, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_013", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_charlie", + "date": "2024-01-27T11:00:00Z", + "scores": { + "system_design": 3.5, + "technical_leadership": 3.5, + "mentoring": 4.0, + "cross_team_collaboration": 4.0, + "strategic_thinking": 3.5 + }, + "overall_recommendation": "Hire", + "gender": "female", + "ethnicity": "asian", + "years_experience": 7, + "university_tier": "tier_1", + "previous_company_size": "large" + }, + { + "candidate_id": "candidate_014", + "role": "Senior Software Engineer", + "interviewer_id": "interviewer_david", + "date": "2024-01-28T14:00:00Z", + "scores": { + "coding_fundamentals": 1.5, + "system_design": 2.0, + "technical_leadership": 1.0, + "communication": 2.0, + "problem_solving": 2.0 + }, + "overall_recommendation": "Strong No Hire", + "gender": "male", + "ethnicity": "white", + "years_experience": 4, + "university_tier": "tier_3", + "previous_company_size": "startup" + }, + { + "candidate_id": "candidate_015", + "role": "Product Manager", + "interviewer_id": "interviewer_emma", + "date": "2024-01-29T11:00:00Z", + "scores": { + "product_strategy": 4.0, + "user_research": 3.5, + "data_analysis": 4.0, + "stakeholder_management": 4.0, + "communication": 3.5 + }, + "overall_recommendation": "Strong Hire", + "gender": "male", + "ethnicity": "black", + "years_experience": 5, + "university_tier": "tier_2", + "previous_company_size": "medium" + } +] \ No newline at end of file diff --git a/engineering/interview-system-designer/assets/sample_role_definitions.json b/engineering/interview-system-designer/assets/sample_role_definitions.json new file mode 100644 index 0000000..405f052 --- /dev/null +++ b/engineering/interview-system-designer/assets/sample_role_definitions.json @@ -0,0 +1,170 @@ +[ + { + "role": "Senior Software Engineer", + "level": "senior", + "team": "platform", + "department": "engineering", + "competencies": [ + "system_design", + "coding_fundamentals", + "technical_leadership", + "mentoring", + "cross_team_collaboration" + ], + "requirements": { + "years_experience": "5-8", + "technical_skills": ["Python", "Java", "Docker", "Kubernetes", "AWS"], + "leadership_experience": true, + "mentoring_required": true + }, + "hiring_bar": "high", + "interview_focus": ["technical_depth", "system_architecture", "leadership_potential"] + }, + { + "role": "Product Manager", + "level": "mid", + "team": "growth", + "department": "product", + "competencies": [ + "product_strategy", + "user_research", + "data_analysis", + "stakeholder_management", + "cross_functional_leadership" + ], + "requirements": { + "years_experience": "3-5", + "domain_knowledge": ["user_analytics", "experimentation", "product_metrics"], + "leadership_experience": false, + "technical_background": "preferred" + }, + "hiring_bar": "medium-high", + "interview_focus": ["product_sense", "analytical_thinking", "execution_ability"] + }, + { + "role": "Staff Frontend Engineer", + "level": "staff", + "team": "consumer", + "department": "engineering", + "competencies": [ + "frontend_architecture", + "system_design", + "technical_leadership", + "team_building", + "cross_functional_collaboration" + ], + "requirements": { + "years_experience": "8+", + "technical_skills": ["React", "TypeScript", "GraphQL", "Webpack", "Performance Optimization"], + "leadership_experience": true, + "architecture_experience": true + }, + "hiring_bar": "very-high", + "interview_focus": ["architectural_vision", "technical_strategy", "organizational_impact"] + }, + { + "role": "Data Scientist", + "level": "mid", + "team": "ml_platform", + "department": "data", + "competencies": [ + "statistical_analysis", + "machine_learning", + "data_engineering", + "business_acumen", + "communication" + ], + "requirements": { + "years_experience": "3-6", + "technical_skills": ["Python", "SQL", "TensorFlow", "Spark", "Statistics"], + "domain_knowledge": ["ML algorithms", "experimentation", "data_pipelines"], + "leadership_experience": false + }, + "hiring_bar": "high", + "interview_focus": ["technical_depth", "problem_solving", "business_impact"] + }, + { + "role": "DevOps Engineer", + "level": "senior", + "team": "infrastructure", + "department": "engineering", + "competencies": [ + "infrastructure_automation", + "ci_cd_design", + "monitoring_observability", + "security_implementation", + "incident_management" + ], + "requirements": { + "years_experience": "5-7", + "technical_skills": ["Kubernetes", "Terraform", "AWS", "Docker", "Monitoring"], + "security_background": "required", + "leadership_experience": "preferred" + }, + "hiring_bar": "high", + "interview_focus": ["system_reliability", "automation_expertise", "operational_excellence"] + }, + { + "role": "UX Designer", + "level": "senior", + "team": "design_systems", + "department": "design", + "competencies": [ + "design_process", + "user_research", + "design_systems", + "cross_functional_collaboration", + "design_leadership" + ], + "requirements": { + "years_experience": "5-8", + "portfolio_quality": "high", + "research_experience": true, + "systems_thinking": true + }, + "hiring_bar": "high", + "interview_focus": ["design_process", "systems_thinking", "user_advocacy"] + }, + { + "role": "Engineering Manager", + "level": "senior", + "team": "backend", + "department": "engineering", + "competencies": [ + "people_leadership", + "technical_background", + "strategic_thinking", + "performance_management", + "cross_functional_leadership" + ], + "requirements": { + "years_experience": "6-10", + "management_experience": "2+ years", + "technical_background": "required", + "hiring_experience": true + }, + "hiring_bar": "very-high", + "interview_focus": ["people_leadership", "technical_judgment", "organizational_impact"] + }, + { + "role": "Junior Software Engineer", + "level": "junior", + "team": "web", + "department": "engineering", + "competencies": [ + "coding_fundamentals", + "debugging", + "testing_basics", + "collaboration", + "learning_agility" + ], + "requirements": { + "years_experience": "0-2", + "technical_skills": ["JavaScript", "HTML/CSS", "Git", "Basic Algorithms"], + "education": "CS degree or bootcamp", + "growth_mindset": true + }, + "hiring_bar": "medium", + "interview_focus": ["coding_ability", "problem_solving", "potential_assessment"] + } +] \ No newline at end of file diff --git a/engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.json b/engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.json new file mode 100644 index 0000000..744ffa1 --- /dev/null +++ b/engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.json @@ -0,0 +1,622 @@ +{ + "role": "Product Manager", + "level": "senior", + "competencies": [ + "strategy", + "analytics", + "business_strategy", + "product_strategy", + "stakeholder_management", + "p&l_responsibility", + "leadership", + "team_leadership", + "user_research", + "data_analysis" + ], + "question_types": [ + "technical", + "behavioral", + "situational" + ], + "generated_at": "2026-02-16T13:27:41.303329", + "total_questions": 20, + "questions": [ + { + "question": "What challenges have you faced related to p&l responsibility and how did you overcome them?", + "competency": "p&l_responsibility", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + }, + { + "question": "Analyze conversion funnel data to identify the biggest drop-off point and propose solutions.", + "competency": "data_analysis", + "type": "analytical", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": [ + "funnel_analysis", + "conversion_optimization", + "statistical_significance" + ] + }, + { + "question": "What challenges have you faced related to team leadership and how did you overcome them?", + "competency": "team_leadership", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + }, + { + "question": "Design a go-to-market strategy for a new B2B SaaS product entering a competitive market.", + "competency": "product_strategy", + "type": "strategic", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": [ + "market_analysis", + "competitive_positioning", + "pricing_strategy", + "channel_strategy" + ] + }, + { + "question": "What challenges have you faced related to business strategy and how did you overcome them?", + "competency": "business_strategy", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + }, + { + "question": "Describe your experience with business strategy in your current or previous role.", + "competency": "business_strategy", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "Describe your experience with team leadership in your current or previous role.", + "competency": "team_leadership", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "Describe a situation where you had to influence someone without having direct authority over them.", + "competency": "leadership", + "type": "behavioral", + "method": "STAR", + "focus_areas": [ + "influence", + "persuasion", + "stakeholder_management" + ] + }, + { + "question": "Given a dataset of user activities, calculate the daily active users for the past month.", + "competency": "data_analysis", + "type": "analytical", + "difficulty": "easy", + "time_limit": 30, + "key_concepts": [ + "sql_basics", + "date_functions", + "aggregation" + ] + }, + { + "question": "Describe your experience with analytics in your current or previous role.", + "competency": "analytics", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "How would you prioritize features for a mobile app with limited engineering resources?", + "competency": "product_strategy", + "type": "case_study", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": [ + "prioritization_frameworks", + "resource_allocation", + "impact_estimation" + ] + }, + { + "question": "Describe your experience with stakeholder management in your current or previous role.", + "competency": "stakeholder_management", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "What challenges have you faced related to stakeholder management and how did you overcome them?", + "competency": "stakeholder_management", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + }, + { + "question": "What challenges have you faced related to user research and how did you overcome them?", + "competency": "user_research", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + }, + { + "question": "What challenges have you faced related to strategy and how did you overcome them?", + "competency": "strategy", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + }, + { + "question": "Describe your experience with user research in your current or previous role.", + "competency": "user_research", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "Describe your experience with p&l responsibility in your current or previous role.", + "competency": "p&l_responsibility", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "Describe your experience with strategy in your current or previous role.", + "competency": "strategy", + "type": "experience", + "focus_areas": [ + "experience_depth", + "practical_application" + ] + }, + { + "question": "Tell me about a time when you had to lead a team through a significant change or challenge.", + "competency": "leadership", + "type": "behavioral", + "method": "STAR", + "focus_areas": [ + "change_management", + "team_motivation", + "communication" + ] + }, + { + "question": "What challenges have you faced related to analytics and how did you overcome them?", + "competency": "analytics", + "type": "challenge_based", + "focus_areas": [ + "problem_solving", + "learning_from_experience" + ] + } + ], + "scoring_rubrics": { + "question_8": { + "question": "Describe a situation where you had to influence someone without having direct authority over them.", + "competency": "leadership", + "type": "behavioral", + "scoring_criteria": { + "situation_clarity": { + "4": "Clear, specific situation with relevant context and stakes", + "3": "Good situation description with adequate context", + "2": "Situation described but lacks some specifics", + "1": "Vague or unclear situation description" + }, + "action_quality": { + "4": "Specific, thoughtful actions showing strong competency", + "3": "Good actions demonstrating competency", + "2": "Adequate actions but could be stronger", + "1": "Weak or inappropriate actions" + }, + "result_impact": { + "4": "Significant positive impact with measurable results", + "3": "Good positive impact with clear outcomes", + "2": "Some positive impact demonstrated", + "1": "Little or no positive impact shown" + }, + "self_awareness": { + "4": "Excellent self-reflection, learns from experience, acknowledges growth areas", + "3": "Good self-awareness and learning orientation", + "2": "Some self-reflection demonstrated", + "1": "Limited self-awareness or reflection" + } + }, + "weight": "high", + "time_limit": 30 + }, + "question_19": { + "question": "Tell me about a time when you had to lead a team through a significant change or challenge.", + "competency": "leadership", + "type": "behavioral", + "scoring_criteria": { + "situation_clarity": { + "4": "Clear, specific situation with relevant context and stakes", + "3": "Good situation description with adequate context", + "2": "Situation described but lacks some specifics", + "1": "Vague or unclear situation description" + }, + "action_quality": { + "4": "Specific, thoughtful actions showing strong competency", + "3": "Good actions demonstrating competency", + "2": "Adequate actions but could be stronger", + "1": "Weak or inappropriate actions" + }, + "result_impact": { + "4": "Significant positive impact with measurable results", + "3": "Good positive impact with clear outcomes", + "2": "Some positive impact demonstrated", + "1": "Little or no positive impact shown" + }, + "self_awareness": { + "4": "Excellent self-reflection, learns from experience, acknowledges growth areas", + "3": "Good self-awareness and learning orientation", + "2": "Some self-reflection demonstrated", + "1": "Limited self-awareness or reflection" + } + }, + "weight": "high", + "time_limit": 30 + } + }, + "follow_up_probes": { + "question_1": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_2": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_3": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_4": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_5": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_6": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_7": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_8": [ + "What would you do differently if you faced this situation again?", + "How did you handle team members who were resistant to the change?", + "What metrics did you use to measure success?", + "How did you communicate progress to stakeholders?", + "What did you learn from this experience?" + ], + "question_9": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_10": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_11": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_12": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_13": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_14": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_15": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_16": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_17": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_18": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ], + "question_19": [ + "What would you do differently if you faced this situation again?", + "How did you handle team members who were resistant to the change?", + "What metrics did you use to measure success?", + "How did you communicate progress to stakeholders?", + "What did you learn from this experience?" + ], + "question_20": [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ] + }, + "calibration_examples": { + "question_1": { + "question": "What challenges have you faced related to p&l responsibility and how did you overcome them?", + "competency": "p&l_responsibility", + "sample_answers": { + "poor_answer": { + "answer": "Sample poor answer for p&l_responsibility question - lacks detail, specificity, or demonstrates weak competency", + "score": "1-2", + "issues": [ + "Vague response", + "Limited evidence of competency", + "Poor structure" + ] + }, + "good_answer": { + "answer": "Sample good answer for p&l_responsibility question - adequate detail, demonstrates competency clearly", + "score": "3", + "strengths": [ + "Clear structure", + "Demonstrates competency", + "Adequate detail" + ] + }, + "great_answer": { + "answer": "Sample excellent answer for p&l_responsibility question - exceptional detail, strong evidence, goes above and beyond", + "score": "4", + "strengths": [ + "Exceptional detail", + "Strong evidence", + "Strategic thinking", + "Goes beyond requirements" + ] + } + }, + "scoring_rationale": { + "key_indicators": "Look for evidence of p&l responsibility competency", + "red_flags": "Vague answers, lack of specifics, negative outcomes without learning", + "green_flags": "Specific examples, clear impact, demonstrates growth and learning" + } + }, + "question_2": { + "question": "Analyze conversion funnel data to identify the biggest drop-off point and propose solutions.", + "competency": "data_analysis", + "sample_answers": { + "poor_answer": { + "answer": "Sample poor answer for data_analysis question - lacks detail, specificity, or demonstrates weak competency", + "score": "1-2", + "issues": [ + "Vague response", + "Limited evidence of competency", + "Poor structure" + ] + }, + "good_answer": { + "answer": "Sample good answer for data_analysis question - adequate detail, demonstrates competency clearly", + "score": "3", + "strengths": [ + "Clear structure", + "Demonstrates competency", + "Adequate detail" + ] + }, + "great_answer": { + "answer": "Sample excellent answer for data_analysis question - exceptional detail, strong evidence, goes above and beyond", + "score": "4", + "strengths": [ + "Exceptional detail", + "Strong evidence", + "Strategic thinking", + "Goes beyond requirements" + ] + } + }, + "scoring_rationale": { + "key_indicators": "Look for evidence of data analysis competency", + "red_flags": "Vague answers, lack of specifics, negative outcomes without learning", + "green_flags": "Specific examples, clear impact, demonstrates growth and learning" + } + }, + "question_3": { + "question": "What challenges have you faced related to team leadership and how did you overcome them?", + "competency": "team_leadership", + "sample_answers": { + "poor_answer": { + "answer": "Sample poor answer for team_leadership question - lacks detail, specificity, or demonstrates weak competency", + "score": "1-2", + "issues": [ + "Vague response", + "Limited evidence of competency", + "Poor structure" + ] + }, + "good_answer": { + "answer": "Sample good answer for team_leadership question - adequate detail, demonstrates competency clearly", + "score": "3", + "strengths": [ + "Clear structure", + "Demonstrates competency", + "Adequate detail" + ] + }, + "great_answer": { + "answer": "Sample excellent answer for team_leadership question - exceptional detail, strong evidence, goes above and beyond", + "score": "4", + "strengths": [ + "Exceptional detail", + "Strong evidence", + "Strategic thinking", + "Goes beyond requirements" + ] + } + }, + "scoring_rationale": { + "key_indicators": "Look for evidence of team leadership competency", + "red_flags": "Vague answers, lack of specifics, negative outcomes without learning", + "green_flags": "Specific examples, clear impact, demonstrates growth and learning" + } + }, + "question_4": { + "question": "Design a go-to-market strategy for a new B2B SaaS product entering a competitive market.", + "competency": "product_strategy", + "sample_answers": { + "poor_answer": { + "answer": "Sample poor answer for product_strategy question - lacks detail, specificity, or demonstrates weak competency", + "score": "1-2", + "issues": [ + "Vague response", + "Limited evidence of competency", + "Poor structure" + ] + }, + "good_answer": { + "answer": "Sample good answer for product_strategy question - adequate detail, demonstrates competency clearly", + "score": "3", + "strengths": [ + "Clear structure", + "Demonstrates competency", + "Adequate detail" + ] + }, + "great_answer": { + "answer": "Sample excellent answer for product_strategy question - exceptional detail, strong evidence, goes above and beyond", + "score": "4", + "strengths": [ + "Exceptional detail", + "Strong evidence", + "Strategic thinking", + "Goes beyond requirements" + ] + } + }, + "scoring_rationale": { + "key_indicators": "Look for evidence of product strategy competency", + "red_flags": "Vague answers, lack of specifics, negative outcomes without learning", + "green_flags": "Specific examples, clear impact, demonstrates growth and learning" + } + }, + "question_5": { + "question": "What challenges have you faced related to business strategy and how did you overcome them?", + "competency": "business_strategy", + "sample_answers": { + "poor_answer": { + "answer": "Sample poor answer for business_strategy question - lacks detail, specificity, or demonstrates weak competency", + "score": "1-2", + "issues": [ + "Vague response", + "Limited evidence of competency", + "Poor structure" + ] + }, + "good_answer": { + "answer": "Sample good answer for business_strategy question - adequate detail, demonstrates competency clearly", + "score": "3", + "strengths": [ + "Clear structure", + "Demonstrates competency", + "Adequate detail" + ] + }, + "great_answer": { + "answer": "Sample excellent answer for business_strategy question - exceptional detail, strong evidence, goes above and beyond", + "score": "4", + "strengths": [ + "Exceptional detail", + "Strong evidence", + "Strategic thinking", + "Goes beyond requirements" + ] + } + }, + "scoring_rationale": { + "key_indicators": "Look for evidence of business strategy competency", + "red_flags": "Vague answers, lack of specifics, negative outcomes without learning", + "green_flags": "Specific examples, clear impact, demonstrates growth and learning" + } + } + }, + "usage_guidelines": { + "interview_flow": { + "warm_up": "Start with 1-2 easier questions to build rapport", + "core_assessment": "Focus majority of time on core competency questions", + "closing": "End with questions about candidate's questions/interests" + }, + "time_management": { + "technical_questions": "Allow extra time for coding/design questions", + "behavioral_questions": "Keep to time limits but allow for follow-ups", + "total_recommendation": "45-75 minutes per interview round" + }, + "question_selection": { + "variety": "Mix question types within each competency area", + "difficulty": "Adjust based on candidate responses and energy", + "customization": "Adapt questions based on candidate's background" + }, + "common_mistakes": [ + "Don't ask all questions mechanically", + "Don't skip follow-up questions", + "Don't forget to assess cultural fit alongside competencies", + "Don't let one strong/weak area bias overall assessment" + ], + "calibration_reminders": [ + "Compare against role standard, not other candidates", + "Focus on evidence demonstrated, not potential", + "Consider level-appropriate expectations", + "Document specific examples in feedback" + ] + } +} \ No newline at end of file diff --git a/engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.txt b/engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.txt new file mode 100644 index 0000000..3674c1f --- /dev/null +++ b/engineering/interview-system-designer/expected_outputs/product_manager_senior_questions.txt @@ -0,0 +1,177 @@ +Interview Question Bank: Product Manager (Senior Level) +====================================================================== +Generated: 2026-02-16T13:27:41.303329 +Total Questions: 20 +Question Types: technical, behavioral, situational +Target Competencies: strategy, analytics, business_strategy, product_strategy, stakeholder_management, p&l_responsibility, leadership, team_leadership, user_research, data_analysis + +INTERVIEW QUESTIONS +-------------------------------------------------- + +1. What challenges have you faced related to p&l responsibility and how did you overcome them? + Competency: P&L Responsibility + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + +2. Analyze conversion funnel data to identify the biggest drop-off point and propose solutions. + Competency: Data Analysis + Type: Analytical + Time Limit: 45 minutes + +3. What challenges have you faced related to team leadership and how did you overcome them? + Competency: Team Leadership + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + +4. Design a go-to-market strategy for a new B2B SaaS product entering a competitive market. + Competency: Product Strategy + Type: Strategic + Time Limit: 60 minutes + +5. What challenges have you faced related to business strategy and how did you overcome them? + Competency: Business Strategy + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + +6. Describe your experience with business strategy in your current or previous role. + Competency: Business Strategy + Type: Experience + Focus Areas: experience_depth, practical_application + +7. Describe your experience with team leadership in your current or previous role. + Competency: Team Leadership + Type: Experience + Focus Areas: experience_depth, practical_application + +8. Describe a situation where you had to influence someone without having direct authority over them. + Competency: Leadership + Type: Behavioral + Focus Areas: influence, persuasion, stakeholder_management + +9. Given a dataset of user activities, calculate the daily active users for the past month. + Competency: Data Analysis + Type: Analytical + Time Limit: 30 minutes + +10. Describe your experience with analytics in your current or previous role. + Competency: Analytics + Type: Experience + Focus Areas: experience_depth, practical_application + +11. How would you prioritize features for a mobile app with limited engineering resources? + Competency: Product Strategy + Type: Case_Study + Time Limit: 45 minutes + +12. Describe your experience with stakeholder management in your current or previous role. + Competency: Stakeholder Management + Type: Experience + Focus Areas: experience_depth, practical_application + +13. What challenges have you faced related to stakeholder management and how did you overcome them? + Competency: Stakeholder Management + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + +14. What challenges have you faced related to user research and how did you overcome them? + Competency: User Research + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + +15. What challenges have you faced related to strategy and how did you overcome them? + Competency: Strategy + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + +16. Describe your experience with user research in your current or previous role. + Competency: User Research + Type: Experience + Focus Areas: experience_depth, practical_application + +17. Describe your experience with p&l responsibility in your current or previous role. + Competency: P&L Responsibility + Type: Experience + Focus Areas: experience_depth, practical_application + +18. Describe your experience with strategy in your current or previous role. + Competency: Strategy + Type: Experience + Focus Areas: experience_depth, practical_application + +19. Tell me about a time when you had to lead a team through a significant change or challenge. + Competency: Leadership + Type: Behavioral + Focus Areas: change_management, team_motivation, communication + +20. What challenges have you faced related to analytics and how did you overcome them? + Competency: Analytics + Type: Challenge_Based + Focus Areas: problem_solving, learning_from_experience + + +SCORING RUBRICS +-------------------------------------------------- +Sample Scoring Criteria (behavioral questions): + +Situation Clarity: + 4: Clear, specific situation with relevant context and stakes + 3: Good situation description with adequate context + 2: Situation described but lacks some specifics + 1: Vague or unclear situation description + +Action Quality: + 4: Specific, thoughtful actions showing strong competency + 3: Good actions demonstrating competency + 2: Adequate actions but could be stronger + 1: Weak or inappropriate actions + +Result Impact: + 4: Significant positive impact with measurable results + 3: Good positive impact with clear outcomes + 2: Some positive impact demonstrated + 1: Little or no positive impact shown + +Self Awareness: + 4: Excellent self-reflection, learns from experience, acknowledges growth areas + 3: Good self-awareness and learning orientation + 2: Some self-reflection demonstrated + 1: Limited self-awareness or reflection + + +FOLLOW-UP PROBE EXAMPLES +-------------------------------------------------- +Sample follow-up questions: + • Can you provide more specific details about your approach? + • What would you do differently if you had to do this again? + • What challenges did you face and how did you overcome them? + + +USAGE GUIDELINES +-------------------------------------------------- +Interview Flow: + • Warm Up: Start with 1-2 easier questions to build rapport + • Core Assessment: Focus majority of time on core competency questions + • Closing: End with questions about candidate's questions/interests + +Time Management: + • Technical Questions: Allow extra time for coding/design questions + • Behavioral Questions: Keep to time limits but allow for follow-ups + • Total Recommendation: 45-75 minutes per interview round + +Common Mistakes to Avoid: + • Don't ask all questions mechanically + • Don't skip follow-up questions + • Don't forget to assess cultural fit alongside competencies + + +CALIBRATION EXAMPLES +-------------------------------------------------- +Question: What challenges have you faced related to p&l responsibility and how did you overcome them? + +Sample Answer Quality Levels: + Poor Answer (Score 1-2): + Issues: Vague response, Limited evidence of competency, Poor structure + Good Answer (Score 3): + Strengths: Clear structure, Demonstrates competency, Adequate detail + Great Answer (Score 4): + Strengths: Exceptional detail, Strong evidence, Strategic thinking, Goes beyond requirements \ No newline at end of file diff --git a/engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.json b/engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.json new file mode 100644 index 0000000..24d36d1 --- /dev/null +++ b/engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.json @@ -0,0 +1,435 @@ +{ + "role": "Senior Software Engineer", + "level": "senior", + "team": "platform", + "generated_at": "2026-02-16T13:27:37.925680", + "total_duration_minutes": 300, + "total_rounds": 5, + "rounds": { + "round_1_technical_phone_screen": { + "name": "Technical Phone Screen", + "duration_minutes": 45, + "format": "virtual", + "objectives": [ + "Assess coding fundamentals", + "Evaluate problem-solving approach", + "Screen for basic technical competency" + ], + "question_types": [ + "coding_problems", + "technical_concepts", + "experience_questions" + ], + "evaluation_criteria": [ + "technical_accuracy", + "problem_solving_process", + "communication_clarity" + ], + "order": 1, + "focus_areas": [ + "coding_fundamentals", + "problem_solving", + "technical_leadership", + "system_architecture", + "people_development" + ] + }, + "round_2_coding_deep_dive": { + "name": "Coding Deep Dive", + "duration_minutes": 75, + "format": "in_person_or_virtual", + "objectives": [ + "Evaluate coding skills in depth", + "Assess code quality and testing", + "Review debugging approach" + ], + "question_types": [ + "complex_coding_problems", + "code_review", + "testing_strategy" + ], + "evaluation_criteria": [ + "code_quality", + "testing_approach", + "debugging_skills", + "optimization_thinking" + ], + "order": 2, + "focus_areas": [ + "technical_execution", + "code_quality", + "technical_leadership", + "system_architecture", + "people_development" + ] + }, + "round_3_system_design": { + "name": "System Design", + "duration_minutes": 75, + "format": "collaborative_whiteboard", + "objectives": [ + "Assess architectural thinking", + "Evaluate scalability considerations", + "Review trade-off analysis" + ], + "question_types": [ + "system_architecture", + "scalability_design", + "trade_off_analysis" + ], + "evaluation_criteria": [ + "architectural_thinking", + "scalability_awareness", + "trade_off_reasoning" + ], + "order": 3, + "focus_areas": [ + "system_thinking", + "architectural_reasoning", + "technical_leadership", + "system_architecture", + "people_development" + ] + }, + "round_4_behavioral": { + "name": "Behavioral Interview", + "duration_minutes": 45, + "format": "conversational", + "objectives": [ + "Assess cultural fit", + "Evaluate past experiences", + "Review leadership examples" + ], + "question_types": [ + "star_method_questions", + "situational_scenarios", + "values_alignment" + ], + "evaluation_criteria": [ + "communication_skills", + "leadership_examples", + "cultural_alignment" + ], + "order": 4, + "focus_areas": [ + "cultural_fit", + "communication", + "teamwork", + "technical_leadership", + "system_architecture" + ] + }, + "round_5_technical_leadership": { + "name": "Technical Leadership", + "duration_minutes": 60, + "format": "discussion_based", + "objectives": [ + "Evaluate mentoring capability", + "Assess technical decision making", + "Review cross-team collaboration" + ], + "question_types": [ + "leadership_scenarios", + "technical_decisions", + "mentoring_examples" + ], + "evaluation_criteria": [ + "leadership_potential", + "technical_judgment", + "influence_skills" + ], + "order": 5, + "focus_areas": [ + "leadership", + "mentoring", + "influence", + "technical_leadership", + "system_architecture" + ] + } + }, + "suggested_schedule": { + "type": "multi_day", + "total_duration_minutes": 300, + "recommended_breaks": [ + { + "type": "short_break", + "duration": 15, + "after_minutes": 90 + }, + { + "type": "lunch_break", + "duration": 60, + "after_minutes": 180 + } + ], + "day_structure": { + "day_1": { + "date": "TBD", + "start_time": "09:00", + "end_time": "12:45", + "rounds": [ + { + "type": "interview", + "round_name": "round_1_technical_phone_screen", + "title": "Technical Phone Screen", + "start_time": "09:00", + "end_time": "09:45", + "duration_minutes": 45, + "format": "virtual" + }, + { + "type": "interview", + "round_name": "round_2_coding_deep_dive", + "title": "Coding Deep Dive", + "start_time": "10:00", + "end_time": "11:15", + "duration_minutes": 75, + "format": "in_person_or_virtual" + }, + { + "type": "interview", + "round_name": "round_3_system_design", + "title": "System Design", + "start_time": "11:30", + "end_time": "12:45", + "duration_minutes": 75, + "format": "collaborative_whiteboard" + } + ] + }, + "day_2": { + "date": "TBD", + "start_time": "09:00", + "end_time": "11:00", + "rounds": [ + { + "type": "interview", + "round_name": "round_4_behavioral", + "title": "Behavioral Interview", + "start_time": "09:00", + "end_time": "09:45", + "duration_minutes": 45, + "format": "conversational" + }, + { + "type": "interview", + "round_name": "round_5_technical_leadership", + "title": "Technical Leadership", + "start_time": "10:00", + "end_time": "11:00", + "duration_minutes": 60, + "format": "discussion_based" + } + ] + } + }, + "logistics_notes": [ + "Coordinate interviewer availability before scheduling", + "Ensure all interviewers have access to job description and competency requirements", + "Prepare interview rooms/virtual links for all rounds", + "Share candidate resume and application with all interviewers", + "Test video conferencing setup before virtual interviews", + "Share virtual meeting links with candidate 24 hours in advance", + "Prepare whiteboard or collaborative online tool for design sessions" + ] + }, + "scorecard_template": { + "scoring_scale": { + "4": "Exceeds Expectations - Demonstrates mastery beyond required level", + "3": "Meets Expectations - Solid performance meeting all requirements", + "2": "Partially Meets - Shows potential but has development areas", + "1": "Does Not Meet - Significant gaps in required competencies" + }, + "dimensions": [ + { + "dimension": "system_architecture", + "weight": "high", + "scale": "1-4", + "description": "Assessment of system architecture competency" + }, + { + "dimension": "technical_leadership", + "weight": "high", + "scale": "1-4", + "description": "Assessment of technical leadership competency" + }, + { + "dimension": "mentoring", + "weight": "high", + "scale": "1-4", + "description": "Assessment of mentoring competency" + }, + { + "dimension": "cross_team_collab", + "weight": "high", + "scale": "1-4", + "description": "Assessment of cross team collab competency" + }, + { + "dimension": "technology_evaluation", + "weight": "medium", + "scale": "1-4", + "description": "Assessment of technology evaluation competency" + }, + { + "dimension": "process_improvement", + "weight": "medium", + "scale": "1-4", + "description": "Assessment of process improvement competency" + }, + { + "dimension": "hiring_contribution", + "weight": "medium", + "scale": "1-4", + "description": "Assessment of hiring contribution competency" + }, + { + "dimension": "communication", + "weight": "high", + "scale": "1-4" + }, + { + "dimension": "cultural_fit", + "weight": "medium", + "scale": "1-4" + }, + { + "dimension": "learning_agility", + "weight": "medium", + "scale": "1-4" + } + ], + "overall_recommendation": { + "options": [ + "Strong Hire", + "Hire", + "No Hire", + "Strong No Hire" + ], + "criteria": "Based on weighted average and minimum thresholds" + }, + "calibration_notes": { + "required": true, + "min_length": 100, + "sections": [ + "strengths", + "areas_for_development", + "specific_examples" + ] + } + }, + "interviewer_requirements": { + "round_1_technical_phone_screen": { + "required_skills": [ + "technical_assessment", + "coding_evaluation" + ], + "preferred_experience": [ + "same_domain", + "senior_level" + ], + "calibration_level": "standard", + "suggested_interviewers": [ + "senior_engineer", + "tech_lead" + ] + }, + "round_2_coding_deep_dive": { + "required_skills": [ + "advanced_technical", + "code_quality_assessment" + ], + "preferred_experience": [ + "senior_engineer", + "system_design" + ], + "calibration_level": "high", + "suggested_interviewers": [ + "senior_engineer", + "staff_engineer" + ] + }, + "round_3_system_design": { + "required_skills": [ + "architecture_design", + "scalability_assessment" + ], + "preferred_experience": [ + "senior_architect", + "large_scale_systems" + ], + "calibration_level": "high", + "suggested_interviewers": [ + "senior_architect", + "staff_engineer" + ] + }, + "round_4_behavioral": { + "required_skills": [ + "behavioral_interviewing", + "competency_assessment" + ], + "preferred_experience": [ + "hiring_manager", + "people_leadership" + ], + "calibration_level": "standard", + "suggested_interviewers": [ + "hiring_manager", + "people_manager" + ] + }, + "round_5_technical_leadership": { + "required_skills": [ + "leadership_assessment", + "technical_mentoring" + ], + "preferred_experience": [ + "engineering_manager", + "tech_lead" + ], + "calibration_level": "high", + "suggested_interviewers": [ + "engineering_manager", + "senior_staff" + ] + } + }, + "competency_framework": { + "required": [ + "system_architecture", + "technical_leadership", + "mentoring", + "cross_team_collab" + ], + "preferred": [ + "technology_evaluation", + "process_improvement", + "hiring_contribution" + ], + "focus_areas": [ + "technical_leadership", + "system_architecture", + "people_development" + ] + }, + "calibration_notes": { + "hiring_bar_notes": "Calibrated for senior level software engineer role", + "common_pitfalls": [ + "Avoid comparing candidates to each other rather than to the role standard", + "Don't let one strong/weak area overshadow overall assessment", + "Ensure consistent application of evaluation criteria" + ], + "calibration_checkpoints": [ + "Review score distribution after every 5 candidates", + "Conduct monthly interviewer calibration sessions", + "Track correlation with 6-month performance reviews" + ], + "escalation_criteria": [ + "Any candidate receiving all 4s or all 1s", + "Significant disagreement between interviewers (>1.5 point spread)", + "Unusual circumstances or accommodations needed" + ] + } +} \ No newline at end of file diff --git a/engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.txt b/engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.txt new file mode 100644 index 0000000..4210f2a --- /dev/null +++ b/engineering/interview-system-designer/expected_outputs/senior_software_engineer_senior_interview_loop.txt @@ -0,0 +1,151 @@ +Interview Loop Design for Senior Software Engineer (Senior Level) +============================================================ +Team: platform +Generated: 2026-02-16T13:27:37.925680 +Total Duration: 300 minutes (5h 0m) +Total Rounds: 5 + +INTERVIEW ROUNDS +---------------------------------------- + +Round 1: Technical Phone Screen +Duration: 45 minutes +Format: Virtual +Objectives: + • Assess coding fundamentals + • Evaluate problem-solving approach + • Screen for basic technical competency +Focus Areas: + • Coding Fundamentals + • Problem Solving + • Technical Leadership + • System Architecture + • People Development + +Round 2: Coding Deep Dive +Duration: 75 minutes +Format: In Person Or Virtual +Objectives: + • Evaluate coding skills in depth + • Assess code quality and testing + • Review debugging approach +Focus Areas: + • Technical Execution + • Code Quality + • Technical Leadership + • System Architecture + • People Development + +Round 3: System Design +Duration: 75 minutes +Format: Collaborative Whiteboard +Objectives: + • Assess architectural thinking + • Evaluate scalability considerations + • Review trade-off analysis +Focus Areas: + • System Thinking + • Architectural Reasoning + • Technical Leadership + • System Architecture + • People Development + +Round 4: Behavioral Interview +Duration: 45 minutes +Format: Conversational +Objectives: + • Assess cultural fit + • Evaluate past experiences + • Review leadership examples +Focus Areas: + • Cultural Fit + • Communication + • Teamwork + • Technical Leadership + • System Architecture + +Round 5: Technical Leadership +Duration: 60 minutes +Format: Discussion Based +Objectives: + • Evaluate mentoring capability + • Assess technical decision making + • Review cross-team collaboration +Focus Areas: + • Leadership + • Mentoring + • Influence + • Technical Leadership + • System Architecture + +SUGGESTED SCHEDULE +---------------------------------------- +Schedule Type: Multi Day + +Day 1: +Time: 09:00 - 12:45 + 09:00-09:45: Technical Phone Screen (45min) + 10:00-11:15: Coding Deep Dive (75min) + 11:30-12:45: System Design (75min) + +Day 2: +Time: 09:00 - 11:00 + 09:00-09:45: Behavioral Interview (45min) + 10:00-11:00: Technical Leadership (60min) + +INTERVIEWER REQUIREMENTS +---------------------------------------- + +Technical Phone Screen: +Required Skills: technical_assessment, coding_evaluation +Suggested Interviewers: senior_engineer, tech_lead +Calibration Level: Standard + +Coding Deep Dive: +Required Skills: advanced_technical, code_quality_assessment +Suggested Interviewers: senior_engineer, staff_engineer +Calibration Level: High + +System Design: +Required Skills: architecture_design, scalability_assessment +Suggested Interviewers: senior_architect, staff_engineer +Calibration Level: High + +Behavioral: +Required Skills: behavioral_interviewing, competency_assessment +Suggested Interviewers: hiring_manager, people_manager +Calibration Level: Standard + +Technical Leadership: +Required Skills: leadership_assessment, technical_mentoring +Suggested Interviewers: engineering_manager, senior_staff +Calibration Level: High + +SCORECARD TEMPLATE +---------------------------------------- +Scoring Scale: + 4: Exceeds Expectations - Demonstrates mastery beyond required level + 3: Meets Expectations - Solid performance meeting all requirements + 2: Partially Meets - Shows potential but has development areas + 1: Does Not Meet - Significant gaps in required competencies + +Evaluation Dimensions: + • System Architecture (Weight: high) + • Technical Leadership (Weight: high) + • Mentoring (Weight: high) + • Cross Team Collab (Weight: high) + • Technology Evaluation (Weight: medium) + • Process Improvement (Weight: medium) + • Hiring Contribution (Weight: medium) + • Communication (Weight: high) + • Cultural Fit (Weight: medium) + • Learning Agility (Weight: medium) + +CALIBRATION NOTES +---------------------------------------- +Hiring Bar: Calibrated for senior level software engineer role + +Common Pitfalls: + • Avoid comparing candidates to each other rather than to the role standard + • Don't let one strong/weak area overshadow overall assessment + • Ensure consistent application of evaluation criteria \ No newline at end of file diff --git a/engineering/interview-system-designer/hiring_calibrator.py b/engineering/interview-system-designer/hiring_calibrator.py new file mode 100644 index 0000000..c7bcc11 --- /dev/null +++ b/engineering/interview-system-designer/hiring_calibrator.py @@ -0,0 +1,1306 @@ +#!/usr/bin/env python3 +""" +Hiring Calibrator + +Analyzes interview scores from multiple candidates and interviewers to detect bias, +calibration issues, and inconsistent rubric application. Generates calibration reports +with specific recommendations for interviewer coaching and process improvements. + +Usage: + python hiring_calibrator.py --input interview_results.json --analysis-type comprehensive + python hiring_calibrator.py --input data.json --competencies technical,leadership --output report.json + python hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly +""" + +import os +import sys +import json +import argparse +import statistics +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict, Counter +import math + + +class HiringCalibrator: + """Analyzes interview data for bias detection and calibration issues.""" + + def __init__(self): + self.bias_thresholds = self._init_bias_thresholds() + self.calibration_standards = self._init_calibration_standards() + self.demographic_categories = self._init_demographic_categories() + + def _init_bias_thresholds(self) -> Dict[str, float]: + """Initialize statistical thresholds for bias detection.""" + return { + "score_variance_threshold": 1.5, # Standard deviations + "pass_rate_difference_threshold": 0.15, # 15% difference + "interviewer_consistency_threshold": 0.8, # Correlation coefficient + "demographic_parity_threshold": 0.10, # 10% difference + "score_inflation_threshold": 0.3, # 30% above historical average + "score_deflation_threshold": 0.3, # 30% below historical average + "minimum_sample_size": 5 # Minimum candidates per analysis + } + + def _init_calibration_standards(self) -> Dict[str, Dict]: + """Initialize expected calibration standards.""" + return { + "score_distribution": { + "target_mean": 2.8, # Expected average score (1-4 scale) + "target_std": 0.9, # Expected standard deviation + "expected_distribution": { + "1": 0.10, # 10% score 1 (does not meet) + "2": 0.25, # 25% score 2 (partially meets) + "3": 0.45, # 45% score 3 (meets expectations) + "4": 0.20 # 20% score 4 (exceeds expectations) + } + }, + "interviewer_agreement": { + "minimum_correlation": 0.70, # Minimum correlation between interviewers + "maximum_std_deviation": 0.8, # Maximum std dev in scores for same candidate + "agreement_threshold": 0.75 # % of time interviewers should agree within 1 point + }, + "pass_rates": { + "junior_level": 0.25, # 25% pass rate for junior roles + "mid_level": 0.20, # 20% pass rate for mid roles + "senior_level": 0.15, # 15% pass rate for senior roles + "staff_level": 0.10, # 10% pass rate for staff+ roles + "leadership": 0.12 # 12% pass rate for leadership roles + } + } + + def _init_demographic_categories(self) -> List[str]: + """Initialize demographic categories to analyze for bias.""" + return [ + "gender", "ethnicity", "education_level", "previous_company_size", + "years_experience", "university_tier", "geographic_location" + ] + + def analyze_hiring_calibration(self, interview_data: List[Dict[str, Any]], + analysis_type: str = "comprehensive", + competencies: Optional[List[str]] = None, + trend_analysis: bool = False, + period: str = "monthly") -> Dict[str, Any]: + """Perform comprehensive hiring calibration analysis.""" + + # Validate and preprocess data + processed_data = self._preprocess_interview_data(interview_data) + + if len(processed_data) < self.bias_thresholds["minimum_sample_size"]: + return { + "error": "Insufficient data for analysis", + "minimum_required": self.bias_thresholds["minimum_sample_size"], + "actual_samples": len(processed_data) + } + + # Perform different types of analysis based on request + analysis_results = { + "analysis_type": analysis_type, + "data_summary": self._generate_data_summary(processed_data), + "generated_at": datetime.now().isoformat() + } + + if analysis_type in ["comprehensive", "bias"]: + analysis_results["bias_analysis"] = self._analyze_bias_patterns(processed_data, competencies) + + if analysis_type in ["comprehensive", "calibration"]: + analysis_results["calibration_analysis"] = self._analyze_calibration_consistency(processed_data, competencies) + + if analysis_type in ["comprehensive", "interviewer"]: + analysis_results["interviewer_analysis"] = self._analyze_interviewer_bias(processed_data) + + if analysis_type in ["comprehensive", "scoring"]: + analysis_results["scoring_analysis"] = self._analyze_scoring_patterns(processed_data, competencies) + + if trend_analysis: + analysis_results["trend_analysis"] = self._analyze_trends_over_time(processed_data, period) + + # Generate recommendations + analysis_results["recommendations"] = self._generate_recommendations(analysis_results) + + # Calculate overall calibration health score + analysis_results["calibration_health_score"] = self._calculate_health_score(analysis_results) + + return analysis_results + + def _preprocess_interview_data(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Clean and validate interview data.""" + processed_data = [] + + for record in raw_data: + if self._validate_interview_record(record): + processed_record = self._standardize_record(record) + processed_data.append(processed_record) + + return processed_data + + def _validate_interview_record(self, record: Dict[str, Any]) -> bool: + """Validate that an interview record has required fields.""" + required_fields = ["candidate_id", "interviewer_id", "scores", "overall_recommendation", "date"] + + for field in required_fields: + if field not in record or record[field] is None: + return False + + # Validate scores format + if not isinstance(record["scores"], dict): + return False + + # Validate score values are numeric and in valid range (1-4) + for competency, score in record["scores"].items(): + if not isinstance(score, (int, float)) or not (1 <= score <= 4): + return False + + return True + + def _standardize_record(self, record: Dict[str, Any]) -> Dict[str, Any]: + """Standardize record format and add computed fields.""" + standardized = record.copy() + + # Calculate average score + scores = list(record["scores"].values()) + standardized["average_score"] = statistics.mean(scores) + + # Standardize recommendation to binary + recommendation = record["overall_recommendation"].lower() + standardized["hire_decision"] = recommendation in ["hire", "strong hire", "yes"] + + # Parse date if string + if isinstance(record["date"], str): + try: + standardized["date"] = datetime.fromisoformat(record["date"].replace("Z", "+00:00")) + except ValueError: + standardized["date"] = datetime.now() + + # Add demographic info if available + for category in self.demographic_categories: + if category not in standardized: + standardized[category] = "unknown" + + # Add level normalization + role = record.get("role", "").lower() + if any(level in role for level in ["junior", "associate", "entry"]): + standardized["normalized_level"] = "junior" + elif any(level in role for level in ["senior", "sr"]): + standardized["normalized_level"] = "senior" + elif any(level in role for level in ["staff", "principal", "lead"]): + standardized["normalized_level"] = "staff" + else: + standardized["normalized_level"] = "mid" + + return standardized + + def _generate_data_summary(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """Generate summary statistics for the dataset.""" + if not data: + return {} + + total_candidates = len(data) + unique_interviewers = len(set(record["interviewer_id"] for record in data)) + + # Score statistics + all_scores = [] + all_average_scores = [] + hire_decisions = [] + + for record in data: + all_scores.extend(record["scores"].values()) + all_average_scores.append(record["average_score"]) + hire_decisions.append(record["hire_decision"]) + + # Date range + dates = [record["date"] for record in data if record["date"]] + date_range = { + "start_date": min(dates).isoformat() if dates else None, + "end_date": max(dates).isoformat() if dates else None, + "total_days": (max(dates) - min(dates)).days if len(dates) > 1 else 0 + } + + # Role distribution + roles = [record.get("role", "unknown") for record in data] + role_distribution = dict(Counter(roles)) + + return { + "total_candidates": total_candidates, + "unique_interviewers": unique_interviewers, + "candidates_per_interviewer": round(total_candidates / unique_interviewers, 2), + "date_range": date_range, + "score_statistics": { + "mean_individual_scores": round(statistics.mean(all_scores), 2), + "std_individual_scores": round(statistics.stdev(all_scores) if len(all_scores) > 1 else 0, 2), + "mean_average_scores": round(statistics.mean(all_average_scores), 2), + "std_average_scores": round(statistics.stdev(all_average_scores) if len(all_average_scores) > 1 else 0, 2) + }, + "hire_rate": round(sum(hire_decisions) / len(hire_decisions), 3), + "role_distribution": role_distribution + } + + def _analyze_bias_patterns(self, data: List[Dict[str, Any]], + target_competencies: Optional[List[str]]) -> Dict[str, Any]: + """Analyze potential bias patterns in interview decisions.""" + bias_analysis = { + "demographic_bias": {}, + "interviewer_bias": {}, + "competency_bias": {}, + "overall_bias_score": 0 + } + + # Analyze demographic bias + for demographic in self.demographic_categories: + if all(record.get(demographic) == "unknown" for record in data): + continue + + demographic_analysis = self._analyze_demographic_bias(data, demographic) + if demographic_analysis["bias_detected"]: + bias_analysis["demographic_bias"][demographic] = demographic_analysis + + # Analyze interviewer bias + bias_analysis["interviewer_bias"] = self._analyze_interviewer_bias(data) + + # Analyze competency bias if specified + if target_competencies: + bias_analysis["competency_bias"] = self._analyze_competency_bias(data, target_competencies) + + # Calculate overall bias score + bias_analysis["overall_bias_score"] = self._calculate_bias_score(bias_analysis) + + return bias_analysis + + def _analyze_demographic_bias(self, data: List[Dict[str, Any]], + demographic: str) -> Dict[str, Any]: + """Analyze bias for a specific demographic category.""" + # Group data by demographic values + demographic_groups = defaultdict(list) + for record in data: + demo_value = record.get(demographic, "unknown") + if demo_value != "unknown": + demographic_groups[demo_value].append(record) + + if len(demographic_groups) < 2: + return {"bias_detected": False, "reason": "insufficient_groups"} + + # Calculate statistics for each group + group_stats = {} + for group, records in demographic_groups.items(): + if len(records) >= self.bias_thresholds["minimum_sample_size"]: + scores = [r["average_score"] for r in records] + hire_rate = sum(r["hire_decision"] for r in records) / len(records) + + group_stats[group] = { + "count": len(records), + "mean_score": statistics.mean(scores), + "hire_rate": hire_rate, + "std_score": statistics.stdev(scores) if len(scores) > 1 else 0 + } + + if len(group_stats) < 2: + return {"bias_detected": False, "reason": "insufficient_sample_sizes"} + + # Detect statistical differences + bias_detected = False + bias_details = {} + + # Check for significant differences in hire rates + hire_rates = [stats["hire_rate"] for stats in group_stats.values()] + max_hire_rate_diff = max(hire_rates) - min(hire_rates) + + if max_hire_rate_diff > self.bias_thresholds["demographic_parity_threshold"]: + bias_detected = True + bias_details["hire_rate_disparity"] = { + "max_difference": round(max_hire_rate_diff, 3), + "threshold": self.bias_thresholds["demographic_parity_threshold"], + "group_stats": group_stats + } + + # Check for significant differences in scoring + mean_scores = [stats["mean_score"] for stats in group_stats.values()] + max_score_diff = max(mean_scores) - min(mean_scores) + + if max_score_diff > 0.5: # Half point difference threshold + bias_detected = True + bias_details["scoring_disparity"] = { + "max_difference": round(max_score_diff, 3), + "group_stats": group_stats + } + + return { + "bias_detected": bias_detected, + "demographic": demographic, + "group_statistics": group_stats, + "bias_details": bias_details, + "recommendation": self._generate_demographic_bias_recommendation(demographic, bias_details) if bias_detected else None + } + + def _analyze_interviewer_bias(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """Analyze bias patterns across different interviewers.""" + interviewer_stats = defaultdict(list) + + # Group by interviewer + for record in data: + interviewer_id = record["interviewer_id"] + interviewer_stats[interviewer_id].append(record) + + # Calculate statistics per interviewer + interviewer_analysis = {} + for interviewer_id, records in interviewer_stats.items(): + if len(records) >= self.bias_thresholds["minimum_sample_size"]: + scores = [r["average_score"] for r in records] + hire_rate = sum(r["hire_decision"] for r in records) / len(records) + + interviewer_analysis[interviewer_id] = { + "total_interviews": len(records), + "mean_score": statistics.mean(scores), + "std_score": statistics.stdev(scores) if len(scores) > 1 else 0, + "hire_rate": hire_rate, + "score_inflation": self._detect_score_inflation(scores), + "consistency_score": self._calculate_interviewer_consistency(records) + } + + # Identify outlier interviewers + if len(interviewer_analysis) > 1: + overall_mean_score = statistics.mean([stats["mean_score"] for stats in interviewer_analysis.values()]) + overall_hire_rate = statistics.mean([stats["hire_rate"] for stats in interviewer_analysis.values()]) + + outlier_interviewers = {} + for interviewer_id, stats in interviewer_analysis.items(): + issues = [] + + # Check for score inflation/deflation + if stats["mean_score"] > overall_mean_score * (1 + self.bias_thresholds["score_inflation_threshold"]): + issues.append("score_inflation") + elif stats["mean_score"] < overall_mean_score * (1 - self.bias_thresholds["score_deflation_threshold"]): + issues.append("score_deflation") + + # Check for hire rate deviation + hire_rate_diff = abs(stats["hire_rate"] - overall_hire_rate) + if hire_rate_diff > self.bias_thresholds["pass_rate_difference_threshold"]: + issues.append("hire_rate_deviation") + + # Check for low consistency + if stats["consistency_score"] < self.bias_thresholds["interviewer_consistency_threshold"]: + issues.append("low_consistency") + + if issues: + outlier_interviewers[interviewer_id] = { + "issues": issues, + "statistics": stats, + "severity": len(issues) # More issues = higher severity + } + + return { + "interviewer_statistics": interviewer_analysis, + "outlier_interviewers": outlier_interviewers if len(interviewer_analysis) > 1 else {}, + "overall_consistency": self._calculate_overall_interviewer_consistency(data), + "recommendations": self._generate_interviewer_recommendations(outlier_interviewers if len(interviewer_analysis) > 1 else {}) + } + + def _analyze_competency_bias(self, data: List[Dict[str, Any]], + competencies: List[str]) -> Dict[str, Any]: + """Analyze bias patterns within specific competencies.""" + competency_analysis = {} + + for competency in competencies: + # Extract scores for this competency + competency_scores = [] + for record in data: + if competency in record["scores"]: + competency_scores.append({ + "score": record["scores"][competency], + "interviewer": record["interviewer_id"], + "candidate": record["candidate_id"], + "overall_decision": record["hire_decision"] + }) + + if len(competency_scores) < self.bias_thresholds["minimum_sample_size"]: + continue + + # Analyze scoring patterns + scores = [item["score"] for item in competency_scores] + score_variance = statistics.variance(scores) if len(scores) > 1 else 0 + + # Analyze by interviewer + interviewer_competency_scores = defaultdict(list) + for item in competency_scores: + interviewer_competency_scores[item["interviewer"]].append(item["score"]) + + interviewer_variations = {} + if len(interviewer_competency_scores) > 1: + interviewer_means = {interviewer: statistics.mean(scores) + for interviewer, scores in interviewer_competency_scores.items() + if len(scores) >= 3} + + if len(interviewer_means) > 1: + mean_of_means = statistics.mean(interviewer_means.values()) + for interviewer, mean_score in interviewer_means.items(): + deviation = abs(mean_score - mean_of_means) + if deviation > 0.5: # More than half point deviation + interviewer_variations[interviewer] = { + "mean_score": round(mean_score, 2), + "deviation_from_average": round(deviation, 2), + "sample_size": len(interviewer_competency_scores[interviewer]) + } + + competency_analysis[competency] = { + "total_scores": len(competency_scores), + "mean_score": round(statistics.mean(scores), 2), + "score_variance": round(score_variance, 2), + "interviewer_variations": interviewer_variations, + "bias_detected": len(interviewer_variations) > 0 + } + + return competency_analysis + + def _analyze_calibration_consistency(self, data: List[Dict[str, Any]], + target_competencies: Optional[List[str]]) -> Dict[str, Any]: + """Analyze calibration consistency across interviews.""" + + # Group candidates by those interviewed by multiple people + candidate_interviewers = defaultdict(list) + for record in data: + candidate_interviewers[record["candidate_id"]].append(record) + + multi_interviewer_candidates = { + candidate: records for candidate, records in candidate_interviewers.items() + if len(records) > 1 + } + + if not multi_interviewer_candidates: + return { + "error": "No candidates with multiple interviewers found", + "single_interviewer_analysis": self._analyze_single_interviewer_consistency(data) + } + + # Calculate agreement statistics + agreement_stats = [] + score_correlations = [] + + for candidate, records in multi_interviewer_candidates.items(): + candidate_scores = [] + interviewer_pairs = [] + + for record in records: + avg_score = record["average_score"] + candidate_scores.append(avg_score) + interviewer_pairs.append(record["interviewer_id"]) + + if len(candidate_scores) > 1: + # Calculate standard deviation of scores for this candidate + score_std = statistics.stdev(candidate_scores) + agreement_stats.append(score_std) + + # Check if all interviewers agree within 1 point + score_range = max(candidate_scores) - min(candidate_scores) + agreement_within_one = score_range <= 1.0 + + score_correlations.append({ + "candidate": candidate, + "scores": candidate_scores, + "interviewers": interviewer_pairs, + "score_std": score_std, + "score_range": score_range, + "agreement_within_one": agreement_within_one + }) + + # Calculate overall calibration metrics + mean_score_std = statistics.mean(agreement_stats) if agreement_stats else 0 + agreement_rate = sum(1 for corr in score_correlations if corr["agreement_within_one"]) / len(score_correlations) if score_correlations else 0 + + calibration_quality = "good" + if mean_score_std > self.calibration_standards["interviewer_agreement"]["maximum_std_deviation"]: + calibration_quality = "poor" + elif agreement_rate < self.calibration_standards["interviewer_agreement"]["agreement_threshold"]: + calibration_quality = "fair" + + return { + "multi_interviewer_candidates": len(multi_interviewer_candidates), + "mean_score_standard_deviation": round(mean_score_std, 3), + "agreement_within_one_point_rate": round(agreement_rate, 3), + "calibration_quality": calibration_quality, + "candidate_agreement_details": score_correlations, + "target_standards": self.calibration_standards["interviewer_agreement"], + "recommendations": self._generate_calibration_recommendations(mean_score_std, agreement_rate) + } + + def _analyze_scoring_patterns(self, data: List[Dict[str, Any]], + target_competencies: Optional[List[str]]) -> Dict[str, Any]: + """Analyze overall scoring patterns and distributions.""" + + # Overall score distribution + all_individual_scores = [] + all_average_scores = [] + score_distribution = defaultdict(int) + + for record in data: + avg_score = record["average_score"] + all_average_scores.append(avg_score) + + for competency, score in record["scores"].items(): + if not target_competencies or competency in target_competencies: + all_individual_scores.append(score) + score_distribution[str(int(score))] += 1 + + # Calculate distribution percentages + total_scores = sum(score_distribution.values()) + score_percentages = {score: count/total_scores for score, count in score_distribution.items()} + + # Compare against expected distribution + expected_dist = self.calibration_standards["score_distribution"]["expected_distribution"] + distribution_analysis = {} + + for score in ["1", "2", "3", "4"]: + expected_pct = expected_dist.get(score, 0) + actual_pct = score_percentages.get(score, 0) + difference = actual_pct - expected_pct + + distribution_analysis[score] = { + "expected_percentage": expected_pct, + "actual_percentage": round(actual_pct, 3), + "difference": round(difference, 3), + "significant_deviation": abs(difference) > 0.05 # 5% threshold + } + + # Calculate scoring statistics + mean_score = statistics.mean(all_individual_scores) if all_individual_scores else 0 + std_score = statistics.stdev(all_individual_scores) if len(all_individual_scores) > 1 else 0 + + target_mean = self.calibration_standards["score_distribution"]["target_mean"] + target_std = self.calibration_standards["score_distribution"]["target_std"] + + # Analyze pass rates by level + level_pass_rates = {} + level_groups = defaultdict(list) + + for record in data: + level = record.get("normalized_level", "unknown") + level_groups[level].append(record["hire_decision"]) + + for level, decisions in level_groups.items(): + if len(decisions) >= self.bias_thresholds["minimum_sample_size"]: + pass_rate = sum(decisions) / len(decisions) + expected_rate = self.calibration_standards["pass_rates"].get(f"{level}_level", 0.15) + + level_pass_rates[level] = { + "actual_pass_rate": round(pass_rate, 3), + "expected_pass_rate": expected_rate, + "difference": round(pass_rate - expected_rate, 3), + "sample_size": len(decisions) + } + + return { + "score_statistics": { + "mean_score": round(mean_score, 2), + "std_score": round(std_score, 2), + "target_mean": target_mean, + "target_std": target_std, + "mean_deviation": round(abs(mean_score - target_mean), 2), + "std_deviation": round(abs(std_score - target_std), 2) + }, + "score_distribution": distribution_analysis, + "level_pass_rates": level_pass_rates, + "overall_assessment": self._assess_scoring_health(distribution_analysis, mean_score, target_mean) + } + + def _analyze_trends_over_time(self, data: List[Dict[str, Any]], period: str) -> Dict[str, Any]: + """Analyze trends in hiring patterns over time.""" + + # Sort data by date + dated_data = [record for record in data if record.get("date")] + dated_data.sort(key=lambda x: x["date"]) + + if len(dated_data) < 10: # Need minimum data for trend analysis + return {"error": "Insufficient data for trend analysis", "minimum_required": 10} + + # Group by time period + period_groups = defaultdict(list) + + for record in dated_data: + date = record["date"] + + if period == "weekly": + period_key = date.strftime("%Y-W%U") + elif period == "monthly": + period_key = date.strftime("%Y-%m") + elif period == "quarterly": + quarter = (date.month - 1) // 3 + 1 + period_key = f"{date.year}-Q{quarter}" + else: # daily + period_key = date.strftime("%Y-%m-%d") + + period_groups[period_key].append(record) + + # Calculate metrics for each period + period_metrics = {} + for period_key, records in period_groups.items(): + if len(records) >= 3: # Minimum for meaningful metrics + scores = [r["average_score"] for r in records] + hire_rate = sum(r["hire_decision"] for r in records) / len(records) + + period_metrics[period_key] = { + "count": len(records), + "mean_score": statistics.mean(scores), + "hire_rate": hire_rate, + "std_score": statistics.stdev(scores) if len(scores) > 1 else 0 + } + + if len(period_metrics) < 3: + return {"error": "Insufficient periods for trend analysis"} + + # Analyze trends + sorted_periods = sorted(period_metrics.keys()) + mean_scores = [period_metrics[p]["mean_score"] for p in sorted_periods] + hire_rates = [period_metrics[p]["hire_rate"] for p in sorted_periods] + + # Simple linear trend calculation + score_trend = self._calculate_linear_trend(mean_scores) + hire_rate_trend = self._calculate_linear_trend(hire_rates) + + return { + "period": period, + "total_periods": len(period_metrics), + "period_metrics": period_metrics, + "trends": { + "score_trend": { + "direction": "increasing" if score_trend > 0.01 else "decreasing" if score_trend < -0.01 else "stable", + "slope": round(score_trend, 4), + "significance": "significant" if abs(score_trend) > 0.05 else "minor" + }, + "hire_rate_trend": { + "direction": "increasing" if hire_rate_trend > 0.005 else "decreasing" if hire_rate_trend < -0.005 else "stable", + "slope": round(hire_rate_trend, 4), + "significance": "significant" if abs(hire_rate_trend) > 0.02 else "minor" + } + }, + "insights": self._generate_trend_insights(score_trend, hire_rate_trend, period_metrics) + } + + def _calculate_linear_trend(self, values: List[float]) -> float: + """Calculate simple linear trend slope.""" + if len(values) < 2: + return 0 + + n = len(values) + x = list(range(n)) + + # Calculate slope using least squares + x_mean = statistics.mean(x) + y_mean = statistics.mean(values) + + numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n)) + denominator = sum((x[i] - x_mean) ** 2 for i in range(n)) + + return numerator / denominator if denominator != 0 else 0 + + def _detect_score_inflation(self, scores: List[float]) -> Dict[str, Any]: + """Detect if an interviewer shows score inflation patterns.""" + if len(scores) < 5: + return {"insufficient_data": True} + + mean_score = statistics.mean(scores) + std_score = statistics.stdev(scores) + + # Check against expected mean (2.8) + expected_mean = self.calibration_standards["score_distribution"]["target_mean"] + deviation = mean_score - expected_mean + + # High scores with low variance might indicate inflation + high_scores_low_variance = mean_score > 3.2 and std_score < 0.5 + + # Check distribution - too many 4s might indicate inflation + score_counts = Counter([int(score) for score in scores]) + four_count_ratio = score_counts.get(4, 0) / len(scores) + + return { + "mean_score": round(mean_score, 2), + "expected_mean": expected_mean, + "deviation": round(deviation, 2), + "high_scores_low_variance": high_scores_low_variance, + "four_count_ratio": round(four_count_ratio, 2), + "inflation_detected": deviation > 0.3 or high_scores_low_variance or four_count_ratio > 0.4 + } + + def _calculate_interviewer_consistency(self, records: List[Dict[str, Any]]) -> float: + """Calculate consistency score for an interviewer.""" + if len(records) < 3: + return 0.5 # Neutral score for insufficient data + + # Look at variance in scoring + avg_scores = [r["average_score"] for r in records] + score_variance = statistics.variance(avg_scores) + + # Look at decision consistency relative to scores + decisions = [r["hire_decision"] for r in records] + scores_of_hires = [r["average_score"] for r in records if r["hire_decision"]] + scores_of_no_hires = [r["average_score"] for r in records if not r["hire_decision"]] + + # Good consistency means hires have higher average scores + decision_consistency = 0.5 + if scores_of_hires and scores_of_no_hires: + hire_mean = statistics.mean(scores_of_hires) + no_hire_mean = statistics.mean(scores_of_no_hires) + score_gap = hire_mean - no_hire_mean + decision_consistency = min(1.0, max(0.0, score_gap / 2.0)) # Normalize to 0-1 + + # Combine metrics (lower variance = higher consistency) + variance_consistency = max(0.0, 1.0 - (score_variance / 2.0)) + + return (decision_consistency + variance_consistency) / 2 + + def _calculate_overall_interviewer_consistency(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """Calculate overall consistency across all interviewers.""" + interviewer_consistency_scores = [] + + interviewer_records = defaultdict(list) + for record in data: + interviewer_records[record["interviewer_id"]].append(record) + + for interviewer_id, records in interviewer_records.items(): + if len(records) >= 3: + consistency = self._calculate_interviewer_consistency(records) + interviewer_consistency_scores.append(consistency) + + if not interviewer_consistency_scores: + return {"error": "Insufficient data per interviewer for consistency analysis"} + + return { + "mean_consistency": round(statistics.mean(interviewer_consistency_scores), 3), + "std_consistency": round(statistics.stdev(interviewer_consistency_scores) if len(interviewer_consistency_scores) > 1 else 0, 3), + "min_consistency": round(min(interviewer_consistency_scores), 3), + "max_consistency": round(max(interviewer_consistency_scores), 3), + "interviewers_analyzed": len(interviewer_consistency_scores), + "target_threshold": self.bias_thresholds["interviewer_consistency_threshold"] + } + + def _calculate_bias_score(self, bias_analysis: Dict[str, Any]) -> float: + """Calculate overall bias score (0-1, where 1 is most biased).""" + bias_factors = [] + + # Demographic bias factors + demographic_bias = bias_analysis.get("demographic_bias", {}) + for demo, analysis in demographic_bias.items(): + if analysis.get("bias_detected"): + bias_factors.append(0.3) # Each demographic bias adds 0.3 + + # Interviewer bias factors + interviewer_bias = bias_analysis.get("interviewer_bias", {}) + outlier_interviewers = interviewer_bias.get("outlier_interviewers", {}) + if outlier_interviewers: + # Scale by severity and number of outliers + total_severity = sum(info["severity"] for info in outlier_interviewers.values()) + bias_factors.append(min(0.5, total_severity * 0.1)) + + # Competency bias factors + competency_bias = bias_analysis.get("competency_bias", {}) + for comp, analysis in competency_bias.items(): + if analysis.get("bias_detected"): + bias_factors.append(0.2) # Each competency bias adds 0.2 + + return min(1.0, sum(bias_factors)) + + def _calculate_health_score(self, analysis: Dict[str, Any]) -> Dict[str, Any]: + """Calculate overall calibration health score.""" + health_factors = [] + + # Bias score (lower is better) + bias_analysis = analysis.get("bias_analysis", {}) + bias_score = bias_analysis.get("overall_bias_score", 0) + bias_health = max(0, 1 - bias_score) + health_factors.append(("bias", bias_health, 0.3)) + + # Calibration consistency + calibration_analysis = analysis.get("calibration_analysis", {}) + if "calibration_quality" in calibration_analysis: + quality_map = {"good": 1.0, "fair": 0.7, "poor": 0.3} + calibration_health = quality_map.get(calibration_analysis["calibration_quality"], 0.5) + health_factors.append(("calibration", calibration_health, 0.25)) + + # Interviewer consistency + interviewer_analysis = analysis.get("interviewer_analysis", {}) + overall_consistency = interviewer_analysis.get("overall_consistency", {}) + if "mean_consistency" in overall_consistency: + consistency_health = overall_consistency["mean_consistency"] + health_factors.append(("interviewer_consistency", consistency_health, 0.25)) + + # Scoring patterns health + scoring_analysis = analysis.get("scoring_analysis", {}) + if "overall_assessment" in scoring_analysis: + assessment_map = {"healthy": 1.0, "concerning": 0.6, "poor": 0.2} + scoring_health = assessment_map.get(scoring_analysis["overall_assessment"], 0.5) + health_factors.append(("scoring_patterns", scoring_health, 0.2)) + + # Calculate weighted average + if health_factors: + weighted_sum = sum(score * weight for _, score, weight in health_factors) + total_weight = sum(weight for _, _, weight in health_factors) + overall_score = weighted_sum / total_weight + else: + overall_score = 0.5 # Neutral if no data + + # Categorize health + if overall_score >= 0.8: + health_category = "excellent" + elif overall_score >= 0.7: + health_category = "good" + elif overall_score >= 0.5: + health_category = "fair" + else: + health_category = "poor" + + return { + "overall_score": round(overall_score, 3), + "health_category": health_category, + "component_scores": {name: round(score, 3) for name, score, _ in health_factors}, + "improvement_priority": self._identify_improvement_priorities(health_factors) + } + + def _identify_improvement_priorities(self, health_factors: List[Tuple[str, float, float]]) -> List[str]: + """Identify areas that need the most improvement.""" + priorities = [] + + for name, score, weight in health_factors: + impact = (1 - score) * weight # Low scores with high weights = high priority + if impact > 0.15: # Significant impact threshold + priorities.append(name) + + # Sort by impact (highest first) + priorities.sort(key=lambda name: next((1 - score) * weight for n, score, weight in health_factors if n == name), reverse=True) + + return priorities + + def _generate_recommendations(self, analysis: Dict[str, Any]) -> List[Dict[str, Any]]: + """Generate actionable recommendations based on analysis results.""" + recommendations = [] + + # Bias-related recommendations + bias_analysis = analysis.get("bias_analysis", {}) + + # Demographic bias recommendations + for demo, demo_analysis in bias_analysis.get("demographic_bias", {}).items(): + if demo_analysis.get("bias_detected"): + recommendations.append({ + "priority": "high", + "category": "bias_mitigation", + "title": f"Address {demo.replace('_', ' ').title()} Bias", + "description": demo_analysis.get("recommendation", f"Implement bias mitigation strategies for {demo}"), + "actions": [ + "Conduct unconscious bias training focused on this demographic", + "Review and standardize interview questions", + "Implement diverse interview panels", + "Monitor hiring metrics by demographic group" + ] + }) + + # Interviewer-specific recommendations + interviewer_analysis = bias_analysis.get("interviewer_bias", {}) + outlier_interviewers = interviewer_analysis.get("outlier_interviewers", {}) + + for interviewer_id, outlier_info in outlier_interviewers.items(): + issues = outlier_info["issues"] + priority = "high" if outlier_info["severity"] >= 3 else "medium" + + actions = [] + if "score_inflation" in issues: + actions.extend([ + "Provide calibration training on scoring standards", + "Shadow experienced interviewers for recalibration", + "Review examples of each score level" + ]) + if "score_deflation" in issues: + actions.extend([ + "Review expectations for role level", + "Calibrate against recent successful hires", + "Discuss evaluation criteria with hiring manager" + ]) + if "hire_rate_deviation" in issues: + actions.extend([ + "Review hiring bar standards", + "Participate in calibration sessions", + "Compare decision criteria with team" + ]) + if "low_consistency" in issues: + actions.extend([ + "Practice structured interviewing techniques", + "Use standardized scorecards", + "Document specific examples for each score" + ]) + + recommendations.append({ + "priority": priority, + "category": "interviewer_coaching", + "title": f"Coach Interviewer {interviewer_id}", + "description": f"Address issues: {', '.join(issues)}", + "actions": list(set(actions)) # Remove duplicates + }) + + # Calibration recommendations + calibration_analysis = analysis.get("calibration_analysis", {}) + if calibration_analysis.get("calibration_quality") in ["fair", "poor"]: + recommendations.append({ + "priority": "high", + "category": "calibration_improvement", + "title": "Improve Interview Calibration", + "description": f"Current calibration quality: {calibration_analysis.get('calibration_quality')}", + "actions": [ + "Conduct monthly calibration sessions", + "Create shared examples of good/poor answers", + "Implement mandatory interviewer shadowing", + "Standardize scoring rubrics across all interviewers", + "Review and align on role expectations" + ] + }) + + # Scoring pattern recommendations + scoring_analysis = analysis.get("scoring_analysis", {}) + if scoring_analysis.get("overall_assessment") in ["concerning", "poor"]: + recommendations.append({ + "priority": "medium", + "category": "scoring_standards", + "title": "Adjust Scoring Standards", + "description": "Scoring patterns deviate significantly from expected distribution", + "actions": [ + "Review and communicate target score distributions", + "Provide examples for each score level", + "Monitor pass rates by role level", + "Adjust hiring bar if consistently too high/low" + ] + }) + + # Health score recommendations + health_score = analysis.get("calibration_health_score", {}) + priorities = health_score.get("improvement_priority", []) + + if "bias" in priorities: + recommendations.append({ + "priority": "critical", + "category": "bias_mitigation", + "title": "Implement Comprehensive Bias Mitigation", + "description": "Multiple bias indicators detected across the hiring process", + "actions": [ + "Mandatory unconscious bias training for all interviewers", + "Implement structured interview protocols", + "Diversify interview panels", + "Regular bias audits and monitoring", + "Create accountability metrics for fair hiring" + ] + }) + + # Sort by priority + priority_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + recommendations.sort(key=lambda x: priority_order.get(x["priority"], 3)) + + return recommendations + + def _generate_demographic_bias_recommendation(self, demographic: str, bias_details: Dict[str, Any]) -> str: + """Generate specific recommendation for demographic bias.""" + if "hire_rate_disparity" in bias_details: + return f"Significant hire rate disparity detected for {demographic}. Implement structured interviews and diverse panels." + elif "scoring_disparity" in bias_details: + return f"Scoring disparity detected for {demographic}. Provide unconscious bias training and standardize evaluation criteria." + else: + return f"Potential bias detected for {demographic}. Monitor closely and implement bias mitigation strategies." + + def _generate_interviewer_recommendations(self, outlier_interviewers: Dict[str, Any]) -> List[str]: + """Generate recommendations for interviewer issues.""" + if not outlier_interviewers: + return ["All interviewers performing within expected ranges"] + + recommendations = [] + for interviewer, info in outlier_interviewers.items(): + issues = info["issues"] + if len(issues) >= 2: + recommendations.append(f"Interviewer {interviewer}: Requires comprehensive recalibration - multiple issues detected") + elif "score_inflation" in issues: + recommendations.append(f"Interviewer {interviewer}: Provide calibration training on scoring standards") + elif "hire_rate_deviation" in issues: + recommendations.append(f"Interviewer {interviewer}: Review hiring bar standards and decision criteria") + + return recommendations + + def _generate_calibration_recommendations(self, mean_std: float, agreement_rate: float) -> List[str]: + """Generate calibration improvement recommendations.""" + recommendations = [] + + if mean_std > self.calibration_standards["interviewer_agreement"]["maximum_std_deviation"]: + recommendations.append("High score variance detected - implement regular calibration sessions") + recommendations.append("Create shared examples of scoring standards for each competency") + + if agreement_rate < self.calibration_standards["interviewer_agreement"]["agreement_threshold"]: + recommendations.append("Low interviewer agreement rate - standardize interview questions and evaluation criteria") + recommendations.append("Implement mandatory interviewer training on consistent evaluation") + + if not recommendations: + recommendations.append("Calibration appears healthy - maintain current practices") + + return recommendations + + def _assess_scoring_health(self, distribution: Dict[str, Any], mean_score: float, target_mean: float) -> str: + """Assess overall health of scoring patterns.""" + issues = 0 + + # Check distribution deviations + for score_level, analysis in distribution.items(): + if analysis["significant_deviation"]: + issues += 1 + + # Check mean deviation + if abs(mean_score - target_mean) > 0.3: + issues += 1 + + if issues == 0: + return "healthy" + elif issues <= 2: + return "concerning" + else: + return "poor" + + def _generate_trend_insights(self, score_trend: float, hire_rate_trend: float, period_metrics: Dict[str, Any]) -> List[str]: + """Generate insights from trend analysis.""" + insights = [] + + if abs(score_trend) > 0.05: + direction = "increasing" if score_trend > 0 else "decreasing" + insights.append(f"Significant {direction} trend in average scores over time") + + if score_trend > 0: + insights.append("May indicate score inflation or improving candidate quality") + else: + insights.append("May indicate stricter evaluation or declining candidate quality") + + if abs(hire_rate_trend) > 0.02: + direction = "increasing" if hire_rate_trend > 0 else "decreasing" + insights.append(f"Significant {direction} trend in hire rates over time") + + if hire_rate_trend > 0: + insights.append("Consider if hiring bar has lowered or candidate pool improved") + else: + insights.append("Consider if hiring bar has raised or candidate pool declined") + + # Check for consistency + period_values = list(period_metrics.values()) + hire_rates = [p["hire_rate"] for p in period_values] + hire_rate_variance = statistics.variance(hire_rates) if len(hire_rates) > 1 else 0 + + if hire_rate_variance > 0.01: # High variance in hire rates + insights.append("High variance in hire rates across periods - consider process standardization") + + if not insights: + insights.append("Hiring patterns appear stable over time") + + return insights + + def _analyze_single_interviewer_consistency(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + """Analyze consistency for single-interviewer candidates.""" + # Look at consistency within individual interviewers + interviewer_scores = defaultdict(list) + + for record in data: + interviewer_scores[record["interviewer_id"]].extend(record["scores"].values()) + + consistency_analysis = {} + for interviewer, scores in interviewer_scores.items(): + if len(scores) >= 10: # Need sufficient data + consistency_analysis[interviewer] = { + "mean_score": round(statistics.mean(scores), 2), + "std_score": round(statistics.stdev(scores), 2), + "coefficient_of_variation": round(statistics.stdev(scores) / statistics.mean(scores), 2), + "total_scores": len(scores) + } + + return consistency_analysis + + +def format_human_readable(calibration_report: Dict[str, Any]) -> str: + """Format calibration report in human-readable format.""" + output = [] + + # Header + output.append("HIRING CALIBRATION ANALYSIS REPORT") + output.append("=" * 60) + output.append(f"Analysis Type: {calibration_report.get('analysis_type', 'N/A').title()}") + output.append(f"Generated: {calibration_report.get('generated_at', 'N/A')}") + + if "error" in calibration_report: + output.append(f"\nError: {calibration_report['error']}") + return "\n".join(output) + + # Data Summary + data_summary = calibration_report.get("data_summary", {}) + if data_summary: + output.append(f"\nDATA SUMMARY") + output.append("-" * 30) + output.append(f"Total Candidates: {data_summary.get('total_candidates', 0)}") + output.append(f"Unique Interviewers: {data_summary.get('unique_interviewers', 0)}") + output.append(f"Overall Hire Rate: {data_summary.get('hire_rate', 0):.1%}") + + score_stats = data_summary.get("score_statistics", {}) + output.append(f"Average Score: {score_stats.get('mean_average_scores', 0):.2f}") + output.append(f"Score Std Dev: {score_stats.get('std_average_scores', 0):.2f}") + + # Health Score + health_score = calibration_report.get("calibration_health_score", {}) + if health_score: + output.append(f"\nCALIBRATION HEALTH SCORE") + output.append("-" * 30) + output.append(f"Overall Score: {health_score.get('overall_score', 0):.3f}") + output.append(f"Health Category: {health_score.get('health_category', 'Unknown').title()}") + + if health_score.get("improvement_priority"): + output.append(f"Priority Areas: {', '.join(health_score['improvement_priority'])}") + + # Bias Analysis + bias_analysis = calibration_report.get("bias_analysis", {}) + if bias_analysis: + output.append(f"\nBIAS ANALYSIS") + output.append("-" * 30) + output.append(f"Overall Bias Score: {bias_analysis.get('overall_bias_score', 0):.3f}") + + # Demographic bias + demographic_bias = bias_analysis.get("demographic_bias", {}) + if demographic_bias: + output.append(f"\nDemographic Bias Issues:") + for demo, analysis in demographic_bias.items(): + output.append(f" • {demo.replace('_', ' ').title()}: {analysis.get('bias_details', {}).keys()}") + + # Interviewer bias + interviewer_bias = bias_analysis.get("interviewer_bias", {}) + outlier_interviewers = interviewer_bias.get("outlier_interviewers", {}) + if outlier_interviewers: + output.append(f"\nOutlier Interviewers:") + for interviewer, info in outlier_interviewers.items(): + issues = ", ".join(info["issues"]) + output.append(f" • {interviewer}: {issues}") + + # Calibration Analysis + calibration_analysis = calibration_report.get("calibration_analysis", {}) + if calibration_analysis and "error" not in calibration_analysis: + output.append(f"\nCALIBRATION CONSISTENCY") + output.append("-" * 30) + output.append(f"Quality: {calibration_analysis.get('calibration_quality', 'Unknown').title()}") + output.append(f"Agreement Rate: {calibration_analysis.get('agreement_within_one_point_rate', 0):.1%}") + output.append(f"Score Std Dev: {calibration_analysis.get('mean_score_standard_deviation', 0):.3f}") + + # Scoring Analysis + scoring_analysis = calibration_report.get("scoring_analysis", {}) + if scoring_analysis: + output.append(f"\nSCORING PATTERNS") + output.append("-" * 30) + output.append(f"Overall Assessment: {scoring_analysis.get('overall_assessment', 'Unknown').title()}") + + score_stats = scoring_analysis.get("score_statistics", {}) + output.append(f"Mean Score: {score_stats.get('mean_score', 0):.2f} (Target: {score_stats.get('target_mean', 0):.2f})") + + # Distribution analysis + distribution = scoring_analysis.get("score_distribution", {}) + if distribution: + output.append(f"\nScore Distribution vs Expected:") + for score in ["1", "2", "3", "4"]: + if score in distribution: + actual = distribution[score]["actual_percentage"] + expected = distribution[score]["expected_percentage"] + output.append(f" Score {score}: {actual:.1%} (Expected: {expected:.1%})") + + # Top Recommendations + recommendations = calibration_report.get("recommendations", []) + if recommendations: + output.append(f"\nTOP RECOMMENDATIONS") + output.append("-" * 30) + for i, rec in enumerate(recommendations[:5], 1): # Show top 5 + output.append(f"{i}. {rec['title']} ({rec['priority'].title()} Priority)") + output.append(f" {rec['description']}") + if rec.get('actions'): + output.append(f" Actions: {len(rec['actions'])} specific action items") + + return "\n".join(output) + + +def main(): + parser = argparse.ArgumentParser(description="Analyze interview data for bias and calibration issues") + parser.add_argument("--input", type=str, required=True, help="Input JSON file with interview results data") + parser.add_argument("--analysis-type", type=str, choices=["comprehensive", "bias", "calibration", "interviewer", "scoring"], + default="comprehensive", help="Type of analysis to perform") + parser.add_argument("--competencies", type=str, help="Comma-separated list of competencies to focus on") + parser.add_argument("--trend-analysis", action="store_true", help="Perform trend analysis over time") + parser.add_argument("--period", type=str, choices=["daily", "weekly", "monthly", "quarterly"], + default="monthly", help="Time period for trend analysis") + parser.add_argument("--output", type=str, help="Output file path") + parser.add_argument("--format", choices=["json", "text", "both"], default="both", help="Output format") + + args = parser.parse_args() + + # Load input data + try: + with open(args.input, 'r') as f: + interview_data = json.load(f) + + if not isinstance(interview_data, list): + print("Error: Input data must be a JSON array of interview records") + sys.exit(1) + except FileNotFoundError: + print(f"Error: Input file '{args.input}' not found") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in input file: {e}") + sys.exit(1) + except Exception as e: + print(f"Error reading input file: {e}") + sys.exit(1) + + # Initialize calibrator and run analysis + calibrator = HiringCalibrator() + + competencies = args.competencies.split(',') if args.competencies else None + + try: + results = calibrator.analyze_hiring_calibration( + interview_data=interview_data, + analysis_type=args.analysis_type, + competencies=competencies, + trend_analysis=args.trend_analysis, + period=args.period + ) + + # Handle output + if args.output: + output_path = args.output + json_path = output_path if output_path.endswith('.json') else f"{output_path}.json" + text_path = output_path.replace('.json', '.txt') if output_path.endswith('.json') else f"{output_path}.txt" + else: + base_filename = f"calibration_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + json_path = f"{base_filename}.json" + text_path = f"{base_filename}.txt" + + # Write outputs + if args.format in ["json", "both"]: + with open(json_path, 'w') as f: + json.dump(results, f, indent=2, default=str) + print(f"JSON report written to: {json_path}") + + if args.format in ["text", "both"]: + with open(text_path, 'w') as f: + f.write(format_human_readable(results)) + print(f"Text report written to: {text_path}") + + # Print summary + print(f"\nCalibration Analysis Summary:") + if "error" in results: + print(f"Error: {results['error']}") + else: + health_score = results.get("calibration_health_score", {}) + print(f"Health Score: {health_score.get('overall_score', 0):.3f} ({health_score.get('health_category', 'Unknown').title()})") + + bias_score = results.get("bias_analysis", {}).get("overall_bias_score", 0) + print(f"Bias Score: {bias_score:.3f} (Lower is better)") + + recommendations = results.get("recommendations", []) + print(f"Recommendations Generated: {len(recommendations)}") + + if recommendations: + print(f"Top Priority: {recommendations[0]['title']} ({recommendations[0]['priority'].title()})") + + except Exception as e: + print(f"Error during analysis: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering/interview-system-designer/loop_designer.py b/engineering/interview-system-designer/loop_designer.py new file mode 100644 index 0000000..b6cf046 --- /dev/null +++ b/engineering/interview-system-designer/loop_designer.py @@ -0,0 +1,908 @@ +#!/usr/bin/env python3 +""" +Interview Loop Designer + +Generates calibrated interview loops tailored to specific roles, levels, and teams. +Creates complete interview loops with rounds, focus areas, time allocation, +interviewer skill requirements, and scorecard templates. + +Usage: + python loop_designer.py --role "Senior Software Engineer" --level senior --team platform + python loop_designer.py --role "Product Manager" --level mid --competencies leadership,strategy + python loop_designer.py --input role_definition.json --output loops/ +""" + +import os +import sys +import json +import argparse +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict + + +class InterviewLoopDesigner: + """Designs comprehensive interview loops based on role requirements.""" + + def __init__(self): + self.competency_frameworks = self._init_competency_frameworks() + self.role_templates = self._init_role_templates() + self.interviewer_skills = self._init_interviewer_skills() + + def _init_competency_frameworks(self) -> Dict[str, Dict]: + """Initialize competency frameworks for different roles.""" + return { + "software_engineer": { + "junior": { + "required": ["coding_fundamentals", "debugging", "testing_basics", "version_control"], + "preferred": ["system_understanding", "code_review", "collaboration"], + "focus_areas": ["technical_execution", "learning_agility", "team_collaboration"] + }, + "mid": { + "required": ["advanced_coding", "system_design_basics", "testing_strategy", "debugging_complex"], + "preferred": ["mentoring_basics", "technical_communication", "project_ownership"], + "focus_areas": ["technical_depth", "system_thinking", "ownership"] + }, + "senior": { + "required": ["system_architecture", "technical_leadership", "mentoring", "cross_team_collab"], + "preferred": ["technology_evaluation", "process_improvement", "hiring_contribution"], + "focus_areas": ["technical_leadership", "system_architecture", "people_development"] + }, + "staff": { + "required": ["architectural_vision", "organizational_impact", "technical_strategy", "team_building"], + "preferred": ["industry_influence", "innovation_leadership", "executive_communication"], + "focus_areas": ["organizational_impact", "technical_vision", "strategic_influence"] + }, + "principal": { + "required": ["company_wide_impact", "technical_vision", "talent_development", "strategic_planning"], + "preferred": ["industry_leadership", "board_communication", "market_influence"], + "focus_areas": ["strategic_leadership", "organizational_transformation", "external_influence"] + } + }, + "product_manager": { + "junior": { + "required": ["product_execution", "user_research", "data_analysis", "stakeholder_comm"], + "preferred": ["market_awareness", "technical_understanding", "project_management"], + "focus_areas": ["execution_excellence", "user_focus", "analytical_thinking"] + }, + "mid": { + "required": ["product_strategy", "cross_functional_leadership", "metrics_design", "market_analysis"], + "preferred": ["team_building", "technical_collaboration", "competitive_analysis"], + "focus_areas": ["strategic_thinking", "leadership", "business_impact"] + }, + "senior": { + "required": ["business_strategy", "team_leadership", "p&l_ownership", "market_positioning"], + "preferred": ["hiring_leadership", "board_communication", "partnership_development"], + "focus_areas": ["business_leadership", "market_strategy", "organizational_impact"] + }, + "staff": { + "required": ["portfolio_management", "organizational_leadership", "strategic_planning", "market_creation"], + "preferred": ["executive_presence", "investor_relations", "acquisition_strategy"], + "focus_areas": ["strategic_leadership", "market_innovation", "organizational_transformation"] + } + }, + "designer": { + "junior": { + "required": ["design_fundamentals", "user_research", "prototyping", "design_tools"], + "preferred": ["user_empathy", "visual_design", "collaboration"], + "focus_areas": ["design_execution", "user_research", "creative_problem_solving"] + }, + "mid": { + "required": ["design_systems", "user_testing", "cross_functional_collab", "design_strategy"], + "preferred": ["mentoring", "process_improvement", "business_understanding"], + "focus_areas": ["design_leadership", "system_thinking", "business_impact"] + }, + "senior": { + "required": ["design_leadership", "team_building", "strategic_design", "stakeholder_management"], + "preferred": ["design_culture", "hiring_leadership", "executive_communication"], + "focus_areas": ["design_strategy", "team_leadership", "organizational_impact"] + } + }, + "data_scientist": { + "junior": { + "required": ["statistical_analysis", "python_r", "data_visualization", "sql"], + "preferred": ["machine_learning", "business_understanding", "communication"], + "focus_areas": ["analytical_skills", "technical_execution", "business_impact"] + }, + "mid": { + "required": ["advanced_ml", "experiment_design", "data_engineering", "stakeholder_comm"], + "preferred": ["mentoring", "project_leadership", "product_collaboration"], + "focus_areas": ["advanced_analytics", "project_leadership", "cross_functional_impact"] + }, + "senior": { + "required": ["data_strategy", "team_leadership", "ml_systems", "business_strategy"], + "preferred": ["hiring_leadership", "executive_communication", "technology_evaluation"], + "focus_areas": ["strategic_leadership", "technical_vision", "organizational_impact"] + } + }, + "devops_engineer": { + "junior": { + "required": ["infrastructure_basics", "scripting", "monitoring", "troubleshooting"], + "preferred": ["automation", "cloud_platforms", "security_awareness"], + "focus_areas": ["operational_excellence", "automation_mindset", "problem_solving"] + }, + "mid": { + "required": ["ci_cd_design", "infrastructure_as_code", "security_implementation", "performance_optimization"], + "preferred": ["team_collaboration", "incident_management", "capacity_planning"], + "focus_areas": ["system_reliability", "automation_leadership", "cross_team_collaboration"] + }, + "senior": { + "required": ["platform_architecture", "team_leadership", "security_strategy", "organizational_impact"], + "preferred": ["hiring_contribution", "technology_evaluation", "executive_communication"], + "focus_areas": ["platform_leadership", "strategic_thinking", "organizational_transformation"] + } + }, + "engineering_manager": { + "junior": { + "required": ["team_leadership", "technical_background", "people_management", "project_coordination"], + "preferred": ["hiring_experience", "performance_management", "technical_mentoring"], + "focus_areas": ["people_leadership", "team_building", "execution_excellence"] + }, + "senior": { + "required": ["organizational_leadership", "strategic_planning", "talent_development", "cross_functional_leadership"], + "preferred": ["technical_vision", "culture_building", "executive_communication"], + "focus_areas": ["organizational_impact", "strategic_leadership", "talent_development"] + }, + "staff": { + "required": ["multi_team_leadership", "organizational_strategy", "executive_presence", "cultural_transformation"], + "preferred": ["board_communication", "market_understanding", "acquisition_integration"], + "focus_areas": ["organizational_transformation", "strategic_leadership", "cultural_evolution"] + } + } + } + + def _init_role_templates(self) -> Dict[str, Dict]: + """Initialize role-specific interview templates.""" + return { + "software_engineer": { + "core_rounds": ["technical_phone_screen", "coding_deep_dive", "system_design", "behavioral"], + "optional_rounds": ["technical_leadership", "domain_expertise", "culture_fit"], + "total_duration_range": (180, 360), # 3-6 hours + "required_competencies": ["coding", "problem_solving", "communication"] + }, + "product_manager": { + "core_rounds": ["product_sense", "analytical_thinking", "execution_process", "behavioral"], + "optional_rounds": ["strategic_thinking", "technical_collaboration", "leadership"], + "total_duration_range": (180, 300), # 3-5 hours + "required_competencies": ["product_strategy", "analytical_thinking", "stakeholder_management"] + }, + "designer": { + "core_rounds": ["portfolio_review", "design_challenge", "collaboration_process", "behavioral"], + "optional_rounds": ["design_system_thinking", "research_methodology", "leadership"], + "total_duration_range": (180, 300), # 3-5 hours + "required_competencies": ["design_process", "user_empathy", "visual_communication"] + }, + "data_scientist": { + "core_rounds": ["technical_assessment", "case_study", "statistical_thinking", "behavioral"], + "optional_rounds": ["ml_systems", "business_strategy", "technical_leadership"], + "total_duration_range": (210, 330), # 3.5-5.5 hours + "required_competencies": ["statistical_analysis", "programming", "business_acumen"] + }, + "devops_engineer": { + "core_rounds": ["technical_assessment", "system_design", "troubleshooting", "behavioral"], + "optional_rounds": ["security_assessment", "automation_design", "leadership"], + "total_duration_range": (180, 300), # 3-5 hours + "required_competencies": ["infrastructure", "automation", "problem_solving"] + }, + "engineering_manager": { + "core_rounds": ["leadership_assessment", "technical_background", "people_management", "behavioral"], + "optional_rounds": ["strategic_thinking", "hiring_assessment", "culture_building"], + "total_duration_range": (240, 360), # 4-6 hours + "required_competencies": ["people_leadership", "technical_understanding", "strategic_thinking"] + } + } + + def _init_interviewer_skills(self) -> Dict[str, Dict]: + """Initialize interviewer skill requirements for different round types.""" + return { + "technical_phone_screen": { + "required_skills": ["technical_assessment", "coding_evaluation"], + "preferred_experience": ["same_domain", "senior_level"], + "calibration_level": "standard" + }, + "coding_deep_dive": { + "required_skills": ["advanced_technical", "code_quality_assessment"], + "preferred_experience": ["senior_engineer", "system_design"], + "calibration_level": "high" + }, + "system_design": { + "required_skills": ["architecture_design", "scalability_assessment"], + "preferred_experience": ["senior_architect", "large_scale_systems"], + "calibration_level": "high" + }, + "behavioral": { + "required_skills": ["behavioral_interviewing", "competency_assessment"], + "preferred_experience": ["hiring_manager", "people_leadership"], + "calibration_level": "standard" + }, + "technical_leadership": { + "required_skills": ["leadership_assessment", "technical_mentoring"], + "preferred_experience": ["engineering_manager", "tech_lead"], + "calibration_level": "high" + }, + "product_sense": { + "required_skills": ["product_evaluation", "market_analysis"], + "preferred_experience": ["product_manager", "product_leadership"], + "calibration_level": "high" + }, + "analytical_thinking": { + "required_skills": ["data_analysis", "metrics_evaluation"], + "preferred_experience": ["data_analyst", "product_manager"], + "calibration_level": "standard" + }, + "design_challenge": { + "required_skills": ["design_evaluation", "user_experience"], + "preferred_experience": ["senior_designer", "design_manager"], + "calibration_level": "high" + } + } + + def generate_interview_loop(self, role: str, level: str, team: Optional[str] = None, + competencies: Optional[List[str]] = None) -> Dict[str, Any]: + """Generate a complete interview loop for the specified role and level.""" + + # Normalize inputs + role_key = role.lower().replace(" ", "_").replace("-", "_") + level_key = level.lower() + + # Get role template and competency requirements + if role_key not in self.competency_frameworks: + role_key = self._find_closest_role(role_key) + + if level_key not in self.competency_frameworks[role_key]: + level_key = self._find_closest_level(role_key, level_key) + + competency_req = self.competency_frameworks[role_key][level_key] + role_template = self.role_templates.get(role_key, self.role_templates["software_engineer"]) + + # Design the interview loop + rounds = self._design_rounds(role_key, level_key, competency_req, role_template, competencies) + schedule = self._create_schedule(rounds) + scorecard = self._generate_scorecard(role_key, level_key, competency_req) + interviewer_requirements = self._define_interviewer_requirements(rounds) + + return { + "role": role, + "level": level, + "team": team, + "generated_at": datetime.now().isoformat(), + "total_duration_minutes": sum(round_info["duration_minutes"] for round_info in rounds.values()), + "total_rounds": len(rounds), + "rounds": rounds, + "suggested_schedule": schedule, + "scorecard_template": scorecard, + "interviewer_requirements": interviewer_requirements, + "competency_framework": competency_req, + "calibration_notes": self._generate_calibration_notes(role_key, level_key) + } + + def _find_closest_role(self, role_key: str) -> str: + """Find the closest matching role template.""" + role_mappings = { + "engineer": "software_engineer", + "developer": "software_engineer", + "swe": "software_engineer", + "backend": "software_engineer", + "frontend": "software_engineer", + "fullstack": "software_engineer", + "pm": "product_manager", + "product": "product_manager", + "ux": "designer", + "ui": "designer", + "graphic": "designer", + "data": "data_scientist", + "analyst": "data_scientist", + "ml": "data_scientist", + "ops": "devops_engineer", + "sre": "devops_engineer", + "infrastructure": "devops_engineer", + "manager": "engineering_manager", + "lead": "engineering_manager" + } + + for key_part in role_key.split("_"): + if key_part in role_mappings: + return role_mappings[key_part] + + return "software_engineer" # Default fallback + + def _find_closest_level(self, role_key: str, level_key: str) -> str: + """Find the closest matching level for the role.""" + available_levels = list(self.competency_frameworks[role_key].keys()) + + level_mappings = { + "entry": "junior", + "associate": "junior", + "jr": "junior", + "mid": "mid", + "middle": "mid", + "sr": "senior", + "senior": "senior", + "staff": "staff", + "principal": "principal", + "lead": "senior", + "manager": "senior" + } + + mapped_level = level_mappings.get(level_key, level_key) + + if mapped_level in available_levels: + return mapped_level + elif "senior" in available_levels: + return "senior" + else: + return available_levels[0] + + def _design_rounds(self, role_key: str, level_key: str, competency_req: Dict, + role_template: Dict, custom_competencies: Optional[List[str]]) -> Dict[str, Dict]: + """Design the specific interview rounds based on role and level.""" + rounds = {} + + # Determine which rounds to include + core_rounds = role_template["core_rounds"].copy() + optional_rounds = role_template["optional_rounds"].copy() + + # Add optional rounds based on level + if level_key in ["senior", "staff", "principal"]: + if "technical_leadership" in optional_rounds and role_key in ["software_engineer", "engineering_manager"]: + core_rounds.append("technical_leadership") + if "strategic_thinking" in optional_rounds and role_key in ["product_manager", "engineering_manager"]: + core_rounds.append("strategic_thinking") + if "design_system_thinking" in optional_rounds and role_key == "designer": + core_rounds.append("design_system_thinking") + + if level_key in ["staff", "principal"]: + if "domain_expertise" in optional_rounds: + core_rounds.append("domain_expertise") + + # Define round details + round_definitions = self._get_round_definitions() + + for i, round_type in enumerate(core_rounds, 1): + if round_type in round_definitions: + round_def = round_definitions[round_type].copy() + round_def["order"] = i + round_def["focus_areas"] = self._customize_focus_areas(round_type, competency_req, custom_competencies) + rounds[f"round_{i}_{round_type}"] = round_def + + return rounds + + def _get_round_definitions(self) -> Dict[str, Dict]: + """Get predefined round definitions with standard durations and formats.""" + return { + "technical_phone_screen": { + "name": "Technical Phone Screen", + "duration_minutes": 45, + "format": "virtual", + "objectives": ["Assess coding fundamentals", "Evaluate problem-solving approach", "Screen for basic technical competency"], + "question_types": ["coding_problems", "technical_concepts", "experience_questions"], + "evaluation_criteria": ["technical_accuracy", "problem_solving_process", "communication_clarity"] + }, + "coding_deep_dive": { + "name": "Coding Deep Dive", + "duration_minutes": 75, + "format": "in_person_or_virtual", + "objectives": ["Evaluate coding skills in depth", "Assess code quality and testing", "Review debugging approach"], + "question_types": ["complex_coding_problems", "code_review", "testing_strategy"], + "evaluation_criteria": ["code_quality", "testing_approach", "debugging_skills", "optimization_thinking"] + }, + "system_design": { + "name": "System Design", + "duration_minutes": 75, + "format": "collaborative_whiteboard", + "objectives": ["Assess architectural thinking", "Evaluate scalability considerations", "Review trade-off analysis"], + "question_types": ["system_architecture", "scalability_design", "trade_off_analysis"], + "evaluation_criteria": ["architectural_thinking", "scalability_awareness", "trade_off_reasoning"] + }, + "behavioral": { + "name": "Behavioral Interview", + "duration_minutes": 45, + "format": "conversational", + "objectives": ["Assess cultural fit", "Evaluate past experiences", "Review leadership examples"], + "question_types": ["star_method_questions", "situational_scenarios", "values_alignment"], + "evaluation_criteria": ["communication_skills", "leadership_examples", "cultural_alignment"] + }, + "technical_leadership": { + "name": "Technical Leadership", + "duration_minutes": 60, + "format": "discussion_based", + "objectives": ["Evaluate mentoring capability", "Assess technical decision making", "Review cross-team collaboration"], + "question_types": ["leadership_scenarios", "technical_decisions", "mentoring_examples"], + "evaluation_criteria": ["leadership_potential", "technical_judgment", "influence_skills"] + }, + "product_sense": { + "name": "Product Sense", + "duration_minutes": 75, + "format": "case_study", + "objectives": ["Assess product intuition", "Evaluate user empathy", "Review market understanding"], + "question_types": ["product_scenarios", "feature_prioritization", "user_journey_analysis"], + "evaluation_criteria": ["product_intuition", "user_empathy", "analytical_thinking"] + }, + "analytical_thinking": { + "name": "Analytical Thinking", + "duration_minutes": 60, + "format": "data_analysis", + "objectives": ["Evaluate data interpretation", "Assess metric design", "Review experiment planning"], + "question_types": ["data_interpretation", "metric_design", "experiment_analysis"], + "evaluation_criteria": ["analytical_rigor", "metric_intuition", "experimental_thinking"] + }, + "design_challenge": { + "name": "Design Challenge", + "duration_minutes": 90, + "format": "hands_on_design", + "objectives": ["Assess design process", "Evaluate user-centered thinking", "Review iteration approach"], + "question_types": ["design_problems", "user_research", "design_critique"], + "evaluation_criteria": ["design_process", "user_focus", "visual_communication"] + }, + "portfolio_review": { + "name": "Portfolio Review", + "duration_minutes": 75, + "format": "presentation_discussion", + "objectives": ["Review past work", "Assess design thinking", "Evaluate impact measurement"], + "question_types": ["portfolio_walkthrough", "design_decisions", "impact_stories"], + "evaluation_criteria": ["design_quality", "process_thinking", "business_impact"] + } + } + + def _customize_focus_areas(self, round_type: str, competency_req: Dict, + custom_competencies: Optional[List[str]]) -> List[str]: + """Customize focus areas based on role competency requirements.""" + base_focus_areas = competency_req.get("focus_areas", []) + + round_focus_mapping = { + "technical_phone_screen": ["coding_fundamentals", "problem_solving"], + "coding_deep_dive": ["technical_execution", "code_quality"], + "system_design": ["system_thinking", "architectural_reasoning"], + "behavioral": ["cultural_fit", "communication", "teamwork"], + "technical_leadership": ["leadership", "mentoring", "influence"], + "product_sense": ["product_intuition", "user_empathy"], + "analytical_thinking": ["data_analysis", "metric_design"], + "design_challenge": ["design_process", "user_focus"] + } + + focus_areas = round_focus_mapping.get(round_type, []) + + # Add custom competencies if specified + if custom_competencies: + focus_areas.extend([comp for comp in custom_competencies if comp not in focus_areas]) + + # Add role-specific focus areas + focus_areas.extend([area for area in base_focus_areas if area not in focus_areas]) + + return focus_areas[:5] # Limit to top 5 focus areas + + def _create_schedule(self, rounds: Dict[str, Dict]) -> Dict[str, Any]: + """Create a suggested interview schedule.""" + sorted_rounds = sorted(rounds.items(), key=lambda x: x[1]["order"]) + + # Calculate optimal scheduling + total_duration = sum(round_info["duration_minutes"] for _, round_info in sorted_rounds) + + if total_duration <= 240: # 4 hours or less - single day + schedule_type = "single_day" + day_structure = self._create_single_day_schedule(sorted_rounds) + else: # Multi-day schedule + schedule_type = "multi_day" + day_structure = self._create_multi_day_schedule(sorted_rounds) + + return { + "type": schedule_type, + "total_duration_minutes": total_duration, + "recommended_breaks": self._calculate_breaks(total_duration), + "day_structure": day_structure, + "logistics_notes": self._generate_logistics_notes(sorted_rounds) + } + + def _create_single_day_schedule(self, rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]: + """Create a single-day interview schedule.""" + start_time = datetime.strptime("09:00", "%H:%M") + current_time = start_time + + schedule = [] + + for round_name, round_info in rounds: + # Add break if needed (after 90 minutes of interviews) + if schedule and sum(item.get("duration_minutes", 0) for item in schedule if "break" not in item.get("type", "")) >= 90: + schedule.append({ + "type": "break", + "start_time": current_time.strftime("%H:%M"), + "duration_minutes": 15, + "end_time": (current_time + timedelta(minutes=15)).strftime("%H:%M") + }) + current_time += timedelta(minutes=15) + + # Add the interview round + end_time = current_time + timedelta(minutes=round_info["duration_minutes"]) + schedule.append({ + "type": "interview", + "round_name": round_name, + "title": round_info["name"], + "start_time": current_time.strftime("%H:%M"), + "end_time": end_time.strftime("%H:%M"), + "duration_minutes": round_info["duration_minutes"], + "format": round_info["format"] + }) + current_time = end_time + + return { + "day_1": { + "date": "TBD", + "start_time": start_time.strftime("%H:%M"), + "end_time": current_time.strftime("%H:%M"), + "rounds": schedule + } + } + + def _create_multi_day_schedule(self, rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]: + """Create a multi-day interview schedule.""" + # Split rounds across days (max 4 hours per day) + max_daily_minutes = 240 + days = {} + current_day = 1 + current_day_duration = 0 + current_day_rounds = [] + + for round_name, round_info in rounds: + duration = round_info["duration_minutes"] + 15 # Add buffer time + + if current_day_duration + duration > max_daily_minutes and current_day_rounds: + # Finalize current day + days[f"day_{current_day}"] = self._finalize_day_schedule(current_day_rounds) + current_day += 1 + current_day_duration = 0 + current_day_rounds = [] + + current_day_rounds.append((round_name, round_info)) + current_day_duration += duration + + # Finalize last day + if current_day_rounds: + days[f"day_{current_day}"] = self._finalize_day_schedule(current_day_rounds) + + return days + + def _finalize_day_schedule(self, day_rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]: + """Finalize the schedule for a specific day.""" + start_time = datetime.strptime("09:00", "%H:%M") + current_time = start_time + schedule = [] + + for round_name, round_info in day_rounds: + end_time = current_time + timedelta(minutes=round_info["duration_minutes"]) + schedule.append({ + "type": "interview", + "round_name": round_name, + "title": round_info["name"], + "start_time": current_time.strftime("%H:%M"), + "end_time": end_time.strftime("%H:%M"), + "duration_minutes": round_info["duration_minutes"], + "format": round_info["format"] + }) + current_time = end_time + timedelta(minutes=15) # 15-min buffer + + return { + "date": "TBD", + "start_time": start_time.strftime("%H:%M"), + "end_time": (current_time - timedelta(minutes=15)).strftime("%H:%M"), + "rounds": schedule + } + + def _calculate_breaks(self, total_duration: int) -> List[Dict[str, Any]]: + """Calculate recommended breaks based on total duration.""" + breaks = [] + + if total_duration >= 120: # 2+ hours + breaks.append({"type": "short_break", "duration": 15, "after_minutes": 90}) + + if total_duration >= 240: # 4+ hours + breaks.append({"type": "lunch_break", "duration": 60, "after_minutes": 180}) + + if total_duration >= 360: # 6+ hours + breaks.append({"type": "short_break", "duration": 15, "after_minutes": 300}) + + return breaks + + def _generate_scorecard(self, role_key: str, level_key: str, competency_req: Dict) -> Dict[str, Any]: + """Generate a scorecard template for the interview loop.""" + scoring_dimensions = [] + + # Add competency-based scoring dimensions + for competency in competency_req["required"]: + scoring_dimensions.append({ + "dimension": competency, + "weight": "high", + "scale": "1-4", + "description": f"Assessment of {competency.replace('_', ' ')} competency" + }) + + for competency in competency_req.get("preferred", []): + scoring_dimensions.append({ + "dimension": competency, + "weight": "medium", + "scale": "1-4", + "description": f"Assessment of {competency.replace('_', ' ')} competency" + }) + + # Add standard dimensions + standard_dimensions = [ + {"dimension": "communication", "weight": "high", "scale": "1-4"}, + {"dimension": "cultural_fit", "weight": "medium", "scale": "1-4"}, + {"dimension": "learning_agility", "weight": "medium", "scale": "1-4"} + ] + + scoring_dimensions.extend(standard_dimensions) + + return { + "scoring_scale": { + "4": "Exceeds Expectations - Demonstrates mastery beyond required level", + "3": "Meets Expectations - Solid performance meeting all requirements", + "2": "Partially Meets - Shows potential but has development areas", + "1": "Does Not Meet - Significant gaps in required competencies" + }, + "dimensions": scoring_dimensions, + "overall_recommendation": { + "options": ["Strong Hire", "Hire", "No Hire", "Strong No Hire"], + "criteria": "Based on weighted average and minimum thresholds" + }, + "calibration_notes": { + "required": True, + "min_length": 100, + "sections": ["strengths", "areas_for_development", "specific_examples"] + } + } + + def _define_interviewer_requirements(self, rounds: Dict[str, Dict]) -> Dict[str, Dict]: + """Define interviewer skill requirements for each round.""" + requirements = {} + + for round_name, round_info in rounds.items(): + round_type = round_name.split("_", 2)[-1] # Extract round type + + if round_type in self.interviewer_skills: + skill_req = self.interviewer_skills[round_type].copy() + skill_req["suggested_interviewers"] = self._suggest_interviewer_profiles(round_type) + requirements[round_name] = skill_req + else: + # Default requirements + requirements[round_name] = { + "required_skills": ["interviewing_basics", "evaluation_skills"], + "preferred_experience": ["relevant_domain"], + "calibration_level": "standard", + "suggested_interviewers": ["experienced_interviewer"] + } + + return requirements + + def _suggest_interviewer_profiles(self, round_type: str) -> List[str]: + """Suggest specific interviewer profiles for different round types.""" + profile_mapping = { + "technical_phone_screen": ["senior_engineer", "tech_lead"], + "coding_deep_dive": ["senior_engineer", "staff_engineer"], + "system_design": ["senior_architect", "staff_engineer"], + "behavioral": ["hiring_manager", "people_manager"], + "technical_leadership": ["engineering_manager", "senior_staff"], + "product_sense": ["senior_pm", "product_leader"], + "analytical_thinking": ["senior_analyst", "data_scientist"], + "design_challenge": ["senior_designer", "design_manager"] + } + + return profile_mapping.get(round_type, ["experienced_interviewer"]) + + def _generate_calibration_notes(self, role_key: str, level_key: str) -> Dict[str, Any]: + """Generate calibration notes and best practices.""" + return { + "hiring_bar_notes": f"Calibrated for {level_key} level {role_key.replace('_', ' ')} role", + "common_pitfalls": [ + "Avoid comparing candidates to each other rather than to the role standard", + "Don't let one strong/weak area overshadow overall assessment", + "Ensure consistent application of evaluation criteria" + ], + "calibration_checkpoints": [ + "Review score distribution after every 5 candidates", + "Conduct monthly interviewer calibration sessions", + "Track correlation with 6-month performance reviews" + ], + "escalation_criteria": [ + "Any candidate receiving all 4s or all 1s", + "Significant disagreement between interviewers (>1.5 point spread)", + "Unusual circumstances or accommodations needed" + ] + } + + def _generate_logistics_notes(self, rounds: List[Tuple[str, Dict]]) -> List[str]: + """Generate logistics and coordination notes.""" + notes = [ + "Coordinate interviewer availability before scheduling", + "Ensure all interviewers have access to job description and competency requirements", + "Prepare interview rooms/virtual links for all rounds", + "Share candidate resume and application with all interviewers" + ] + + # Add format-specific notes + formats_used = {round_info["format"] for _, round_info in rounds} + + if "virtual" in formats_used: + notes.append("Test video conferencing setup before virtual interviews") + notes.append("Share virtual meeting links with candidate 24 hours in advance") + + if "collaborative_whiteboard" in formats_used: + notes.append("Prepare whiteboard or collaborative online tool for design sessions") + + if "hands_on_design" in formats_used: + notes.append("Provide design tools access or ensure candidate can screen share their preferred tools") + + return notes + + +def format_human_readable(loop_data: Dict[str, Any]) -> str: + """Format the interview loop data in a human-readable format.""" + output = [] + + # Header + output.append(f"Interview Loop Design for {loop_data['role']} ({loop_data['level'].title()} Level)") + output.append("=" * 60) + + if loop_data.get('team'): + output.append(f"Team: {loop_data['team']}") + + output.append(f"Generated: {loop_data['generated_at']}") + output.append(f"Total Duration: {loop_data['total_duration_minutes']} minutes ({loop_data['total_duration_minutes']//60}h {loop_data['total_duration_minutes']%60}m)") + output.append(f"Total Rounds: {loop_data['total_rounds']}") + output.append("") + + # Interview Rounds + output.append("INTERVIEW ROUNDS") + output.append("-" * 40) + + sorted_rounds = sorted(loop_data['rounds'].items(), key=lambda x: x[1]['order']) + for round_name, round_info in sorted_rounds: + output.append(f"\nRound {round_info['order']}: {round_info['name']}") + output.append(f"Duration: {round_info['duration_minutes']} minutes") + output.append(f"Format: {round_info['format'].replace('_', ' ').title()}") + + output.append("Objectives:") + for obj in round_info['objectives']: + output.append(f" • {obj}") + + output.append("Focus Areas:") + for area in round_info['focus_areas']: + output.append(f" • {area.replace('_', ' ').title()}") + + # Suggested Schedule + output.append("\nSUGGESTED SCHEDULE") + output.append("-" * 40) + + schedule = loop_data['suggested_schedule'] + output.append(f"Schedule Type: {schedule['type'].replace('_', ' ').title()}") + + for day_name, day_info in schedule['day_structure'].items(): + output.append(f"\n{day_name.replace('_', ' ').title()}:") + output.append(f"Time: {day_info['start_time']} - {day_info['end_time']}") + + for item in day_info['rounds']: + if item['type'] == 'interview': + output.append(f" {item['start_time']}-{item['end_time']}: {item['title']} ({item['duration_minutes']}min)") + else: + output.append(f" {item['start_time']}-{item['end_time']}: {item['type'].title()} ({item['duration_minutes']}min)") + + # Interviewer Requirements + output.append("\nINTERVIEWER REQUIREMENTS") + output.append("-" * 40) + + for round_name, requirements in loop_data['interviewer_requirements'].items(): + round_display = round_name.split("_", 2)[-1].replace("_", " ").title() + output.append(f"\n{round_display}:") + output.append(f"Required Skills: {', '.join(requirements['required_skills'])}") + output.append(f"Suggested Interviewers: {', '.join(requirements['suggested_interviewers'])}") + output.append(f"Calibration Level: {requirements['calibration_level'].title()}") + + # Scorecard Overview + output.append("\nSCORECARD TEMPLATE") + output.append("-" * 40) + + scorecard = loop_data['scorecard_template'] + output.append("Scoring Scale:") + for score, description in scorecard['scoring_scale'].items(): + output.append(f" {score}: {description}") + + output.append("\nEvaluation Dimensions:") + for dim in scorecard['dimensions']: + output.append(f" • {dim['dimension'].replace('_', ' ').title()} (Weight: {dim['weight']})") + + # Calibration Notes + output.append("\nCALIBRATION NOTES") + output.append("-" * 40) + + calibration = loop_data['calibration_notes'] + output.append(f"Hiring Bar: {calibration['hiring_bar_notes']}") + + output.append("\nCommon Pitfalls:") + for pitfall in calibration['common_pitfalls']: + output.append(f" • {pitfall}") + + return "\n".join(output) + + +def main(): + parser = argparse.ArgumentParser(description="Generate calibrated interview loops for specific roles and levels") + parser.add_argument("--role", type=str, help="Job role title (e.g., 'Senior Software Engineer')") + parser.add_argument("--level", type=str, help="Experience level (junior, mid, senior, staff, principal)") + parser.add_argument("--team", type=str, help="Team or department (optional)") + parser.add_argument("--competencies", type=str, help="Comma-separated list of specific competencies to focus on") + parser.add_argument("--input", type=str, help="Input JSON file with role definition") + parser.add_argument("--output", type=str, help="Output directory or file path") + parser.add_argument("--format", choices=["json", "text", "both"], default="both", help="Output format") + + args = parser.parse_args() + + designer = InterviewLoopDesigner() + + # Handle input + if args.input: + try: + with open(args.input, 'r') as f: + role_data = json.load(f) + role = role_data.get('role') or role_data.get('title', '') + level = role_data.get('level', 'senior') + team = role_data.get('team') + competencies = role_data.get('competencies') + except Exception as e: + print(f"Error reading input file: {e}") + sys.exit(1) + else: + if not args.role or not args.level: + print("Error: --role and --level are required when not using --input") + sys.exit(1) + + role = args.role + level = args.level + team = args.team + competencies = args.competencies.split(',') if args.competencies else None + + # Generate interview loop + try: + loop_data = designer.generate_interview_loop(role, level, team, competencies) + + # Handle output + if args.output: + output_path = args.output + if os.path.isdir(output_path): + safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_') + base_filename = f"{safe_role}_{level}_interview_loop" + json_path = os.path.join(output_path, f"{base_filename}.json") + text_path = os.path.join(output_path, f"{base_filename}.txt") + else: + # Use provided path as base + json_path = output_path if output_path.endswith('.json') else f"{output_path}.json" + text_path = output_path.replace('.json', '.txt') if output_path.endswith('.json') else f"{output_path}.txt" + else: + safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_') + base_filename = f"{safe_role}_{level}_interview_loop" + json_path = f"{base_filename}.json" + text_path = f"{base_filename}.txt" + + # Write outputs + if args.format in ["json", "both"]: + with open(json_path, 'w') as f: + json.dump(loop_data, f, indent=2, default=str) + print(f"JSON output written to: {json_path}") + + if args.format in ["text", "both"]: + with open(text_path, 'w') as f: + f.write(format_human_readable(loop_data)) + print(f"Text output written to: {text_path}") + + # Always print summary to stdout + print("\nInterview Loop Summary:") + print(f"Role: {loop_data['role']} ({loop_data['level'].title()})") + print(f"Total Duration: {loop_data['total_duration_minutes']} minutes") + print(f"Number of Rounds: {loop_data['total_rounds']}") + print(f"Schedule Type: {loop_data['suggested_schedule']['type'].replace('_', ' ').title()}") + + except Exception as e: + print(f"Error generating interview loop: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering/interview-system-designer/question_bank_generator.py b/engineering/interview-system-designer/question_bank_generator.py new file mode 100644 index 0000000..7febf87 --- /dev/null +++ b/engineering/interview-system-designer/question_bank_generator.py @@ -0,0 +1,1060 @@ +#!/usr/bin/env python3 +""" +Question Bank Generator + +Generates comprehensive, competency-based interview questions with detailed scoring criteria. +Creates structured question banks organized by competency area with scoring rubrics, +follow-up probes, and calibration examples. + +Usage: + python question_bank_generator.py --role "Frontend Engineer" --competencies react,typescript,system-design + python question_bank_generator.py --role "Product Manager" --question-types behavioral,leadership + python question_bank_generator.py --input role_requirements.json --output questions/ +""" + +import os +import sys +import json +import argparse +import random +from datetime import datetime +from typing import Dict, List, Optional, Any, Tuple +from collections import defaultdict + + +class QuestionBankGenerator: + """Generates comprehensive interview question banks with scoring criteria.""" + + def __init__(self): + self.technical_questions = self._init_technical_questions() + self.behavioral_questions = self._init_behavioral_questions() + self.competency_mapping = self._init_competency_mapping() + self.scoring_rubrics = self._init_scoring_rubrics() + self.follow_up_strategies = self._init_follow_up_strategies() + + def _init_technical_questions(self) -> Dict[str, Dict]: + """Initialize technical questions by competency area and level.""" + return { + "coding_fundamentals": { + "junior": [ + { + "question": "Write a function to reverse a string without using built-in reverse methods.", + "competency": "coding_fundamentals", + "type": "coding", + "difficulty": "easy", + "time_limit": 15, + "key_concepts": ["loops", "string_manipulation", "basic_algorithms"] + }, + { + "question": "Implement a function to check if a string is a palindrome.", + "competency": "coding_fundamentals", + "type": "coding", + "difficulty": "easy", + "time_limit": 15, + "key_concepts": ["string_processing", "comparison", "edge_cases"] + }, + { + "question": "Find the largest element in an array without using built-in max functions.", + "competency": "coding_fundamentals", + "type": "coding", + "difficulty": "easy", + "time_limit": 10, + "key_concepts": ["arrays", "iteration", "comparison"] + } + ], + "mid": [ + { + "question": "Implement a function to find the first non-repeating character in a string.", + "competency": "coding_fundamentals", + "type": "coding", + "difficulty": "medium", + "time_limit": 20, + "key_concepts": ["hash_maps", "string_processing", "efficiency"] + }, + { + "question": "Write a function to merge two sorted arrays into one sorted array.", + "competency": "coding_fundamentals", + "type": "coding", + "difficulty": "medium", + "time_limit": 25, + "key_concepts": ["merge_algorithms", "two_pointers", "optimization"] + } + ], + "senior": [ + { + "question": "Implement a LRU (Least Recently Used) cache with O(1) operations.", + "competency": "coding_fundamentals", + "type": "coding", + "difficulty": "hard", + "time_limit": 35, + "key_concepts": ["data_structures", "hash_maps", "doubly_linked_lists"] + } + ] + }, + "system_design": { + "mid": [ + { + "question": "Design a URL shortener service like bit.ly for 10K users.", + "competency": "system_design", + "type": "design", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": ["database_design", "hashing", "basic_scalability"] + } + ], + "senior": [ + { + "question": "Design a real-time chat system supporting 1M concurrent users.", + "competency": "system_design", + "type": "design", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": ["websockets", "load_balancing", "database_sharding", "caching"] + }, + { + "question": "Design a distributed cache system like Redis with high availability.", + "competency": "system_design", + "type": "design", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": ["distributed_systems", "replication", "consistency", "partitioning"] + } + ], + "staff": [ + { + "question": "Design the architecture for a global content delivery network (CDN).", + "competency": "system_design", + "type": "design", + "difficulty": "expert", + "time_limit": 75, + "key_concepts": ["global_architecture", "edge_computing", "content_optimization", "network_protocols"] + } + ] + }, + "frontend_development": { + "junior": [ + { + "question": "Create a responsive navigation menu using HTML, CSS, and vanilla JavaScript.", + "competency": "frontend_development", + "type": "coding", + "difficulty": "easy", + "time_limit": 30, + "key_concepts": ["html_css", "responsive_design", "dom_manipulation"] + } + ], + "mid": [ + { + "question": "Build a React component that fetches and displays paginated data from an API.", + "competency": "frontend_development", + "type": "coding", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": ["react_hooks", "api_integration", "state_management", "pagination"] + } + ], + "senior": [ + { + "question": "Design and implement a custom React hook for managing complex form state with validation.", + "competency": "frontend_development", + "type": "coding", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": ["custom_hooks", "form_validation", "state_management", "performance"] + } + ] + }, + "data_analysis": { + "junior": [ + { + "question": "Given a dataset of user activities, calculate the daily active users for the past month.", + "competency": "data_analysis", + "type": "analytical", + "difficulty": "easy", + "time_limit": 30, + "key_concepts": ["sql_basics", "date_functions", "aggregation"] + } + ], + "mid": [ + { + "question": "Analyze conversion funnel data to identify the biggest drop-off point and propose solutions.", + "competency": "data_analysis", + "type": "analytical", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": ["funnel_analysis", "conversion_optimization", "statistical_significance"] + } + ], + "senior": [ + { + "question": "Design an A/B testing framework to measure the impact of a new recommendation algorithm.", + "competency": "data_analysis", + "type": "analytical", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": ["experiment_design", "statistical_power", "bias_mitigation", "causal_inference"] + } + ] + }, + "machine_learning": { + "mid": [ + { + "question": "Explain how you would build a recommendation system for an e-commerce platform.", + "competency": "machine_learning", + "type": "conceptual", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": ["collaborative_filtering", "content_based", "cold_start", "evaluation_metrics"] + } + ], + "senior": [ + { + "question": "Design a real-time fraud detection system for financial transactions.", + "competency": "machine_learning", + "type": "design", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": ["anomaly_detection", "real_time_ml", "feature_engineering", "model_monitoring"] + } + ] + }, + "product_strategy": { + "mid": [ + { + "question": "How would you prioritize features for a mobile app with limited engineering resources?", + "competency": "product_strategy", + "type": "case_study", + "difficulty": "medium", + "time_limit": 45, + "key_concepts": ["prioritization_frameworks", "resource_allocation", "impact_estimation"] + } + ], + "senior": [ + { + "question": "Design a go-to-market strategy for a new B2B SaaS product entering a competitive market.", + "competency": "product_strategy", + "type": "strategic", + "difficulty": "hard", + "time_limit": 60, + "key_concepts": ["market_analysis", "competitive_positioning", "pricing_strategy", "channel_strategy"] + } + ] + } + } + + def _init_behavioral_questions(self) -> Dict[str, List[Dict]]: + """Initialize behavioral questions by competency area.""" + return { + "leadership": [ + { + "question": "Tell me about a time when you had to lead a team through a significant change or challenge.", + "competency": "leadership", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["change_management", "team_motivation", "communication"] + }, + { + "question": "Describe a situation where you had to influence someone without having direct authority over them.", + "competency": "leadership", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["influence", "persuasion", "stakeholder_management"] + }, + { + "question": "Give me an example of when you had to make a difficult decision that affected your team.", + "competency": "leadership", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["decision_making", "team_impact", "communication"] + } + ], + "collaboration": [ + { + "question": "Describe a time when you had to work with a difficult colleague or stakeholder.", + "competency": "collaboration", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["conflict_resolution", "relationship_building", "professionalism"] + }, + { + "question": "Tell me about a project where you had to coordinate across multiple teams or departments.", + "competency": "collaboration", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["cross_functional_work", "communication", "project_coordination"] + } + ], + "problem_solving": [ + { + "question": "Walk me through a complex problem you solved recently. What was your approach?", + "competency": "problem_solving", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["analytical_thinking", "methodology", "creativity"] + }, + { + "question": "Describe a time when you had to solve a problem with limited information or resources.", + "competency": "problem_solving", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["resourcefulness", "ambiguity_tolerance", "decision_making"] + } + ], + "communication": [ + { + "question": "Tell me about a time when you had to present complex technical information to a non-technical audience.", + "competency": "communication", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["technical_communication", "audience_adaptation", "clarity"] + }, + { + "question": "Describe a situation where you had to deliver difficult feedback to a colleague.", + "competency": "communication", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["feedback_delivery", "empathy", "constructive_criticism"] + } + ], + "adaptability": [ + { + "question": "Tell me about a time when you had to quickly learn a new technology or skill for work.", + "competency": "adaptability", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["learning_agility", "growth_mindset", "knowledge_acquisition"] + }, + { + "question": "Describe how you handled a situation when project requirements changed significantly mid-way.", + "competency": "adaptability", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["flexibility", "change_management", "resilience"] + } + ], + "innovation": [ + { + "question": "Tell me about a time when you came up with a creative solution to improve a process or solve a problem.", + "competency": "innovation", + "type": "behavioral", + "method": "STAR", + "focus_areas": ["creative_thinking", "process_improvement", "initiative"] + } + ] + } + + def _init_competency_mapping(self) -> Dict[str, Dict]: + """Initialize role to competency mapping.""" + return { + "software_engineer": { + "core_competencies": ["coding_fundamentals", "system_design", "problem_solving", "collaboration"], + "level_specific": { + "junior": ["coding_fundamentals", "debugging", "learning_agility"], + "mid": ["advanced_coding", "system_design", "mentoring_basics"], + "senior": ["system_architecture", "technical_leadership", "innovation"], + "staff": ["architectural_vision", "organizational_impact", "strategic_thinking"] + } + }, + "frontend_engineer": { + "core_competencies": ["frontend_development", "ui_ux_understanding", "problem_solving", "collaboration"], + "level_specific": { + "junior": ["html_css_js", "responsive_design", "basic_frameworks"], + "mid": ["react_vue_angular", "state_management", "performance_optimization"], + "senior": ["frontend_architecture", "team_leadership", "cross_functional_collaboration"], + "staff": ["frontend_strategy", "technology_evaluation", "organizational_impact"] + } + }, + "backend_engineer": { + "core_competencies": ["backend_development", "database_design", "api_design", "system_design"], + "level_specific": { + "junior": ["server_side_programming", "database_basics", "api_consumption"], + "mid": ["microservices", "caching", "security_basics"], + "senior": ["distributed_systems", "performance_optimization", "technical_leadership"], + "staff": ["system_architecture", "technology_strategy", "cross_team_influence"] + } + }, + "product_manager": { + "core_competencies": ["product_strategy", "user_research", "data_analysis", "stakeholder_management"], + "level_specific": { + "junior": ["feature_specification", "user_stories", "basic_analytics"], + "mid": ["product_roadmap", "cross_functional_leadership", "market_research"], + "senior": ["business_strategy", "team_leadership", "p&l_responsibility"], + "staff": ["portfolio_management", "organizational_strategy", "market_creation"] + } + }, + "data_scientist": { + "core_competencies": ["statistical_analysis", "machine_learning", "data_analysis", "business_acumen"], + "level_specific": { + "junior": ["python_r", "sql", "basic_ml", "data_visualization"], + "mid": ["advanced_ml", "experiment_design", "model_evaluation"], + "senior": ["ml_systems", "data_strategy", "stakeholder_communication"], + "staff": ["data_platform", "ai_strategy", "organizational_impact"] + } + }, + "designer": { + "core_competencies": ["design_process", "user_research", "visual_design", "collaboration"], + "level_specific": { + "junior": ["design_tools", "user_empathy", "visual_communication"], + "mid": ["design_systems", "user_testing", "cross_functional_work"], + "senior": ["design_strategy", "team_leadership", "business_impact"], + "staff": ["design_vision", "organizational_design", "strategic_influence"] + } + }, + "devops_engineer": { + "core_competencies": ["infrastructure", "automation", "monitoring", "troubleshooting"], + "level_specific": { + "junior": ["scripting", "basic_cloud", "ci_cd_basics"], + "mid": ["infrastructure_as_code", "container_orchestration", "security"], + "senior": ["platform_design", "reliability_engineering", "team_leadership"], + "staff": ["platform_strategy", "organizational_infrastructure", "technology_vision"] + } + } + } + + def _init_scoring_rubrics(self) -> Dict[str, Dict]: + """Initialize scoring rubrics for different question types.""" + return { + "coding": { + "correctness": { + "4": "Solution is completely correct, handles all edge cases, optimal complexity", + "3": "Solution is correct for main cases, good complexity, minor edge case issues", + "2": "Solution works but has some bugs or suboptimal approach", + "1": "Solution has significant issues or doesn't work" + }, + "code_quality": { + "4": "Clean, readable, well-structured code with excellent naming and comments", + "3": "Good code structure, readable with appropriate naming", + "2": "Code works but has style/structure issues", + "1": "Poor code quality, hard to understand" + }, + "problem_solving_approach": { + "4": "Excellent problem breakdown, clear thinking process, considers alternatives", + "3": "Good approach, logical thinking, systematic problem solving", + "2": "Decent approach but some confusion or inefficiency", + "1": "Poor approach, unclear thinking process" + }, + "communication": { + "4": "Excellent explanation of approach, asks clarifying questions, clear reasoning", + "3": "Good communication, explains thinking well", + "2": "Adequate communication, some explanation", + "1": "Poor communication, little explanation" + } + }, + "behavioral": { + "situation_clarity": { + "4": "Clear, specific situation with relevant context and stakes", + "3": "Good situation description with adequate context", + "2": "Situation described but lacks some specifics", + "1": "Vague or unclear situation description" + }, + "action_quality": { + "4": "Specific, thoughtful actions showing strong competency", + "3": "Good actions demonstrating competency", + "2": "Adequate actions but could be stronger", + "1": "Weak or inappropriate actions" + }, + "result_impact": { + "4": "Significant positive impact with measurable results", + "3": "Good positive impact with clear outcomes", + "2": "Some positive impact demonstrated", + "1": "Little or no positive impact shown" + }, + "self_awareness": { + "4": "Excellent self-reflection, learns from experience, acknowledges growth areas", + "3": "Good self-awareness and learning orientation", + "2": "Some self-reflection demonstrated", + "1": "Limited self-awareness or reflection" + } + }, + "design": { + "system_thinking": { + "4": "Comprehensive system view, considers all components and interactions", + "3": "Good system understanding with most components identified", + "2": "Basic system thinking with some gaps", + "1": "Limited system thinking, misses key components" + }, + "scalability": { + "4": "Excellent scalability considerations, multiple strategies discussed", + "3": "Good scalability awareness with practical solutions", + "2": "Basic scalability understanding", + "1": "Little to no scalability consideration" + }, + "trade_offs": { + "4": "Excellent trade-off analysis, considers multiple dimensions", + "3": "Good trade-off awareness with clear reasoning", + "2": "Some trade-off consideration", + "1": "Limited trade-off analysis" + }, + "technical_depth": { + "4": "Deep technical knowledge with implementation details", + "3": "Good technical knowledge with solid understanding", + "2": "Adequate technical knowledge", + "1": "Limited technical depth" + } + } + } + + def _init_follow_up_strategies(self) -> Dict[str, List[str]]: + """Initialize follow-up question strategies by competency.""" + return { + "coding_fundamentals": [ + "How would you optimize this solution for better time complexity?", + "What edge cases should we consider for this problem?", + "How would you test this function?", + "What would happen if the input size was very large?" + ], + "system_design": [ + "How would you handle if the system needed to scale 10x?", + "What would you do if one of your services went down?", + "How would you monitor this system in production?", + "What security considerations would you implement?" + ], + "leadership": [ + "What would you do differently if you faced this situation again?", + "How did you handle team members who were resistant to the change?", + "What metrics did you use to measure success?", + "How did you communicate progress to stakeholders?" + ], + "problem_solving": [ + "Walk me through your thought process step by step", + "What alternative approaches did you consider?", + "How did you validate your solution worked?", + "What did you learn from this experience?" + ], + "collaboration": [ + "How did you build consensus among the different stakeholders?", + "What communication channels did you use to keep everyone aligned?", + "How did you handle disagreements or conflicts?", + "What would you do to improve collaboration in the future?" + ] + } + + def generate_question_bank(self, role: str, level: str = "senior", + competencies: Optional[List[str]] = None, + question_types: Optional[List[str]] = None, + num_questions: int = 20) -> Dict[str, Any]: + """Generate a comprehensive question bank for the specified role and competencies.""" + + # Normalize inputs + role_key = self._normalize_role(role) + level_key = level.lower() + + # Get competency requirements + role_competencies = self._get_role_competencies(role_key, level_key, competencies) + + # Determine question types to include + if question_types is None: + question_types = ["technical", "behavioral", "situational"] + + # Generate questions + questions = self._generate_questions(role_competencies, question_types, level_key, num_questions) + + # Create scoring rubrics + scoring_rubrics = self._create_scoring_rubrics(questions) + + # Generate follow-up probes + follow_up_probes = self._generate_follow_up_probes(questions) + + # Create calibration examples + calibration_examples = self._create_calibration_examples(questions[:5]) # Sample for first 5 questions + + return { + "role": role, + "level": level, + "competencies": role_competencies, + "question_types": question_types, + "generated_at": datetime.now().isoformat(), + "total_questions": len(questions), + "questions": questions, + "scoring_rubrics": scoring_rubrics, + "follow_up_probes": follow_up_probes, + "calibration_examples": calibration_examples, + "usage_guidelines": self._generate_usage_guidelines(role_key, level_key) + } + + def _normalize_role(self, role: str) -> str: + """Normalize role name to match competency mapping keys.""" + role_lower = role.lower().replace(" ", "_").replace("-", "_") + + # Map variations to standard roles + role_mappings = { + "software_engineer": ["engineer", "developer", "swe", "software_developer"], + "frontend_engineer": ["frontend", "front_end", "ui_engineer", "web_developer"], + "backend_engineer": ["backend", "back_end", "server_engineer", "api_developer"], + "product_manager": ["pm", "product", "product_owner", "po"], + "data_scientist": ["ds", "data", "analyst", "ml_engineer"], + "designer": ["ux", "ui", "ux_ui", "product_designer", "visual_designer"], + "devops_engineer": ["devops", "sre", "platform_engineer", "infrastructure"] + } + + for standard_role, variations in role_mappings.items(): + if any(var in role_lower for var in variations): + return standard_role + + # Default fallback + return "software_engineer" + + def _get_role_competencies(self, role_key: str, level_key: str, + custom_competencies: Optional[List[str]]) -> List[str]: + """Get competencies for the role and level.""" + if role_key not in self.competency_mapping: + role_key = "software_engineer" + + role_mapping = self.competency_mapping[role_key] + competencies = role_mapping["core_competencies"].copy() + + # Add level-specific competencies + if level_key in role_mapping["level_specific"]: + competencies.extend(role_mapping["level_specific"][level_key]) + elif "senior" in role_mapping["level_specific"]: + competencies.extend(role_mapping["level_specific"]["senior"]) + + # Add custom competencies if specified + if custom_competencies: + competencies.extend([comp.strip() for comp in custom_competencies if comp.strip() not in competencies]) + + return list(set(competencies)) # Remove duplicates + + def _generate_questions(self, competencies: List[str], question_types: List[str], + level: str, num_questions: int) -> List[Dict[str, Any]]: + """Generate questions based on competencies and types.""" + questions = [] + questions_per_competency = max(1, num_questions // len(competencies)) + + for competency in competencies: + competency_questions = [] + + # Add technical questions if requested and available + if "technical" in question_types and competency in self.technical_questions: + tech_questions = [] + + # Get questions for current level and below + level_order = ["junior", "mid", "senior", "staff", "principal"] + current_level_idx = level_order.index(level) if level in level_order else 2 + + for lvl_idx in range(current_level_idx + 1): + lvl = level_order[lvl_idx] + if lvl in self.technical_questions[competency]: + tech_questions.extend(self.technical_questions[competency][lvl]) + + competency_questions.extend(tech_questions[:questions_per_competency]) + + # Add behavioral questions if requested + if "behavioral" in question_types and competency in self.behavioral_questions: + behavioral_q = self.behavioral_questions[competency][:questions_per_competency] + competency_questions.extend(behavioral_q) + + # Add situational questions (variations of behavioral) + if "situational" in question_types: + situational_q = self._generate_situational_questions(competency, questions_per_competency) + competency_questions.extend(situational_q) + + # Ensure we have enough questions for this competency + while len(competency_questions) < questions_per_competency: + competency_questions.extend(self._generate_fallback_questions(competency, level)) + if len(competency_questions) >= questions_per_competency: + break + + questions.extend(competency_questions[:questions_per_competency]) + + # Shuffle and limit to requested number + random.shuffle(questions) + return questions[:num_questions] + + def _generate_situational_questions(self, competency: str, count: int) -> List[Dict[str, Any]]: + """Generate situational questions for a competency.""" + situational_templates = { + "leadership": [ + { + "question": "You're leading a project that's behind schedule and the client is unhappy. How do you handle this situation?", + "competency": competency, + "type": "situational", + "focus_areas": ["crisis_management", "client_communication", "team_leadership"] + } + ], + "collaboration": [ + { + "question": "You're working on a cross-functional project and two team members have opposing views on the technical approach. How do you resolve this?", + "competency": competency, + "type": "situational", + "focus_areas": ["conflict_resolution", "technical_decision_making", "facilitation"] + } + ], + "problem_solving": [ + { + "question": "You've been assigned to improve the performance of a critical system, but you have limited time and budget. Walk me through your approach.", + "competency": competency, + "type": "situational", + "focus_areas": ["prioritization", "resource_constraints", "systematic_approach"] + } + ] + } + + if competency in situational_templates: + return situational_templates[competency][:count] + return [] + + def _generate_fallback_questions(self, competency: str, level: str) -> List[Dict[str, Any]]: + """Generate fallback questions when specific ones aren't available.""" + fallback_questions = [ + { + "question": f"Describe your experience with {competency.replace('_', ' ')} in your current or previous role.", + "competency": competency, + "type": "experience", + "focus_areas": ["experience_depth", "practical_application"] + }, + { + "question": f"What challenges have you faced related to {competency.replace('_', ' ')} and how did you overcome them?", + "competency": competency, + "type": "challenge_based", + "focus_areas": ["problem_solving", "learning_from_experience"] + } + ] + return fallback_questions + + def _create_scoring_rubrics(self, questions: List[Dict[str, Any]]) -> Dict[str, Dict]: + """Create scoring rubrics for the generated questions.""" + rubrics = {} + + for i, question in enumerate(questions, 1): + question_key = f"question_{i}" + question_type = question.get("type", "behavioral") + + if question_type in self.scoring_rubrics: + rubrics[question_key] = { + "question": question["question"], + "competency": question["competency"], + "type": question_type, + "scoring_criteria": self.scoring_rubrics[question_type], + "weight": self._determine_question_weight(question), + "time_limit": question.get("time_limit", 30) + } + + return rubrics + + def _determine_question_weight(self, question: Dict[str, Any]) -> str: + """Determine the weight/importance of a question.""" + competency = question.get("competency", "") + question_type = question.get("type", "") + difficulty = question.get("difficulty", "medium") + + # Core competencies get higher weight + core_competencies = ["coding_fundamentals", "system_design", "leadership", "problem_solving"] + + if competency in core_competencies: + return "high" + elif question_type in ["coding", "design"] or difficulty == "hard": + return "high" + elif difficulty == "easy": + return "medium" + else: + return "medium" + + def _generate_follow_up_probes(self, questions: List[Dict[str, Any]]) -> Dict[str, List[str]]: + """Generate follow-up probes for each question.""" + probes = {} + + for i, question in enumerate(questions, 1): + question_key = f"question_{i}" + competency = question.get("competency", "") + + # Get competency-specific follow-ups + if competency in self.follow_up_strategies: + competency_probes = self.follow_up_strategies[competency].copy() + else: + competency_probes = [ + "Can you provide more specific details about your approach?", + "What would you do differently if you had to do this again?", + "What challenges did you face and how did you overcome them?" + ] + + # Add question-type specific probes + question_type = question.get("type", "") + if question_type == "coding": + competency_probes.extend([ + "How would you test this solution?", + "What's the time and space complexity of your approach?", + "Can you think of any optimizations?" + ]) + elif question_type == "behavioral": + competency_probes.extend([ + "What did you learn from this experience?", + "How did others react to your approach?", + "What metrics did you use to measure success?" + ]) + elif question_type == "design": + competency_probes.extend([ + "How would you handle failure scenarios?", + "What monitoring would you implement?", + "How would this scale to 10x the load?" + ]) + + probes[question_key] = competency_probes[:5] # Limit to 5 follow-ups + + return probes + + def _create_calibration_examples(self, sample_questions: List[Dict[str, Any]]) -> Dict[str, Dict]: + """Create calibration examples with poor/good/great answers.""" + examples = {} + + for i, question in enumerate(sample_questions, 1): + question_key = f"question_{i}" + examples[question_key] = { + "question": question["question"], + "competency": question["competency"], + "sample_answers": { + "poor_answer": self._generate_sample_answer(question, "poor"), + "good_answer": self._generate_sample_answer(question, "good"), + "great_answer": self._generate_sample_answer(question, "great") + }, + "scoring_rationale": self._generate_scoring_rationale(question) + } + + return examples + + def _generate_sample_answer(self, question: Dict[str, Any], quality: str) -> Dict[str, str]: + """Generate sample answers of different quality levels.""" + competency = question.get("competency", "") + question_type = question.get("type", "") + + if quality == "poor": + return { + "answer": f"Sample poor answer for {competency} question - lacks detail, specificity, or demonstrates weak competency", + "score": "1-2", + "issues": ["Vague response", "Limited evidence of competency", "Poor structure"] + } + elif quality == "good": + return { + "answer": f"Sample good answer for {competency} question - adequate detail, demonstrates competency clearly", + "score": "3", + "strengths": ["Clear structure", "Demonstrates competency", "Adequate detail"] + } + else: # great + return { + "answer": f"Sample excellent answer for {competency} question - exceptional detail, strong evidence, goes above and beyond", + "score": "4", + "strengths": ["Exceptional detail", "Strong evidence", "Strategic thinking", "Goes beyond requirements"] + } + + def _generate_scoring_rationale(self, question: Dict[str, Any]) -> Dict[str, str]: + """Generate rationale for scoring this question.""" + competency = question.get("competency", "") + return { + "key_indicators": f"Look for evidence of {competency.replace('_', ' ')} competency", + "red_flags": "Vague answers, lack of specifics, negative outcomes without learning", + "green_flags": "Specific examples, clear impact, demonstrates growth and learning" + } + + def _generate_usage_guidelines(self, role_key: str, level_key: str) -> Dict[str, Any]: + """Generate usage guidelines for the question bank.""" + return { + "interview_flow": { + "warm_up": "Start with 1-2 easier questions to build rapport", + "core_assessment": "Focus majority of time on core competency questions", + "closing": "End with questions about candidate's questions/interests" + }, + "time_management": { + "technical_questions": "Allow extra time for coding/design questions", + "behavioral_questions": "Keep to time limits but allow for follow-ups", + "total_recommendation": "45-75 minutes per interview round" + }, + "question_selection": { + "variety": "Mix question types within each competency area", + "difficulty": "Adjust based on candidate responses and energy", + "customization": "Adapt questions based on candidate's background" + }, + "common_mistakes": [ + "Don't ask all questions mechanically", + "Don't skip follow-up questions", + "Don't forget to assess cultural fit alongside competencies", + "Don't let one strong/weak area bias overall assessment" + ], + "calibration_reminders": [ + "Compare against role standard, not other candidates", + "Focus on evidence demonstrated, not potential", + "Consider level-appropriate expectations", + "Document specific examples in feedback" + ] + } + + +def format_human_readable(question_bank: Dict[str, Any]) -> str: + """Format question bank data in human-readable format.""" + output = [] + + # Header + output.append(f"Interview Question Bank: {question_bank['role']} ({question_bank['level'].title()} Level)") + output.append("=" * 70) + output.append(f"Generated: {question_bank['generated_at']}") + output.append(f"Total Questions: {question_bank['total_questions']}") + output.append(f"Question Types: {', '.join(question_bank['question_types'])}") + output.append(f"Target Competencies: {', '.join(question_bank['competencies'])}") + output.append("") + + # Questions + output.append("INTERVIEW QUESTIONS") + output.append("-" * 50) + + for i, question in enumerate(question_bank['questions'], 1): + output.append(f"\n{i}. {question['question']}") + output.append(f" Competency: {question['competency'].replace('_', ' ').title()}") + output.append(f" Type: {question.get('type', 'N/A').title()}") + if 'time_limit' in question: + output.append(f" Time Limit: {question['time_limit']} minutes") + if 'focus_areas' in question: + output.append(f" Focus Areas: {', '.join(question['focus_areas'])}") + + # Scoring Guidelines + output.append("\n\nSCORING RUBRICS") + output.append("-" * 50) + + # Show sample scoring criteria + if question_bank['scoring_rubrics']: + first_question = list(question_bank['scoring_rubrics'].keys())[0] + sample_rubric = question_bank['scoring_rubrics'][first_question] + + output.append(f"Sample Scoring Criteria ({sample_rubric['type']} questions):") + for criterion, scores in sample_rubric['scoring_criteria'].items(): + output.append(f"\n{criterion.replace('_', ' ').title()}:") + for score, description in scores.items(): + output.append(f" {score}: {description}") + + # Follow-up Probes + output.append("\n\nFOLLOW-UP PROBE EXAMPLES") + output.append("-" * 50) + + if question_bank['follow_up_probes']: + first_question = list(question_bank['follow_up_probes'].keys())[0] + sample_probes = question_bank['follow_up_probes'][first_question] + + output.append("Sample follow-up questions:") + for probe in sample_probes[:3]: # Show first 3 + output.append(f" • {probe}") + + # Usage Guidelines + output.append("\n\nUSAGE GUIDELINES") + output.append("-" * 50) + + guidelines = question_bank['usage_guidelines'] + + output.append("Interview Flow:") + for phase, description in guidelines['interview_flow'].items(): + output.append(f" • {phase.replace('_', ' ').title()}: {description}") + + output.append("\nTime Management:") + for aspect, recommendation in guidelines['time_management'].items(): + output.append(f" • {aspect.replace('_', ' ').title()}: {recommendation}") + + output.append("\nCommon Mistakes to Avoid:") + for mistake in guidelines['common_mistakes'][:3]: # Show first 3 + output.append(f" • {mistake}") + + # Calibration Examples (if available) + if question_bank['calibration_examples']: + output.append("\n\nCALIBRATION EXAMPLES") + output.append("-" * 50) + + first_example = list(question_bank['calibration_examples'].values())[0] + output.append(f"Question: {first_example['question']}") + + output.append("\nSample Answer Quality Levels:") + for quality, details in first_example['sample_answers'].items(): + output.append(f" {quality.replace('_', ' ').title()} (Score {details['score']}):") + if 'issues' in details: + output.append(f" Issues: {', '.join(details['issues'])}") + if 'strengths' in details: + output.append(f" Strengths: {', '.join(details['strengths'])}") + + return "\n".join(output) + + +def main(): + parser = argparse.ArgumentParser(description="Generate comprehensive interview question banks with scoring criteria") + parser.add_argument("--role", type=str, help="Job role title (e.g., 'Frontend Engineer')") + parser.add_argument("--level", type=str, default="senior", help="Experience level (junior, mid, senior, staff, principal)") + parser.add_argument("--competencies", type=str, help="Comma-separated list of competencies to focus on") + parser.add_argument("--question-types", type=str, help="Comma-separated list of question types (technical, behavioral, situational)") + parser.add_argument("--num-questions", type=int, default=20, help="Number of questions to generate") + parser.add_argument("--input", type=str, help="Input JSON file with role requirements") + parser.add_argument("--output", type=str, help="Output directory or file path") + parser.add_argument("--format", choices=["json", "text", "both"], default="both", help="Output format") + + args = parser.parse_args() + + generator = QuestionBankGenerator() + + # Handle input + if args.input: + try: + with open(args.input, 'r') as f: + role_data = json.load(f) + role = role_data.get('role') or role_data.get('title', '') + level = role_data.get('level', 'senior') + competencies = role_data.get('competencies') + question_types = role_data.get('question_types') + num_questions = role_data.get('num_questions', 20) + except Exception as e: + print(f"Error reading input file: {e}") + sys.exit(1) + else: + if not args.role: + print("Error: --role is required when not using --input") + sys.exit(1) + + role = args.role + level = args.level + competencies = args.competencies.split(',') if args.competencies else None + question_types = args.question_types.split(',') if args.question_types else None + num_questions = args.num_questions + + # Generate question bank + try: + question_bank = generator.generate_question_bank( + role=role, + level=level, + competencies=competencies, + question_types=question_types, + num_questions=num_questions + ) + + # Handle output + if args.output: + output_path = args.output + if os.path.isdir(output_path): + safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_') + base_filename = f"{safe_role}_{level}_questions" + json_path = os.path.join(output_path, f"{base_filename}.json") + text_path = os.path.join(output_path, f"{base_filename}.txt") + else: + json_path = output_path if output_path.endswith('.json') else f"{output_path}.json" + text_path = output_path.replace('.json', '.txt') if output_path.endswith('.json') else f"{output_path}.txt" + else: + safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_') + base_filename = f"{safe_role}_{level}_questions" + json_path = f"{base_filename}.json" + text_path = f"{base_filename}.txt" + + # Write outputs + if args.format in ["json", "both"]: + with open(json_path, 'w') as f: + json.dump(question_bank, f, indent=2, default=str) + print(f"JSON output written to: {json_path}") + + if args.format in ["text", "both"]: + with open(text_path, 'w') as f: + f.write(format_human_readable(question_bank)) + print(f"Text output written to: {text_path}") + + # Print summary + print(f"\nQuestion Bank Summary:") + print(f"Role: {question_bank['role']} ({question_bank['level'].title()})") + print(f"Total Questions: {question_bank['total_questions']}") + print(f"Competencies Covered: {len(question_bank['competencies'])}") + print(f"Question Types: {', '.join(question_bank['question_types'])}") + + except Exception as e: + print(f"Error generating question bank: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/engineering/interview-system-designer/references/bias_mitigation_checklist.md b/engineering/interview-system-designer/references/bias_mitigation_checklist.md new file mode 100644 index 0000000..164abcc --- /dev/null +++ b/engineering/interview-system-designer/references/bias_mitigation_checklist.md @@ -0,0 +1,308 @@ +# Interview Bias Mitigation Checklist + +This comprehensive checklist helps identify, prevent, and mitigate various forms of bias in the interview process. Use this as a systematic guide to ensure fair and equitable hiring practices. + +## Pre-Interview Phase + +### Job Description & Requirements +- [ ] **Remove unnecessary requirements** that don't directly relate to job performance +- [ ] **Avoid gendered language** (competitive, aggressive vs. collaborative, detail-oriented) +- [ ] **Remove university prestige requirements** unless absolutely necessary for role +- [ ] **Focus on skills and outcomes** rather than years of experience in specific technologies +- [ ] **Use inclusive language** and avoid cultural assumptions +- [ ] **Specify only essential requirements** vs. nice-to-have qualifications +- [ ] **Remove location/commute assumptions** for remote-eligible positions +- [ ] **Review requirements for unconscious bias** (e.g., assuming continuous work history) + +### Sourcing & Pipeline +- [ ] **Diversify sourcing channels** beyond traditional networks +- [ ] **Partner with diverse professional organizations** and communities +- [ ] **Use bias-minimizing sourcing tools** and platforms +- [ ] **Track sourcing effectiveness** by demographic groups +- [ ] **Train recruiters on bias awareness** and inclusive outreach +- [ ] **Review referral patterns** for potential network bias +- [ ] **Expand university partnerships** beyond elite institutions +- [ ] **Use structured outreach messages** to reduce individual bias + +### Resume Screening +- [ ] **Implement blind resume review** (remove names, photos, university names initially) +- [ ] **Use standardized screening criteria** applied consistently +- [ ] **Multiple screeners for each resume** with independent scoring +- [ ] **Focus on relevant skills and achievements** over pedigree indicators +- [ ] **Avoid assumptions about career gaps** or non-traditional backgrounds +- [ ] **Consider alternative paths to skills** (bootcamps, self-taught, career changes) +- [ ] **Track screening pass rates** by demographic groups +- [ ] **Regular screener calibration sessions** on bias awareness + +## Interview Panel Composition + +### Diversity Requirements +- [ ] **Ensure diverse interview panels** (gender, ethnicity, seniority levels) +- [ ] **Include at least one underrepresented interviewer** when possible +- [ ] **Rotate panel assignments** to prevent bias patterns +- [ ] **Balance seniority levels** on panels (not all senior or all junior) +- [ ] **Include cross-functional perspectives** when relevant +- [ ] **Avoid panels of only one demographic group** when possible +- [ ] **Consider panel member unconscious bias training** status +- [ ] **Document panel composition rationale** for future review + +### Interviewer Selection +- [ ] **Choose interviewers based on relevant competency assessment ability** +- [ ] **Ensure interviewers have completed bias training** within last 12 months +- [ ] **Select interviewers with consistent calibration history** +- [ ] **Avoid interviewers with known bias patterns** (flagged in previous analyses) +- [ ] **Include at least one interviewer familiar with candidate's background type** +- [ ] **Balance perspectives** (technical depth, cultural fit, growth potential) +- [ ] **Consider interviewer availability for proper preparation time** +- [ ] **Ensure interviewers understand role requirements and standards** + +## Interview Process Design + +### Question Standardization +- [ ] **Use standardized question sets** for each competency area +- [ ] **Develop questions that assess skills, not culture fit stereotypes** +- [ ] **Avoid questions about personal background** unless directly job-relevant +- [ ] **Remove questions that could reveal protected characteristics** +- [ ] **Focus on behavioral examples** using STAR method +- [ ] **Include scenario-based questions** with clear evaluation criteria +- [ ] **Test questions for potential bias** with diverse interviewers +- [ ] **Regularly update question bank** based on effectiveness data + +### Structured Interview Protocol +- [ ] **Define clear time allocations** for each question/section +- [ ] **Establish consistent interview flow** across all candidates +- [ ] **Create standardized intro/outro** processes +- [ ] **Use identical technical setup and tools** for all candidates +- [ ] **Provide same background information** to all interviewers +- [ ] **Standardize note-taking format** and requirements +- [ ] **Define clear handoff procedures** between interviewers +- [ ] **Document any deviations** from standard protocol + +### Accommodation Preparation +- [ ] **Proactively offer accommodations** without requiring disclosure +- [ ] **Provide multiple interview format options** (phone, video, in-person) +- [ ] **Ensure accessibility of interview locations and tools** +- [ ] **Allow extended time** when requested or needed +- [ ] **Provide materials in advance** when helpful +- [ ] **Train interviewers on accommodation protocols** +- [ ] **Test all technology** for accessibility compliance +- [ ] **Have backup plans** for technical issues + +## During the Interview + +### Interviewer Behavior +- [ ] **Use welcoming, professional tone** with all candidates +- [ ] **Avoid assumptions based on appearance or background** +- [ ] **Give equal encouragement and support** to all candidates +- [ ] **Allow equal time for candidate questions** +- [ ] **Avoid leading questions** that suggest desired answers +- [ ] **Listen actively** without interrupting unnecessarily +- [ ] **Take detailed notes** focusing on responses, not impressions +- [ ] **Avoid small talk** that could reveal irrelevant personal information + +### Question Delivery +- [ ] **Ask questions as written** without improvisation that could introduce bias +- [ ] **Provide equal clarification** when candidates ask for it +- [ ] **Use consistent follow-up probing** across candidates +- [ ] **Allow reasonable thinking time** before expecting responses +- [ ] **Avoid rephrasing questions** in ways that give hints +- [ ] **Stay focused on defined competencies** being assessed +- [ ] **Give equal encouragement** for elaboration when needed +- [ ] **Maintain professional demeanor** regardless of candidate background + +### Real-time Bias Checking +- [ ] **Notice first impressions** but don't let them drive assessment +- [ ] **Question gut reactions** - are they based on competency evidence? +- [ ] **Focus on specific examples** and evidence provided +- [ ] **Avoid pattern matching** to existing successful employees +- [ ] **Notice cultural assumptions** in interpretation of responses +- [ ] **Check for confirmation bias** - seeking evidence to support initial impressions +- [ ] **Consider alternative explanations** for candidate responses +- [ ] **Stay aware of fatigue effects** on judgment throughout the day + +## Evaluation & Scoring + +### Scoring Consistency +- [ ] **Use defined rubrics consistently** across all candidates +- [ ] **Score immediately after interview** while details are fresh +- [ ] **Focus scoring on demonstrated competencies** not potential or personality +- [ ] **Provide specific evidence** for each score given +- [ ] **Avoid comparative scoring** (comparing candidates to each other) +- [ ] **Use calibrated examples** of each score level +- [ ] **Score independently** before discussing with other interviewers +- [ ] **Document reasoning** for all scores, especially extreme ones (1s and 4s) + +### Bias Check Questions +- [ ] **"Would I score this differently if the candidate looked different?"** +- [ ] **"Am I basing this on evidence or assumptions?"** +- [ ] **"Would this response get the same score from a different demographic?"** +- [ ] **"Am I penalizing non-traditional backgrounds or approaches?"** +- [ ] **"Is my scoring consistent with the defined rubric?"** +- [ ] **"Am I letting one strong/weak area bias overall assessment?"** +- [ ] **"Are my cultural assumptions affecting interpretation?"** +- [ ] **"Would I want to work with this person?" (Check if this is biasing assessment)** + +### Documentation Requirements +- [ ] **Record specific examples** supporting each competency score +- [ ] **Avoid subjective language** like "seems like," "appears to be" +- [ ] **Focus on observable behaviors** and concrete responses +- [ ] **Note exact quotes** when relevant to assessment +- [ ] **Distinguish between facts and interpretations** +- [ ] **Provide improvement suggestions** that are skill-based, not person-based +- [ ] **Avoid comparative language** to other candidates or employees +- [ ] **Use neutral language** free from cultural assumptions + +## Debrief Process + +### Structured Discussion +- [ ] **Start with independent score sharing** before discussion +- [ ] **Focus discussion on evidence** not impressions or feelings +- [ ] **Address significant score discrepancies** with evidence review +- [ ] **Challenge biased language** or assumptions in discussion +- [ ] **Ensure all voices are heard** in group decision making +- [ ] **Document reasons for final decision** with specific evidence +- [ ] **Avoid personality-based discussions** ("culture fit" should be evidence-based) +- [ ] **Consider multiple perspectives** on candidate responses + +### Decision-Making Process +- [ ] **Use weighted scoring system** based on role requirements +- [ ] **Require minimum scores** in critical competency areas +- [ ] **Avoid veto power** unless based on clear, documented evidence +- [ ] **Consider growth potential** fairly across all candidates +- [ ] **Document dissenting opinions** and reasoning +- [ ] **Use tie-breaking criteria** that are predetermined and fair +- [ ] **Consider additional data collection** if team is split +- [ ] **Make final decision based on role requirements**, not team preferences + +### Final Recommendations +- [ ] **Provide specific, actionable feedback** for development areas +- [ ] **Focus recommendations on skills and competencies** +- [ ] **Avoid language that could reflect bias** in written feedback +- [ ] **Consider onboarding needs** based on actual skill gaps, not assumptions +- [ ] **Provide coaching recommendations** that are evidence-based +- [ ] **Avoid personal judgments** about candidate character or personality +- [ ] **Make hiring recommendation** based solely on job-relevant criteria +- [ ] **Document any concerns** with specific, observable evidence + +## Post-Interview Monitoring + +### Data Collection +- [ ] **Track interviewer scoring patterns** for consistency analysis +- [ ] **Monitor pass rates** by demographic groups +- [ ] **Collect candidate experience feedback** on interview fairness +- [ ] **Analyze score distributions** for potential bias indicators +- [ ] **Track time-to-decision** across different candidate types +- [ ] **Monitor offer acceptance rates** by demographics +- [ ] **Collect new hire performance data** for process validation +- [ ] **Document any bias incidents** or concerns raised + +### Regular Analysis +- [ ] **Conduct quarterly bias audits** of interview data +- [ ] **Review interviewer calibration** and identify outliers +- [ ] **Analyze demographic trends** in hiring outcomes +- [ ] **Compare candidate experience surveys** across groups +- [ ] **Track correlation between interview scores and job performance** +- [ ] **Review and update bias mitigation strategies** based on data +- [ ] **Share findings with interview teams** for continuous improvement +- [ ] **Update training programs** based on identified bias patterns + +## Bias Types to Watch For + +### Affinity Bias +- **Definition**: Favoring candidates similar to yourself +- **Watch for**: Over-positive response to shared backgrounds, interests, or experiences +- **Mitigation**: Focus on job-relevant competencies, diversify interview panels + +### Halo/Horn Effect +- **Definition**: One positive/negative trait influencing overall assessment +- **Watch for**: Strong performance in one area affecting scores in unrelated areas +- **Mitigation**: Score each competency independently, use structured evaluation + +### Confirmation Bias +- **Definition**: Seeking information that confirms initial impressions +- **Watch for**: Asking follow-ups that lead candidate toward expected responses +- **Mitigation**: Use standardized questions, consider alternative interpretations + +### Attribution Bias +- **Definition**: Attributing success/failure to different causes based on candidate demographics +- **Watch for**: Assuming women are "lucky" vs. men are "skilled" for same achievements +- **Mitigation**: Focus on candidate's role in achievements, avoid assumptions + +### Cultural Bias +- **Definition**: Judging candidates based on cultural differences rather than job performance +- **Watch for**: Penalizing communication styles, work approaches, or values that differ from team norm +- **Mitigation**: Define job-relevant criteria clearly, consider diverse perspectives valuable + +### Educational Bias +- **Definition**: Over-weighting prestigious educational credentials +- **Watch for**: Assuming higher capability based on school rank rather than demonstrated skills +- **Mitigation**: Focus on skills demonstration, consider alternative learning paths + +### Experience Bias +- **Definition**: Requiring specific company or industry experience unnecessarily +- **Watch for**: Discounting transferable skills from different industries or company sizes +- **Mitigation**: Define core skills needed, assess adaptability and learning ability + +## Emergency Bias Response Protocol + +### During Interview +1. **Pause the interview** if significant bias is observed +2. **Privately address** bias with interviewer if possible +3. **Document the incident** for review +4. **Continue with fair assessment** of candidate +5. **Flag for debrief discussion** if interview continues + +### Post-Interview +1. **Report bias incidents** to hiring manager/HR immediately +2. **Document specific behaviors** observed +3. **Consider additional interviewer** for second opinion +4. **Review candidate assessment** for bias impact +5. **Implement corrective actions** for future interviews + +### Interviewer Coaching +1. **Provide immediate feedback** on bias observed +2. **Schedule bias training refresher** if needed +3. **Monitor future interviews** for improvement +4. **Consider removing from interview rotation** if bias persists +5. **Document coaching provided** for performance management + +## Legal Compliance Reminders + +### Protected Characteristics +- Age, race, color, religion, sex, national origin, disability status, veteran status +- Pregnancy, genetic information, sexual orientation, gender identity +- Any other characteristics protected by local/state/federal law + +### Prohibited Questions +- Questions about family planning, marital status, pregnancy +- Age-related questions (unless BFOQ) +- Religious or political affiliations +- Disability status (unless voluntary disclosure for accommodation) +- Arrest records (without conviction relevance) +- Financial status or credit (unless job-relevant) + +### Documentation Requirements +- Keep all interview materials for required retention period +- Ensure consistent documentation across all candidates +- Avoid documenting protected characteristic observations +- Focus documentation on job-relevant observations only + +## Training & Certification + +### Required Training Topics +- Unconscious bias awareness and mitigation +- Structured interviewing techniques +- Legal compliance in hiring +- Company-specific bias mitigation protocols +- Role-specific competency assessment +- Accommodation and accessibility requirements + +### Ongoing Development +- Annual bias training refresher +- Quarterly calibration sessions +- Regular updates on legal requirements +- Peer feedback and coaching +- Industry best practice updates +- Data-driven process improvements + +This checklist should be reviewed and updated regularly based on legal requirements, industry best practices, and internal bias analysis results. \ No newline at end of file diff --git a/engineering/interview-system-designer/references/competency_matrix_templates.md b/engineering/interview-system-designer/references/competency_matrix_templates.md new file mode 100644 index 0000000..6c68c51 --- /dev/null +++ b/engineering/interview-system-designer/references/competency_matrix_templates.md @@ -0,0 +1,171 @@ +# Competency Matrix Templates + +This document provides comprehensive competency matrix templates for different engineering roles and levels. Use these matrices to design role-specific interview loops and evaluation criteria. + +## Software Engineering Competency Matrix + +### Technical Competencies + +| Competency | Junior (L1-L2) | Mid (L3-L4) | Senior (L5-L6) | Staff+ (L7+) | +|------------|----------------|-------------|----------------|--------------| +| **Coding & Algorithms** | Basic data structures, simple algorithms, language syntax | Advanced algorithms, complexity analysis, optimization | Complex problem solving, algorithm design, performance tuning | Architecture-level algorithmic decisions, novel approach design | +| **System Design** | Component interactions, basic scalability concepts | Service design, database modeling, API design | Distributed systems, scalability patterns, trade-off analysis | Large-scale architecture, cross-system design, technology strategy | +| **Code Quality** | Readable code, basic testing, follows conventions | Maintainable code, comprehensive testing, design patterns | Code reviews, quality standards, refactoring leadership | Engineering standards, quality culture, technical debt management | +| **Debugging & Problem Solving** | Basic debugging, structured problem approach | Complex debugging, root cause analysis, performance issues | System-wide debugging, production issues, incident response | Cross-system troubleshooting, preventive measures, tooling design | +| **Domain Knowledge** | Learning role-specific technologies | Proficiency in domain tools/frameworks | Deep domain expertise, technology evaluation | Domain leadership, technology roadmap, innovation | + +### Behavioral Competencies + +| Competency | Junior (L1-L2) | Mid (L3-L4) | Senior (L5-L6) | Staff+ (L7+) | +|------------|----------------|-------------|----------------|--------------| +| **Communication** | Clear status updates, asks good questions | Technical explanations, stakeholder updates | Cross-functional communication, technical writing | Executive communication, external representation, thought leadership | +| **Collaboration** | Team participation, code reviews | Cross-team projects, knowledge sharing | Team leadership, conflict resolution | Cross-org collaboration, culture building, strategic partnerships | +| **Leadership & Influence** | Peer mentoring, positive attitude | Junior mentoring, project ownership | Team guidance, technical decisions, hiring | Org-wide influence, vision setting, culture change | +| **Growth & Learning** | Skill development, feedback receptivity | Proactive learning, teaching others | Continuous improvement, trend awareness | Learning culture, industry leadership, innovation adoption | +| **Ownership & Initiative** | Task completion, quality focus | Project ownership, process improvement | Feature/service ownership, strategic thinking | Product/platform ownership, business impact, market influence | + +## Product Management Competency Matrix + +### Product Competencies + +| Competency | Associate PM (L1-L2) | PM (L3-L4) | Senior PM (L5-L6) | Principal PM (L7+) | +|------------|---------------------|------------|-------------------|-------------------| +| **Product Strategy** | Feature requirements, user stories | Product roadmaps, market analysis | Business strategy, competitive positioning | Portfolio strategy, market creation, platform vision | +| **User Research & Analytics** | Basic user interviews, metrics tracking | Research design, data interpretation | Research strategy, advanced analytics | Research culture, measurement frameworks, insight generation | +| **Technical Understanding** | Basic tech concepts, API awareness | System architecture, technical trade-offs | Technical strategy, platform decisions | Technology vision, architectural influence, innovation leadership | +| **Execution & Process** | Feature delivery, stakeholder coordination | Project management, cross-functional leadership | Process optimization, team scaling | Operational excellence, org design, strategic execution | +| **Business Acumen** | Revenue awareness, customer understanding | P&L understanding, business case development | Business strategy, market dynamics | Corporate strategy, board communication, investor relations | + +### Leadership Competencies + +| Competency | Associate PM (L1-L2) | PM (L3-L4) | Senior PM (L5-L6) | Principal PM (L7+) | +|------------|---------------------|------------|-------------------|-------------------| +| **Stakeholder Management** | Team collaboration, clear communication | Cross-functional alignment, expectation management | Executive communication, influence without authority | Board interaction, external partnerships, industry influence | +| **Team Development** | Peer learning, feedback sharing | Junior mentoring, knowledge transfer | Team building, hiring, performance management | Talent development, culture building, org leadership | +| **Decision Making** | Data-driven decisions, priority setting | Complex trade-offs, strategic choices | Ambiguous situations, high-stakes decisions | Strategic vision, transformational decisions, risk management | +| **Innovation & Vision** | Creative problem solving, user empathy | Market opportunity identification, feature innovation | Product vision, market strategy | Industry vision, disruptive thinking, platform creation | + +## Design Competency Matrix + +### Design Competencies + +| Competency | Junior Designer (L1-L2) | Mid Designer (L3-L4) | Senior Designer (L5-L6) | Principal Designer (L7+) | +|------------|-------------------------|---------------------|-------------------------|-------------------------| +| **Visual Design** | UI components, typography, color theory | Design systems, visual hierarchy | Brand integration, advanced layouts | Visual strategy, brand evolution, design innovation | +| **User Experience** | User flows, wireframing, prototyping | Interaction design, usability testing | Experience strategy, journey mapping | UX vision, service design, behavioral insights | +| **Research & Validation** | User interviews, usability tests | Research planning, data synthesis | Research strategy, methodology design | Research culture, insight frameworks, market research | +| **Design Systems** | Component usage, style guides | System contribution, pattern creation | System architecture, governance | System strategy, scalable design, platform thinking | +| **Tools & Craft** | Design software proficiency, asset creation | Advanced techniques, workflow optimization | Tool evaluation, process design | Technology integration, future tooling, craft evolution | + +### Collaboration Competencies + +| Competency | Junior Designer (L1-L2) | Mid Designer (L3-L4) | Senior Designer (L5-L6) | Principal Designer (L7+) | +|------------|-------------------------|---------------------|-------------------------|-------------------------| +| **Cross-functional Partnership** | Engineering collaboration, handoff quality | Product partnership, stakeholder alignment | Leadership collaboration, strategic alignment | Executive partnership, business strategy integration | +| **Communication & Advocacy** | Design rationale, feedback integration | Design presentations, user advocacy | Executive communication, design thinking evangelism | Industry thought leadership, external representation | +| **Mentorship & Growth** | Peer learning, skill sharing | Junior mentoring, critique facilitation | Team development, hiring, career guidance | Design culture, talent strategy, industry leadership | +| **Business Impact** | User-centered thinking, design quality | Feature success, user satisfaction | Business metrics, strategic impact | Market influence, competitive advantage, innovation leadership | + +## Data Science Competency Matrix + +### Technical Competencies + +| Competency | Junior DS (L1-L2) | Mid DS (L3-L4) | Senior DS (L5-L6) | Principal DS (L7+) | +|------------|-------------------|----------------|-------------------|-------------------| +| **Statistical Analysis** | Descriptive stats, hypothesis testing | Advanced statistics, experimental design | Causal inference, advanced modeling | Statistical strategy, methodology innovation | +| **Machine Learning** | Basic ML algorithms, model training | Advanced ML, feature engineering | ML systems, model deployment | ML strategy, AI platform, research direction | +| **Data Engineering** | SQL, basic ETL, data cleaning | Pipeline design, data modeling | Platform architecture, scalable systems | Data strategy, infrastructure vision, governance | +| **Programming & Tools** | Python/R proficiency, visualization | Advanced programming, tool integration | Software engineering, system design | Technology strategy, platform development, innovation | +| **Domain Expertise** | Business understanding, metric interpretation | Domain modeling, insight generation | Strategic analysis, business integration | Market expertise, competitive intelligence, thought leadership | + +### Impact & Leadership Competencies + +| Competency | Junior DS (L1-L2) | Mid DS (L3-L4) | Senior DS (L5-L6) | Principal DS (L7+) | +|------------|-------------------|----------------|-------------------|-------------------| +| **Business Impact** | Metric improvement, insight delivery | Project leadership, business case development | Strategic initiatives, P&L impact | Business transformation, market advantage, innovation | +| **Communication** | Technical reporting, visualization | Stakeholder presentations, executive briefings | Board communication, external representation | Industry leadership, thought leadership, market influence | +| **Team Leadership** | Peer collaboration, knowledge sharing | Junior mentoring, project management | Team building, hiring, culture development | Organizational leadership, talent strategy, vision setting | +| **Innovation & Research** | Algorithm implementation, experimentation | Research projects, publication | Research strategy, academic partnerships | Research vision, industry influence, breakthrough innovation | + +## DevOps Engineering Competency Matrix + +### Technical Competencies + +| Competency | Junior DevOps (L1-L2) | Mid DevOps (L3-L4) | Senior DevOps (L5-L6) | Principal DevOps (L7+) | +|------------|----------------------|-------------------|----------------------|----------------------| +| **Infrastructure** | Basic cloud services, server management | Infrastructure automation, containerization | Platform architecture, multi-cloud strategy | Infrastructure vision, emerging technologies, industry standards | +| **CI/CD & Automation** | Pipeline basics, script writing | Advanced pipelines, deployment automation | Platform design, workflow optimization | Automation strategy, developer experience, productivity platforms | +| **Monitoring & Observability** | Basic monitoring, log analysis | Advanced monitoring, alerting systems | Observability strategy, SLA/SLI design | Monitoring vision, reliability engineering, performance culture | +| **Security & Compliance** | Security basics, access management | Security automation, compliance frameworks | Security architecture, risk management | Security strategy, governance, industry leadership | +| **Performance & Scalability** | Performance monitoring, basic optimization | Capacity planning, performance tuning | Scalability architecture, cost optimization | Performance strategy, efficiency platforms, innovation | + +### Leadership & Impact Competencies + +| Competency | Junior DevOps (L1-L2) | Mid DevOps (L3-L4) | Senior DevOps (L5-L6) | Principal DevOps (L7+) | +|------------|----------------------|-------------------|----------------------|----------------------| +| **Developer Experience** | Tool support, documentation | Platform development, self-service tools | Developer productivity, workflow design | Developer platform vision, industry best practices | +| **Incident Management** | Incident response, troubleshooting | Incident coordination, root cause analysis | Incident strategy, prevention systems | Reliability culture, organizational resilience | +| **Team Collaboration** | Cross-team support, knowledge sharing | Process improvement, training delivery | Culture building, practice evangelism | Organizational transformation, industry influence | +| **Strategic Impact** | Operational excellence, cost awareness | Efficiency improvements, platform adoption | Strategic initiatives, business enablement | Technology strategy, competitive advantage, market leadership | + +## Engineering Management Competency Matrix + +### People Leadership Competencies + +| Competency | Manager (L1-L2) | Senior Manager (L3-L4) | Director (L5-L6) | VP+ (L7+) | +|------------|-----------------|------------------------|------------------|----------| +| **Team Building** | Hiring, onboarding, 1:1s | Team culture, performance management | Multi-team coordination, org design | Organizational culture, talent strategy | +| **Performance Management** | Individual development, feedback | Performance systems, coaching | Calibration across teams, promotion standards | Talent development, succession planning | +| **Communication** | Team updates, stakeholder management | Executive communication, cross-functional alignment | Board updates, external communication | Industry representation, thought leadership | +| **Conflict Resolution** | Team conflicts, process improvements | Cross-team issues, organizational friction | Strategic alignment, cultural challenges | Corporate-level conflicts, crisis management | + +### Technical Leadership Competencies + +| Competency | Manager (L1-L2) | Senior Manager (L3-L4) | Director (L5-L6) | VP+ (L7+) | +|------------|-----------------|------------------------|------------------|----------| +| **Technical Vision** | Team technical decisions, architecture input | Platform strategy, technology choices | Technical roadmap, innovation strategy | Technology vision, industry standards | +| **System Ownership** | Feature/service ownership, quality standards | Platform ownership, scalability planning | System portfolio, technical debt management | Technology strategy, competitive advantage | +| **Process & Practice** | Team processes, development practices | Engineering standards, quality systems | Process innovation, best practices | Engineering culture, industry influence | +| **Technology Strategy** | Tool evaluation, team technology choices | Platform decisions, technical investments | Technology portfolio, strategic architecture | Corporate technology strategy, market leadership | + +## Usage Guidelines + +### Assessment Approach + +1. **Level Calibration**: Use these matrices to calibrate expectations for each level within your organization +2. **Interview Design**: Select competencies most relevant to the specific role and level being hired for +3. **Evaluation Consistency**: Ensure all interviewers understand and apply the same competency standards +4. **Growth Planning**: Use matrices for career development and promotion discussions + +### Customization Tips + +1. **Industry Adaptation**: Modify competencies based on your industry (fintech, healthcare, etc.) +2. **Company Stage**: Adjust expectations based on startup vs. enterprise environment +3. **Team Needs**: Emphasize competencies most critical for current team challenges +4. **Cultural Fit**: Add company-specific values and cultural competencies + +### Common Pitfalls + +1. **Unrealistic Expectations**: Don't expect senior-level competencies from junior candidates +2. **One-Size-Fits-All**: Customize competency emphasis based on role requirements +3. **Static Assessment**: Regularly update matrices based on changing business needs +4. **Bias Introduction**: Ensure competencies are measurable and don't introduce unconscious bias + +## Matrix Validation Process + +### Regular Review Cycle +- **Quarterly**: Review competency relevance and adjust weights +- **Semi-annually**: Update level expectations based on market standards +- **Annually**: Comprehensive review with stakeholder feedback + +### Stakeholder Input +- **Hiring Managers**: Validate role-specific competency requirements +- **Current Team Members**: Confirm level expectations match reality +- **Recent Hires**: Gather feedback on assessment accuracy +- **HR Partners**: Ensure legal compliance and bias mitigation + +### Continuous Improvement +- **Performance Correlation**: Track new hire performance against competency assessments +- **Market Benchmarking**: Compare standards with industry peers +- **Feedback Integration**: Incorporate interviewer and candidate feedback +- **Bias Monitoring**: Regular analysis of assessment patterns across demographics \ No newline at end of file diff --git a/engineering/interview-system-designer/references/debrief_facilitation_guide.md b/engineering/interview-system-designer/references/debrief_facilitation_guide.md new file mode 100644 index 0000000..a15ccae --- /dev/null +++ b/engineering/interview-system-designer/references/debrief_facilitation_guide.md @@ -0,0 +1,319 @@ +# Interview Debrief Facilitation Guide + +This guide provides a comprehensive framework for conducting effective, unbiased interview debriefs that lead to consistent hiring decisions. Use this to facilitate productive discussions that focus on evidence-based evaluation. + +## Pre-Debrief Preparation + +### Facilitator Responsibilities +- [ ] **Review all interviewer feedback** before the meeting +- [ ] **Identify significant score discrepancies** that need discussion +- [ ] **Prepare discussion agenda** with time allocations +- [ ] **Gather role requirements** and competency framework +- [ ] **Review any flags or special considerations** noted during interviews +- [ ] **Ensure all required materials** are available (scorecards, rubrics, candidate resume) +- [ ] **Set up meeting logistics** (room, video conference, screen sharing) +- [ ] **Send agenda to participants** 30 minutes before meeting + +### Required Materials Checklist +- [ ] Candidate resume and application materials +- [ ] Job description and competency requirements +- [ ] Individual interviewer scorecards +- [ ] Scoring rubrics and competency definitions +- [ ] Interview notes and documentation +- [ ] Any technical assessments or work samples +- [ ] Company hiring standards and calibration examples +- [ ] Bias mitigation reminders and prompts + +### Participant Preparation Requirements +- [ ] All interviewers must **complete independent scoring** before debrief +- [ ] **Submit written feedback** with specific evidence for each competency +- [ ] **Review scoring rubrics** to ensure consistent interpretation +- [ ] **Prepare specific examples** to support scoring decisions +- [ ] **Flag any concerns or unusual circumstances** that affected assessment +- [ ] **Avoid discussing candidate** with other interviewers before debrief +- [ ] **Come prepared to defend scores** with concrete evidence +- [ ] **Be ready to adjust scores** based on additional evidence shared + +## Debrief Meeting Structure + +### Opening (5 minutes) +1. **State meeting purpose**: Make hiring decision based on evidence +2. **Review agenda and time limits**: Keep discussion focused and productive +3. **Remind of bias mitigation principles**: Focus on competencies, not personality +4. **Confirm confidentiality**: Discussion stays within hiring team +5. **Establish ground rules**: One person speaks at a time, evidence-based discussion + +### Individual Score Sharing (10-15 minutes) +- **Go around the room systematically** - each interviewer shares scores independently +- **No discussion or challenges yet** - just data collection +- **Record scores on shared document** visible to all participants +- **Note any abstentions** or "insufficient data" responses +- **Identify clear patterns** and discrepancies without commentary +- **Flag any scores requiring explanation** (1s or 4s typically need strong evidence) + +### Competency-by-Competency Discussion (30-40 minutes) + +#### For Each Core Competency: + +**1. Present Score Distribution (2 minutes)** +- Display all scores for this competency +- Note range and any outliers +- Identify if consensus exists or discussion needed + +**2. Evidence Sharing (5-8 minutes per competency)** +- Start with interviewers who assessed this competency directly +- Share specific examples and observations +- Focus on what candidate said/did, not interpretations +- Allow questions for clarification (not challenges yet) + +**3. Discussion and Calibration (3-5 minutes)** +- Address significant discrepancies (>1 point difference) +- Challenge vague or potentially biased language +- Seek additional evidence if needed +- Allow score adjustments based on new information +- Reach consensus or note dissenting views + +#### Structured Discussion Questions: +- **"What specific evidence supports this score?"** +- **"Can you provide the exact example or quote?"** +- **"How does this compare to our rubric definition?"** +- **"Would this response receive the same score regardless of who gave it?"** +- **"Are we evaluating the competency or making assumptions?"** +- **"What would need to change for this to be the next level up/down?"** + +### Overall Recommendation Discussion (10-15 minutes) + +#### Weighted Score Calculation +1. **Apply competency weights** based on role requirements +2. **Calculate overall weighted average** +3. **Check minimum threshold requirements** +4. **Consider any veto criteria** (critical competency failures) + +#### Final Recommendation Options +- **Strong Hire**: Exceeds requirements in most areas, clear value-add +- **Hire**: Meets requirements with growth potential +- **No Hire**: Doesn't meet minimum requirements for success +- **Strong No Hire**: Significant gaps that would impact team/company + +#### Decision Rationale Documentation +- **Summarize key strengths** with specific evidence +- **Identify development areas** with specific examples +- **Explain final recommendation** with competency-based reasoning +- **Note any dissenting opinions** and reasoning +- **Document onboarding considerations** if hiring + +### Closing and Next Steps (5 minutes) +- **Confirm final decision** and documentation +- **Assign follow-up actions** (feedback delivery, offer preparation, etc.) +- **Schedule any additional interviews** if needed +- **Review timeline** for candidate communication +- **Remind confidentiality** of discussion and decision + +## Facilitation Best Practices + +### Creating Psychological Safety +- **Encourage honest feedback** without fear of judgment +- **Validate different perspectives** and assessment approaches +- **Address power dynamics** - ensure junior voices are heard +- **Model vulnerability** - admit when evidence changes your mind +- **Focus on learning** and calibration, not winning arguments +- **Thank participants** for thorough preparation and thoughtful input + +### Managing Difficult Conversations + +#### When Scores Vary Significantly +1. **Acknowledge the discrepancy** without judgment +2. **Ask for specific evidence** from each scorer +3. **Look for different interpretations** of the same data +4. **Consider if different questions** revealed different competency levels +5. **Check for bias patterns** in reasoning +6. **Allow time for reflection** and potential score adjustments + +#### When Someone Uses Biased Language +1. **Pause the conversation** gently but firmly +2. **Ask for specific evidence** behind the assessment +3. **Reframe in competency terms** - "What specific skills did this demonstrate?" +4. **Challenge assumptions** - "Help me understand how we know that" +5. **Redirect to rubric** - "How does this align with our scoring criteria?" +6. **Document and follow up** privately if bias persists + +#### When the Discussion Gets Off Track +- **Redirect to competencies**: "Let's focus on the technical skills demonstrated" +- **Ask for evidence**: "What specific example supports that assessment?" +- **Reference rubrics**: "How does this align with our level 3 definition?" +- **Manage time**: "We have 5 minutes left on this competency" +- **Table unrelated issues**: "That's important but separate from this hire decision" + +### Encouraging Evidence-Based Discussion + +#### Good Evidence Examples +- **Direct quotes**: "When asked about debugging, they said..." +- **Specific behaviors**: "They organized their approach by first..." +- **Observable outcomes**: "Their code compiled on first run and handled edge cases" +- **Process descriptions**: "They walked through their problem-solving step by step" +- **Measurable results**: "They identified 3 optimization opportunities" + +#### Poor Evidence Examples +- **Gut feelings**: "They just seemed off" +- **Comparisons**: "Not as strong as our last hire" +- **Assumptions**: "Probably wouldn't fit our culture" +- **Vague impressions**: "Didn't seem passionate" +- **Irrelevant factors**: "Their background is different from ours" + +### Managing Group Dynamics + +#### Ensuring Equal Participation +- **Direct questions** to quieter participants +- **Prevent interrupting** and ensure everyone finishes thoughts +- **Balance speaking time** across all interviewers +- **Validate minority opinions** even if not adopted +- **Check for unheard perspectives** before finalizing decisions + +#### Handling Strong Personalities +- **Set time limits** for individual speaking +- **Redirect monopolizers**: "Let's hear from others on this" +- **Challenge confidently stated opinions** that lack evidence +- **Support less assertive voices** in expressing dissenting views +- **Focus on data**, not personality or seniority in decision making + +## Bias Interruption Strategies + +### Affinity Bias Interruption +- **Notice pattern**: Positive assessment seems based on shared background/interests +- **Interrupt with**: "Let's focus on the job-relevant skills they demonstrated" +- **Redirect to**: Specific competency evidence and measurable outcomes +- **Document**: Note if personal connection affected professional assessment + +### Halo/Horn Effect Interruption +- **Notice pattern**: One area strongly influencing assessment of unrelated areas +- **Interrupt with**: "Let's score each competency independently" +- **Redirect to**: Specific evidence for each individual competency area +- **Recalibrate**: Ask for separate examples supporting each score + +### Confirmation Bias Interruption +- **Notice pattern**: Only seeking/discussing evidence that supports initial impression +- **Interrupt with**: "What evidence might suggest a different assessment?" +- **Redirect to**: Consider alternative interpretations of the same data +- **Challenge**: "How might we be wrong about this assessment?" + +### Attribution Bias Interruption +- **Notice pattern**: Attributing success to luck/help for some demographics, skill for others +- **Interrupt with**: "What role did the candidate play in achieving this outcome?" +- **Redirect to**: Candidate's specific contributions and decision-making +- **Standardize**: Apply same attribution standards across all candidates + +## Decision Documentation Framework + +### Required Documentation Elements +1. **Final scores** for each assessed competency +2. **Overall recommendation** with supporting rationale +3. **Key strengths** with specific evidence +4. **Development areas** with specific examples +5. **Dissenting opinions** if any, with reasoning +6. **Special considerations** or accommodation needs +7. **Next steps** and timeline for decision communication + +### Evidence Quality Standards +- **Specific and observable**: What exactly did the candidate do or say? +- **Job-relevant**: How does this relate to success in the role? +- **Measurable**: Can this be quantified or clearly described? +- **Unbiased**: Would this evidence be interpreted the same way regardless of candidate demographics? +- **Complete**: Does this represent the full picture of their performance in this area? + +### Writing Guidelines +- **Use active voice** and specific language +- **Avoid assumptions** about motivations or personality +- **Focus on behaviors** demonstrated during the interview +- **Provide context** for any unusual circumstances +- **Be constructive** in describing development areas +- **Maintain professionalism** and respect for candidate + +## Common Debrief Challenges and Solutions + +### Challenge: "I just don't think they'd fit our culture" +**Solution**: +- Ask for specific, observable evidence +- Define what "culture fit" means in job-relevant terms +- Challenge assumptions about cultural requirements +- Focus on ability to collaborate and contribute effectively + +### Challenge: Scores vary widely with no clear explanation +**Solution**: +- Review if different interviewers assessed different competencies +- Look for question differences that might explain variance +- Consider if candidate performance varied across interviews +- May need additional data gathering or interview + +### Challenge: Everyone loved/hated the candidate but can't articulate why +**Solution**: +- Push for specific evidence supporting emotional reactions +- Review competency rubrics together +- Look for halo/horn effects influencing overall impression +- Consider unconscious bias training for team + +### Challenge: Technical vs. non-technical interviewers disagree +**Solution**: +- Clarify which competencies each interviewer was assessing +- Ensure technical assessments carry appropriate weight +- Look for different perspectives on same evidence +- Consider specialist input for technical decisions + +### Challenge: Senior interviewer dominates decision making +**Solution**: +- Structure discussion to hear from all levels first +- Ask direct questions to junior interviewers +- Challenge opinions that lack supporting evidence +- Remember that assessment ability doesn't correlate with seniority + +### Challenge: Team wants to hire but scores don't support it +**Solution**: +- Review if rubrics match actual job requirements +- Check for consistent application of scoring standards +- Consider if additional competencies need assessment +- May indicate need for rubric calibration or role requirement review + +## Post-Debrief Actions + +### Immediate Actions (Same Day) +- [ ] **Finalize decision documentation** with all evidence +- [ ] **Communicate decision** to recruiting team +- [ ] **Schedule candidate feedback** delivery if applicable +- [ ] **Update interview scheduling** based on decision +- [ ] **Note any process improvements** needed for future + +### Follow-up Actions (Within 1 Week) +- [ ] **Deliver candidate feedback** (internal or external) +- [ ] **Update interview feedback** in tracking system +- [ ] **Schedule any additional interviews** if needed +- [ ] **Begin offer process** if hiring +- [ ] **Document lessons learned** for process improvement + +### Long-term Actions (Monthly/Quarterly) +- [ ] **Analyze debrief effectiveness** and decision quality +- [ ] **Review interviewer calibration** based on decisions +- [ ] **Update rubrics** based on debrief insights +- [ ] **Provide additional training** if bias patterns identified +- [ ] **Share successful practices** with other hiring teams + +## Continuous Improvement Framework + +### Debrief Effectiveness Metrics +- **Decision consistency**: Are similar candidates receiving similar decisions? +- **Time to decision**: Are debriefs completing within planned time? +- **Participation quality**: Are all interviewers contributing evidence-based input? +- **Bias incidents**: How often are bias interruptions needed? +- **Decision satisfaction**: Do participants feel good about the process and outcome? + +### Regular Review Process +- **Monthly**: Review debrief facilitation effectiveness and interviewer feedback +- **Quarterly**: Analyze decision patterns and potential bias indicators +- **Semi-annually**: Update debrief processes based on hiring outcome data +- **Annually**: Comprehensive review of debrief framework and training needs + +### Training and Calibration +- **New facilitators**: Shadow 3-5 debriefs before leading independently +- **All facilitators**: Quarterly calibration sessions on bias interruption +- **Interviewer training**: Include debrief participation expectations +- **Leadership training**: Ensure hiring managers can facilitate effectively + +This guide should be adapted to your organization's specific needs while maintaining focus on evidence-based, unbiased decision making. \ No newline at end of file From e6cc0f4c6a4e54afa6a9f342f2152bcf96f727c7 Mon Sep 17 00:00:00 2001 From: Leo Date: Mon, 16 Feb 2026 13:48:47 +0000 Subject: [PATCH 4/4] feat: add migration-architect POWERFUL-tier skill --- engineering/migration-architect/README.md | 382 +++++ engineering/migration-architect/SKILL.md | 473 ++++++ .../assets/database_schema_after.json | 367 +++++ .../assets/database_schema_before.json | 243 +++ .../assets/sample_database_migration.json | 106 ++ .../assets/sample_service_migration.json | 175 +++ .../expected_outputs/rollback_runbook.json | 577 +++++++ .../expected_outputs/rollback_runbook.txt | 282 ++++ .../sample_database_migration_plan.json | 317 ++++ .../sample_database_migration_plan.txt | 161 ++ .../sample_service_migration_plan.json | 310 ++++ .../sample_service_migration_plan.txt | 154 ++ .../schema_compatibility_report.json | 192 +++ .../schema_compatibility_report.txt | 129 ++ .../data_reconciliation_strategies.md | 1329 +++++++++++++++++ .../references/migration_patterns_catalog.md | 705 +++++++++ .../references/zero_downtime_techniques.md | 1104 ++++++++++++++ .../scripts/compatibility_checker.py | 883 +++++++++++ .../scripts/migration_planner.py | 661 ++++++++ .../scripts/rollback_generator.py | 1109 ++++++++++++++ 20 files changed, 9659 insertions(+) create mode 100644 engineering/migration-architect/README.md create mode 100644 engineering/migration-architect/SKILL.md create mode 100644 engineering/migration-architect/assets/database_schema_after.json create mode 100644 engineering/migration-architect/assets/database_schema_before.json create mode 100644 engineering/migration-architect/assets/sample_database_migration.json create mode 100644 engineering/migration-architect/assets/sample_service_migration.json create mode 100644 engineering/migration-architect/expected_outputs/rollback_runbook.json create mode 100644 engineering/migration-architect/expected_outputs/rollback_runbook.txt create mode 100644 engineering/migration-architect/expected_outputs/sample_database_migration_plan.json create mode 100644 engineering/migration-architect/expected_outputs/sample_database_migration_plan.txt create mode 100644 engineering/migration-architect/expected_outputs/sample_service_migration_plan.json create mode 100644 engineering/migration-architect/expected_outputs/sample_service_migration_plan.txt create mode 100644 engineering/migration-architect/expected_outputs/schema_compatibility_report.json create mode 100644 engineering/migration-architect/expected_outputs/schema_compatibility_report.txt create mode 100644 engineering/migration-architect/references/data_reconciliation_strategies.md create mode 100644 engineering/migration-architect/references/migration_patterns_catalog.md create mode 100644 engineering/migration-architect/references/zero_downtime_techniques.md create mode 100644 engineering/migration-architect/scripts/compatibility_checker.py create mode 100644 engineering/migration-architect/scripts/migration_planner.py create mode 100644 engineering/migration-architect/scripts/rollback_generator.py diff --git a/engineering/migration-architect/README.md b/engineering/migration-architect/README.md new file mode 100644 index 0000000..faf71a1 --- /dev/null +++ b/engineering/migration-architect/README.md @@ -0,0 +1,382 @@ +# Migration Architect + +**Tier:** POWERFUL +**Category:** Engineering - Migration Strategy +**Purpose:** Zero-downtime migration planning, compatibility validation, and rollback strategy generation + +## Overview + +The Migration Architect skill provides comprehensive tools and methodologies for planning, executing, and validating complex system migrations with minimal business impact. This skill combines proven migration patterns with automated planning tools to ensure successful transitions between systems, databases, and infrastructure. + +## Components + +### Core Scripts + +1. **migration_planner.py** - Automated migration plan generation +2. **compatibility_checker.py** - Schema and API compatibility analysis +3. **rollback_generator.py** - Comprehensive rollback procedure generation + +### Reference Documentation + +- **migration_patterns_catalog.md** - Detailed catalog of proven migration patterns +- **zero_downtime_techniques.md** - Comprehensive zero-downtime migration techniques +- **data_reconciliation_strategies.md** - Advanced data consistency and reconciliation strategies + +### Sample Assets + +- **sample_database_migration.json** - Example database migration specification +- **sample_service_migration.json** - Example service migration specification +- **database_schema_before.json** - Sample "before" database schema +- **database_schema_after.json** - Sample "after" database schema + +## Quick Start + +### 1. Generate a Migration Plan + +```bash +python3 scripts/migration_planner.py \ + --input assets/sample_database_migration.json \ + --output migration_plan.json \ + --format both +``` + +**Input:** Migration specification with source, target, constraints, and requirements +**Output:** Detailed phased migration plan with risk assessment, timeline, and validation gates + +### 2. Check Compatibility + +```bash +python3 scripts/compatibility_checker.py \ + --before assets/database_schema_before.json \ + --after assets/database_schema_after.json \ + --type database \ + --output compatibility_report.json \ + --format both +``` + +**Input:** Before and after schema definitions +**Output:** Compatibility report with breaking changes, migration scripts, and recommendations + +### 3. Generate Rollback Procedures + +```bash +python3 scripts/rollback_generator.py \ + --input migration_plan.json \ + --output rollback_runbook.json \ + --format both +``` + +**Input:** Migration plan from step 1 +**Output:** Comprehensive rollback runbook with procedures, triggers, and communication templates + +## Script Details + +### Migration Planner (`migration_planner.py`) + +Generates comprehensive migration plans with: + +- **Phased approach** with dependencies and validation gates +- **Risk assessment** with mitigation strategies +- **Timeline estimation** based on complexity and constraints +- **Rollback triggers** and success criteria +- **Stakeholder communication** templates + +**Usage:** +```bash +python3 scripts/migration_planner.py [OPTIONS] + +Options: + --input, -i Input migration specification file (JSON) [required] + --output, -o Output file for migration plan (JSON) + --format, -f Output format: json, text, both (default: both) + --validate Validate migration specification only +``` + +**Input Format:** +```json +{ + "type": "database|service|infrastructure", + "pattern": "schema_change|strangler_fig|blue_green", + "source": "Source system description", + "target": "Target system description", + "constraints": { + "max_downtime_minutes": 30, + "data_volume_gb": 2500, + "dependencies": ["service1", "service2"], + "compliance_requirements": ["GDPR", "SOX"] + } +} +``` + +### Compatibility Checker (`compatibility_checker.py`) + +Analyzes compatibility between schema versions: + +- **Breaking change detection** (removed fields, type changes, constraint additions) +- **Data migration requirements** identification +- **Suggested migration scripts** generation +- **Risk assessment** for each change + +**Usage:** +```bash +python3 scripts/compatibility_checker.py [OPTIONS] + +Options: + --before Before schema file (JSON) [required] + --after After schema file (JSON) [required] + --type Schema type: database, api (default: database) + --output, -o Output file for compatibility report (JSON) + --format, -f Output format: json, text, both (default: both) +``` + +**Exit Codes:** +- `0`: No compatibility issues +- `1`: Potentially breaking changes found +- `2`: Breaking changes found + +### Rollback Generator (`rollback_generator.py`) + +Creates comprehensive rollback procedures: + +- **Phase-by-phase rollback** steps +- **Automated trigger conditions** for rollback +- **Data recovery procedures** +- **Communication templates** for different audiences +- **Validation checklists** for rollback success + +**Usage:** +```bash +python3 scripts/rollback_generator.py [OPTIONS] + +Options: + --input, -i Input migration plan file (JSON) [required] + --output, -o Output file for rollback runbook (JSON) + --format, -f Output format: json, text, both (default: both) +``` + +## Migration Patterns Supported + +### Database Migrations + +- **Expand-Contract Pattern** - Zero-downtime schema evolution +- **Parallel Schema Pattern** - Side-by-side schema migration +- **Event Sourcing Migration** - Event-driven data migration + +### Service Migrations + +- **Strangler Fig Pattern** - Gradual legacy system replacement +- **Parallel Run Pattern** - Risk mitigation through dual execution +- **Blue-Green Deployment** - Zero-downtime service updates + +### Infrastructure Migrations + +- **Lift and Shift** - Quick cloud migration with minimal changes +- **Hybrid Cloud Migration** - Gradual cloud adoption +- **Multi-Cloud Migration** - Distribution across multiple providers + +## Sample Workflow + +### 1. Database Schema Migration + +```bash +# Generate migration plan +python3 scripts/migration_planner.py \ + --input assets/sample_database_migration.json \ + --output db_migration_plan.json + +# Check schema compatibility +python3 scripts/compatibility_checker.py \ + --before assets/database_schema_before.json \ + --after assets/database_schema_after.json \ + --type database \ + --output schema_compatibility.json + +# Generate rollback procedures +python3 scripts/rollback_generator.py \ + --input db_migration_plan.json \ + --output db_rollback_runbook.json +``` + +### 2. Service Migration + +```bash +# Generate service migration plan +python3 scripts/migration_planner.py \ + --input assets/sample_service_migration.json \ + --output service_migration_plan.json + +# Generate rollback procedures +python3 scripts/rollback_generator.py \ + --input service_migration_plan.json \ + --output service_rollback_runbook.json +``` + +## Output Examples + +### Migration Plan Structure + +```json +{ + "migration_id": "abc123def456", + "source_system": "Legacy User Service", + "target_system": "New User Service", + "migration_type": "service", + "complexity": "medium", + "estimated_duration_hours": 72, + "phases": [ + { + "name": "preparation", + "description": "Prepare systems and teams for migration", + "duration_hours": 8, + "validation_criteria": ["All backups completed successfully"], + "rollback_triggers": ["Critical system failure"], + "risk_level": "medium" + } + ], + "risks": [ + { + "category": "technical", + "description": "Service compatibility issues", + "severity": "high", + "mitigation": "Comprehensive integration testing" + } + ] +} +``` + +### Compatibility Report Structure + +```json +{ + "overall_compatibility": "potentially_incompatible", + "breaking_changes_count": 2, + "potentially_breaking_count": 3, + "issues": [ + { + "type": "required_column_added", + "severity": "breaking", + "description": "Required column 'email_verified_at' added", + "suggested_migration": "Add default value initially" + } + ], + "migration_scripts": [ + { + "script_type": "sql", + "description": "Add email verification columns", + "script_content": "ALTER TABLE users ADD COLUMN email_verified_at TIMESTAMP;", + "rollback_script": "ALTER TABLE users DROP COLUMN email_verified_at;" + } + ] +} +``` + +## Best Practices + +### Planning Phase +1. **Start with risk assessment** - Identify failure modes before planning +2. **Design for rollback** - Every step should have a tested rollback procedure +3. **Validate in staging** - Execute full migration in production-like environment +4. **Plan gradual rollout** - Use feature flags and traffic routing + +### Execution Phase +1. **Monitor continuously** - Track technical and business metrics +2. **Communicate proactively** - Keep stakeholders informed +3. **Document everything** - Maintain detailed logs for analysis +4. **Stay flexible** - Be prepared to adjust based on real-world performance + +### Validation Phase +1. **Automate validation** - Use automated consistency and performance checks +2. **Test business logic** - Validate critical business processes end-to-end +3. **Load test** - Verify performance under expected production load +4. **Security validation** - Ensure security controls function properly + +## Integration + +### CI/CD Pipeline Integration + +```yaml +# Example GitHub Actions workflow +name: Migration Validation +on: [push, pull_request] + +jobs: + validate-migration: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Validate Migration Plan + run: | + python3 scripts/migration_planner.py \ + --input migration_spec.json \ + --validate + - name: Check Compatibility + run: | + python3 scripts/compatibility_checker.py \ + --before schema_before.json \ + --after schema_after.json \ + --type database +``` + +### Monitoring Integration + +The tools generate metrics and alerts that can be integrated with: +- **Prometheus** - For metrics collection +- **Grafana** - For visualization and dashboards +- **PagerDuty** - For incident management +- **Slack** - For team notifications + +## Advanced Features + +### Machine Learning Integration +- Anomaly detection for data consistency issues +- Predictive analysis for migration success probability +- Automated pattern recognition for migration optimization + +### Performance Optimization +- Parallel processing for large-scale migrations +- Incremental reconciliation strategies +- Statistical sampling for validation + +### Compliance Support +- GDPR compliance tracking +- SOX audit trail generation +- HIPAA security validation + +## Troubleshooting + +### Common Issues + +**"Migration plan validation failed"** +- Check JSON syntax in migration specification +- Ensure all required fields are present +- Validate constraint values are realistic + +**"Compatibility checker reports false positives"** +- Review excluded fields configuration +- Check data type mapping compatibility +- Adjust tolerance settings for numerical comparisons + +**"Rollback procedures seem incomplete"** +- Ensure migration plan includes all phases +- Verify database backup locations are specified +- Check that all dependencies are documented + +### Getting Help + +1. **Review documentation** - Check reference docs for patterns and techniques +2. **Examine sample files** - Use provided assets as templates +3. **Check expected outputs** - Compare your results with sample outputs +4. **Validate inputs** - Ensure input files match expected format + +## Contributing + +To extend or modify the Migration Architect skill: + +1. **Add new patterns** - Extend pattern templates in migration_planner.py +2. **Enhance compatibility checks** - Add new validation rules in compatibility_checker.py +3. **Improve rollback procedures** - Add specialized rollback steps in rollback_generator.py +4. **Update documentation** - Keep reference docs current with new patterns + +## License + +This skill is part of the claude-skills repository and follows the same license terms. \ No newline at end of file diff --git a/engineering/migration-architect/SKILL.md b/engineering/migration-architect/SKILL.md new file mode 100644 index 0000000..6ad1232 --- /dev/null +++ b/engineering/migration-architect/SKILL.md @@ -0,0 +1,473 @@ +# Migration Architect + +**Tier:** POWERFUL +**Category:** Engineering - Migration Strategy +**Purpose:** Zero-downtime migration planning, compatibility validation, and rollback strategy generation + +## Overview + +The Migration Architect skill provides comprehensive tools and methodologies for planning, executing, and validating complex system migrations with minimal business impact. This skill combines proven migration patterns with automated planning tools to ensure successful transitions between systems, databases, and infrastructure. + +## Core Capabilities + +### 1. Migration Strategy Planning +- **Phased Migration Planning:** Break complex migrations into manageable phases with clear validation gates +- **Risk Assessment:** Identify potential failure points and mitigation strategies before execution +- **Timeline Estimation:** Generate realistic timelines based on migration complexity and resource constraints +- **Stakeholder Communication:** Create communication templates and progress dashboards + +### 2. Compatibility Analysis +- **Schema Evolution:** Analyze database schema changes for backward compatibility issues +- **API Versioning:** Detect breaking changes in REST/GraphQL APIs and microservice interfaces +- **Data Type Validation:** Identify data format mismatches and conversion requirements +- **Constraint Analysis:** Validate referential integrity and business rule changes + +### 3. Rollback Strategy Generation +- **Automated Rollback Plans:** Generate comprehensive rollback procedures for each migration phase +- **Data Recovery Scripts:** Create point-in-time data restoration procedures +- **Service Rollback:** Plan service version rollbacks with traffic management +- **Validation Checkpoints:** Define success criteria and rollback triggers + +## Migration Patterns + +### Database Migrations + +#### Schema Evolution Patterns +1. **Expand-Contract Pattern** + - **Expand:** Add new columns/tables alongside existing schema + - **Dual Write:** Application writes to both old and new schema + - **Migration:** Backfill historical data to new schema + - **Contract:** Remove old columns/tables after validation + +2. **Parallel Schema Pattern** + - Run new schema in parallel with existing schema + - Use feature flags to route traffic between schemas + - Validate data consistency between parallel systems + - Cutover when confidence is high + +3. **Event Sourcing Migration** + - Capture all changes as events during migration window + - Apply events to new schema for consistency + - Enable replay capability for rollback scenarios + +#### Data Migration Strategies +1. **Bulk Data Migration** + - **Snapshot Approach:** Full data copy during maintenance window + - **Incremental Sync:** Continuous data synchronization with change tracking + - **Stream Processing:** Real-time data transformation pipelines + +2. **Dual-Write Pattern** + - Write to both source and target systems during migration + - Implement compensation patterns for write failures + - Use distributed transactions where consistency is critical + +3. **Change Data Capture (CDC)** + - Stream database changes to target system + - Maintain eventual consistency during migration + - Enable zero-downtime migrations for large datasets + +### Service Migrations + +#### Strangler Fig Pattern +1. **Intercept Requests:** Route traffic through proxy/gateway +2. **Gradually Replace:** Implement new service functionality incrementally +3. **Legacy Retirement:** Remove old service components as new ones prove stable +4. **Monitoring:** Track performance and error rates throughout transition + +```mermaid +graph TD + A[Client Requests] --> B[API Gateway] + B --> C{Route Decision} + C -->|Legacy Path| D[Legacy Service] + C -->|New Path| E[New Service] + D --> F[Legacy Database] + E --> G[New Database] +``` + +#### Parallel Run Pattern +1. **Dual Execution:** Run both old and new services simultaneously +2. **Shadow Traffic:** Route production traffic to both systems +3. **Result Comparison:** Compare outputs to validate correctness +4. **Gradual Cutover:** Shift traffic percentage based on confidence + +#### Canary Deployment Pattern +1. **Limited Rollout:** Deploy new service to small percentage of users +2. **Monitoring:** Track key metrics (latency, errors, business KPIs) +3. **Gradual Increase:** Increase traffic percentage as confidence grows +4. **Full Rollout:** Complete migration once validation passes + +### Infrastructure Migrations + +#### Cloud-to-Cloud Migration +1. **Assessment Phase** + - Inventory existing resources and dependencies + - Map services to target cloud equivalents + - Identify vendor-specific features requiring refactoring + +2. **Pilot Migration** + - Migrate non-critical workloads first + - Validate performance and cost models + - Refine migration procedures + +3. **Production Migration** + - Use infrastructure as code for consistency + - Implement cross-cloud networking during transition + - Maintain disaster recovery capabilities + +#### On-Premises to Cloud Migration +1. **Lift and Shift** + - Minimal changes to existing applications + - Quick migration with optimization later + - Use cloud migration tools and services + +2. **Re-architecture** + - Redesign applications for cloud-native patterns + - Adopt microservices, containers, and serverless + - Implement cloud security and scaling practices + +3. **Hybrid Approach** + - Keep sensitive data on-premises + - Migrate compute workloads to cloud + - Implement secure connectivity between environments + +## Feature Flags for Migrations + +### Progressive Feature Rollout +```python +# Example feature flag implementation +class MigrationFeatureFlag: + def __init__(self, flag_name, rollout_percentage=0): + self.flag_name = flag_name + self.rollout_percentage = rollout_percentage + + def is_enabled_for_user(self, user_id): + hash_value = hash(f"{self.flag_name}:{user_id}") + return (hash_value % 100) < self.rollout_percentage + + def gradual_rollout(self, target_percentage, step_size=10): + while self.rollout_percentage < target_percentage: + self.rollout_percentage = min( + self.rollout_percentage + step_size, + target_percentage + ) + yield self.rollout_percentage +``` + +### Circuit Breaker Pattern +Implement automatic fallback to legacy systems when new systems show degraded performance: + +```python +class MigrationCircuitBreaker: + def __init__(self, failure_threshold=5, timeout=60): + self.failure_count = 0 + self.failure_threshold = failure_threshold + self.timeout = timeout + self.last_failure_time = None + self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN + + def call_new_service(self, request): + if self.state == 'OPEN': + if self.should_attempt_reset(): + self.state = 'HALF_OPEN' + else: + return self.fallback_to_legacy(request) + + try: + response = self.new_service.process(request) + self.on_success() + return response + except Exception as e: + self.on_failure() + return self.fallback_to_legacy(request) +``` + +## Data Validation and Reconciliation + +### Validation Strategies +1. **Row Count Validation** + - Compare record counts between source and target + - Account for soft deletes and filtered records + - Implement threshold-based alerting + +2. **Checksums and Hashing** + - Generate checksums for critical data subsets + - Compare hash values to detect data drift + - Use sampling for large datasets + +3. **Business Logic Validation** + - Run critical business queries on both systems + - Compare aggregate results (sums, counts, averages) + - Validate derived data and calculations + +### Reconciliation Patterns +1. **Delta Detection** + ```sql + -- Example delta query for reconciliation + SELECT 'missing_in_target' as issue_type, source_id + FROM source_table s + WHERE NOT EXISTS ( + SELECT 1 FROM target_table t + WHERE t.id = s.id + ) + UNION ALL + SELECT 'extra_in_target' as issue_type, target_id + FROM target_table t + WHERE NOT EXISTS ( + SELECT 1 FROM source_table s + WHERE s.id = t.id + ); + ``` + +2. **Automated Correction** + - Implement data repair scripts for common issues + - Use idempotent operations for safe re-execution + - Log all correction actions for audit trails + +## Rollback Strategies + +### Database Rollback +1. **Schema Rollback** + - Maintain schema version control + - Use backward-compatible migrations when possible + - Keep rollback scripts for each migration step + +2. **Data Rollback** + - Point-in-time recovery using database backups + - Transaction log replay for precise rollback points + - Maintain data snapshots at migration checkpoints + +### Service Rollback +1. **Blue-Green Deployment** + - Keep previous service version running during migration + - Switch traffic back to blue environment if issues arise + - Maintain parallel infrastructure during migration window + +2. **Rolling Rollback** + - Gradually shift traffic back to previous version + - Monitor system health during rollback process + - Implement automated rollback triggers + +### Infrastructure Rollback +1. **Infrastructure as Code** + - Version control all infrastructure definitions + - Maintain rollback terraform/CloudFormation templates + - Test rollback procedures in staging environments + +2. **Data Persistence** + - Preserve data in original location during migration + - Implement data sync back to original systems + - Maintain backup strategies across both environments + +## Risk Assessment Framework + +### Risk Categories +1. **Technical Risks** + - Data loss or corruption + - Service downtime or degraded performance + - Integration failures with dependent systems + - Scalability issues under production load + +2. **Business Risks** + - Revenue impact from service disruption + - Customer experience degradation + - Compliance and regulatory concerns + - Brand reputation impact + +3. **Operational Risks** + - Team knowledge gaps + - Insufficient testing coverage + - Inadequate monitoring and alerting + - Communication breakdowns + +### Risk Mitigation Strategies +1. **Technical Mitigations** + - Comprehensive testing (unit, integration, load, chaos) + - Gradual rollout with automated rollback triggers + - Data validation and reconciliation processes + - Performance monitoring and alerting + +2. **Business Mitigations** + - Stakeholder communication plans + - Business continuity procedures + - Customer notification strategies + - Revenue protection measures + +3. **Operational Mitigations** + - Team training and documentation + - Runbook creation and testing + - On-call rotation planning + - Post-migration review processes + +## Migration Runbooks + +### Pre-Migration Checklist +- [ ] Migration plan reviewed and approved +- [ ] Rollback procedures tested and validated +- [ ] Monitoring and alerting configured +- [ ] Team roles and responsibilities defined +- [ ] Stakeholder communication plan activated +- [ ] Backup and recovery procedures verified +- [ ] Test environment validation complete +- [ ] Performance benchmarks established +- [ ] Security review completed +- [ ] Compliance requirements verified + +### During Migration +- [ ] Execute migration phases in planned order +- [ ] Monitor key performance indicators continuously +- [ ] Validate data consistency at each checkpoint +- [ ] Communicate progress to stakeholders +- [ ] Document any deviations from plan +- [ ] Execute rollback if success criteria not met +- [ ] Coordinate with dependent teams +- [ ] Maintain detailed execution logs + +### Post-Migration +- [ ] Validate all success criteria met +- [ ] Perform comprehensive system health checks +- [ ] Execute data reconciliation procedures +- [ ] Monitor system performance over 72 hours +- [ ] Update documentation and runbooks +- [ ] Decommission legacy systems (if applicable) +- [ ] Conduct post-migration retrospective +- [ ] Archive migration artifacts +- [ ] Update disaster recovery procedures + +## Communication Templates + +### Executive Summary Template +``` +Migration Status: [IN_PROGRESS | COMPLETED | ROLLED_BACK] +Start Time: [YYYY-MM-DD HH:MM UTC] +Current Phase: [X of Y] +Overall Progress: [X%] + +Key Metrics: +- System Availability: [X.XX%] +- Data Migration Progress: [X.XX%] +- Performance Impact: [+/-X%] +- Issues Encountered: [X] + +Next Steps: +1. [Action item 1] +2. [Action item 2] + +Risk Assessment: [LOW | MEDIUM | HIGH] +Rollback Status: [AVAILABLE | NOT_AVAILABLE] +``` + +### Technical Team Update Template +``` +Phase: [Phase Name] - [Status] +Duration: [Started] - [Expected End] + +Completed Tasks: +✓ [Task 1] +✓ [Task 2] + +In Progress: +🔄 [Task 3] - [X% complete] + +Upcoming: +⏳ [Task 4] - [Expected start time] + +Issues: +⚠️ [Issue description] - [Severity] - [ETA resolution] + +Metrics: +- Migration Rate: [X records/minute] +- Error Rate: [X.XX%] +- System Load: [CPU/Memory/Disk] +``` + +## Success Metrics + +### Technical Metrics +- **Migration Completion Rate:** Percentage of data/services successfully migrated +- **Downtime Duration:** Total system unavailability during migration +- **Data Consistency Score:** Percentage of data validation checks passing +- **Performance Delta:** Performance change compared to baseline +- **Error Rate:** Percentage of failed operations during migration + +### Business Metrics +- **Customer Impact Score:** Measure of customer experience degradation +- **Revenue Protection:** Percentage of revenue maintained during migration +- **Time to Value:** Duration from migration start to business value realization +- **Stakeholder Satisfaction:** Post-migration stakeholder feedback scores + +### Operational Metrics +- **Plan Adherence:** Percentage of migration executed according to plan +- **Issue Resolution Time:** Average time to resolve migration issues +- **Team Efficiency:** Resource utilization and productivity metrics +- **Knowledge Transfer Score:** Team readiness for post-migration operations + +## Tools and Technologies + +### Migration Planning Tools +- **migration_planner.py:** Automated migration plan generation +- **compatibility_checker.py:** Schema and API compatibility analysis +- **rollback_generator.py:** Comprehensive rollback procedure generation + +### Validation Tools +- Database comparison utilities (schema and data) +- API contract testing frameworks +- Performance benchmarking tools +- Data quality validation pipelines + +### Monitoring and Alerting +- Real-time migration progress dashboards +- Automated rollback trigger systems +- Business metric monitoring +- Stakeholder notification systems + +## Best Practices + +### Planning Phase +1. **Start with Risk Assessment:** Identify all potential failure modes before planning +2. **Design for Rollback:** Every migration step should have a tested rollback procedure +3. **Validate in Staging:** Execute full migration process in production-like environment +4. **Plan for Gradual Rollout:** Use feature flags and traffic routing for controlled migration + +### Execution Phase +1. **Monitor Continuously:** Track both technical and business metrics throughout +2. **Communicate Proactively:** Keep all stakeholders informed of progress and issues +3. **Document Everything:** Maintain detailed logs for post-migration analysis +4. **Stay Flexible:** Be prepared to adjust timeline based on real-world performance + +### Validation Phase +1. **Automate Validation:** Use automated tools for data consistency and performance checks +2. **Business Logic Testing:** Validate critical business processes end-to-end +3. **Load Testing:** Verify system performance under expected production load +4. **Security Validation:** Ensure security controls function properly in new environment + +## Integration with Development Lifecycle + +### CI/CD Integration +```yaml +# Example migration pipeline stage +migration_validation: + stage: test + script: + - python scripts/compatibility_checker.py --before=old_schema.json --after=new_schema.json + - python scripts/migration_planner.py --config=migration_config.json --validate + artifacts: + reports: + - compatibility_report.json + - migration_plan.json +``` + +### Infrastructure as Code +```terraform +# Example Terraform for blue-green infrastructure +resource "aws_instance" "blue_environment" { + count = var.migration_phase == "preparation" ? var.instance_count : 0 + # Blue environment configuration +} + +resource "aws_instance" "green_environment" { + count = var.migration_phase == "execution" ? var.instance_count : 0 + # Green environment configuration +} +``` + +This Migration Architect skill provides a comprehensive framework for planning, executing, and validating complex system migrations while minimizing business impact and technical risk. The combination of automated tools, proven patterns, and detailed procedures enables organizations to confidently undertake even the most complex migration projects. \ No newline at end of file diff --git a/engineering/migration-architect/assets/database_schema_after.json b/engineering/migration-architect/assets/database_schema_after.json new file mode 100644 index 0000000..634c50c --- /dev/null +++ b/engineering/migration-architect/assets/database_schema_after.json @@ -0,0 +1,367 @@ +{ + "schema_version": "2.0", + "database": "user_management_v2", + "tables": { + "users": { + "columns": { + "id": { + "type": "bigint", + "nullable": false, + "primary_key": true, + "auto_increment": true + }, + "username": { + "type": "varchar", + "length": 50, + "nullable": false, + "unique": true + }, + "email": { + "type": "varchar", + "length": 320, + "nullable": false, + "unique": true + }, + "password_hash": { + "type": "varchar", + "length": 255, + "nullable": false + }, + "first_name": { + "type": "varchar", + "length": 100, + "nullable": true + }, + "last_name": { + "type": "varchar", + "length": 100, + "nullable": true + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "updated_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + }, + "is_active": { + "type": "boolean", + "nullable": false, + "default": true + }, + "phone": { + "type": "varchar", + "length": 20, + "nullable": true + }, + "email_verified_at": { + "type": "timestamp", + "nullable": true, + "comment": "When email was verified" + }, + "phone_verified_at": { + "type": "timestamp", + "nullable": true, + "comment": "When phone was verified" + }, + "two_factor_enabled": { + "type": "boolean", + "nullable": false, + "default": false + }, + "last_login_at": { + "type": "timestamp", + "nullable": true + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [ + "username", + "email" + ], + "foreign_key": [], + "check": [ + "email LIKE '%@%'", + "LENGTH(password_hash) >= 60", + "phone IS NULL OR LENGTH(phone) >= 10" + ] + }, + "indexes": [ + { + "name": "idx_users_email", + "columns": ["email"], + "unique": true + }, + { + "name": "idx_users_username", + "columns": ["username"], + "unique": true + }, + { + "name": "idx_users_created_at", + "columns": ["created_at"] + }, + { + "name": "idx_users_email_verified", + "columns": ["email_verified_at"] + }, + { + "name": "idx_users_last_login", + "columns": ["last_login_at"] + } + ] + }, + "user_profiles": { + "columns": { + "id": { + "type": "bigint", + "nullable": false, + "primary_key": true, + "auto_increment": true + }, + "user_id": { + "type": "bigint", + "nullable": false + }, + "bio": { + "type": "text", + "nullable": true + }, + "avatar_url": { + "type": "varchar", + "length": 500, + "nullable": true + }, + "birth_date": { + "type": "date", + "nullable": true + }, + "location": { + "type": "varchar", + "length": 100, + "nullable": true + }, + "website": { + "type": "varchar", + "length": 255, + "nullable": true + }, + "privacy_level": { + "type": "varchar", + "length": 20, + "nullable": false, + "default": "public" + }, + "timezone": { + "type": "varchar", + "length": 50, + "nullable": true, + "default": "UTC" + }, + "language": { + "type": "varchar", + "length": 10, + "nullable": false, + "default": "en" + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "updated_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [], + "foreign_key": [ + { + "columns": ["user_id"], + "references": "users(id)", + "on_delete": "CASCADE" + } + ], + "check": [ + "privacy_level IN ('public', 'private', 'friends_only')", + "bio IS NULL OR LENGTH(bio) <= 2000", + "language IN ('en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh')" + ] + }, + "indexes": [ + { + "name": "idx_user_profiles_user_id", + "columns": ["user_id"], + "unique": true + }, + { + "name": "idx_user_profiles_privacy", + "columns": ["privacy_level"] + }, + { + "name": "idx_user_profiles_language", + "columns": ["language"] + } + ] + }, + "user_sessions": { + "columns": { + "id": { + "type": "varchar", + "length": 128, + "nullable": false, + "primary_key": true + }, + "user_id": { + "type": "bigint", + "nullable": false + }, + "ip_address": { + "type": "varchar", + "length": 45, + "nullable": true + }, + "user_agent": { + "type": "text", + "nullable": true + }, + "expires_at": { + "type": "timestamp", + "nullable": false + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "last_activity": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + }, + "session_type": { + "type": "varchar", + "length": 20, + "nullable": false, + "default": "web" + }, + "is_mobile": { + "type": "boolean", + "nullable": false, + "default": false + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [], + "foreign_key": [ + { + "columns": ["user_id"], + "references": "users(id)", + "on_delete": "CASCADE" + } + ], + "check": [ + "session_type IN ('web', 'mobile', 'api', 'admin')" + ] + }, + "indexes": [ + { + "name": "idx_user_sessions_user_id", + "columns": ["user_id"] + }, + { + "name": "idx_user_sessions_expires", + "columns": ["expires_at"] + }, + { + "name": "idx_user_sessions_type", + "columns": ["session_type"] + } + ] + }, + "user_preferences": { + "columns": { + "id": { + "type": "bigint", + "nullable": false, + "primary_key": true, + "auto_increment": true + }, + "user_id": { + "type": "bigint", + "nullable": false + }, + "preference_key": { + "type": "varchar", + "length": 100, + "nullable": false + }, + "preference_value": { + "type": "json", + "nullable": true + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "updated_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [ + ["user_id", "preference_key"] + ], + "foreign_key": [ + { + "columns": ["user_id"], + "references": "users(id)", + "on_delete": "CASCADE" + } + ], + "check": [] + }, + "indexes": [ + { + "name": "idx_user_preferences_user_key", + "columns": ["user_id", "preference_key"], + "unique": true + } + ] + } + }, + "views": { + "active_users": { + "definition": "SELECT u.id, u.username, u.email, u.first_name, u.last_name, u.email_verified_at, u.last_login_at FROM users u WHERE u.is_active = true", + "columns": ["id", "username", "email", "first_name", "last_name", "email_verified_at", "last_login_at"] + }, + "verified_users": { + "definition": "SELECT u.id, u.username, u.email FROM users u WHERE u.is_active = true AND u.email_verified_at IS NOT NULL", + "columns": ["id", "username", "email"] + } + }, + "procedures": [ + { + "name": "cleanup_expired_sessions", + "parameters": [], + "definition": "DELETE FROM user_sessions WHERE expires_at < NOW()" + }, + { + "name": "get_user_with_profile", + "parameters": ["user_id BIGINT"], + "definition": "SELECT u.*, p.bio, p.avatar_url, p.privacy_level FROM users u LEFT JOIN user_profiles p ON u.id = p.user_id WHERE u.id = user_id" + } + ] +} \ No newline at end of file diff --git a/engineering/migration-architect/assets/database_schema_before.json b/engineering/migration-architect/assets/database_schema_before.json new file mode 100644 index 0000000..597da91 --- /dev/null +++ b/engineering/migration-architect/assets/database_schema_before.json @@ -0,0 +1,243 @@ +{ + "schema_version": "1.0", + "database": "user_management", + "tables": { + "users": { + "columns": { + "id": { + "type": "bigint", + "nullable": false, + "primary_key": true, + "auto_increment": true + }, + "username": { + "type": "varchar", + "length": 50, + "nullable": false, + "unique": true + }, + "email": { + "type": "varchar", + "length": 255, + "nullable": false, + "unique": true + }, + "password_hash": { + "type": "varchar", + "length": 255, + "nullable": false + }, + "first_name": { + "type": "varchar", + "length": 100, + "nullable": true + }, + "last_name": { + "type": "varchar", + "length": 100, + "nullable": true + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "updated_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + }, + "is_active": { + "type": "boolean", + "nullable": false, + "default": true + }, + "phone": { + "type": "varchar", + "length": 20, + "nullable": true + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [ + "username", + "email" + ], + "foreign_key": [], + "check": [ + "email LIKE '%@%'", + "LENGTH(password_hash) >= 60" + ] + }, + "indexes": [ + { + "name": "idx_users_email", + "columns": ["email"], + "unique": true + }, + { + "name": "idx_users_username", + "columns": ["username"], + "unique": true + }, + { + "name": "idx_users_created_at", + "columns": ["created_at"] + } + ] + }, + "user_profiles": { + "columns": { + "id": { + "type": "bigint", + "nullable": false, + "primary_key": true, + "auto_increment": true + }, + "user_id": { + "type": "bigint", + "nullable": false + }, + "bio": { + "type": "varchar", + "length": 255, + "nullable": true + }, + "avatar_url": { + "type": "varchar", + "length": 500, + "nullable": true + }, + "birth_date": { + "type": "date", + "nullable": true + }, + "location": { + "type": "varchar", + "length": 100, + "nullable": true + }, + "website": { + "type": "varchar", + "length": 255, + "nullable": true + }, + "privacy_level": { + "type": "varchar", + "length": 20, + "nullable": false, + "default": "public" + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "updated_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [], + "foreign_key": [ + { + "columns": ["user_id"], + "references": "users(id)", + "on_delete": "CASCADE" + } + ], + "check": [ + "privacy_level IN ('public', 'private', 'friends_only')" + ] + }, + "indexes": [ + { + "name": "idx_user_profiles_user_id", + "columns": ["user_id"], + "unique": true + }, + { + "name": "idx_user_profiles_privacy", + "columns": ["privacy_level"] + } + ] + }, + "user_sessions": { + "columns": { + "id": { + "type": "varchar", + "length": 128, + "nullable": false, + "primary_key": true + }, + "user_id": { + "type": "bigint", + "nullable": false + }, + "ip_address": { + "type": "varchar", + "length": 45, + "nullable": true + }, + "user_agent": { + "type": "varchar", + "length": 500, + "nullable": true + }, + "expires_at": { + "type": "timestamp", + "nullable": false + }, + "created_at": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP" + }, + "last_activity": { + "type": "timestamp", + "nullable": false, + "default": "CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP" + } + }, + "constraints": { + "primary_key": ["id"], + "unique": [], + "foreign_key": [ + { + "columns": ["user_id"], + "references": "users(id)", + "on_delete": "CASCADE" + } + ], + "check": [] + }, + "indexes": [ + { + "name": "idx_user_sessions_user_id", + "columns": ["user_id"] + }, + { + "name": "idx_user_sessions_expires", + "columns": ["expires_at"] + } + ] + } + }, + "views": { + "active_users": { + "definition": "SELECT u.id, u.username, u.email, u.first_name, u.last_name FROM users u WHERE u.is_active = true", + "columns": ["id", "username", "email", "first_name", "last_name"] + } + }, + "procedures": [ + { + "name": "cleanup_expired_sessions", + "parameters": [], + "definition": "DELETE FROM user_sessions WHERE expires_at < NOW()" + } + ] +} \ No newline at end of file diff --git a/engineering/migration-architect/assets/sample_database_migration.json b/engineering/migration-architect/assets/sample_database_migration.json new file mode 100644 index 0000000..478ed38 --- /dev/null +++ b/engineering/migration-architect/assets/sample_database_migration.json @@ -0,0 +1,106 @@ +{ + "type": "database", + "pattern": "schema_change", + "source": "PostgreSQL 13 Production Database", + "target": "PostgreSQL 15 Cloud Database", + "description": "Migrate user management system from on-premises PostgreSQL to cloud with schema updates", + "constraints": { + "max_downtime_minutes": 30, + "data_volume_gb": 2500, + "dependencies": [ + "user_service_api", + "authentication_service", + "notification_service", + "analytics_pipeline", + "backup_service" + ], + "compliance_requirements": [ + "GDPR", + "SOX" + ], + "special_requirements": [ + "zero_data_loss", + "referential_integrity", + "performance_baseline_maintained" + ] + }, + "tables_to_migrate": [ + { + "name": "users", + "row_count": 1500000, + "size_mb": 450, + "critical": true + }, + { + "name": "user_profiles", + "row_count": 1500000, + "size_mb": 890, + "critical": true + }, + { + "name": "user_sessions", + "row_count": 25000000, + "size_mb": 1200, + "critical": false + }, + { + "name": "audit_logs", + "row_count": 50000000, + "size_mb": 2800, + "critical": false + } + ], + "schema_changes": [ + { + "table": "users", + "changes": [ + { + "type": "add_column", + "column": "email_verified_at", + "data_type": "timestamp", + "nullable": true + }, + { + "type": "add_column", + "column": "phone_verified_at", + "data_type": "timestamp", + "nullable": true + } + ] + }, + { + "table": "user_profiles", + "changes": [ + { + "type": "modify_column", + "column": "bio", + "old_type": "varchar(255)", + "new_type": "text" + }, + { + "type": "add_constraint", + "constraint_type": "check", + "constraint_name": "bio_length_check", + "definition": "LENGTH(bio) <= 2000" + } + ] + } + ], + "performance_requirements": { + "max_query_response_time_ms": 100, + "concurrent_connections": 500, + "transactions_per_second": 1000 + }, + "business_continuity": { + "critical_business_hours": { + "start": "08:00", + "end": "18:00", + "timezone": "UTC" + }, + "preferred_migration_window": { + "start": "02:00", + "end": "06:00", + "timezone": "UTC" + } + } +} \ No newline at end of file diff --git a/engineering/migration-architect/assets/sample_service_migration.json b/engineering/migration-architect/assets/sample_service_migration.json new file mode 100644 index 0000000..193a969 --- /dev/null +++ b/engineering/migration-architect/assets/sample_service_migration.json @@ -0,0 +1,175 @@ +{ + "type": "service", + "pattern": "strangler_fig", + "source": "Legacy User Service (Java Spring Boot 2.x)", + "target": "New User Service (Node.js + TypeScript)", + "description": "Migrate legacy user management service to modern microservices architecture", + "constraints": { + "max_downtime_minutes": 0, + "data_volume_gb": 50, + "dependencies": [ + "payment_service", + "order_service", + "notification_service", + "analytics_service", + "mobile_app_v1", + "mobile_app_v2", + "web_frontend", + "admin_dashboard" + ], + "compliance_requirements": [ + "PCI_DSS", + "GDPR" + ], + "special_requirements": [ + "api_backward_compatibility", + "session_continuity", + "rate_limit_preservation" + ] + }, + "service_details": { + "legacy_service": { + "endpoints": [ + "GET /api/v1/users/{id}", + "POST /api/v1/users", + "PUT /api/v1/users/{id}", + "DELETE /api/v1/users/{id}", + "GET /api/v1/users/{id}/profile", + "PUT /api/v1/users/{id}/profile", + "POST /api/v1/users/{id}/verify-email", + "POST /api/v1/users/login", + "POST /api/v1/users/logout" + ], + "current_load": { + "requests_per_second": 850, + "peak_requests_per_second": 2000, + "average_response_time_ms": 120, + "p95_response_time_ms": 300 + }, + "infrastructure": { + "instances": 4, + "cpu_cores_per_instance": 4, + "memory_gb_per_instance": 8, + "load_balancer": "AWS ELB Classic" + } + }, + "new_service": { + "endpoints": [ + "GET /api/v2/users/{id}", + "POST /api/v2/users", + "PUT /api/v2/users/{id}", + "DELETE /api/v2/users/{id}", + "GET /api/v2/users/{id}/profile", + "PUT /api/v2/users/{id}/profile", + "POST /api/v2/users/{id}/verify-email", + "POST /api/v2/users/{id}/verify-phone", + "POST /api/v2/auth/login", + "POST /api/v2/auth/logout", + "POST /api/v2/auth/refresh" + ], + "target_performance": { + "requests_per_second": 1500, + "peak_requests_per_second": 3000, + "average_response_time_ms": 80, + "p95_response_time_ms": 200 + }, + "infrastructure": { + "container_platform": "Kubernetes", + "initial_replicas": 3, + "max_replicas": 10, + "cpu_request_millicores": 500, + "cpu_limit_millicores": 1000, + "memory_request_mb": 512, + "memory_limit_mb": 1024, + "load_balancer": "AWS ALB" + } + } + }, + "migration_phases": [ + { + "phase": "preparation", + "description": "Deploy new service and configure routing", + "estimated_duration_hours": 8 + }, + { + "phase": "intercept", + "description": "Configure API gateway to route to new service", + "estimated_duration_hours": 2 + }, + { + "phase": "gradual_migration", + "description": "Gradually increase traffic to new service", + "estimated_duration_hours": 48 + }, + { + "phase": "validation", + "description": "Validate new service performance and functionality", + "estimated_duration_hours": 24 + }, + { + "phase": "decommission", + "description": "Remove legacy service after validation", + "estimated_duration_hours": 4 + } + ], + "feature_flags": [ + { + "name": "enable_new_user_service", + "description": "Route user service requests to new implementation", + "initial_percentage": 5, + "rollout_schedule": [ + {"percentage": 5, "duration_hours": 24}, + {"percentage": 25, "duration_hours": 24}, + {"percentage": 50, "duration_hours": 24}, + {"percentage": 100, "duration_hours": 0} + ] + }, + { + "name": "enable_new_auth_endpoints", + "description": "Enable new authentication endpoints", + "initial_percentage": 0, + "rollout_schedule": [ + {"percentage": 10, "duration_hours": 12}, + {"percentage": 50, "duration_hours": 12}, + {"percentage": 100, "duration_hours": 0} + ] + } + ], + "monitoring": { + "critical_metrics": [ + "request_rate", + "error_rate", + "response_time_p95", + "response_time_p99", + "cpu_utilization", + "memory_utilization", + "database_connection_pool" + ], + "alert_thresholds": { + "error_rate": 0.05, + "response_time_p95": 250, + "cpu_utilization": 0.80, + "memory_utilization": 0.85 + } + }, + "rollback_triggers": [ + { + "metric": "error_rate", + "threshold": 0.10, + "duration_minutes": 5, + "action": "automatic_rollback" + }, + { + "metric": "response_time_p95", + "threshold": 500, + "duration_minutes": 10, + "action": "alert_team" + }, + { + "metric": "cpu_utilization", + "threshold": 0.95, + "duration_minutes": 5, + "action": "scale_up" + } + ] +} \ No newline at end of file diff --git a/engineering/migration-architect/expected_outputs/rollback_runbook.json b/engineering/migration-architect/expected_outputs/rollback_runbook.json new file mode 100644 index 0000000..83e5edc --- /dev/null +++ b/engineering/migration-architect/expected_outputs/rollback_runbook.json @@ -0,0 +1,577 @@ +{ + "runbook_id": "rb_921c0bca", + "migration_id": "23a52ed1507f", + "created_at": "2026-02-16T13:47:31.108500", + "rollback_phases": [ + { + "phase_name": "rollback_cleanup", + "description": "Rollback changes made during cleanup phase", + "urgency_level": "medium", + "estimated_duration_minutes": 570, + "prerequisites": [ + "Incident commander assigned and briefed", + "All team members notified of rollback initiation", + "Monitoring systems confirmed operational", + "Backup systems verified and accessible" + ], + "steps": [ + { + "step_id": "rb_validate_0_final", + "name": "Validate rollback completion", + "description": "Comprehensive validation that cleanup rollback completed successfully", + "script_type": "manual", + "script_content": "Execute validation checklist for this phase", + "estimated_duration_minutes": 10, + "dependencies": [], + "validation_commands": [ + "SELECT COUNT(*) FROM {table_name};", + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';", + "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';", + "SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};", + "SELECT MAX({timestamp_column}) FROM {table_name};" + ], + "success_criteria": [ + "cleanup fully rolled back", + "All validation checks pass" + ], + "failure_escalation": "Investigate cleanup rollback failures", + "rollback_order": 99 + } + ], + "validation_checkpoints": [ + "cleanup rollback steps completed", + "System health checks passing", + "No critical errors in logs", + "Key metrics within acceptable ranges", + "Validation command passed: SELECT COUNT(*) FROM {table_name};...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..." + ], + "communication_requirements": [ + "Notify incident commander of phase start/completion", + "Update rollback status dashboard", + "Log all actions and decisions" + ], + "risk_level": "medium" + }, + { + "phase_name": "rollback_contract", + "description": "Rollback changes made during contract phase", + "urgency_level": "medium", + "estimated_duration_minutes": 570, + "prerequisites": [ + "Incident commander assigned and briefed", + "All team members notified of rollback initiation", + "Monitoring systems confirmed operational", + "Backup systems verified and accessible", + "Previous rollback phase completed successfully" + ], + "steps": [ + { + "step_id": "rb_validate_1_final", + "name": "Validate rollback completion", + "description": "Comprehensive validation that contract rollback completed successfully", + "script_type": "manual", + "script_content": "Execute validation checklist for this phase", + "estimated_duration_minutes": 10, + "dependencies": [], + "validation_commands": [ + "SELECT COUNT(*) FROM {table_name};", + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';", + "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';", + "SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};", + "SELECT MAX({timestamp_column}) FROM {table_name};" + ], + "success_criteria": [ + "contract fully rolled back", + "All validation checks pass" + ], + "failure_escalation": "Investigate contract rollback failures", + "rollback_order": 99 + } + ], + "validation_checkpoints": [ + "contract rollback steps completed", + "System health checks passing", + "No critical errors in logs", + "Key metrics within acceptable ranges", + "Validation command passed: SELECT COUNT(*) FROM {table_name};...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..." + ], + "communication_requirements": [ + "Notify incident commander of phase start/completion", + "Update rollback status dashboard", + "Log all actions and decisions" + ], + "risk_level": "medium" + }, + { + "phase_name": "rollback_migrate", + "description": "Rollback changes made during migrate phase", + "urgency_level": "medium", + "estimated_duration_minutes": 570, + "prerequisites": [ + "Incident commander assigned and briefed", + "All team members notified of rollback initiation", + "Monitoring systems confirmed operational", + "Backup systems verified and accessible", + "Previous rollback phase completed successfully" + ], + "steps": [ + { + "step_id": "rb_validate_2_final", + "name": "Validate rollback completion", + "description": "Comprehensive validation that migrate rollback completed successfully", + "script_type": "manual", + "script_content": "Execute validation checklist for this phase", + "estimated_duration_minutes": 10, + "dependencies": [], + "validation_commands": [ + "SELECT COUNT(*) FROM {table_name};", + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';", + "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';", + "SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};", + "SELECT MAX({timestamp_column}) FROM {table_name};" + ], + "success_criteria": [ + "migrate fully rolled back", + "All validation checks pass" + ], + "failure_escalation": "Investigate migrate rollback failures", + "rollback_order": 99 + } + ], + "validation_checkpoints": [ + "migrate rollback steps completed", + "System health checks passing", + "No critical errors in logs", + "Key metrics within acceptable ranges", + "Validation command passed: SELECT COUNT(*) FROM {table_name};...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..." + ], + "communication_requirements": [ + "Notify incident commander of phase start/completion", + "Update rollback status dashboard", + "Log all actions and decisions" + ], + "risk_level": "medium" + }, + { + "phase_name": "rollback_expand", + "description": "Rollback changes made during expand phase", + "urgency_level": "medium", + "estimated_duration_minutes": 570, + "prerequisites": [ + "Incident commander assigned and briefed", + "All team members notified of rollback initiation", + "Monitoring systems confirmed operational", + "Backup systems verified and accessible", + "Previous rollback phase completed successfully" + ], + "steps": [ + { + "step_id": "rb_validate_3_final", + "name": "Validate rollback completion", + "description": "Comprehensive validation that expand rollback completed successfully", + "script_type": "manual", + "script_content": "Execute validation checklist for this phase", + "estimated_duration_minutes": 10, + "dependencies": [], + "validation_commands": [ + "SELECT COUNT(*) FROM {table_name};", + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';", + "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';", + "SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};", + "SELECT MAX({timestamp_column}) FROM {table_name};" + ], + "success_criteria": [ + "expand fully rolled back", + "All validation checks pass" + ], + "failure_escalation": "Investigate expand rollback failures", + "rollback_order": 99 + } + ], + "validation_checkpoints": [ + "expand rollback steps completed", + "System health checks passing", + "No critical errors in logs", + "Key metrics within acceptable ranges", + "Validation command passed: SELECT COUNT(*) FROM {table_name};...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..." + ], + "communication_requirements": [ + "Notify incident commander of phase start/completion", + "Update rollback status dashboard", + "Log all actions and decisions" + ], + "risk_level": "medium" + }, + { + "phase_name": "rollback_preparation", + "description": "Rollback changes made during preparation phase", + "urgency_level": "medium", + "estimated_duration_minutes": 570, + "prerequisites": [ + "Incident commander assigned and briefed", + "All team members notified of rollback initiation", + "Monitoring systems confirmed operational", + "Backup systems verified and accessible", + "Previous rollback phase completed successfully" + ], + "steps": [ + { + "step_id": "rb_schema_4_01", + "name": "Drop migration artifacts", + "description": "Remove temporary migration tables and procedures", + "script_type": "sql", + "script_content": "-- Drop migration artifacts\nDROP TABLE IF EXISTS migration_log;\nDROP PROCEDURE IF EXISTS migrate_data();", + "estimated_duration_minutes": 5, + "dependencies": [], + "validation_commands": [ + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name LIKE '%migration%';" + ], + "success_criteria": [ + "No migration artifacts remain" + ], + "failure_escalation": "Manual cleanup required", + "rollback_order": 1 + }, + { + "step_id": "rb_validate_4_final", + "name": "Validate rollback completion", + "description": "Comprehensive validation that preparation rollback completed successfully", + "script_type": "manual", + "script_content": "Execute validation checklist for this phase", + "estimated_duration_minutes": 10, + "dependencies": [ + "rb_schema_4_01" + ], + "validation_commands": [ + "SELECT COUNT(*) FROM {table_name};", + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';", + "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';", + "SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};", + "SELECT MAX({timestamp_column}) FROM {table_name};" + ], + "success_criteria": [ + "preparation fully rolled back", + "All validation checks pass" + ], + "failure_escalation": "Investigate preparation rollback failures", + "rollback_order": 99 + } + ], + "validation_checkpoints": [ + "preparation rollback steps completed", + "System health checks passing", + "No critical errors in logs", + "Key metrics within acceptable ranges", + "Validation command passed: SELECT COUNT(*) FROM {table_name};...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE...", + "Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH..." + ], + "communication_requirements": [ + "Notify incident commander of phase start/completion", + "Update rollback status dashboard", + "Log all actions and decisions" + ], + "risk_level": "medium" + } + ], + "trigger_conditions": [ + { + "trigger_id": "error_rate_spike", + "name": "Error Rate Spike", + "condition": "error_rate > baseline * 5 for 5 minutes", + "metric_threshold": { + "metric": "error_rate", + "operator": "greater_than", + "value": "baseline_error_rate * 5", + "duration_minutes": 5 + }, + "evaluation_window_minutes": 5, + "auto_execute": true, + "escalation_contacts": [ + "on_call_engineer", + "migration_lead" + ] + }, + { + "trigger_id": "response_time_degradation", + "name": "Response Time Degradation", + "condition": "p95_response_time > baseline * 3 for 10 minutes", + "metric_threshold": { + "metric": "p95_response_time", + "operator": "greater_than", + "value": "baseline_p95 * 3", + "duration_minutes": 10 + }, + "evaluation_window_minutes": 10, + "auto_execute": false, + "escalation_contacts": [ + "performance_team", + "migration_lead" + ] + }, + { + "trigger_id": "availability_drop", + "name": "Service Availability Drop", + "condition": "availability < 95% for 2 minutes", + "metric_threshold": { + "metric": "availability", + "operator": "less_than", + "value": 0.95, + "duration_minutes": 2 + }, + "evaluation_window_minutes": 2, + "auto_execute": true, + "escalation_contacts": [ + "sre_team", + "incident_commander" + ] + }, + { + "trigger_id": "data_integrity_failure", + "name": "Data Integrity Check Failure", + "condition": "data_validation_failures > 0", + "metric_threshold": { + "metric": "data_validation_failures", + "operator": "greater_than", + "value": 0, + "duration_minutes": 1 + }, + "evaluation_window_minutes": 1, + "auto_execute": true, + "escalation_contacts": [ + "dba_team", + "data_team" + ] + }, + { + "trigger_id": "migration_progress_stalled", + "name": "Migration Progress Stalled", + "condition": "migration_progress unchanged for 30 minutes", + "metric_threshold": { + "metric": "migration_progress_rate", + "operator": "equals", + "value": 0, + "duration_minutes": 30 + }, + "evaluation_window_minutes": 30, + "auto_execute": false, + "escalation_contacts": [ + "migration_team", + "dba_team" + ] + } + ], + "data_recovery_plan": { + "recovery_method": "point_in_time", + "backup_location": "/backups/pre_migration_{migration_id}_{timestamp}.sql", + "recovery_scripts": [ + "pg_restore -d production -c /backups/pre_migration_backup.sql", + "SELECT pg_create_restore_point('rollback_point');", + "VACUUM ANALYZE; -- Refresh statistics after restore" + ], + "data_validation_queries": [ + "SELECT COUNT(*) FROM critical_business_table;", + "SELECT MAX(created_at) FROM audit_log;", + "SELECT COUNT(DISTINCT user_id) FROM user_sessions;", + "SELECT SUM(amount) FROM financial_transactions WHERE date = CURRENT_DATE;" + ], + "estimated_recovery_time_minutes": 45, + "recovery_dependencies": [ + "database_instance_running", + "backup_file_accessible" + ] + }, + "communication_templates": [ + { + "template_type": "rollback_start", + "audience": "technical", + "subject": "ROLLBACK INITIATED: {migration_name}", + "body": "Team,\n\nWe have initiated rollback for migration: {migration_name}\nRollback ID: {rollback_id}\nStart Time: {start_time}\nEstimated Duration: {estimated_duration}\n\nReason: {rollback_reason}\n\nCurrent Status: Rolling back phase {current_phase}\n\nNext Updates: Every 15 minutes or upon phase completion\n\nActions Required:\n- Monitor system health dashboards\n- Stand by for escalation if needed\n- Do not make manual changes during rollback\n\nIncident Commander: {incident_commander}\n", + "urgency": "medium", + "delivery_methods": [ + "email", + "slack" + ] + }, + { + "template_type": "rollback_start", + "audience": "business", + "subject": "System Rollback In Progress - {system_name}", + "body": "Business Stakeholders,\n\nWe are currently performing a planned rollback of the {system_name} migration due to {rollback_reason}.\n\nImpact: {business_impact}\nExpected Resolution: {estimated_completion_time}\nAffected Services: {affected_services}\n\nWe will provide updates every 30 minutes.\n\nContact: {business_contact}\n", + "urgency": "medium", + "delivery_methods": [ + "email" + ] + }, + { + "template_type": "rollback_start", + "audience": "executive", + "subject": "EXEC ALERT: Critical System Rollback - {system_name}", + "body": "Executive Team,\n\nA critical rollback is in progress for {system_name}.\n\nSummary:\n- Rollback Reason: {rollback_reason}\n- Business Impact: {business_impact}\n- Expected Resolution: {estimated_completion_time}\n- Customer Impact: {customer_impact}\n\nWe are following established procedures and will update hourly.\n\nEscalation: {escalation_contact}\n", + "urgency": "high", + "delivery_methods": [ + "email" + ] + }, + { + "template_type": "rollback_complete", + "audience": "technical", + "subject": "ROLLBACK COMPLETED: {migration_name}", + "body": "Team,\n\nRollback has been successfully completed for migration: {migration_name}\n\nSummary:\n- Start Time: {start_time}\n- End Time: {end_time}\n- Duration: {actual_duration}\n- Phases Completed: {completed_phases}\n\nValidation Results:\n{validation_results}\n\nSystem Status: {system_status}\n\nNext Steps:\n- Continue monitoring for 24 hours\n- Post-rollback review scheduled for {review_date}\n- Root cause analysis to begin\n\nAll clear to resume normal operations.\n\nIncident Commander: {incident_commander}\n", + "urgency": "medium", + "delivery_methods": [ + "email", + "slack" + ] + }, + { + "template_type": "emergency_escalation", + "audience": "executive", + "subject": "CRITICAL: Rollback Emergency - {migration_name}", + "body": "CRITICAL SITUATION - IMMEDIATE ATTENTION REQUIRED\n\nMigration: {migration_name}\nIssue: Rollback procedure has encountered critical failures\n\nCurrent Status: {current_status}\nFailed Components: {failed_components}\nBusiness Impact: {business_impact}\nCustomer Impact: {customer_impact}\n\nImmediate Actions:\n1. Emergency response team activated\n2. {emergency_action_1}\n3. {emergency_action_2}\n\nWar Room: {war_room_location}\nBridge Line: {conference_bridge}\n\nNext Update: {next_update_time}\n\nIncident Commander: {incident_commander}\nExecutive On-Call: {executive_on_call}\n", + "urgency": "emergency", + "delivery_methods": [ + "email", + "sms", + "phone_call" + ] + } + ], + "escalation_matrix": { + "level_1": { + "trigger": "Single component failure", + "response_time_minutes": 5, + "contacts": [ + "on_call_engineer", + "migration_lead" + ], + "actions": [ + "Investigate issue", + "Attempt automated remediation", + "Monitor closely" + ] + }, + "level_2": { + "trigger": "Multiple component failures or single critical failure", + "response_time_minutes": 2, + "contacts": [ + "senior_engineer", + "team_lead", + "devops_lead" + ], + "actions": [ + "Initiate rollback", + "Establish war room", + "Notify stakeholders" + ] + }, + "level_3": { + "trigger": "System-wide failure or data corruption", + "response_time_minutes": 1, + "contacts": [ + "engineering_manager", + "cto", + "incident_commander" + ], + "actions": [ + "Emergency rollback", + "All hands on deck", + "Executive notification" + ] + }, + "emergency": { + "trigger": "Business-critical failure with customer impact", + "response_time_minutes": 0, + "contacts": [ + "ceo", + "cto", + "head_of_operations" + ], + "actions": [ + "Emergency procedures", + "Customer communication", + "Media preparation if needed" + ] + } + }, + "validation_checklist": [ + "Verify system is responding to health checks", + "Confirm error rates are within normal parameters", + "Validate response times meet SLA requirements", + "Check all critical business processes are functioning", + "Verify monitoring and alerting systems are operational", + "Confirm no data corruption has occurred", + "Validate security controls are functioning properly", + "Check backup systems are working correctly", + "Verify integration points with downstream systems", + "Confirm user authentication and authorization working", + "Validate database schema matches expected state", + "Confirm referential integrity constraints", + "Check database performance metrics", + "Verify data consistency across related tables", + "Validate indexes and statistics are optimal", + "Confirm transaction logs are clean", + "Check database connections and connection pooling" + ], + "post_rollback_procedures": [ + "Monitor system stability for 24-48 hours post-rollback", + "Conduct thorough post-rollback testing of all critical paths", + "Review and analyze rollback metrics and timing", + "Document lessons learned and rollback procedure improvements", + "Schedule post-mortem meeting with all stakeholders", + "Update rollback procedures based on actual experience", + "Communicate rollback completion to all stakeholders", + "Archive rollback logs and artifacts for future reference", + "Review and update monitoring thresholds if needed", + "Plan for next migration attempt with improved procedures", + "Conduct security review to ensure no vulnerabilities introduced", + "Update disaster recovery procedures if affected by rollback", + "Review capacity planning based on rollback resource usage", + "Update documentation with rollback experience and timings" + ], + "emergency_contacts": [ + { + "role": "Incident Commander", + "name": "TBD - Assigned during migration", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "incident.commander@company.com", + "backup_contact": "backup.commander@company.com" + }, + { + "role": "Technical Lead", + "name": "TBD - Migration technical owner", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "tech.lead@company.com", + "backup_contact": "senior.engineer@company.com" + }, + { + "role": "Business Owner", + "name": "TBD - Business stakeholder", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "business.owner@company.com", + "backup_contact": "product.manager@company.com" + }, + { + "role": "On-Call Engineer", + "name": "Current on-call rotation", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "oncall@company.com", + "backup_contact": "backup.oncall@company.com" + }, + { + "role": "Executive Escalation", + "name": "CTO/VP Engineering", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "cto@company.com", + "backup_contact": "vp.engineering@company.com" + } + ] +} \ No newline at end of file diff --git a/engineering/migration-architect/expected_outputs/rollback_runbook.txt b/engineering/migration-architect/expected_outputs/rollback_runbook.txt new file mode 100644 index 0000000..2df436e --- /dev/null +++ b/engineering/migration-architect/expected_outputs/rollback_runbook.txt @@ -0,0 +1,282 @@ +================================================================================ +ROLLBACK RUNBOOK: rb_921c0bca +================================================================================ +Migration ID: 23a52ed1507f +Created: 2026-02-16T13:47:31.108500 + +EMERGENCY CONTACTS +---------------------------------------- +Incident Commander: TBD - Assigned during migration + Phone: +1-XXX-XXX-XXXX + Email: incident.commander@company.com + Backup: backup.commander@company.com + +Technical Lead: TBD - Migration technical owner + Phone: +1-XXX-XXX-XXXX + Email: tech.lead@company.com + Backup: senior.engineer@company.com + +Business Owner: TBD - Business stakeholder + Phone: +1-XXX-XXX-XXXX + Email: business.owner@company.com + Backup: product.manager@company.com + +On-Call Engineer: Current on-call rotation + Phone: +1-XXX-XXX-XXXX + Email: oncall@company.com + Backup: backup.oncall@company.com + +Executive Escalation: CTO/VP Engineering + Phone: +1-XXX-XXX-XXXX + Email: cto@company.com + Backup: vp.engineering@company.com + +ESCALATION MATRIX +---------------------------------------- +LEVEL_1: + Trigger: Single component failure + Response Time: 5 minutes + Contacts: on_call_engineer, migration_lead + Actions: Investigate issue, Attempt automated remediation, Monitor closely + +LEVEL_2: + Trigger: Multiple component failures or single critical failure + Response Time: 2 minutes + Contacts: senior_engineer, team_lead, devops_lead + Actions: Initiate rollback, Establish war room, Notify stakeholders + +LEVEL_3: + Trigger: System-wide failure or data corruption + Response Time: 1 minutes + Contacts: engineering_manager, cto, incident_commander + Actions: Emergency rollback, All hands on deck, Executive notification + +EMERGENCY: + Trigger: Business-critical failure with customer impact + Response Time: 0 minutes + Contacts: ceo, cto, head_of_operations + Actions: Emergency procedures, Customer communication, Media preparation if needed + +AUTOMATIC ROLLBACK TRIGGERS +---------------------------------------- +• Error Rate Spike + Condition: error_rate > baseline * 5 for 5 minutes + Auto-Execute: Yes + Evaluation Window: 5 minutes + Contacts: on_call_engineer, migration_lead + +• Response Time Degradation + Condition: p95_response_time > baseline * 3 for 10 minutes + Auto-Execute: No + Evaluation Window: 10 minutes + Contacts: performance_team, migration_lead + +• Service Availability Drop + Condition: availability < 95% for 2 minutes + Auto-Execute: Yes + Evaluation Window: 2 minutes + Contacts: sre_team, incident_commander + +• Data Integrity Check Failure + Condition: data_validation_failures > 0 + Auto-Execute: Yes + Evaluation Window: 1 minutes + Contacts: dba_team, data_team + +• Migration Progress Stalled + Condition: migration_progress unchanged for 30 minutes + Auto-Execute: No + Evaluation Window: 30 minutes + Contacts: migration_team, dba_team + +ROLLBACK PHASES +---------------------------------------- +1. ROLLBACK_CLEANUP + Description: Rollback changes made during cleanup phase + Urgency: MEDIUM + Duration: 570 minutes + Risk Level: MEDIUM + Prerequisites: + ✓ Incident commander assigned and briefed + ✓ All team members notified of rollback initiation + ✓ Monitoring systems confirmed operational + ✓ Backup systems verified and accessible + Steps: + 99. Validate rollback completion + Duration: 10 min + Type: manual + Success Criteria: cleanup fully rolled back, All validation checks pass + + Validation Checkpoints: + ☐ cleanup rollback steps completed + ☐ System health checks passing + ☐ No critical errors in logs + ☐ Key metrics within acceptable ranges + ☐ Validation command passed: SELECT COUNT(*) FROM {table_name};... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH... + +2. ROLLBACK_CONTRACT + Description: Rollback changes made during contract phase + Urgency: MEDIUM + Duration: 570 minutes + Risk Level: MEDIUM + Prerequisites: + ✓ Incident commander assigned and briefed + ✓ All team members notified of rollback initiation + ✓ Monitoring systems confirmed operational + ✓ Backup systems verified and accessible + ✓ Previous rollback phase completed successfully + Steps: + 99. Validate rollback completion + Duration: 10 min + Type: manual + Success Criteria: contract fully rolled back, All validation checks pass + + Validation Checkpoints: + ☐ contract rollback steps completed + ☐ System health checks passing + ☐ No critical errors in logs + ☐ Key metrics within acceptable ranges + ☐ Validation command passed: SELECT COUNT(*) FROM {table_name};... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH... + +3. ROLLBACK_MIGRATE + Description: Rollback changes made during migrate phase + Urgency: MEDIUM + Duration: 570 minutes + Risk Level: MEDIUM + Prerequisites: + ✓ Incident commander assigned and briefed + ✓ All team members notified of rollback initiation + ✓ Monitoring systems confirmed operational + ✓ Backup systems verified and accessible + ✓ Previous rollback phase completed successfully + Steps: + 99. Validate rollback completion + Duration: 10 min + Type: manual + Success Criteria: migrate fully rolled back, All validation checks pass + + Validation Checkpoints: + ☐ migrate rollback steps completed + ☐ System health checks passing + ☐ No critical errors in logs + ☐ Key metrics within acceptable ranges + ☐ Validation command passed: SELECT COUNT(*) FROM {table_name};... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH... + +4. ROLLBACK_EXPAND + Description: Rollback changes made during expand phase + Urgency: MEDIUM + Duration: 570 minutes + Risk Level: MEDIUM + Prerequisites: + ✓ Incident commander assigned and briefed + ✓ All team members notified of rollback initiation + ✓ Monitoring systems confirmed operational + ✓ Backup systems verified and accessible + ✓ Previous rollback phase completed successfully + Steps: + 99. Validate rollback completion + Duration: 10 min + Type: manual + Success Criteria: expand fully rolled back, All validation checks pass + + Validation Checkpoints: + ☐ expand rollback steps completed + ☐ System health checks passing + ☐ No critical errors in logs + ☐ Key metrics within acceptable ranges + ☐ Validation command passed: SELECT COUNT(*) FROM {table_name};... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH... + +5. ROLLBACK_PREPARATION + Description: Rollback changes made during preparation phase + Urgency: MEDIUM + Duration: 570 minutes + Risk Level: MEDIUM + Prerequisites: + ✓ Incident commander assigned and briefed + ✓ All team members notified of rollback initiation + ✓ Monitoring systems confirmed operational + ✓ Backup systems verified and accessible + ✓ Previous rollback phase completed successfully + Steps: + 1. Drop migration artifacts + Duration: 5 min + Type: sql + Script: + -- Drop migration artifacts + DROP TABLE IF EXISTS migration_log; + DROP PROCEDURE IF EXISTS migrate_data(); + Success Criteria: No migration artifacts remain + + 99. Validate rollback completion + Duration: 10 min + Type: manual + Success Criteria: preparation fully rolled back, All validation checks pass + + Validation Checkpoints: + ☐ preparation rollback steps completed + ☐ System health checks passing + ☐ No critical errors in logs + ☐ Key metrics within acceptable ranges + ☐ Validation command passed: SELECT COUNT(*) FROM {table_name};... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.tables WHE... + ☐ Validation command passed: SELECT COUNT(*) FROM information_schema.columns WH... + +DATA RECOVERY PLAN +---------------------------------------- +Recovery Method: point_in_time +Backup Location: /backups/pre_migration_{migration_id}_{timestamp}.sql +Estimated Recovery Time: 45 minutes +Recovery Scripts: + • pg_restore -d production -c /backups/pre_migration_backup.sql + • SELECT pg_create_restore_point('rollback_point'); + • VACUUM ANALYZE; -- Refresh statistics after restore +Validation Queries: + • SELECT COUNT(*) FROM critical_business_table; + • SELECT MAX(created_at) FROM audit_log; + • SELECT COUNT(DISTINCT user_id) FROM user_sessions; + • SELECT SUM(amount) FROM financial_transactions WHERE date = CURRENT_DATE; + +POST-ROLLBACK VALIDATION CHECKLIST +---------------------------------------- + 1. ☐ Verify system is responding to health checks + 2. ☐ Confirm error rates are within normal parameters + 3. ☐ Validate response times meet SLA requirements + 4. ☐ Check all critical business processes are functioning + 5. ☐ Verify monitoring and alerting systems are operational + 6. ☐ Confirm no data corruption has occurred + 7. ☐ Validate security controls are functioning properly + 8. ☐ Check backup systems are working correctly + 9. ☐ Verify integration points with downstream systems +10. ☐ Confirm user authentication and authorization working +11. ☐ Validate database schema matches expected state +12. ☐ Confirm referential integrity constraints +13. ☐ Check database performance metrics +14. ☐ Verify data consistency across related tables +15. ☐ Validate indexes and statistics are optimal +16. ☐ Confirm transaction logs are clean +17. ☐ Check database connections and connection pooling + +POST-ROLLBACK PROCEDURES +---------------------------------------- + 1. Monitor system stability for 24-48 hours post-rollback + 2. Conduct thorough post-rollback testing of all critical paths + 3. Review and analyze rollback metrics and timing + 4. Document lessons learned and rollback procedure improvements + 5. Schedule post-mortem meeting with all stakeholders + 6. Update rollback procedures based on actual experience + 7. Communicate rollback completion to all stakeholders + 8. Archive rollback logs and artifacts for future reference + 9. Review and update monitoring thresholds if needed +10. Plan for next migration attempt with improved procedures +11. Conduct security review to ensure no vulnerabilities introduced +12. Update disaster recovery procedures if affected by rollback +13. Review capacity planning based on rollback resource usage +14. Update documentation with rollback experience and timings diff --git a/engineering/migration-architect/expected_outputs/sample_database_migration_plan.json b/engineering/migration-architect/expected_outputs/sample_database_migration_plan.json new file mode 100644 index 0000000..872e37f --- /dev/null +++ b/engineering/migration-architect/expected_outputs/sample_database_migration_plan.json @@ -0,0 +1,317 @@ +{ + "migration_id": "23a52ed1507f", + "source_system": "PostgreSQL 13 Production Database", + "target_system": "PostgreSQL 15 Cloud Database", + "migration_type": "database", + "complexity": "critical", + "estimated_duration_hours": 95, + "phases": [ + { + "name": "preparation", + "description": "Prepare systems and teams for migration", + "duration_hours": 19, + "dependencies": [], + "validation_criteria": [ + "All backups completed successfully", + "Monitoring systems operational", + "Team members briefed and ready", + "Rollback procedures tested" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Backup source system", + "Set up monitoring and alerting", + "Prepare rollback procedures", + "Communicate migration timeline", + "Validate prerequisites" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "expand", + "description": "Execute expand phase", + "duration_hours": 19, + "dependencies": [ + "preparation" + ], + "validation_criteria": [ + "Expand phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete expand activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "migrate", + "description": "Execute migrate phase", + "duration_hours": 19, + "dependencies": [ + "expand" + ], + "validation_criteria": [ + "Migrate phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete migrate activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "contract", + "description": "Execute contract phase", + "duration_hours": 19, + "dependencies": [ + "migrate" + ], + "validation_criteria": [ + "Contract phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete contract activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "cleanup", + "description": "Execute cleanup phase", + "duration_hours": 19, + "dependencies": [ + "contract" + ], + "validation_criteria": [ + "Cleanup phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete cleanup activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + } + ], + "risks": [ + { + "category": "technical", + "description": "Data corruption during migration", + "probability": "low", + "impact": "critical", + "severity": "high", + "mitigation": "Implement comprehensive backup and validation procedures", + "owner": "DBA Team" + }, + { + "category": "technical", + "description": "Extended downtime due to migration complexity", + "probability": "medium", + "impact": "high", + "severity": "high", + "mitigation": "Use blue-green deployment and phased migration approach", + "owner": "DevOps Team" + }, + { + "category": "business", + "description": "Business process disruption", + "probability": "medium", + "impact": "high", + "severity": "high", + "mitigation": "Communicate timeline and provide alternate workflows", + "owner": "Business Owner" + }, + { + "category": "operational", + "description": "Insufficient rollback testing", + "probability": "high", + "impact": "critical", + "severity": "critical", + "mitigation": "Execute full rollback procedures in staging environment", + "owner": "QA Team" + }, + { + "category": "business", + "description": "Zero-downtime requirement increases complexity", + "probability": "high", + "impact": "medium", + "severity": "high", + "mitigation": "Implement blue-green deployment or rolling update strategy", + "owner": "DevOps Team" + }, + { + "category": "compliance", + "description": "Regulatory compliance requirements", + "probability": "medium", + "impact": "high", + "severity": "high", + "mitigation": "Ensure all compliance checks are integrated into migration process", + "owner": "Compliance Team" + } + ], + "success_criteria": [ + "All data successfully migrated with 100% integrity", + "System performance meets or exceeds baseline", + "All business processes functioning normally", + "No critical security vulnerabilities introduced", + "Stakeholder acceptance criteria met", + "Documentation and runbooks updated" + ], + "rollback_plan": { + "rollback_phases": [ + { + "phase": "cleanup", + "rollback_actions": [ + "Revert cleanup changes", + "Restore pre-cleanup state", + "Validate cleanup rollback success" + ], + "validation_criteria": [ + "System restored to pre-cleanup state", + "All cleanup changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 285 + }, + { + "phase": "contract", + "rollback_actions": [ + "Revert contract changes", + "Restore pre-contract state", + "Validate contract rollback success" + ], + "validation_criteria": [ + "System restored to pre-contract state", + "All contract changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 285 + }, + { + "phase": "migrate", + "rollback_actions": [ + "Revert migrate changes", + "Restore pre-migrate state", + "Validate migrate rollback success" + ], + "validation_criteria": [ + "System restored to pre-migrate state", + "All migrate changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 285 + }, + { + "phase": "expand", + "rollback_actions": [ + "Revert expand changes", + "Restore pre-expand state", + "Validate expand rollback success" + ], + "validation_criteria": [ + "System restored to pre-expand state", + "All expand changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 285 + }, + { + "phase": "preparation", + "rollback_actions": [ + "Revert preparation changes", + "Restore pre-preparation state", + "Validate preparation rollback success" + ], + "validation_criteria": [ + "System restored to pre-preparation state", + "All preparation changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 285 + } + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Migration timeline exceeded by > 50%", + "Business-critical functionality unavailable", + "Security breach detected", + "Stakeholder decision to abort" + ], + "rollback_decision_matrix": { + "low_severity": "Continue with monitoring", + "medium_severity": "Assess and decide within 15 minutes", + "high_severity": "Immediate rollback initiation", + "critical_severity": "Emergency rollback - all hands" + }, + "rollback_contacts": [ + "Migration Lead", + "Technical Lead", + "Business Owner", + "On-call Engineer" + ] + }, + "stakeholders": [ + "Business Owner", + "Technical Lead", + "DevOps Team", + "QA Team", + "Security Team", + "End Users" + ], + "created_at": "2026-02-16T13:47:23.704502" +} \ No newline at end of file diff --git a/engineering/migration-architect/expected_outputs/sample_database_migration_plan.txt b/engineering/migration-architect/expected_outputs/sample_database_migration_plan.txt new file mode 100644 index 0000000..a243ecc --- /dev/null +++ b/engineering/migration-architect/expected_outputs/sample_database_migration_plan.txt @@ -0,0 +1,161 @@ +================================================================================ +MIGRATION PLAN: 23a52ed1507f +================================================================================ +Source System: PostgreSQL 13 Production Database +Target System: PostgreSQL 15 Cloud Database +Migration Type: DATABASE +Complexity Level: CRITICAL +Estimated Duration: 95 hours (4.0 days) +Created: 2026-02-16T13:47:23.704502 + +MIGRATION PHASES +---------------------------------------- +1. PREPARATION (19h) + Description: Prepare systems and teams for migration + Risk Level: MEDIUM + Tasks: + • Backup source system + • Set up monitoring and alerting + • Prepare rollback procedures + • Communicate migration timeline + • Validate prerequisites + Success Criteria: + ✓ All backups completed successfully + ✓ Monitoring systems operational + ✓ Team members briefed and ready + ✓ Rollback procedures tested + +2. EXPAND (19h) + Description: Execute expand phase + Risk Level: MEDIUM + Dependencies: preparation + Tasks: + • Complete expand activities + Success Criteria: + ✓ Expand phase completed successfully + +3. MIGRATE (19h) + Description: Execute migrate phase + Risk Level: MEDIUM + Dependencies: expand + Tasks: + • Complete migrate activities + Success Criteria: + ✓ Migrate phase completed successfully + +4. CONTRACT (19h) + Description: Execute contract phase + Risk Level: MEDIUM + Dependencies: migrate + Tasks: + • Complete contract activities + Success Criteria: + ✓ Contract phase completed successfully + +5. CLEANUP (19h) + Description: Execute cleanup phase + Risk Level: MEDIUM + Dependencies: contract + Tasks: + • Complete cleanup activities + Success Criteria: + ✓ Cleanup phase completed successfully + +RISK ASSESSMENT +---------------------------------------- +CRITICAL SEVERITY RISKS: + • Insufficient rollback testing + Category: operational + Probability: high | Impact: critical + Mitigation: Execute full rollback procedures in staging environment + Owner: QA Team + +HIGH SEVERITY RISKS: + • Data corruption during migration + Category: technical + Probability: low | Impact: critical + Mitigation: Implement comprehensive backup and validation procedures + Owner: DBA Team + + • Extended downtime due to migration complexity + Category: technical + Probability: medium | Impact: high + Mitigation: Use blue-green deployment and phased migration approach + Owner: DevOps Team + + • Business process disruption + Category: business + Probability: medium | Impact: high + Mitigation: Communicate timeline and provide alternate workflows + Owner: Business Owner + + • Zero-downtime requirement increases complexity + Category: business + Probability: high | Impact: medium + Mitigation: Implement blue-green deployment or rolling update strategy + Owner: DevOps Team + + • Regulatory compliance requirements + Category: compliance + Probability: medium | Impact: high + Mitigation: Ensure all compliance checks are integrated into migration process + Owner: Compliance Team + +ROLLBACK STRATEGY +---------------------------------------- +Rollback Triggers: + • Critical system failure + • Data corruption detected + • Migration timeline exceeded by > 50% + • Business-critical functionality unavailable + • Security breach detected + • Stakeholder decision to abort + +Rollback Phases: + CLEANUP: + - Revert cleanup changes + - Restore pre-cleanup state + - Validate cleanup rollback success + Estimated Time: 285 minutes + + CONTRACT: + - Revert contract changes + - Restore pre-contract state + - Validate contract rollback success + Estimated Time: 285 minutes + + MIGRATE: + - Revert migrate changes + - Restore pre-migrate state + - Validate migrate rollback success + Estimated Time: 285 minutes + + EXPAND: + - Revert expand changes + - Restore pre-expand state + - Validate expand rollback success + Estimated Time: 285 minutes + + PREPARATION: + - Revert preparation changes + - Restore pre-preparation state + - Validate preparation rollback success + Estimated Time: 285 minutes + +SUCCESS CRITERIA +---------------------------------------- +✓ All data successfully migrated with 100% integrity +✓ System performance meets or exceeds baseline +✓ All business processes functioning normally +✓ No critical security vulnerabilities introduced +✓ Stakeholder acceptance criteria met +✓ Documentation and runbooks updated + +STAKEHOLDERS +---------------------------------------- +• Business Owner +• Technical Lead +• DevOps Team +• QA Team +• Security Team +• End Users diff --git a/engineering/migration-architect/expected_outputs/sample_service_migration_plan.json b/engineering/migration-architect/expected_outputs/sample_service_migration_plan.json new file mode 100644 index 0000000..15899f2 --- /dev/null +++ b/engineering/migration-architect/expected_outputs/sample_service_migration_plan.json @@ -0,0 +1,310 @@ +{ + "migration_id": "21031930da18", + "source_system": "Legacy User Service (Java Spring Boot 2.x)", + "target_system": "New User Service (Node.js + TypeScript)", + "migration_type": "service", + "complexity": "critical", + "estimated_duration_hours": 500, + "phases": [ + { + "name": "intercept", + "description": "Execute intercept phase", + "duration_hours": 100, + "dependencies": [], + "validation_criteria": [ + "Intercept phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete intercept activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "implement", + "description": "Execute implement phase", + "duration_hours": 100, + "dependencies": [ + "intercept" + ], + "validation_criteria": [ + "Implement phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete implement activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "redirect", + "description": "Execute redirect phase", + "duration_hours": 100, + "dependencies": [ + "implement" + ], + "validation_criteria": [ + "Redirect phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete redirect activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "validate", + "description": "Execute validate phase", + "duration_hours": 100, + "dependencies": [ + "redirect" + ], + "validation_criteria": [ + "Validate phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete validate activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + }, + { + "name": "retire", + "description": "Execute retire phase", + "duration_hours": 100, + "dependencies": [ + "validate" + ], + "validation_criteria": [ + "Retire phase completed successfully" + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ], + "tasks": [ + "Complete retire activities" + ], + "risk_level": "medium", + "resources_required": [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + } + ], + "risks": [ + { + "category": "technical", + "description": "Service compatibility issues", + "probability": "medium", + "impact": "high", + "severity": "high", + "mitigation": "Implement comprehensive integration testing", + "owner": "Development Team" + }, + { + "category": "technical", + "description": "Performance degradation", + "probability": "medium", + "impact": "medium", + "severity": "medium", + "mitigation": "Conduct load testing and performance benchmarking", + "owner": "DevOps Team" + }, + { + "category": "business", + "description": "Feature parity gaps", + "probability": "high", + "impact": "high", + "severity": "high", + "mitigation": "Document feature mapping and acceptance criteria", + "owner": "Product Owner" + }, + { + "category": "operational", + "description": "Monitoring gap during transition", + "probability": "medium", + "impact": "medium", + "severity": "medium", + "mitigation": "Set up dual monitoring and alerting systems", + "owner": "SRE Team" + }, + { + "category": "business", + "description": "Zero-downtime requirement increases complexity", + "probability": "high", + "impact": "medium", + "severity": "high", + "mitigation": "Implement blue-green deployment or rolling update strategy", + "owner": "DevOps Team" + }, + { + "category": "compliance", + "description": "Regulatory compliance requirements", + "probability": "medium", + "impact": "high", + "severity": "high", + "mitigation": "Ensure all compliance checks are integrated into migration process", + "owner": "Compliance Team" + } + ], + "success_criteria": [ + "All data successfully migrated with 100% integrity", + "System performance meets or exceeds baseline", + "All business processes functioning normally", + "No critical security vulnerabilities introduced", + "Stakeholder acceptance criteria met", + "Documentation and runbooks updated" + ], + "rollback_plan": { + "rollback_phases": [ + { + "phase": "retire", + "rollback_actions": [ + "Revert retire changes", + "Restore pre-retire state", + "Validate retire rollback success" + ], + "validation_criteria": [ + "System restored to pre-retire state", + "All retire changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 1500 + }, + { + "phase": "validate", + "rollback_actions": [ + "Revert validate changes", + "Restore pre-validate state", + "Validate validate rollback success" + ], + "validation_criteria": [ + "System restored to pre-validate state", + "All validate changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 1500 + }, + { + "phase": "redirect", + "rollback_actions": [ + "Revert redirect changes", + "Restore pre-redirect state", + "Validate redirect rollback success" + ], + "validation_criteria": [ + "System restored to pre-redirect state", + "All redirect changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 1500 + }, + { + "phase": "implement", + "rollback_actions": [ + "Revert implement changes", + "Restore pre-implement state", + "Validate implement rollback success" + ], + "validation_criteria": [ + "System restored to pre-implement state", + "All implement changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 1500 + }, + { + "phase": "intercept", + "rollback_actions": [ + "Revert intercept changes", + "Restore pre-intercept state", + "Validate intercept rollback success" + ], + "validation_criteria": [ + "System restored to pre-intercept state", + "All intercept changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": 1500 + } + ], + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Migration timeline exceeded by > 50%", + "Business-critical functionality unavailable", + "Security breach detected", + "Stakeholder decision to abort" + ], + "rollback_decision_matrix": { + "low_severity": "Continue with monitoring", + "medium_severity": "Assess and decide within 15 minutes", + "high_severity": "Immediate rollback initiation", + "critical_severity": "Emergency rollback - all hands" + }, + "rollback_contacts": [ + "Migration Lead", + "Technical Lead", + "Business Owner", + "On-call Engineer" + ] + }, + "stakeholders": [ + "Business Owner", + "Technical Lead", + "DevOps Team", + "QA Team", + "Security Team", + "End Users" + ], + "created_at": "2026-02-16T13:47:34.565896" +} \ No newline at end of file diff --git a/engineering/migration-architect/expected_outputs/sample_service_migration_plan.txt b/engineering/migration-architect/expected_outputs/sample_service_migration_plan.txt new file mode 100644 index 0000000..ac08e33 --- /dev/null +++ b/engineering/migration-architect/expected_outputs/sample_service_migration_plan.txt @@ -0,0 +1,154 @@ +================================================================================ +MIGRATION PLAN: 21031930da18 +================================================================================ +Source System: Legacy User Service (Java Spring Boot 2.x) +Target System: New User Service (Node.js + TypeScript) +Migration Type: SERVICE +Complexity Level: CRITICAL +Estimated Duration: 500 hours (20.8 days) +Created: 2026-02-16T13:47:34.565896 + +MIGRATION PHASES +---------------------------------------- +1. INTERCEPT (100h) + Description: Execute intercept phase + Risk Level: MEDIUM + Tasks: + • Complete intercept activities + Success Criteria: + ✓ Intercept phase completed successfully + +2. IMPLEMENT (100h) + Description: Execute implement phase + Risk Level: MEDIUM + Dependencies: intercept + Tasks: + • Complete implement activities + Success Criteria: + ✓ Implement phase completed successfully + +3. REDIRECT (100h) + Description: Execute redirect phase + Risk Level: MEDIUM + Dependencies: implement + Tasks: + • Complete redirect activities + Success Criteria: + ✓ Redirect phase completed successfully + +4. VALIDATE (100h) + Description: Execute validate phase + Risk Level: MEDIUM + Dependencies: redirect + Tasks: + • Complete validate activities + Success Criteria: + ✓ Validate phase completed successfully + +5. RETIRE (100h) + Description: Execute retire phase + Risk Level: MEDIUM + Dependencies: validate + Tasks: + • Complete retire activities + Success Criteria: + ✓ Retire phase completed successfully + +RISK ASSESSMENT +---------------------------------------- +HIGH SEVERITY RISKS: + • Service compatibility issues + Category: technical + Probability: medium | Impact: high + Mitigation: Implement comprehensive integration testing + Owner: Development Team + + • Feature parity gaps + Category: business + Probability: high | Impact: high + Mitigation: Document feature mapping and acceptance criteria + Owner: Product Owner + + • Zero-downtime requirement increases complexity + Category: business + Probability: high | Impact: medium + Mitigation: Implement blue-green deployment or rolling update strategy + Owner: DevOps Team + + • Regulatory compliance requirements + Category: compliance + Probability: medium | Impact: high + Mitigation: Ensure all compliance checks are integrated into migration process + Owner: Compliance Team + +MEDIUM SEVERITY RISKS: + • Performance degradation + Category: technical + Probability: medium | Impact: medium + Mitigation: Conduct load testing and performance benchmarking + Owner: DevOps Team + + • Monitoring gap during transition + Category: operational + Probability: medium | Impact: medium + Mitigation: Set up dual monitoring and alerting systems + Owner: SRE Team + +ROLLBACK STRATEGY +---------------------------------------- +Rollback Triggers: + • Critical system failure + • Data corruption detected + • Migration timeline exceeded by > 50% + • Business-critical functionality unavailable + • Security breach detected + • Stakeholder decision to abort + +Rollback Phases: + RETIRE: + - Revert retire changes + - Restore pre-retire state + - Validate retire rollback success + Estimated Time: 1500 minutes + + VALIDATE: + - Revert validate changes + - Restore pre-validate state + - Validate validate rollback success + Estimated Time: 1500 minutes + + REDIRECT: + - Revert redirect changes + - Restore pre-redirect state + - Validate redirect rollback success + Estimated Time: 1500 minutes + + IMPLEMENT: + - Revert implement changes + - Restore pre-implement state + - Validate implement rollback success + Estimated Time: 1500 minutes + + INTERCEPT: + - Revert intercept changes + - Restore pre-intercept state + - Validate intercept rollback success + Estimated Time: 1500 minutes + +SUCCESS CRITERIA +---------------------------------------- +✓ All data successfully migrated with 100% integrity +✓ System performance meets or exceeds baseline +✓ All business processes functioning normally +✓ No critical security vulnerabilities introduced +✓ Stakeholder acceptance criteria met +✓ Documentation and runbooks updated + +STAKEHOLDERS +---------------------------------------- +• Business Owner +• Technical Lead +• DevOps Team +• QA Team +• Security Team +• End Users diff --git a/engineering/migration-architect/expected_outputs/schema_compatibility_report.json b/engineering/migration-architect/expected_outputs/schema_compatibility_report.json new file mode 100644 index 0000000..d41983b --- /dev/null +++ b/engineering/migration-architect/expected_outputs/schema_compatibility_report.json @@ -0,0 +1,192 @@ +{ + "schema_before": "{\n \"schema_version\": \"1.0\",\n \"database\": \"user_management\",\n \"tables\": {\n \"users\": {\n \"columns\": {\n \"id\": {\n \"type\": \"bigint\",\n \"nullable\": false,\n \"primary_key\": true,\n \"auto_increment\": true\n },\n \"username\": {\n \"type\": \"varchar\",\n \"length\": 50,\n \"nullable\": false,\n \"unique\": true\n },\n \"email\": {\n \"type\": \"varchar\",\n \"length\": 255,\n \"nullable\": false,\n...", + "schema_after": "{\n \"schema_version\": \"2.0\",\n \"database\": \"user_management_v2\",\n \"tables\": {\n \"users\": {\n \"columns\": {\n \"id\": {\n \"type\": \"bigint\",\n \"nullable\": false,\n \"primary_key\": true,\n \"auto_increment\": true\n },\n \"username\": {\n \"type\": \"varchar\",\n \"length\": 50,\n \"nullable\": false,\n \"unique\": true\n },\n \"email\": {\n \"type\": \"varchar\",\n \"length\": 320,\n \"nullable\": fals...", + "analysis_date": "2026-02-16T13:47:27.050459", + "overall_compatibility": "potentially_incompatible", + "breaking_changes_count": 0, + "potentially_breaking_count": 4, + "non_breaking_changes_count": 0, + "additive_changes_count": 0, + "issues": [ + { + "type": "check_added", + "severity": "potentially_breaking", + "description": "New check constraint 'phone IS NULL OR LENGTH(phone) >= 10' added to table 'users'", + "field_path": "tables.users.constraints.check", + "old_value": null, + "new_value": "phone IS NULL OR LENGTH(phone) >= 10", + "impact": "New check constraint may reject existing data", + "suggested_migration": "Validate existing data complies with new constraint", + "affected_operations": [ + "INSERT", + "UPDATE" + ] + }, + { + "type": "check_added", + "severity": "potentially_breaking", + "description": "New check constraint 'bio IS NULL OR LENGTH(bio) <= 2000' added to table 'user_profiles'", + "field_path": "tables.user_profiles.constraints.check", + "old_value": null, + "new_value": "bio IS NULL OR LENGTH(bio) <= 2000", + "impact": "New check constraint may reject existing data", + "suggested_migration": "Validate existing data complies with new constraint", + "affected_operations": [ + "INSERT", + "UPDATE" + ] + }, + { + "type": "check_added", + "severity": "potentially_breaking", + "description": "New check constraint 'language IN ('en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh')' added to table 'user_profiles'", + "field_path": "tables.user_profiles.constraints.check", + "old_value": null, + "new_value": "language IN ('en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh')", + "impact": "New check constraint may reject existing data", + "suggested_migration": "Validate existing data complies with new constraint", + "affected_operations": [ + "INSERT", + "UPDATE" + ] + }, + { + "type": "check_added", + "severity": "potentially_breaking", + "description": "New check constraint 'session_type IN ('web', 'mobile', 'api', 'admin')' added to table 'user_sessions'", + "field_path": "tables.user_sessions.constraints.check", + "old_value": null, + "new_value": "session_type IN ('web', 'mobile', 'api', 'admin')", + "impact": "New check constraint may reject existing data", + "suggested_migration": "Validate existing data complies with new constraint", + "affected_operations": [ + "INSERT", + "UPDATE" + ] + } + ], + "migration_scripts": [ + { + "script_type": "sql", + "description": "Create new table user_preferences", + "script_content": "CREATE TABLE user_preferences (\n id bigint NOT NULL,\n user_id bigint NOT NULL,\n preference_key varchar NOT NULL,\n preference_value json,\n created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,\n updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP\n);", + "rollback_script": "DROP TABLE IF EXISTS user_preferences;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'user_preferences';" + }, + { + "script_type": "sql", + "description": "Add column email_verified_at to table users", + "script_content": "ALTER TABLE users ADD COLUMN email_verified_at timestamp;", + "rollback_script": "ALTER TABLE users DROP COLUMN email_verified_at;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'email_verified_at';" + }, + { + "script_type": "sql", + "description": "Add column phone_verified_at to table users", + "script_content": "ALTER TABLE users ADD COLUMN phone_verified_at timestamp;", + "rollback_script": "ALTER TABLE users DROP COLUMN phone_verified_at;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'phone_verified_at';" + }, + { + "script_type": "sql", + "description": "Add column two_factor_enabled to table users", + "script_content": "ALTER TABLE users ADD COLUMN two_factor_enabled boolean NOT NULL DEFAULT False;", + "rollback_script": "ALTER TABLE users DROP COLUMN two_factor_enabled;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'two_factor_enabled';" + }, + { + "script_type": "sql", + "description": "Add column last_login_at to table users", + "script_content": "ALTER TABLE users ADD COLUMN last_login_at timestamp;", + "rollback_script": "ALTER TABLE users DROP COLUMN last_login_at;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'users' AND column_name = 'last_login_at';" + }, + { + "script_type": "sql", + "description": "Add check constraint to users", + "script_content": "ALTER TABLE users ADD CONSTRAINT check_users CHECK (phone IS NULL OR LENGTH(phone) >= 10);", + "rollback_script": "ALTER TABLE users DROP CONSTRAINT check_users;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.table_constraints WHERE table_name = 'users' AND constraint_type = 'CHECK';" + }, + { + "script_type": "sql", + "description": "Add column timezone to table user_profiles", + "script_content": "ALTER TABLE user_profiles ADD COLUMN timezone varchar DEFAULT UTC;", + "rollback_script": "ALTER TABLE user_profiles DROP COLUMN timezone;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'user_profiles' AND column_name = 'timezone';" + }, + { + "script_type": "sql", + "description": "Add column language to table user_profiles", + "script_content": "ALTER TABLE user_profiles ADD COLUMN language varchar NOT NULL DEFAULT en;", + "rollback_script": "ALTER TABLE user_profiles DROP COLUMN language;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'user_profiles' AND column_name = 'language';" + }, + { + "script_type": "sql", + "description": "Add check constraint to user_profiles", + "script_content": "ALTER TABLE user_profiles ADD CONSTRAINT check_user_profiles CHECK (bio IS NULL OR LENGTH(bio) <= 2000);", + "rollback_script": "ALTER TABLE user_profiles DROP CONSTRAINT check_user_profiles;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.table_constraints WHERE table_name = 'user_profiles' AND constraint_type = 'CHECK';" + }, + { + "script_type": "sql", + "description": "Add check constraint to user_profiles", + "script_content": "ALTER TABLE user_profiles ADD CONSTRAINT check_user_profiles CHECK (language IN ('en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh'));", + "rollback_script": "ALTER TABLE user_profiles DROP CONSTRAINT check_user_profiles;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.table_constraints WHERE table_name = 'user_profiles' AND constraint_type = 'CHECK';" + }, + { + "script_type": "sql", + "description": "Add column session_type to table user_sessions", + "script_content": "ALTER TABLE user_sessions ADD COLUMN session_type varchar NOT NULL DEFAULT web;", + "rollback_script": "ALTER TABLE user_sessions DROP COLUMN session_type;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'user_sessions' AND column_name = 'session_type';" + }, + { + "script_type": "sql", + "description": "Add column is_mobile to table user_sessions", + "script_content": "ALTER TABLE user_sessions ADD COLUMN is_mobile boolean NOT NULL DEFAULT False;", + "rollback_script": "ALTER TABLE user_sessions DROP COLUMN is_mobile;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'user_sessions' AND column_name = 'is_mobile';" + }, + { + "script_type": "sql", + "description": "Add check constraint to user_sessions", + "script_content": "ALTER TABLE user_sessions ADD CONSTRAINT check_user_sessions CHECK (session_type IN ('web', 'mobile', 'api', 'admin'));", + "rollback_script": "ALTER TABLE user_sessions DROP CONSTRAINT check_user_sessions;", + "dependencies": [], + "validation_query": "SELECT COUNT(*) FROM information_schema.table_constraints WHERE table_name = 'user_sessions' AND constraint_type = 'CHECK';" + } + ], + "risk_assessment": { + "overall_risk": "medium", + "deployment_risk": "safe_independent_deployment", + "rollback_complexity": "low", + "testing_requirements": [ + "integration_testing", + "regression_testing", + "data_migration_testing" + ] + }, + "recommendations": [ + "Conduct thorough testing with realistic data volumes", + "Implement monitoring for migration success metrics", + "Test all migration scripts in staging environment", + "Implement migration progress monitoring", + "Create detailed communication plan for stakeholders", + "Implement feature flags for gradual rollout" + ] +} \ No newline at end of file diff --git a/engineering/migration-architect/expected_outputs/schema_compatibility_report.txt b/engineering/migration-architect/expected_outputs/schema_compatibility_report.txt new file mode 100644 index 0000000..fdcbd33 --- /dev/null +++ b/engineering/migration-architect/expected_outputs/schema_compatibility_report.txt @@ -0,0 +1,129 @@ +================================================================================ +COMPATIBILITY ANALYSIS REPORT +================================================================================ +Analysis Date: 2026-02-16T13:47:27.050459 +Overall Compatibility: POTENTIALLY_INCOMPATIBLE + +SUMMARY +---------------------------------------- +Breaking Changes: 0 +Potentially Breaking: 4 +Non-Breaking Changes: 0 +Additive Changes: 0 +Total Issues Found: 4 + +RISK ASSESSMENT +---------------------------------------- +Overall Risk: medium +Deployment Risk: safe_independent_deployment +Rollback Complexity: low +Testing Requirements: ['integration_testing', 'regression_testing', 'data_migration_testing'] + +POTENTIALLY BREAKING ISSUES +---------------------------------------- +• New check constraint 'phone IS NULL OR LENGTH(phone) >= 10' added to table 'users' + Field: tables.users.constraints.check + Impact: New check constraint may reject existing data + Migration: Validate existing data complies with new constraint + Affected Operations: INSERT, UPDATE + +• New check constraint 'bio IS NULL OR LENGTH(bio) <= 2000' added to table 'user_profiles' + Field: tables.user_profiles.constraints.check + Impact: New check constraint may reject existing data + Migration: Validate existing data complies with new constraint + Affected Operations: INSERT, UPDATE + +• New check constraint 'language IN ('en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh')' added to table 'user_profiles' + Field: tables.user_profiles.constraints.check + Impact: New check constraint may reject existing data + Migration: Validate existing data complies with new constraint + Affected Operations: INSERT, UPDATE + +• New check constraint 'session_type IN ('web', 'mobile', 'api', 'admin')' added to table 'user_sessions' + Field: tables.user_sessions.constraints.check + Impact: New check constraint may reject existing data + Migration: Validate existing data complies with new constraint + Affected Operations: INSERT, UPDATE + +SUGGESTED MIGRATION SCRIPTS +---------------------------------------- +1. Create new table user_preferences + Type: sql + Script: + CREATE TABLE user_preferences ( + id bigint NOT NULL, + user_id bigint NOT NULL, + preference_key varchar NOT NULL, + preference_value json, + created_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ); + +2. Add column email_verified_at to table users + Type: sql + Script: + ALTER TABLE users ADD COLUMN email_verified_at timestamp; + +3. Add column phone_verified_at to table users + Type: sql + Script: + ALTER TABLE users ADD COLUMN phone_verified_at timestamp; + +4. Add column two_factor_enabled to table users + Type: sql + Script: + ALTER TABLE users ADD COLUMN two_factor_enabled boolean NOT NULL DEFAULT False; + +5. Add column last_login_at to table users + Type: sql + Script: + ALTER TABLE users ADD COLUMN last_login_at timestamp; + +6. Add check constraint to users + Type: sql + Script: + ALTER TABLE users ADD CONSTRAINT check_users CHECK (phone IS NULL OR LENGTH(phone) >= 10); + +7. Add column timezone to table user_profiles + Type: sql + Script: + ALTER TABLE user_profiles ADD COLUMN timezone varchar DEFAULT UTC; + +8. Add column language to table user_profiles + Type: sql + Script: + ALTER TABLE user_profiles ADD COLUMN language varchar NOT NULL DEFAULT en; + +9. Add check constraint to user_profiles + Type: sql + Script: + ALTER TABLE user_profiles ADD CONSTRAINT check_user_profiles CHECK (bio IS NULL OR LENGTH(bio) <= 2000); + +10. Add check constraint to user_profiles + Type: sql + Script: + ALTER TABLE user_profiles ADD CONSTRAINT check_user_profiles CHECK (language IN ('en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh')); + +11. Add column session_type to table user_sessions + Type: sql + Script: + ALTER TABLE user_sessions ADD COLUMN session_type varchar NOT NULL DEFAULT web; + +12. Add column is_mobile to table user_sessions + Type: sql + Script: + ALTER TABLE user_sessions ADD COLUMN is_mobile boolean NOT NULL DEFAULT False; + +13. Add check constraint to user_sessions + Type: sql + Script: + ALTER TABLE user_sessions ADD CONSTRAINT check_user_sessions CHECK (session_type IN ('web', 'mobile', 'api', 'admin')); + +RECOMMENDATIONS +---------------------------------------- +1. Conduct thorough testing with realistic data volumes +2. Implement monitoring for migration success metrics +3. Test all migration scripts in staging environment +4. Implement migration progress monitoring +5. Create detailed communication plan for stakeholders +6. Implement feature flags for gradual rollout diff --git a/engineering/migration-architect/references/data_reconciliation_strategies.md b/engineering/migration-architect/references/data_reconciliation_strategies.md new file mode 100644 index 0000000..9f506e7 --- /dev/null +++ b/engineering/migration-architect/references/data_reconciliation_strategies.md @@ -0,0 +1,1329 @@ +# Data Reconciliation Strategies + +## Overview + +Data reconciliation is the process of ensuring data consistency and integrity across systems during and after migrations. This document provides comprehensive strategies, tools, and implementation patterns for detecting, measuring, and correcting data discrepancies in migration scenarios. + +## Core Principles + +### 1. Eventually Consistent +Accept that perfect real-time consistency may not be achievable during migrations, but ensure eventual consistency through reconciliation processes. + +### 2. Idempotent Operations +All reconciliation operations must be safe to run multiple times without causing additional issues. + +### 3. Audit Trail +Maintain detailed logs of all reconciliation actions for compliance and debugging. + +### 4. Non-Destructive +Reconciliation should prefer addition over deletion, and always maintain backups before corrections. + +## Types of Data Inconsistencies + +### 1. Missing Records +Records that exist in source but not in target system. + +### 2. Extra Records +Records that exist in target but not in source system. + +### 3. Field Mismatches +Records exist in both systems but with different field values. + +### 4. Referential Integrity Violations +Foreign key relationships that are broken during migration. + +### 5. Temporal Inconsistencies +Data with incorrect timestamps or ordering. + +### 6. Schema Drift +Structural differences between source and target schemas. + +## Detection Strategies + +### 1. Row Count Validation + +#### Simple Count Comparison +```sql +-- Compare total row counts +SELECT + 'source' as system, + COUNT(*) as row_count +FROM source_table +UNION ALL +SELECT + 'target' as system, + COUNT(*) as row_count +FROM target_table; +``` + +#### Filtered Count Comparison +```sql +-- Compare counts with business logic filters +WITH source_counts AS ( + SELECT + status, + created_date::date as date, + COUNT(*) as count + FROM source_orders + WHERE created_date >= '2024-01-01' + GROUP BY status, created_date::date +), +target_counts AS ( + SELECT + status, + created_date::date as date, + COUNT(*) as count + FROM target_orders + WHERE created_date >= '2024-01-01' + GROUP BY status, created_date::date +) +SELECT + COALESCE(s.status, t.status) as status, + COALESCE(s.date, t.date) as date, + COALESCE(s.count, 0) as source_count, + COALESCE(t.count, 0) as target_count, + COALESCE(s.count, 0) - COALESCE(t.count, 0) as difference +FROM source_counts s +FULL OUTER JOIN target_counts t + ON s.status = t.status AND s.date = t.date +WHERE COALESCE(s.count, 0) != COALESCE(t.count, 0); +``` + +### 2. Checksum-Based Validation + +#### Record-Level Checksums +```python +import hashlib +import json + +class RecordChecksum: + def __init__(self, exclude_fields=None): + self.exclude_fields = exclude_fields or ['updated_at', 'version'] + + def calculate_checksum(self, record): + """Calculate MD5 checksum for a database record""" + # Remove excluded fields and sort for consistency + filtered_record = { + k: v for k, v in record.items() + if k not in self.exclude_fields + } + + # Convert to sorted JSON string for consistent hashing + normalized = json.dumps(filtered_record, sort_keys=True, default=str) + + return hashlib.md5(normalized.encode('utf-8')).hexdigest() + + def compare_records(self, source_record, target_record): + """Compare two records using checksums""" + source_checksum = self.calculate_checksum(source_record) + target_checksum = self.calculate_checksum(target_record) + + return { + 'match': source_checksum == target_checksum, + 'source_checksum': source_checksum, + 'target_checksum': target_checksum + } + +# Usage example +checksum_calculator = RecordChecksum(exclude_fields=['updated_at', 'migration_flag']) + +source_records = fetch_records_from_source() +target_records = fetch_records_from_target() + +mismatches = [] +for source_id, source_record in source_records.items(): + if source_id in target_records: + comparison = checksum_calculator.compare_records( + source_record, target_records[source_id] + ) + if not comparison['match']: + mismatches.append({ + 'record_id': source_id, + 'source_checksum': comparison['source_checksum'], + 'target_checksum': comparison['target_checksum'] + }) +``` + +#### Aggregate Checksums +```sql +-- Calculate aggregate checksums for data validation +WITH source_aggregates AS ( + SELECT + DATE_TRUNC('day', created_at) as day, + status, + COUNT(*) as record_count, + SUM(amount) as total_amount, + MD5(STRING_AGG(CAST(id AS VARCHAR) || ':' || CAST(amount AS VARCHAR), '|' ORDER BY id)) as checksum + FROM source_transactions + GROUP BY DATE_TRUNC('day', created_at), status +), +target_aggregates AS ( + SELECT + DATE_TRUNC('day', created_at) as day, + status, + COUNT(*) as record_count, + SUM(amount) as total_amount, + MD5(STRING_AGG(CAST(id AS VARCHAR) || ':' || CAST(amount AS VARCHAR), '|' ORDER BY id)) as checksum + FROM target_transactions + GROUP BY DATE_TRUNC('day', created_at), status +) +SELECT + COALESCE(s.day, t.day) as day, + COALESCE(s.status, t.status) as status, + COALESCE(s.record_count, 0) as source_count, + COALESCE(t.record_count, 0) as target_count, + COALESCE(s.total_amount, 0) as source_amount, + COALESCE(t.total_amount, 0) as target_amount, + s.checksum as source_checksum, + t.checksum as target_checksum, + CASE WHEN s.checksum = t.checksum THEN 'MATCH' ELSE 'MISMATCH' END as status +FROM source_aggregates s +FULL OUTER JOIN target_aggregates t + ON s.day = t.day AND s.status = t.status +WHERE s.checksum != t.checksum OR s.checksum IS NULL OR t.checksum IS NULL; +``` + +### 3. Delta Detection + +#### Change Data Capture (CDC) Based +```python +class CDCReconciler: + def __init__(self, kafka_client, database_client): + self.kafka = kafka_client + self.db = database_client + self.processed_changes = set() + + def process_cdc_stream(self, topic_name): + """Process CDC events and track changes for reconciliation""" + + consumer = self.kafka.consumer(topic_name) + + for message in consumer: + change_event = json.loads(message.value) + + change_id = f"{change_event['table']}:{change_event['key']}:{change_event['timestamp']}" + + if change_id in self.processed_changes: + continue # Skip duplicate events + + try: + self.apply_change(change_event) + self.processed_changes.add(change_id) + + # Commit offset only after successful processing + consumer.commit() + + except Exception as e: + # Log failure and continue - will be caught by reconciliation + self.log_processing_failure(change_id, str(e)) + + def apply_change(self, change_event): + """Apply CDC change to target system""" + + table = change_event['table'] + operation = change_event['operation'] + key = change_event['key'] + data = change_event.get('data', {}) + + if operation == 'INSERT': + self.db.insert(table, data) + elif operation == 'UPDATE': + self.db.update(table, key, data) + elif operation == 'DELETE': + self.db.delete(table, key) + + def reconcile_missed_changes(self, start_timestamp, end_timestamp): + """Find and apply changes that may have been missed""" + + # Query source database for changes in time window + source_changes = self.db.get_changes_in_window( + start_timestamp, end_timestamp + ) + + missed_changes = [] + + for change in source_changes: + change_id = f"{change['table']}:{change['key']}:{change['timestamp']}" + + if change_id not in self.processed_changes: + missed_changes.append(change) + + # Apply missed changes + for change in missed_changes: + try: + self.apply_change(change) + print(f"Applied missed change: {change['table']}:{change['key']}") + except Exception as e: + print(f"Failed to apply missed change: {e}") +``` + +### 4. Business Logic Validation + +#### Critical Business Rules Validation +```python +class BusinessLogicValidator: + def __init__(self, source_db, target_db): + self.source_db = source_db + self.target_db = target_db + + def validate_financial_consistency(self): + """Validate critical financial calculations""" + + validation_rules = [ + { + 'name': 'daily_transaction_totals', + 'source_query': """ + SELECT DATE(created_at) as date, SUM(amount) as total + FROM source_transactions + WHERE created_at >= CURRENT_DATE - INTERVAL '30 days' + GROUP BY DATE(created_at) + """, + 'target_query': """ + SELECT DATE(created_at) as date, SUM(amount) as total + FROM target_transactions + WHERE created_at >= CURRENT_DATE - INTERVAL '30 days' + GROUP BY DATE(created_at) + """, + 'tolerance': 0.01 # Allow $0.01 difference for rounding + }, + { + 'name': 'customer_balance_totals', + 'source_query': """ + SELECT customer_id, SUM(balance) as total_balance + FROM source_accounts + GROUP BY customer_id + HAVING SUM(balance) > 0 + """, + 'target_query': """ + SELECT customer_id, SUM(balance) as total_balance + FROM target_accounts + GROUP BY customer_id + HAVING SUM(balance) > 0 + """, + 'tolerance': 0.01 + } + ] + + validation_results = [] + + for rule in validation_rules: + source_data = self.source_db.execute_query(rule['source_query']) + target_data = self.target_db.execute_query(rule['target_query']) + + differences = self.compare_financial_data( + source_data, target_data, rule['tolerance'] + ) + + validation_results.append({ + 'rule_name': rule['name'], + 'differences_found': len(differences), + 'differences': differences[:10], # First 10 differences + 'status': 'PASS' if len(differences) == 0 else 'FAIL' + }) + + return validation_results + + def compare_financial_data(self, source_data, target_data, tolerance): + """Compare financial data with tolerance for rounding differences""" + + source_dict = { + tuple(row[:-1]): row[-1] for row in source_data + } # Last column is the amount + + target_dict = { + tuple(row[:-1]): row[-1] for row in target_data + } + + differences = [] + + # Check for missing records and value differences + for key, source_value in source_dict.items(): + if key not in target_dict: + differences.append({ + 'key': key, + 'source_value': source_value, + 'target_value': None, + 'difference_type': 'MISSING_IN_TARGET' + }) + else: + target_value = target_dict[key] + if abs(float(source_value) - float(target_value)) > tolerance: + differences.append({ + 'key': key, + 'source_value': source_value, + 'target_value': target_value, + 'difference': float(source_value) - float(target_value), + 'difference_type': 'VALUE_MISMATCH' + }) + + # Check for extra records in target + for key, target_value in target_dict.items(): + if key not in source_dict: + differences.append({ + 'key': key, + 'source_value': None, + 'target_value': target_value, + 'difference_type': 'EXTRA_IN_TARGET' + }) + + return differences +``` + +## Correction Strategies + +### 1. Automated Correction + +#### Missing Record Insertion +```python +class AutoCorrector: + def __init__(self, source_db, target_db, dry_run=True): + self.source_db = source_db + self.target_db = target_db + self.dry_run = dry_run + self.correction_log = [] + + def correct_missing_records(self, table_name, key_field): + """Add missing records from source to target""" + + # Find records in source but not in target + missing_query = f""" + SELECT s.* + FROM source_{table_name} s + LEFT JOIN target_{table_name} t ON s.{key_field} = t.{key_field} + WHERE t.{key_field} IS NULL + """ + + missing_records = self.source_db.execute_query(missing_query) + + for record in missing_records: + correction = { + 'table': table_name, + 'operation': 'INSERT', + 'key': record[key_field], + 'data': record, + 'timestamp': datetime.utcnow() + } + + if not self.dry_run: + try: + self.target_db.insert(table_name, record) + correction['status'] = 'SUCCESS' + except Exception as e: + correction['status'] = 'FAILED' + correction['error'] = str(e) + else: + correction['status'] = 'DRY_RUN' + + self.correction_log.append(correction) + + return len(missing_records) + + def correct_field_mismatches(self, table_name, key_field, fields_to_correct): + """Correct field value mismatches""" + + mismatch_query = f""" + SELECT s.{key_field}, {', '.join([f's.{f} as source_{f}, t.{f} as target_{f}' for f in fields_to_correct])} + FROM source_{table_name} s + JOIN target_{table_name} t ON s.{key_field} = t.{key_field} + WHERE {' OR '.join([f's.{f} != t.{f}' for f in fields_to_correct])} + """ + + mismatched_records = self.source_db.execute_query(mismatch_query) + + for record in mismatched_records: + key_value = record[key_field] + updates = {} + + for field in fields_to_correct: + source_value = record[f'source_{field}'] + target_value = record[f'target_{field}'] + + if source_value != target_value: + updates[field] = source_value + + if updates: + correction = { + 'table': table_name, + 'operation': 'UPDATE', + 'key': key_value, + 'updates': updates, + 'timestamp': datetime.utcnow() + } + + if not self.dry_run: + try: + self.target_db.update(table_name, {key_field: key_value}, updates) + correction['status'] = 'SUCCESS' + except Exception as e: + correction['status'] = 'FAILED' + correction['error'] = str(e) + else: + correction['status'] = 'DRY_RUN' + + self.correction_log.append(correction) + + return len(mismatched_records) +``` + +### 2. Manual Review Process + +#### Correction Workflow +```python +class ManualReviewSystem: + def __init__(self, database_client): + self.db = database_client + self.review_queue = [] + + def queue_for_review(self, discrepancy): + """Add discrepancy to manual review queue""" + + review_item = { + 'id': str(uuid.uuid4()), + 'discrepancy_type': discrepancy['type'], + 'table': discrepancy['table'], + 'record_key': discrepancy['key'], + 'source_data': discrepancy.get('source_data'), + 'target_data': discrepancy.get('target_data'), + 'description': discrepancy['description'], + 'severity': discrepancy.get('severity', 'medium'), + 'status': 'PENDING', + 'created_at': datetime.utcnow(), + 'reviewed_by': None, + 'reviewed_at': None, + 'resolution': None + } + + self.review_queue.append(review_item) + + # Persist to review database + self.db.insert('manual_review_queue', review_item) + + return review_item['id'] + + def process_review(self, review_id, reviewer, action, notes=None): + """Process manual review decision""" + + review_item = self.get_review_item(review_id) + + if not review_item: + raise ValueError(f"Review item {review_id} not found") + + review_item.update({ + 'status': 'REVIEWED', + 'reviewed_by': reviewer, + 'reviewed_at': datetime.utcnow(), + 'resolution': { + 'action': action, # 'APPLY_SOURCE', 'KEEP_TARGET', 'CUSTOM_FIX' + 'notes': notes + } + }) + + # Apply the resolution + if action == 'APPLY_SOURCE': + self.apply_source_data(review_item) + elif action == 'KEEP_TARGET': + pass # No action needed + elif action == 'CUSTOM_FIX': + # Custom fix would be applied separately + pass + + # Update review record + self.db.update('manual_review_queue', + {'id': review_id}, + review_item) + + return review_item + + def generate_review_report(self): + """Generate summary report of manual reviews""" + + reviews = self.db.query(""" + SELECT + discrepancy_type, + severity, + status, + COUNT(*) as count, + MIN(created_at) as oldest_review, + MAX(created_at) as newest_review + FROM manual_review_queue + GROUP BY discrepancy_type, severity, status + ORDER BY severity DESC, discrepancy_type + """) + + return reviews +``` + +### 3. Reconciliation Scheduling + +#### Automated Reconciliation Jobs +```python +import schedule +import time +from datetime import datetime, timedelta + +class ReconciliationScheduler: + def __init__(self, reconciler): + self.reconciler = reconciler + self.job_history = [] + + def setup_schedules(self): + """Set up automated reconciliation schedules""" + + # Quick reconciliation every 15 minutes during migration + schedule.every(15).minutes.do(self.quick_reconciliation) + + # Comprehensive reconciliation every 4 hours + schedule.every(4).hours.do(self.comprehensive_reconciliation) + + # Deep validation daily + schedule.every().day.at("02:00").do(self.deep_validation) + + # Weekly business logic validation + schedule.every().sunday.at("03:00").do(self.business_logic_validation) + + def quick_reconciliation(self): + """Quick count-based reconciliation""" + + job_start = datetime.utcnow() + + try: + # Check critical tables only + critical_tables = [ + 'transactions', 'orders', 'customers', 'accounts' + ] + + results = [] + for table in critical_tables: + count_diff = self.reconciler.check_row_counts(table) + if abs(count_diff) > 0: + results.append({ + 'table': table, + 'count_difference': count_diff, + 'severity': 'high' if abs(count_diff) > 100 else 'medium' + }) + + job_result = { + 'job_type': 'quick_reconciliation', + 'start_time': job_start, + 'end_time': datetime.utcnow(), + 'status': 'completed', + 'issues_found': len(results), + 'details': results + } + + # Alert if significant issues found + if any(r['severity'] == 'high' for r in results): + self.send_alert(job_result) + + except Exception as e: + job_result = { + 'job_type': 'quick_reconciliation', + 'start_time': job_start, + 'end_time': datetime.utcnow(), + 'status': 'failed', + 'error': str(e) + } + + self.job_history.append(job_result) + + def comprehensive_reconciliation(self): + """Comprehensive checksum-based reconciliation""" + + job_start = datetime.utcnow() + + try: + tables_to_check = self.get_migration_tables() + issues = [] + + for table in tables_to_check: + # Sample-based checksum validation + sample_issues = self.reconciler.validate_sample_checksums( + table, sample_size=1000 + ) + issues.extend(sample_issues) + + # Auto-correct simple issues + auto_corrections = 0 + for issue in issues: + if issue['auto_correctable']: + self.reconciler.auto_correct_issue(issue) + auto_corrections += 1 + else: + # Queue for manual review + self.reconciler.queue_for_manual_review(issue) + + job_result = { + 'job_type': 'comprehensive_reconciliation', + 'start_time': job_start, + 'end_time': datetime.utcnow(), + 'status': 'completed', + 'total_issues': len(issues), + 'auto_corrections': auto_corrections, + 'manual_reviews_queued': len(issues) - auto_corrections + } + + except Exception as e: + job_result = { + 'job_type': 'comprehensive_reconciliation', + 'start_time': job_start, + 'end_time': datetime.utcnow(), + 'status': 'failed', + 'error': str(e) + } + + self.job_history.append(job_result) + + def run_scheduler(self): + """Run the reconciliation scheduler""" + + print("Starting reconciliation scheduler...") + + while True: + schedule.run_pending() + time.sleep(60) # Check every minute +``` + +## Monitoring and Reporting + +### 1. Reconciliation Metrics + +```python +class ReconciliationMetrics: + def __init__(self, prometheus_client): + self.prometheus = prometheus_client + + # Define metrics + self.inconsistencies_found = Counter( + 'reconciliation_inconsistencies_total', + 'Number of inconsistencies found', + ['table', 'type', 'severity'] + ) + + self.reconciliation_duration = Histogram( + 'reconciliation_duration_seconds', + 'Time spent on reconciliation jobs', + ['job_type'] + ) + + self.auto_corrections = Counter( + 'reconciliation_auto_corrections_total', + 'Number of automatically corrected inconsistencies', + ['table', 'correction_type'] + ) + + self.data_drift_gauge = Gauge( + 'data_drift_percentage', + 'Percentage of records with inconsistencies', + ['table'] + ) + + def record_inconsistency(self, table, inconsistency_type, severity): + """Record a found inconsistency""" + self.inconsistencies_found.labels( + table=table, + type=inconsistency_type, + severity=severity + ).inc() + + def record_auto_correction(self, table, correction_type): + """Record an automatic correction""" + self.auto_corrections.labels( + table=table, + correction_type=correction_type + ).inc() + + def update_data_drift(self, table, drift_percentage): + """Update data drift gauge""" + self.data_drift_gauge.labels(table=table).set(drift_percentage) + + def record_job_duration(self, job_type, duration_seconds): + """Record reconciliation job duration""" + self.reconciliation_duration.labels(job_type=job_type).observe(duration_seconds) +``` + +### 2. Alerting Rules + +```yaml +# Prometheus alerting rules for data reconciliation +groups: + - name: data_reconciliation + rules: + - alert: HighDataInconsistency + expr: reconciliation_inconsistencies_total > 100 + for: 5m + labels: + severity: critical + annotations: + summary: "High number of data inconsistencies detected" + description: "{{ $value }} inconsistencies found in the last 5 minutes" + + - alert: DataDriftHigh + expr: data_drift_percentage > 5 + for: 10m + labels: + severity: warning + annotations: + summary: "Data drift percentage is high" + description: "{{ $labels.table }} has {{ $value }}% data drift" + + - alert: ReconciliationJobFailed + expr: up{job="reconciliation"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Reconciliation job is down" + description: "The data reconciliation service is not responding" + + - alert: AutoCorrectionRateHigh + expr: rate(reconciliation_auto_corrections_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "High rate of automatic corrections" + description: "Auto-correction rate is {{ $value }} per second" +``` + +### 3. Dashboard and Reporting + +```python +class ReconciliationDashboard: + def __init__(self, database_client, metrics_client): + self.db = database_client + self.metrics = metrics_client + + def generate_daily_report(self, date=None): + """Generate daily reconciliation report""" + + if not date: + date = datetime.utcnow().date() + + # Query reconciliation results for the day + daily_stats = self.db.query(""" + SELECT + table_name, + inconsistency_type, + COUNT(*) as count, + AVG(CASE WHEN resolution = 'AUTO_CORRECTED' THEN 1 ELSE 0 END) as auto_correction_rate + FROM reconciliation_log + WHERE DATE(created_at) = %s + GROUP BY table_name, inconsistency_type + """, (date,)) + + # Generate summary + summary = { + 'date': date.isoformat(), + 'total_inconsistencies': sum(row['count'] for row in daily_stats), + 'auto_correction_rate': sum(row['auto_correction_rate'] * row['count'] for row in daily_stats) / max(sum(row['count'] for row in daily_stats), 1), + 'tables_affected': len(set(row['table_name'] for row in daily_stats)), + 'details_by_table': {} + } + + # Group by table + for row in daily_stats: + table = row['table_name'] + if table not in summary['details_by_table']: + summary['details_by_table'][table] = [] + + summary['details_by_table'][table].append({ + 'inconsistency_type': row['inconsistency_type'], + 'count': row['count'], + 'auto_correction_rate': row['auto_correction_rate'] + }) + + return summary + + def generate_trend_analysis(self, days=7): + """Generate trend analysis for reconciliation metrics""" + + end_date = datetime.utcnow().date() + start_date = end_date - timedelta(days=days) + + trends = self.db.query(""" + SELECT + DATE(created_at) as date, + table_name, + COUNT(*) as inconsistencies, + AVG(CASE WHEN resolution = 'AUTO_CORRECTED' THEN 1 ELSE 0 END) as auto_correction_rate + FROM reconciliation_log + WHERE DATE(created_at) BETWEEN %s AND %s + GROUP BY DATE(created_at), table_name + ORDER BY date, table_name + """, (start_date, end_date)) + + # Calculate trends + trend_analysis = { + 'period': f"{start_date} to {end_date}", + 'trends': {}, + 'overall_trend': 'stable' + } + + for table in set(row['table_name'] for row in trends): + table_data = [row for row in trends if row['table_name'] == table] + + if len(table_data) >= 2: + first_count = table_data[0]['inconsistencies'] + last_count = table_data[-1]['inconsistencies'] + + if last_count > first_count * 1.2: + trend = 'increasing' + elif last_count < first_count * 0.8: + trend = 'decreasing' + else: + trend = 'stable' + + trend_analysis['trends'][table] = { + 'direction': trend, + 'first_day_count': first_count, + 'last_day_count': last_count, + 'change_percentage': ((last_count - first_count) / max(first_count, 1)) * 100 + } + + return trend_analysis +``` + +## Advanced Reconciliation Techniques + +### 1. Machine Learning-Based Anomaly Detection + +```python +from sklearn.isolation import IsolationForest +from sklearn.preprocessing import StandardScaler +import numpy as np + +class MLAnomalyDetector: + def __init__(self): + self.models = {} + self.scalers = {} + + def train_anomaly_detector(self, table_name, training_data): + """Train anomaly detection model for a specific table""" + + # Prepare features (convert records to numerical features) + features = self.extract_features(training_data) + + # Scale features + scaler = StandardScaler() + scaled_features = scaler.fit_transform(features) + + # Train isolation forest + model = IsolationForest(contamination=0.05, random_state=42) + model.fit(scaled_features) + + # Store model and scaler + self.models[table_name] = model + self.scalers[table_name] = scaler + + def detect_anomalies(self, table_name, data): + """Detect anomalous records that may indicate reconciliation issues""" + + if table_name not in self.models: + raise ValueError(f"No trained model for table {table_name}") + + # Extract features + features = self.extract_features(data) + + # Scale features + scaled_features = self.scalers[table_name].transform(features) + + # Predict anomalies + anomaly_scores = self.models[table_name].decision_function(scaled_features) + anomaly_predictions = self.models[table_name].predict(scaled_features) + + # Return anomalous records with scores + anomalies = [] + for i, (record, score, is_anomaly) in enumerate(zip(data, anomaly_scores, anomaly_predictions)): + if is_anomaly == -1: # Isolation forest returns -1 for anomalies + anomalies.append({ + 'record_index': i, + 'record': record, + 'anomaly_score': score, + 'severity': 'high' if score < -0.5 else 'medium' + }) + + return anomalies + + def extract_features(self, data): + """Extract numerical features from database records""" + + features = [] + + for record in data: + record_features = [] + + for key, value in record.items(): + if isinstance(value, (int, float)): + record_features.append(value) + elif isinstance(value, str): + # Convert string to hash-based feature + record_features.append(hash(value) % 10000) + elif isinstance(value, datetime): + # Convert datetime to timestamp + record_features.append(value.timestamp()) + else: + # Default value for other types + record_features.append(0) + + features.append(record_features) + + return np.array(features) +``` + +### 2. Probabilistic Reconciliation + +```python +import random +from typing import List, Dict, Tuple + +class ProbabilisticReconciler: + def __init__(self, confidence_threshold=0.95): + self.confidence_threshold = confidence_threshold + + def statistical_sampling_validation(self, table_name: str, population_size: int) -> Dict: + """Use statistical sampling to validate large datasets""" + + # Calculate sample size for 95% confidence, 5% margin of error + confidence_level = 0.95 + margin_of_error = 0.05 + + z_score = 1.96 # for 95% confidence + p = 0.5 # assume 50% error rate for maximum sample size + + sample_size = (z_score ** 2 * p * (1 - p)) / (margin_of_error ** 2) + + if population_size < 10000: + # Finite population correction + sample_size = sample_size / (1 + (sample_size - 1) / population_size) + + sample_size = min(int(sample_size), population_size) + + # Generate random sample + sample_ids = self.generate_random_sample(table_name, sample_size) + + # Validate sample + sample_results = self.validate_sample_records(table_name, sample_ids) + + # Calculate population estimates + error_rate = sample_results['errors'] / sample_size + estimated_errors = int(population_size * error_rate) + + # Calculate confidence interval + standard_error = (error_rate * (1 - error_rate) / sample_size) ** 0.5 + margin_of_error_actual = z_score * standard_error + + confidence_interval = ( + max(0, error_rate - margin_of_error_actual), + min(1, error_rate + margin_of_error_actual) + ) + + return { + 'table_name': table_name, + 'population_size': population_size, + 'sample_size': sample_size, + 'sample_error_rate': error_rate, + 'estimated_total_errors': estimated_errors, + 'confidence_interval': confidence_interval, + 'confidence_level': confidence_level, + 'recommendation': self.generate_recommendation(error_rate, confidence_interval) + } + + def generate_random_sample(self, table_name: str, sample_size: int) -> List[int]: + """Generate random sample of record IDs""" + + # Get total record count and ID range + id_range = self.db.query(f"SELECT MIN(id), MAX(id) FROM {table_name}")[0] + min_id, max_id = id_range + + # Generate random IDs + sample_ids = [] + attempts = 0 + max_attempts = sample_size * 10 # Avoid infinite loop + + while len(sample_ids) < sample_size and attempts < max_attempts: + candidate_id = random.randint(min_id, max_id) + + # Check if ID exists + exists = self.db.query(f"SELECT 1 FROM {table_name} WHERE id = %s", (candidate_id,)) + + if exists and candidate_id not in sample_ids: + sample_ids.append(candidate_id) + + attempts += 1 + + return sample_ids + + def validate_sample_records(self, table_name: str, sample_ids: List[int]) -> Dict: + """Validate a sample of records""" + + validation_results = { + 'total_checked': len(sample_ids), + 'errors': 0, + 'error_details': [] + } + + for record_id in sample_ids: + # Get record from both source and target + source_record = self.source_db.get_record(table_name, record_id) + target_record = self.target_db.get_record(table_name, record_id) + + if not target_record: + validation_results['errors'] += 1 + validation_results['error_details'].append({ + 'id': record_id, + 'error_type': 'MISSING_IN_TARGET' + }) + elif not self.records_match(source_record, target_record): + validation_results['errors'] += 1 + validation_results['error_details'].append({ + 'id': record_id, + 'error_type': 'DATA_MISMATCH', + 'differences': self.find_differences(source_record, target_record) + }) + + return validation_results + + def generate_recommendation(self, error_rate: float, confidence_interval: Tuple[float, float]) -> str: + """Generate recommendation based on error rate and confidence""" + + if confidence_interval[1] < 0.01: # Less than 1% error rate with confidence + return "Data quality is excellent. Continue with normal reconciliation schedule." + elif confidence_interval[1] < 0.05: # Less than 5% error rate with confidence + return "Data quality is acceptable. Monitor closely and investigate sample errors." + elif confidence_interval[0] > 0.1: # More than 10% error rate with confidence + return "Data quality is poor. Immediate comprehensive reconciliation required." + else: + return "Data quality is uncertain. Increase sample size for better estimates." +``` + +## Performance Optimization + +### 1. Parallel Processing + +```python +import asyncio +import multiprocessing as mp +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor + +class ParallelReconciler: + def __init__(self, max_workers=None): + self.max_workers = max_workers or mp.cpu_count() + + async def parallel_table_reconciliation(self, tables: List[str]): + """Reconcile multiple tables in parallel""" + + async with asyncio.Semaphore(self.max_workers): + tasks = [ + self.reconcile_table_async(table) + for table in tables + ] + + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + summary = { + 'total_tables': len(tables), + 'successful': 0, + 'failed': 0, + 'results': {} + } + + for table, result in zip(tables, results): + if isinstance(result, Exception): + summary['failed'] += 1 + summary['results'][table] = { + 'status': 'failed', + 'error': str(result) + } + else: + summary['successful'] += 1 + summary['results'][table] = result + + return summary + + def parallel_chunk_processing(self, table_name: str, chunk_size: int = 10000): + """Process table reconciliation in parallel chunks""" + + # Get total record count + total_records = self.db.get_record_count(table_name) + num_chunks = (total_records + chunk_size - 1) // chunk_size + + # Create chunk specifications + chunks = [] + for i in range(num_chunks): + start_id = i * chunk_size + end_id = min((i + 1) * chunk_size - 1, total_records - 1) + chunks.append({ + 'table': table_name, + 'start_id': start_id, + 'end_id': end_id, + 'chunk_number': i + 1 + }) + + # Process chunks in parallel + with ProcessPoolExecutor(max_workers=self.max_workers) as executor: + chunk_results = list(executor.map(self.process_chunk, chunks)) + + # Aggregate results + total_inconsistencies = sum(r['inconsistencies'] for r in chunk_results) + total_corrections = sum(r['corrections'] for r in chunk_results) + + return { + 'table': table_name, + 'total_records': total_records, + 'chunks_processed': len(chunks), + 'total_inconsistencies': total_inconsistencies, + 'total_corrections': total_corrections, + 'chunk_details': chunk_results + } + + def process_chunk(self, chunk_spec: Dict) -> Dict: + """Process a single chunk of records""" + + # This runs in a separate process + table = chunk_spec['table'] + start_id = chunk_spec['start_id'] + end_id = chunk_spec['end_id'] + + # Initialize database connections for this process + local_source_db = SourceDatabase() + local_target_db = TargetDatabase() + + # Get records in chunk + source_records = local_source_db.get_records_range(table, start_id, end_id) + target_records = local_target_db.get_records_range(table, start_id, end_id) + + # Reconcile chunk + inconsistencies = 0 + corrections = 0 + + for source_record in source_records: + target_record = target_records.get(source_record['id']) + + if not target_record: + inconsistencies += 1 + # Auto-correct if possible + try: + local_target_db.insert(table, source_record) + corrections += 1 + except Exception: + pass # Log error in production + elif not self.records_match(source_record, target_record): + inconsistencies += 1 + # Auto-correct field mismatches + try: + updates = self.calculate_updates(source_record, target_record) + local_target_db.update(table, source_record['id'], updates) + corrections += 1 + except Exception: + pass # Log error in production + + return { + 'chunk_number': chunk_spec['chunk_number'], + 'start_id': start_id, + 'end_id': end_id, + 'records_processed': len(source_records), + 'inconsistencies': inconsistencies, + 'corrections': corrections + } +``` + +### 2. Incremental Reconciliation + +```python +class IncrementalReconciler: + def __init__(self, source_db, target_db): + self.source_db = source_db + self.target_db = target_db + self.last_reconciliation_times = {} + + def incremental_reconciliation(self, table_name: str): + """Reconcile only records changed since last reconciliation""" + + last_reconciled = self.get_last_reconciliation_time(table_name) + + # Get records modified since last reconciliation + modified_source = self.source_db.get_records_modified_since( + table_name, last_reconciled + ) + + modified_target = self.target_db.get_records_modified_since( + table_name, last_reconciled + ) + + # Create lookup dictionaries + source_dict = {r['id']: r for r in modified_source} + target_dict = {r['id']: r for r in modified_target} + + # Find all record IDs to check + all_ids = set(source_dict.keys()) | set(target_dict.keys()) + + inconsistencies = [] + + for record_id in all_ids: + source_record = source_dict.get(record_id) + target_record = target_dict.get(record_id) + + if source_record and not target_record: + inconsistencies.append({ + 'type': 'missing_in_target', + 'table': table_name, + 'id': record_id, + 'source_record': source_record + }) + elif not source_record and target_record: + inconsistencies.append({ + 'type': 'extra_in_target', + 'table': table_name, + 'id': record_id, + 'target_record': target_record + }) + elif source_record and target_record: + if not self.records_match(source_record, target_record): + inconsistencies.append({ + 'type': 'data_mismatch', + 'table': table_name, + 'id': record_id, + 'source_record': source_record, + 'target_record': target_record, + 'differences': self.find_differences(source_record, target_record) + }) + + # Update last reconciliation time + self.update_last_reconciliation_time(table_name, datetime.utcnow()) + + return { + 'table': table_name, + 'reconciliation_time': datetime.utcnow(), + 'records_checked': len(all_ids), + 'inconsistencies_found': len(inconsistencies), + 'inconsistencies': inconsistencies + } + + def get_last_reconciliation_time(self, table_name: str) -> datetime: + """Get the last reconciliation timestamp for a table""" + + result = self.source_db.query(""" + SELECT last_reconciled_at + FROM reconciliation_metadata + WHERE table_name = %s + """, (table_name,)) + + if result: + return result[0]['last_reconciled_at'] + else: + # First time reconciliation - start from beginning of migration + return self.get_migration_start_time() + + def update_last_reconciliation_time(self, table_name: str, timestamp: datetime): + """Update the last reconciliation timestamp""" + + self.source_db.execute(""" + INSERT INTO reconciliation_metadata (table_name, last_reconciled_at) + VALUES (%s, %s) + ON CONFLICT (table_name) + DO UPDATE SET last_reconciled_at = %s + """, (table_name, timestamp, timestamp)) +``` + +This comprehensive guide provides the framework and tools necessary for implementing robust data reconciliation strategies during migrations, ensuring data integrity and consistency while minimizing business disruption. \ No newline at end of file diff --git a/engineering/migration-architect/references/migration_patterns_catalog.md b/engineering/migration-architect/references/migration_patterns_catalog.md new file mode 100644 index 0000000..f353074 --- /dev/null +++ b/engineering/migration-architect/references/migration_patterns_catalog.md @@ -0,0 +1,705 @@ +# Migration Patterns Catalog + +## Overview + +This catalog provides detailed descriptions of proven migration patterns, their use cases, implementation guidelines, and best practices. Each pattern includes code examples, diagrams, and lessons learned from real-world implementations. + +## Database Migration Patterns + +### 1. Expand-Contract Pattern + +**Use Case:** Schema evolution with zero downtime +**Complexity:** Medium +**Risk Level:** Low-Medium + +#### Description +The Expand-Contract pattern allows for schema changes without downtime by following a three-phase approach: + +1. **Expand:** Add new schema elements alongside existing ones +2. **Migrate:** Dual-write to both old and new schema during transition +3. **Contract:** Remove old schema elements after validation + +#### Implementation Steps + +```sql +-- Phase 1: Expand +ALTER TABLE users ADD COLUMN email_new VARCHAR(255); +CREATE INDEX CONCURRENTLY idx_users_email_new ON users(email_new); + +-- Phase 2: Migrate (Application Code) +-- Write to both columns during transition period +INSERT INTO users (name, email, email_new) VALUES (?, ?, ?); + +-- Backfill existing data +UPDATE users SET email_new = email WHERE email_new IS NULL; + +-- Phase 3: Contract (after validation) +ALTER TABLE users DROP COLUMN email; +ALTER TABLE users RENAME COLUMN email_new TO email; +``` + +#### Pros and Cons +**Pros:** +- Zero downtime deployments +- Safe rollback at any point +- Gradual transition with validation + +**Cons:** +- Increased storage during transition +- More complex application logic +- Extended migration timeline + +### 2. Parallel Schema Pattern + +**Use Case:** Major database restructuring +**Complexity:** High +**Risk Level:** Medium + +#### Description +Run new and old schemas in parallel, using feature flags to gradually route traffic to the new schema while maintaining the ability to rollback quickly. + +#### Implementation Example + +```python +class DatabaseRouter: + def __init__(self, feature_flag_service): + self.feature_flags = feature_flag_service + self.old_db = OldDatabaseConnection() + self.new_db = NewDatabaseConnection() + + def route_query(self, user_id, query_type): + if self.feature_flags.is_enabled("new_schema", user_id): + return self.new_db.execute(query_type) + else: + return self.old_db.execute(query_type) + + def dual_write(self, data): + # Write to both databases for consistency + success_old = self.old_db.write(data) + success_new = self.new_db.write(transform_data(data)) + + if not (success_old and success_new): + # Handle partial failures + self.handle_dual_write_failure(data, success_old, success_new) +``` + +#### Best Practices +- Implement data consistency checks between schemas +- Use circuit breakers for automatic failover +- Monitor performance impact of dual writes +- Plan for data reconciliation processes + +### 3. Event Sourcing Migration + +**Use Case:** Migrating systems with complex business logic +**Complexity:** High +**Risk Level:** Medium-High + +#### Description +Capture all changes as events during migration, enabling replay and reconciliation capabilities. + +#### Event Store Schema +```sql +CREATE TABLE migration_events ( + event_id UUID PRIMARY KEY, + aggregate_id UUID NOT NULL, + event_type VARCHAR(100) NOT NULL, + event_data JSONB NOT NULL, + event_version INTEGER NOT NULL, + occurred_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + processed_at TIMESTAMP WITH TIME ZONE +); +``` + +#### Migration Event Handler +```python +class MigrationEventHandler: + def __init__(self, old_store, new_store): + self.old_store = old_store + self.new_store = new_store + self.event_log = [] + + def handle_update(self, entity_id, old_data, new_data): + # Log the change as an event + event = MigrationEvent( + entity_id=entity_id, + event_type="entity_migrated", + old_data=old_data, + new_data=new_data, + timestamp=datetime.now() + ) + + self.event_log.append(event) + + # Apply to new store + success = self.new_store.update(entity_id, new_data) + + if not success: + # Mark for retry + event.status = "failed" + self.schedule_retry(event) + + return success + + def replay_events(self, from_timestamp=None): + """Replay events for reconciliation""" + events = self.get_events_since(from_timestamp) + for event in events: + self.apply_event(event) +``` + +## Service Migration Patterns + +### 1. Strangler Fig Pattern + +**Use Case:** Legacy system replacement +**Complexity:** Medium-High +**Risk Level:** Medium + +#### Description +Gradually replace legacy functionality by intercepting calls and routing them to new services, eventually "strangling" the legacy system. + +#### Implementation Architecture + +```yaml +# API Gateway Configuration +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: user-service-migration +spec: + http: + - match: + - headers: + migration-flag: + exact: "new" + route: + - destination: + host: user-service-v2 + - route: + - destination: + host: user-service-v1 +``` + +#### Strangler Proxy Implementation + +```python +class StranglerProxy: + def __init__(self): + self.legacy_service = LegacyUserService() + self.new_service = NewUserService() + self.feature_flags = FeatureFlagService() + + def handle_request(self, request): + route = self.determine_route(request) + + if route == "new": + return self.handle_with_new_service(request) + elif route == "both": + return self.handle_with_both_services(request) + else: + return self.handle_with_legacy_service(request) + + def determine_route(self, request): + user_id = request.get('user_id') + + if self.feature_flags.is_enabled("new_user_service", user_id): + if self.feature_flags.is_enabled("dual_write", user_id): + return "both" + else: + return "new" + else: + return "legacy" +``` + +### 2. Parallel Run Pattern + +**Use Case:** Risk mitigation for critical services +**Complexity:** Medium +**Risk Level:** Low-Medium + +#### Description +Run both old and new services simultaneously, comparing outputs to validate correctness before switching traffic. + +#### Implementation + +```python +class ParallelRunManager: + def __init__(self): + self.primary_service = PrimaryService() + self.candidate_service = CandidateService() + self.comparator = ResponseComparator() + self.metrics = MetricsCollector() + + async def parallel_execute(self, request): + # Execute both services concurrently + primary_task = asyncio.create_task( + self.primary_service.process(request) + ) + candidate_task = asyncio.create_task( + self.candidate_service.process(request) + ) + + # Always wait for primary + primary_result = await primary_task + + try: + # Wait for candidate with timeout + candidate_result = await asyncio.wait_for( + candidate_task, timeout=5.0 + ) + + # Compare results + comparison = self.comparator.compare( + primary_result, candidate_result + ) + + # Record metrics + self.metrics.record_comparison(comparison) + + except asyncio.TimeoutError: + self.metrics.record_timeout("candidate") + except Exception as e: + self.metrics.record_error("candidate", str(e)) + + # Always return primary result + return primary_result +``` + +### 3. Blue-Green Deployment Pattern + +**Use Case:** Zero-downtime service updates +**Complexity:** Low-Medium +**Risk Level:** Low + +#### Description +Maintain two identical production environments (blue and green), switching traffic between them for deployments. + +#### Kubernetes Implementation + +```yaml +# Blue Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: app-blue + labels: + version: blue +spec: + replicas: 3 + selector: + matchLabels: + app: myapp + version: blue + template: + metadata: + labels: + app: myapp + version: blue + spec: + containers: + - name: app + image: myapp:v1.0.0 + +--- +# Green Deployment +apiVersion: apps/v1 +kind: Deployment +metadata: + name: app-green + labels: + version: green +spec: + replicas: 3 + selector: + matchLabels: + app: myapp + version: green + template: + metadata: + labels: + app: myapp + version: green + spec: + containers: + - name: app + image: myapp:v2.0.0 + +--- +# Service (switches between blue and green) +apiVersion: v1 +kind: Service +metadata: + name: app-service +spec: + selector: + app: myapp + version: blue # Change to green for deployment + ports: + - port: 80 + targetPort: 8080 +``` + +## Infrastructure Migration Patterns + +### 1. Lift and Shift Pattern + +**Use Case:** Quick cloud migration with minimal changes +**Complexity:** Low-Medium +**Risk Level:** Low + +#### Description +Migrate applications to cloud infrastructure with minimal or no code changes, focusing on infrastructure compatibility. + +#### Migration Checklist + +```yaml +Pre-Migration Assessment: + - inventory_current_infrastructure: + - servers_and_specifications + - network_configuration + - storage_requirements + - security_configurations + - identify_dependencies: + - database_connections + - external_service_integrations + - file_system_dependencies + - assess_compatibility: + - operating_system_versions + - runtime_dependencies + - license_requirements + +Migration Execution: + - provision_target_infrastructure: + - compute_instances + - storage_volumes + - network_configuration + - security_groups + - migrate_data: + - database_backup_restore + - file_system_replication + - configuration_files + - update_configurations: + - connection_strings + - environment_variables + - dns_records + - validate_functionality: + - application_health_checks + - end_to_end_testing + - performance_validation +``` + +### 2. Hybrid Cloud Migration + +**Use Case:** Gradual cloud adoption with on-premises integration +**Complexity:** High +**Risk Level:** Medium-High + +#### Description +Maintain some components on-premises while migrating others to cloud, requiring secure connectivity and data synchronization. + +#### Network Architecture + +```hcl +# Terraform configuration for hybrid connectivity +resource "aws_vpc" "main" { + cidr_block = "10.0.0.0/16" + enable_dns_hostnames = true + enable_dns_support = true +} + +resource "aws_vpn_gateway" "main" { + vpc_id = aws_vpc.main.id + + tags = { + Name = "hybrid-vpn-gateway" + } +} + +resource "aws_customer_gateway" "main" { + bgp_asn = 65000 + ip_address = var.on_premises_public_ip + type = "ipsec.1" + + tags = { + Name = "on-premises-gateway" + } +} + +resource "aws_vpn_connection" "main" { + vpn_gateway_id = aws_vpn_gateway.main.id + customer_gateway_id = aws_customer_gateway.main.id + type = "ipsec.1" + static_routes_only = true +} +``` + +#### Data Synchronization Pattern + +```python +class HybridDataSync: + def __init__(self): + self.on_prem_db = OnPremiseDatabase() + self.cloud_db = CloudDatabase() + self.sync_log = SyncLogManager() + + async def bidirectional_sync(self): + """Synchronize data between on-premises and cloud""" + + # Get last sync timestamp + last_sync = self.sync_log.get_last_sync_time() + + # Sync on-prem changes to cloud + on_prem_changes = self.on_prem_db.get_changes_since(last_sync) + for change in on_prem_changes: + await self.apply_change_to_cloud(change) + + # Sync cloud changes to on-prem + cloud_changes = self.cloud_db.get_changes_since(last_sync) + for change in cloud_changes: + await self.apply_change_to_on_prem(change) + + # Handle conflicts + conflicts = self.detect_conflicts(on_prem_changes, cloud_changes) + for conflict in conflicts: + await self.resolve_conflict(conflict) + + # Update sync timestamp + self.sync_log.record_sync_completion() + + async def apply_change_to_cloud(self, change): + """Apply on-premises change to cloud database""" + try: + if change.operation == "INSERT": + await self.cloud_db.insert(change.table, change.data) + elif change.operation == "UPDATE": + await self.cloud_db.update(change.table, change.key, change.data) + elif change.operation == "DELETE": + await self.cloud_db.delete(change.table, change.key) + + self.sync_log.record_success(change.id, "cloud") + + except Exception as e: + self.sync_log.record_failure(change.id, "cloud", str(e)) + raise +``` + +### 3. Multi-Cloud Migration + +**Use Case:** Avoiding vendor lock-in or regulatory requirements +**Complexity:** Very High +**Risk Level:** High + +#### Description +Distribute workloads across multiple cloud providers for resilience, compliance, or cost optimization. + +#### Service Mesh Configuration + +```yaml +# Istio configuration for multi-cloud service mesh +apiVersion: networking.istio.io/v1beta1 +kind: ServiceEntry +metadata: + name: aws-service +spec: + hosts: + - aws-service.company.com + ports: + - number: 443 + name: https + protocol: HTTPS + location: MESH_EXTERNAL + resolution: DNS + +--- +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: multi-cloud-routing +spec: + hosts: + - user-service + http: + - match: + - headers: + region: + exact: "us-east" + route: + - destination: + host: aws-service.company.com + weight: 100 + - match: + - headers: + region: + exact: "eu-west" + route: + - destination: + host: gcp-service.company.com + weight: 100 + - route: # Default routing + - destination: + host: user-service + subset: local + weight: 80 + - destination: + host: aws-service.company.com + weight: 20 +``` + +## Feature Flag Patterns + +### 1. Progressive Rollout Pattern + +**Use Case:** Gradual feature deployment with risk mitigation +**Implementation:** + +```python +class ProgressiveRollout: + def __init__(self, feature_name): + self.feature_name = feature_name + self.rollout_percentage = 0 + self.user_buckets = {} + + def is_enabled_for_user(self, user_id): + # Consistent user bucketing + user_hash = hashlib.md5(f"{self.feature_name}:{user_id}".encode()).hexdigest() + bucket = int(user_hash, 16) % 100 + + return bucket < self.rollout_percentage + + def increase_rollout(self, target_percentage, step_size=10): + """Gradually increase rollout percentage""" + while self.rollout_percentage < target_percentage: + self.rollout_percentage = min( + self.rollout_percentage + step_size, + target_percentage + ) + + # Monitor metrics before next increase + yield self.rollout_percentage + time.sleep(300) # Wait 5 minutes between increases +``` + +### 2. Circuit Breaker Pattern + +**Use Case:** Automatic fallback during migration issues + +```python +class MigrationCircuitBreaker: + def __init__(self, failure_threshold=5, timeout=60): + self.failure_count = 0 + self.failure_threshold = failure_threshold + self.timeout = timeout + self.last_failure_time = None + self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN + + def call_new_service(self, request): + if self.state == 'OPEN': + if self.should_attempt_reset(): + self.state = 'HALF_OPEN' + else: + return self.fallback_to_legacy(request) + + try: + response = self.new_service.process(request) + self.on_success() + return response + except Exception as e: + self.on_failure() + return self.fallback_to_legacy(request) + + def on_success(self): + self.failure_count = 0 + self.state = 'CLOSED' + + def on_failure(self): + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.failure_count >= self.failure_threshold: + self.state = 'OPEN' + + def should_attempt_reset(self): + return (time.time() - self.last_failure_time) >= self.timeout +``` + +## Migration Anti-Patterns + +### 1. Big Bang Migration (Anti-Pattern) + +**Why to Avoid:** +- High risk of complete system failure +- Difficult to rollback +- Extended downtime +- All-or-nothing deployment + +**Better Alternative:** Use incremental migration patterns like Strangler Fig or Parallel Run. + +### 2. No Rollback Plan (Anti-Pattern) + +**Why to Avoid:** +- Cannot recover from failures +- Increases business risk +- Panic-driven decisions during issues + +**Better Alternative:** Always implement comprehensive rollback procedures before migration. + +### 3. Insufficient Testing (Anti-Pattern) + +**Why to Avoid:** +- Unknown compatibility issues +- Performance degradation +- Data corruption risks + +**Better Alternative:** Implement comprehensive testing at each migration phase. + +## Pattern Selection Matrix + +| Migration Type | Complexity | Downtime Tolerance | Recommended Pattern | +|---------------|------------|-------------------|-------------------| +| Schema Change | Low | Zero | Expand-Contract | +| Schema Change | High | Zero | Parallel Schema | +| Service Replace | Medium | Zero | Strangler Fig | +| Service Update | Low | Zero | Blue-Green | +| Data Migration | High | Some | Event Sourcing | +| Infrastructure | Low | Some | Lift and Shift | +| Infrastructure | High | Zero | Hybrid Cloud | + +## Success Metrics + +### Technical Metrics +- Migration completion rate +- System availability during migration +- Performance impact (response time, throughput) +- Error rate changes +- Rollback execution time + +### Business Metrics +- Customer impact score +- Revenue protection +- Time to value realization +- Stakeholder satisfaction + +### Operational Metrics +- Team efficiency +- Knowledge transfer effectiveness +- Post-migration support requirements +- Documentation completeness + +## Lessons Learned + +### Common Pitfalls +1. **Underestimating data dependencies** - Always map all data relationships +2. **Insufficient monitoring** - Implement comprehensive observability before migration +3. **Poor communication** - Keep all stakeholders informed throughout the process +4. **Rushed timelines** - Allow adequate time for testing and validation +5. **Ignoring performance impact** - Benchmark before and after migration + +### Best Practices +1. **Start with low-risk migrations** - Build confidence and experience +2. **Automate everything possible** - Reduce human error and increase repeatability +3. **Test rollback procedures** - Ensure you can recover from any failure +4. **Monitor continuously** - Use real-time dashboards and alerting +5. **Document everything** - Create comprehensive runbooks and documentation + +This catalog serves as a reference for selecting appropriate migration patterns based on specific requirements, risk tolerance, and technical constraints. \ No newline at end of file diff --git a/engineering/migration-architect/references/zero_downtime_techniques.md b/engineering/migration-architect/references/zero_downtime_techniques.md new file mode 100644 index 0000000..b2144ad --- /dev/null +++ b/engineering/migration-architect/references/zero_downtime_techniques.md @@ -0,0 +1,1104 @@ +# Zero-Downtime Migration Techniques + +## Overview + +Zero-downtime migrations are critical for maintaining business continuity and user experience during system changes. This guide provides comprehensive techniques, patterns, and implementation strategies for achieving true zero-downtime migrations across different system components. + +## Core Principles + +### 1. Backward Compatibility +Every change must be backward compatible until all clients have migrated to the new version. + +### 2. Incremental Changes +Break large changes into smaller, independent increments that can be deployed and validated separately. + +### 3. Feature Flags +Use feature toggles to control the rollout of new functionality without code deployments. + +### 4. Graceful Degradation +Ensure systems continue to function even when some components are unavailable or degraded. + +## Database Zero-Downtime Techniques + +### Schema Evolution Without Downtime + +#### 1. Additive Changes Only +**Principle:** Only add new elements; never remove or modify existing ones directly. + +```sql +-- ✅ Good: Additive change +ALTER TABLE users ADD COLUMN middle_name VARCHAR(50); + +-- ❌ Bad: Breaking change +ALTER TABLE users DROP COLUMN email; +``` + +#### 2. Multi-Phase Schema Evolution + +**Phase 1: Expand** +```sql +-- Add new column alongside existing one +ALTER TABLE users ADD COLUMN email_address VARCHAR(255); + +-- Add index concurrently (PostgreSQL) +CREATE INDEX CONCURRENTLY idx_users_email_address ON users(email_address); +``` + +**Phase 2: Dual Write (Application Code)** +```python +class UserService: + def create_user(self, name, email): + # Write to both old and new columns + user = User( + name=name, + email=email, # Old column + email_address=email # New column + ) + return user.save() + + def update_email(self, user_id, new_email): + # Update both columns + user = User.objects.get(id=user_id) + user.email = new_email + user.email_address = new_email + user.save() + return user +``` + +**Phase 3: Backfill Data** +```sql +-- Backfill existing data (in batches) +UPDATE users +SET email_address = email +WHERE email_address IS NULL + AND id BETWEEN ? AND ?; +``` + +**Phase 4: Switch Reads** +```python +class UserService: + def get_user_email(self, user_id): + user = User.objects.get(id=user_id) + # Switch to reading from new column + return user.email_address or user.email +``` + +**Phase 5: Contract** +```sql +-- After validation, remove old column +ALTER TABLE users DROP COLUMN email; +-- Rename new column if needed +ALTER TABLE users RENAME COLUMN email_address TO email; +``` + +### 3. Online Schema Changes + +#### PostgreSQL Techniques + +```sql +-- Safe column addition +ALTER TABLE orders ADD COLUMN status_new VARCHAR(20) DEFAULT 'pending'; + +-- Safe index creation +CREATE INDEX CONCURRENTLY idx_orders_status_new ON orders(status_new); + +-- Safe constraint addition (after data validation) +ALTER TABLE orders ADD CONSTRAINT check_status_new +CHECK (status_new IN ('pending', 'processing', 'completed', 'cancelled')); +``` + +#### MySQL Techniques + +```sql +-- Use pt-online-schema-change for large tables +pt-online-schema-change \ + --alter "ADD COLUMN status VARCHAR(20) DEFAULT 'pending'" \ + --execute \ + D=mydb,t=orders + +-- Online DDL (MySQL 5.6+) +ALTER TABLE orders +ADD COLUMN priority INT DEFAULT 1, +ALGORITHM=INPLACE, +LOCK=NONE; +``` + +### 4. Data Migration Strategies + +#### Chunked Data Migration + +```python +class DataMigrator: + def __init__(self, source_table, target_table, chunk_size=1000): + self.source_table = source_table + self.target_table = target_table + self.chunk_size = chunk_size + + def migrate_data(self): + last_id = 0 + total_migrated = 0 + + while True: + # Get next chunk + chunk = self.get_chunk(last_id, self.chunk_size) + + if not chunk: + break + + # Transform and migrate chunk + for record in chunk: + transformed = self.transform_record(record) + self.insert_or_update(transformed) + + last_id = chunk[-1]['id'] + total_migrated += len(chunk) + + # Brief pause to avoid overwhelming the database + time.sleep(0.1) + + self.log_progress(total_migrated) + + return total_migrated + + def get_chunk(self, last_id, limit): + return db.execute(f""" + SELECT * FROM {self.source_table} + WHERE id > %s + ORDER BY id + LIMIT %s + """, (last_id, limit)) +``` + +#### Change Data Capture (CDC) + +```python +class CDCProcessor: + def __init__(self): + self.kafka_consumer = KafkaConsumer('db_changes') + self.target_db = TargetDatabase() + + def process_changes(self): + for message in self.kafka_consumer: + change = json.loads(message.value) + + if change['operation'] == 'INSERT': + self.handle_insert(change) + elif change['operation'] == 'UPDATE': + self.handle_update(change) + elif change['operation'] == 'DELETE': + self.handle_delete(change) + + def handle_insert(self, change): + transformed_data = self.transform_data(change['after']) + self.target_db.insert(change['table'], transformed_data) + + def handle_update(self, change): + key = change['key'] + transformed_data = self.transform_data(change['after']) + self.target_db.update(change['table'], key, transformed_data) +``` + +## Application Zero-Downtime Techniques + +### 1. Blue-Green Deployments + +#### Infrastructure Setup + +```yaml +# Blue Environment (Current Production) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: app-blue + labels: + version: blue + app: myapp +spec: + replicas: 3 + selector: + matchLabels: + app: myapp + version: blue + template: + metadata: + labels: + app: myapp + version: blue + spec: + containers: + - name: app + image: myapp:1.0.0 + ports: + - containerPort: 8080 + readinessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + +--- +# Green Environment (New Version) +apiVersion: apps/v1 +kind: Deployment +metadata: + name: app-green + labels: + version: green + app: myapp +spec: + replicas: 3 + selector: + matchLabels: + app: myapp + version: green + template: + metadata: + labels: + app: myapp + version: green + spec: + containers: + - name: app + image: myapp:2.0.0 + ports: + - containerPort: 8080 + readinessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 +``` + +#### Service Switching + +```yaml +# Service (switches between blue and green) +apiVersion: v1 +kind: Service +metadata: + name: app-service +spec: + selector: + app: myapp + version: blue # Switch to 'green' for deployment + ports: + - port: 80 + targetPort: 8080 + type: LoadBalancer +``` + +#### Automated Deployment Script + +```bash +#!/bin/bash + +# Blue-Green Deployment Script +NAMESPACE="production" +APP_NAME="myapp" +NEW_IMAGE="myapp:2.0.0" + +# Determine current and target environments +CURRENT_VERSION=$(kubectl get service $APP_NAME-service -o jsonpath='{.spec.selector.version}') + +if [ "$CURRENT_VERSION" = "blue" ]; then + TARGET_VERSION="green" +else + TARGET_VERSION="blue" +fi + +echo "Current version: $CURRENT_VERSION" +echo "Target version: $TARGET_VERSION" + +# Update target environment with new image +kubectl set image deployment/$APP_NAME-$TARGET_VERSION app=$NEW_IMAGE + +# Wait for rollout to complete +kubectl rollout status deployment/$APP_NAME-$TARGET_VERSION --timeout=300s + +# Run health checks +echo "Running health checks..." +TARGET_IP=$(kubectl get service $APP_NAME-$TARGET_VERSION -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + +for i in {1..30}; do + if curl -f http://$TARGET_IP/health; then + echo "Health check passed" + break + fi + + if [ $i -eq 30 ]; then + echo "Health check failed after 30 attempts" + exit 1 + fi + + sleep 2 +done + +# Switch traffic to new version +kubectl patch service $APP_NAME-service -p '{"spec":{"selector":{"version":"'$TARGET_VERSION'"}}}' + +echo "Traffic switched to $TARGET_VERSION" + +# Monitor for 5 minutes +echo "Monitoring new version..." +sleep 300 + +# Check if rollback is needed +ERROR_RATE=$(curl -s "http://monitoring.company.com/api/error_rate?service=$APP_NAME" | jq '.error_rate') + +if (( $(echo "$ERROR_RATE > 0.05" | bc -l) )); then + echo "Error rate too high ($ERROR_RATE), rolling back..." + kubectl patch service $APP_NAME-service -p '{"spec":{"selector":{"version":"'$CURRENT_VERSION'"}}}' + exit 1 +fi + +echo "Deployment successful!" +``` + +### 2. Canary Deployments + +#### Progressive Canary with Istio + +```yaml +# Destination Rule +apiVersion: networking.istio.io/v1beta1 +kind: DestinationRule +metadata: + name: myapp-destination +spec: + host: myapp + subsets: + - name: v1 + labels: + version: v1 + - name: v2 + labels: + version: v2 + +--- +# Virtual Service for Canary +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: myapp-canary +spec: + hosts: + - myapp + http: + - match: + - headers: + canary: + exact: "true" + route: + - destination: + host: myapp + subset: v2 + - route: + - destination: + host: myapp + subset: v1 + weight: 95 + - destination: + host: myapp + subset: v2 + weight: 5 +``` + +#### Automated Canary Controller + +```python +class CanaryController: + def __init__(self, istio_client, prometheus_client): + self.istio = istio_client + self.prometheus = prometheus_client + self.canary_weight = 5 + self.max_weight = 100 + self.weight_increment = 5 + self.validation_window = 300 # 5 minutes + + async def deploy_canary(self, app_name, new_version): + """Deploy new version using canary strategy""" + + # Start with small percentage + await self.update_traffic_split(app_name, self.canary_weight) + + while self.canary_weight < self.max_weight: + # Monitor metrics for validation window + await asyncio.sleep(self.validation_window) + + # Check canary health + if not await self.is_canary_healthy(app_name, new_version): + await self.rollback_canary(app_name) + raise Exception("Canary deployment failed health checks") + + # Increase traffic to canary + self.canary_weight = min( + self.canary_weight + self.weight_increment, + self.max_weight + ) + + await self.update_traffic_split(app_name, self.canary_weight) + + print(f"Canary traffic increased to {self.canary_weight}%") + + print("Canary deployment completed successfully") + + async def is_canary_healthy(self, app_name, version): + """Check if canary version is healthy""" + + # Check error rate + error_rate = await self.prometheus.query( + f'rate(http_requests_total{{app="{app_name}", version="{version}", status=~"5.."}}' + f'[5m]) / rate(http_requests_total{{app="{app_name}", version="{version}"}}[5m])' + ) + + if error_rate > 0.05: # 5% error rate threshold + return False + + # Check response time + p95_latency = await self.prometheus.query( + f'histogram_quantile(0.95, rate(http_request_duration_seconds_bucket' + f'{{app="{app_name}", version="{version}"}}[5m]))' + ) + + if p95_latency > 2.0: # 2 second p95 threshold + return False + + return True + + async def update_traffic_split(self, app_name, canary_weight): + """Update Istio virtual service with new traffic split""" + + stable_weight = 100 - canary_weight + + virtual_service = { + "apiVersion": "networking.istio.io/v1beta1", + "kind": "VirtualService", + "metadata": {"name": f"{app_name}-canary"}, + "spec": { + "hosts": [app_name], + "http": [{ + "route": [ + { + "destination": {"host": app_name, "subset": "stable"}, + "weight": stable_weight + }, + { + "destination": {"host": app_name, "subset": "canary"}, + "weight": canary_weight + } + ] + }] + } + } + + await self.istio.apply_virtual_service(virtual_service) +``` + +### 3. Rolling Updates + +#### Kubernetes Rolling Update Strategy + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: rolling-update-app +spec: + replicas: 10 + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 2 # Can have 2 extra pods during update + maxUnavailable: 1 # At most 1 pod can be unavailable + selector: + matchLabels: + app: rolling-update-app + template: + metadata: + labels: + app: rolling-update-app + spec: + containers: + - name: app + image: myapp:2.0.0 + ports: + - containerPort: 8080 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 2 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + livenessProbe: + httpGet: + path: /live + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 +``` + +#### Custom Rolling Update Controller + +```python +class RollingUpdateController: + def __init__(self, k8s_client): + self.k8s = k8s_client + self.max_surge = 2 + self.max_unavailable = 1 + + async def rolling_update(self, deployment_name, new_image): + """Perform rolling update with custom logic""" + + deployment = await self.k8s.get_deployment(deployment_name) + total_replicas = deployment.spec.replicas + + # Calculate batch size + batch_size = min(self.max_surge, total_replicas // 5) # Update 20% at a time + + updated_pods = [] + + for i in range(0, total_replicas, batch_size): + batch_end = min(i + batch_size, total_replicas) + + # Update batch of pods + for pod_index in range(i, batch_end): + old_pod = await self.get_pod_by_index(deployment_name, pod_index) + + # Create new pod with new image + new_pod = await self.create_updated_pod(old_pod, new_image) + + # Wait for new pod to be ready + await self.wait_for_pod_ready(new_pod.metadata.name) + + # Remove old pod + await self.k8s.delete_pod(old_pod.metadata.name) + + updated_pods.append(new_pod) + + # Brief pause between pod updates + await asyncio.sleep(2) + + # Validate batch health before continuing + if not await self.validate_batch_health(updated_pods[-batch_size:]): + # Rollback batch + await self.rollback_batch(updated_pods[-batch_size:]) + raise Exception("Rolling update failed validation") + + print(f"Updated {batch_end}/{total_replicas} pods") + + print("Rolling update completed successfully") +``` + +## Load Balancer and Traffic Management + +### 1. Weighted Routing + +#### NGINX Configuration + +```nginx +upstream backend { + # Old version - 80% traffic + server old-app-1:8080 weight=4; + server old-app-2:8080 weight=4; + + # New version - 20% traffic + server new-app-1:8080 weight=1; + server new-app-2:8080 weight=1; +} + +server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + # Health check headers + proxy_set_header X-Health-Check-Timeout 5s; + } +} +``` + +#### HAProxy Configuration + +```haproxy +backend app_servers + balance roundrobin + option httpchk GET /health + + # Old version servers + server old-app-1 old-app-1:8080 check weight 80 + server old-app-2 old-app-2:8080 check weight 80 + + # New version servers + server new-app-1 new-app-1:8080 check weight 20 + server new-app-2 new-app-2:8080 check weight 20 + +frontend app_frontend + bind *:80 + default_backend app_servers + + # Custom health check endpoint + acl health_check path_beg /health + http-request return status 200 content-type text/plain string "OK" if health_check +``` + +### 2. Circuit Breaker Implementation + +```python +class CircuitBreaker: + def __init__(self, failure_threshold=5, recovery_timeout=60, expected_exception=Exception): + self.failure_threshold = failure_threshold + self.recovery_timeout = recovery_timeout + self.expected_exception = expected_exception + self.failure_count = 0 + self.last_failure_time = None + self.state = 'CLOSED' # CLOSED, OPEN, HALF_OPEN + + def call(self, func, *args, **kwargs): + """Execute function with circuit breaker protection""" + + if self.state == 'OPEN': + if self._should_attempt_reset(): + self.state = 'HALF_OPEN' + else: + raise CircuitBreakerOpenException("Circuit breaker is OPEN") + + try: + result = func(*args, **kwargs) + self._on_success() + return result + except self.expected_exception as e: + self._on_failure() + raise + + def _should_attempt_reset(self): + return ( + self.last_failure_time and + time.time() - self.last_failure_time >= self.recovery_timeout + ) + + def _on_success(self): + self.failure_count = 0 + self.state = 'CLOSED' + + def _on_failure(self): + self.failure_count += 1 + self.last_failure_time = time.time() + + if self.failure_count >= self.failure_threshold: + self.state = 'OPEN' + +# Usage with service migration +@CircuitBreaker(failure_threshold=3, recovery_timeout=30) +def call_new_service(request): + return new_service.process(request) + +def handle_request(request): + try: + return call_new_service(request) + except CircuitBreakerOpenException: + # Fallback to old service + return old_service.process(request) +``` + +## Monitoring and Validation + +### 1. Health Check Implementation + +```python +class HealthChecker: + def __init__(self): + self.checks = [] + + def add_check(self, name, check_func, timeout=5): + self.checks.append({ + 'name': name, + 'func': check_func, + 'timeout': timeout + }) + + async def run_checks(self): + """Run all health checks and return status""" + results = {} + overall_status = 'healthy' + + for check in self.checks: + try: + result = await asyncio.wait_for( + check['func'](), + timeout=check['timeout'] + ) + results[check['name']] = { + 'status': 'healthy', + 'result': result + } + except asyncio.TimeoutError: + results[check['name']] = { + 'status': 'unhealthy', + 'error': 'timeout' + } + overall_status = 'unhealthy' + except Exception as e: + results[check['name']] = { + 'status': 'unhealthy', + 'error': str(e) + } + overall_status = 'unhealthy' + + return { + 'status': overall_status, + 'checks': results, + 'timestamp': datetime.utcnow().isoformat() + } + +# Example health checks +health_checker = HealthChecker() + +async def database_check(): + """Check database connectivity""" + result = await db.execute("SELECT 1") + return result is not None + +async def external_api_check(): + """Check external API availability""" + response = await http_client.get("https://api.example.com/health") + return response.status_code == 200 + +async def memory_check(): + """Check memory usage""" + memory_usage = psutil.virtual_memory().percent + if memory_usage > 90: + raise Exception(f"Memory usage too high: {memory_usage}%") + return f"Memory usage: {memory_usage}%" + +health_checker.add_check("database", database_check) +health_checker.add_check("external_api", external_api_check) +health_checker.add_check("memory", memory_check) +``` + +### 2. Readiness vs Liveness Probes + +```yaml +# Kubernetes Pod with proper health checks +apiVersion: v1 +kind: Pod +metadata: + name: app-pod +spec: + containers: + - name: app + image: myapp:2.0.0 + ports: + - containerPort: 8080 + + # Readiness probe - determines if pod should receive traffic + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 3 + timeoutSeconds: 2 + successThreshold: 1 + failureThreshold: 3 + + # Liveness probe - determines if pod should be restarted + livenessProbe: + httpGet: + path: /live + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + + # Startup probe - gives app time to start before other probes + startupProbe: + httpGet: + path: /startup + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + successThreshold: 1 + failureThreshold: 30 # Allow up to 150 seconds for startup +``` + +### 3. Metrics and Alerting + +```python +class MigrationMetrics: + def __init__(self, prometheus_client): + self.prometheus = prometheus_client + + # Define custom metrics + self.migration_progress = Counter( + 'migration_progress_total', + 'Total migration operations completed', + ['operation', 'status'] + ) + + self.migration_duration = Histogram( + 'migration_operation_duration_seconds', + 'Time spent on migration operations', + ['operation'] + ) + + self.system_health = Gauge( + 'system_health_score', + 'Overall system health score (0-1)', + ['component'] + ) + + self.traffic_split = Gauge( + 'traffic_split_percentage', + 'Percentage of traffic going to each version', + ['version'] + ) + + def record_migration_step(self, operation, status, duration=None): + """Record completion of a migration step""" + self.migration_progress.labels(operation=operation, status=status).inc() + + if duration: + self.migration_duration.labels(operation=operation).observe(duration) + + def update_health_score(self, component, score): + """Update health score for a component""" + self.system_health.labels(component=component).set(score) + + def update_traffic_split(self, version_weights): + """Update traffic split metrics""" + for version, weight in version_weights.items(): + self.traffic_split.labels(version=version).set(weight) + +# Usage in migration +metrics = MigrationMetrics(prometheus_client) + +def perform_migration_step(operation): + start_time = time.time() + + try: + # Perform migration operation + result = execute_migration_operation(operation) + + # Record success + duration = time.time() - start_time + metrics.record_migration_step(operation, 'success', duration) + + return result + + except Exception as e: + # Record failure + duration = time.time() - start_time + metrics.record_migration_step(operation, 'failure', duration) + raise +``` + +## Rollback Strategies + +### 1. Immediate Rollback Triggers + +```python +class AutoRollbackSystem: + def __init__(self, metrics_client, deployment_client): + self.metrics = metrics_client + self.deployment = deployment_client + self.rollback_triggers = { + 'error_rate_spike': { + 'threshold': 0.05, # 5% error rate + 'window': 300, # 5 minutes + 'auto_rollback': True + }, + 'latency_increase': { + 'threshold': 2.0, # 2x baseline latency + 'window': 600, # 10 minutes + 'auto_rollback': False # Manual confirmation required + }, + 'availability_drop': { + 'threshold': 0.95, # Below 95% availability + 'window': 120, # 2 minutes + 'auto_rollback': True + } + } + + async def monitor_and_rollback(self, deployment_name): + """Monitor deployment and trigger rollback if needed""" + + while True: + for trigger_name, config in self.rollback_triggers.items(): + if await self.check_trigger(trigger_name, config): + if config['auto_rollback']: + await self.execute_rollback(deployment_name, trigger_name) + else: + await self.alert_for_manual_rollback(deployment_name, trigger_name) + + await asyncio.sleep(30) # Check every 30 seconds + + async def check_trigger(self, trigger_name, config): + """Check if rollback trigger condition is met""" + + current_value = await self.metrics.get_current_value(trigger_name) + baseline_value = await self.metrics.get_baseline_value(trigger_name) + + if trigger_name == 'error_rate_spike': + return current_value > config['threshold'] + elif trigger_name == 'latency_increase': + return current_value > baseline_value * config['threshold'] + elif trigger_name == 'availability_drop': + return current_value < config['threshold'] + + return False + + async def execute_rollback(self, deployment_name, reason): + """Execute automatic rollback""" + + print(f"Executing automatic rollback for {deployment_name}. Reason: {reason}") + + # Get previous revision + previous_revision = await self.deployment.get_previous_revision(deployment_name) + + # Perform rollback + await self.deployment.rollback_to_revision(deployment_name, previous_revision) + + # Notify stakeholders + await self.notify_rollback_executed(deployment_name, reason) +``` + +### 2. Data Rollback Strategies + +```sql +-- Point-in-time recovery setup +-- Create restore point before migration +SELECT pg_create_restore_point('pre_migration_' || to_char(now(), 'YYYYMMDD_HH24MISS')); + +-- Rollback using point-in-time recovery +-- (This would be executed on a separate recovery instance) +-- recovery.conf: +-- recovery_target_name = 'pre_migration_20240101_120000' +-- recovery_target_action = 'promote' +``` + +```python +class DataRollbackManager: + def __init__(self, database_client, backup_service): + self.db = database_client + self.backup = backup_service + + async def create_rollback_point(self, migration_id): + """Create a rollback point before migration""" + + rollback_point = { + 'migration_id': migration_id, + 'timestamp': datetime.utcnow(), + 'backup_location': None, + 'schema_snapshot': None + } + + # Create database backup + backup_path = await self.backup.create_backup( + f"pre_migration_{migration_id}_{int(time.time())}" + ) + rollback_point['backup_location'] = backup_path + + # Capture schema snapshot + schema_snapshot = await self.capture_schema_snapshot() + rollback_point['schema_snapshot'] = schema_snapshot + + # Store rollback point metadata + await self.store_rollback_metadata(rollback_point) + + return rollback_point + + async def execute_rollback(self, migration_id): + """Execute data rollback to specified point""" + + rollback_point = await self.get_rollback_metadata(migration_id) + + if not rollback_point: + raise Exception(f"No rollback point found for migration {migration_id}") + + # Stop application traffic + await self.stop_application_traffic() + + try: + # Restore from backup + await self.backup.restore_from_backup( + rollback_point['backup_location'] + ) + + # Validate data integrity + await self.validate_data_integrity( + rollback_point['schema_snapshot'] + ) + + # Update application configuration + await self.update_application_config(rollback_point) + + # Resume application traffic + await self.resume_application_traffic() + + print(f"Data rollback completed successfully for migration {migration_id}") + + except Exception as e: + # If rollback fails, we have a serious problem + await self.escalate_rollback_failure(migration_id, str(e)) + raise +``` + +## Best Practices Summary + +### 1. Pre-Migration Checklist +- [ ] Comprehensive backup strategy in place +- [ ] Rollback procedures tested in staging +- [ ] Monitoring and alerting configured +- [ ] Health checks implemented +- [ ] Feature flags configured +- [ ] Team communication plan established +- [ ] Load balancer configuration prepared +- [ ] Database connection pooling optimized + +### 2. During Migration +- [ ] Monitor key metrics continuously +- [ ] Validate each phase before proceeding +- [ ] Maintain detailed logs of all actions +- [ ] Keep stakeholders informed of progress +- [ ] Have rollback trigger ready +- [ ] Monitor user experience metrics +- [ ] Watch for performance degradation +- [ ] Validate data consistency + +### 3. Post-Migration +- [ ] Continue monitoring for 24-48 hours +- [ ] Validate all business processes +- [ ] Update documentation +- [ ] Conduct post-migration retrospective +- [ ] Archive migration artifacts +- [ ] Update disaster recovery procedures +- [ ] Plan for legacy system decommissioning + +### 4. Common Pitfalls to Avoid +- Don't skip testing rollback procedures +- Don't ignore performance impact +- Don't rush through validation phases +- Don't forget to communicate with stakeholders +- Don't assume health checks are sufficient +- Don't neglect data consistency validation +- Don't underestimate time requirements +- Don't overlook dependency impacts + +This comprehensive guide provides the foundation for implementing zero-downtime migrations across various system components while maintaining high availability and data integrity. \ No newline at end of file diff --git a/engineering/migration-architect/scripts/compatibility_checker.py b/engineering/migration-architect/scripts/compatibility_checker.py new file mode 100644 index 0000000..acd1297 --- /dev/null +++ b/engineering/migration-architect/scripts/compatibility_checker.py @@ -0,0 +1,883 @@ +#!/usr/bin/env python3 +""" +Compatibility Checker - Analyze schema and API compatibility between versions + +This tool analyzes schema and API changes between versions and identifies backward +compatibility issues including breaking changes, data type mismatches, missing fields, +constraint violations, and generates migration scripts suggestions. + +Author: Migration Architect Skill +Version: 1.0.0 +License: MIT +""" + +import json +import argparse +import sys +import re +import datetime +from typing import Dict, List, Any, Optional, Tuple, Set +from dataclasses import dataclass, asdict +from enum import Enum + + +class ChangeType(Enum): + """Types of changes detected""" + BREAKING = "breaking" + POTENTIALLY_BREAKING = "potentially_breaking" + NON_BREAKING = "non_breaking" + ADDITIVE = "additive" + + +class CompatibilityLevel(Enum): + """Compatibility assessment levels""" + FULLY_COMPATIBLE = "fully_compatible" + BACKWARD_COMPATIBLE = "backward_compatible" + POTENTIALLY_INCOMPATIBLE = "potentially_incompatible" + BREAKING_CHANGES = "breaking_changes" + + +@dataclass +class CompatibilityIssue: + """Individual compatibility issue""" + type: str + severity: str + description: str + field_path: str + old_value: Any + new_value: Any + impact: str + suggested_migration: str + affected_operations: List[str] + + +@dataclass +class MigrationScript: + """Migration script suggestion""" + script_type: str # sql, api, config + description: str + script_content: str + rollback_script: str + dependencies: List[str] + validation_query: str + + +@dataclass +class CompatibilityReport: + """Complete compatibility analysis report""" + schema_before: str + schema_after: str + analysis_date: str + overall_compatibility: str + breaking_changes_count: int + potentially_breaking_count: int + non_breaking_changes_count: int + additive_changes_count: int + issues: List[CompatibilityIssue] + migration_scripts: List[MigrationScript] + risk_assessment: Dict[str, Any] + recommendations: List[str] + + +class SchemaCompatibilityChecker: + """Main schema compatibility checker class""" + + def __init__(self): + self.type_compatibility_matrix = self._build_type_compatibility_matrix() + self.constraint_implications = self._build_constraint_implications() + + def _build_type_compatibility_matrix(self) -> Dict[str, Dict[str, str]]: + """Build data type compatibility matrix""" + return { + # SQL data types compatibility + "varchar": { + "text": "compatible", + "char": "potentially_breaking", # length might be different + "nvarchar": "compatible", + "int": "breaking", + "bigint": "breaking", + "decimal": "breaking", + "datetime": "breaking", + "boolean": "breaking" + }, + "int": { + "bigint": "compatible", + "smallint": "potentially_breaking", # range reduction + "decimal": "compatible", + "float": "potentially_breaking", # precision loss + "varchar": "breaking", + "boolean": "breaking" + }, + "bigint": { + "int": "potentially_breaking", # range reduction + "decimal": "compatible", + "varchar": "breaking", + "boolean": "breaking" + }, + "decimal": { + "float": "potentially_breaking", # precision loss + "int": "potentially_breaking", # precision loss + "bigint": "potentially_breaking", # precision loss + "varchar": "breaking", + "boolean": "breaking" + }, + "datetime": { + "timestamp": "compatible", + "date": "potentially_breaking", # time component lost + "varchar": "breaking", + "int": "breaking" + }, + "boolean": { + "tinyint": "compatible", + "varchar": "breaking", + "int": "breaking" + }, + # JSON/API field types + "string": { + "number": "breaking", + "boolean": "breaking", + "array": "breaking", + "object": "breaking", + "null": "potentially_breaking" + }, + "number": { + "string": "breaking", + "boolean": "breaking", + "array": "breaking", + "object": "breaking", + "null": "potentially_breaking" + }, + "boolean": { + "string": "breaking", + "number": "breaking", + "array": "breaking", + "object": "breaking", + "null": "potentially_breaking" + }, + "array": { + "string": "breaking", + "number": "breaking", + "boolean": "breaking", + "object": "breaking", + "null": "potentially_breaking" + }, + "object": { + "string": "breaking", + "number": "breaking", + "boolean": "breaking", + "array": "breaking", + "null": "potentially_breaking" + } + } + + def _build_constraint_implications(self) -> Dict[str, Dict[str, str]]: + """Build constraint change implications""" + return { + "required": { + "added": "breaking", # Previously optional field now required + "removed": "non_breaking" # Previously required field now optional + }, + "not_null": { + "added": "breaking", # Previously nullable now NOT NULL + "removed": "non_breaking" # Previously NOT NULL now nullable + }, + "unique": { + "added": "potentially_breaking", # May fail if duplicates exist + "removed": "non_breaking" # No longer enforcing uniqueness + }, + "primary_key": { + "added": "breaking", # Major structural change + "removed": "breaking", # Major structural change + "modified": "breaking" # Primary key change is always breaking + }, + "foreign_key": { + "added": "potentially_breaking", # May fail if referential integrity violated + "removed": "potentially_breaking", # May allow orphaned records + "modified": "breaking" # Reference change is breaking + }, + "check": { + "added": "potentially_breaking", # May fail if existing data violates check + "removed": "non_breaking", # No longer enforcing check + "modified": "potentially_breaking" # Different validation rules + }, + "index": { + "added": "non_breaking", # Performance improvement + "removed": "non_breaking", # Performance impact only + "modified": "non_breaking" # Performance impact only + } + } + + def analyze_database_schema(self, before_schema: Dict[str, Any], + after_schema: Dict[str, Any]) -> CompatibilityReport: + """Analyze database schema compatibility""" + issues = [] + migration_scripts = [] + + before_tables = before_schema.get("tables", {}) + after_tables = after_schema.get("tables", {}) + + # Check for removed tables + for table_name in before_tables: + if table_name not in after_tables: + issues.append(CompatibilityIssue( + type="table_removed", + severity="breaking", + description=f"Table '{table_name}' has been removed", + field_path=f"tables.{table_name}", + old_value=before_tables[table_name], + new_value=None, + impact="All operations on this table will fail", + suggested_migration=f"CREATE VIEW {table_name} AS SELECT * FROM replacement_table;", + affected_operations=["SELECT", "INSERT", "UPDATE", "DELETE"] + )) + + # Check for added tables + for table_name in after_tables: + if table_name not in before_tables: + migration_scripts.append(MigrationScript( + script_type="sql", + description=f"Create new table {table_name}", + script_content=self._generate_create_table_sql(table_name, after_tables[table_name]), + rollback_script=f"DROP TABLE IF EXISTS {table_name};", + dependencies=[], + validation_query=f"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';" + )) + + # Check for modified tables + for table_name in set(before_tables.keys()) & set(after_tables.keys()): + table_issues, table_scripts = self._analyze_table_changes( + table_name, before_tables[table_name], after_tables[table_name] + ) + issues.extend(table_issues) + migration_scripts.extend(table_scripts) + + return self._build_compatibility_report( + before_schema, after_schema, issues, migration_scripts + ) + + def analyze_api_schema(self, before_schema: Dict[str, Any], + after_schema: Dict[str, Any]) -> CompatibilityReport: + """Analyze REST API schema compatibility""" + issues = [] + migration_scripts = [] + + # Analyze endpoints + before_paths = before_schema.get("paths", {}) + after_paths = after_schema.get("paths", {}) + + # Check for removed endpoints + for path in before_paths: + if path not in after_paths: + for method in before_paths[path]: + issues.append(CompatibilityIssue( + type="endpoint_removed", + severity="breaking", + description=f"Endpoint {method.upper()} {path} has been removed", + field_path=f"paths.{path}.{method}", + old_value=before_paths[path][method], + new_value=None, + impact="Client requests to this endpoint will fail with 404", + suggested_migration=f"Implement redirect to replacement endpoint or maintain backward compatibility stub", + affected_operations=[f"{method.upper()} {path}"] + )) + + # Check for modified endpoints + for path in set(before_paths.keys()) & set(after_paths.keys()): + path_issues, path_scripts = self._analyze_endpoint_changes( + path, before_paths[path], after_paths[path] + ) + issues.extend(path_issues) + migration_scripts.extend(path_scripts) + + # Analyze data models + before_components = before_schema.get("components", {}).get("schemas", {}) + after_components = after_schema.get("components", {}).get("schemas", {}) + + for model_name in set(before_components.keys()) & set(after_components.keys()): + model_issues, model_scripts = self._analyze_model_changes( + model_name, before_components[model_name], after_components[model_name] + ) + issues.extend(model_issues) + migration_scripts.extend(model_scripts) + + return self._build_compatibility_report( + before_schema, after_schema, issues, migration_scripts + ) + + def _analyze_table_changes(self, table_name: str, before_table: Dict[str, Any], + after_table: Dict[str, Any]) -> Tuple[List[CompatibilityIssue], List[MigrationScript]]: + """Analyze changes to a specific table""" + issues = [] + scripts = [] + + before_columns = before_table.get("columns", {}) + after_columns = after_table.get("columns", {}) + + # Check for removed columns + for col_name in before_columns: + if col_name not in after_columns: + issues.append(CompatibilityIssue( + type="column_removed", + severity="breaking", + description=f"Column '{col_name}' removed from table '{table_name}'", + field_path=f"tables.{table_name}.columns.{col_name}", + old_value=before_columns[col_name], + new_value=None, + impact="SELECT statements including this column will fail", + suggested_migration=f"ALTER TABLE {table_name} ADD COLUMN {col_name}_deprecated AS computed_value;", + affected_operations=["SELECT", "INSERT", "UPDATE"] + )) + + # Check for added columns + for col_name in after_columns: + if col_name not in before_columns: + col_def = after_columns[col_name] + is_required = col_def.get("nullable", True) == False and col_def.get("default") is None + + if is_required: + issues.append(CompatibilityIssue( + type="required_column_added", + severity="breaking", + description=f"Required column '{col_name}' added to table '{table_name}'", + field_path=f"tables.{table_name}.columns.{col_name}", + old_value=None, + new_value=col_def, + impact="INSERT statements without this column will fail", + suggested_migration=f"Add default value or make column nullable initially", + affected_operations=["INSERT"] + )) + + scripts.append(MigrationScript( + script_type="sql", + description=f"Add column {col_name} to table {table_name}", + script_content=f"ALTER TABLE {table_name} ADD COLUMN {self._generate_column_definition(col_name, col_def)};", + rollback_script=f"ALTER TABLE {table_name} DROP COLUMN {col_name};", + dependencies=[], + validation_query=f"SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{col_name}';" + )) + + # Check for modified columns + for col_name in set(before_columns.keys()) & set(after_columns.keys()): + col_issues, col_scripts = self._analyze_column_changes( + table_name, col_name, before_columns[col_name], after_columns[col_name] + ) + issues.extend(col_issues) + scripts.extend(col_scripts) + + # Check constraint changes + before_constraints = before_table.get("constraints", {}) + after_constraints = after_table.get("constraints", {}) + + constraint_issues, constraint_scripts = self._analyze_constraint_changes( + table_name, before_constraints, after_constraints + ) + issues.extend(constraint_issues) + scripts.extend(constraint_scripts) + + return issues, scripts + + def _analyze_column_changes(self, table_name: str, col_name: str, + before_col: Dict[str, Any], after_col: Dict[str, Any]) -> Tuple[List[CompatibilityIssue], List[MigrationScript]]: + """Analyze changes to a specific column""" + issues = [] + scripts = [] + + # Check data type changes + before_type = before_col.get("type", "").lower() + after_type = after_col.get("type", "").lower() + + if before_type != after_type: + compatibility = self.type_compatibility_matrix.get(before_type, {}).get(after_type, "breaking") + + if compatibility == "breaking": + issues.append(CompatibilityIssue( + type="incompatible_type_change", + severity="breaking", + description=f"Column '{col_name}' type changed from {before_type} to {after_type}", + field_path=f"tables.{table_name}.columns.{col_name}.type", + old_value=before_type, + new_value=after_type, + impact="Data conversion may fail or lose precision", + suggested_migration=f"Add conversion logic and validate data integrity", + affected_operations=["SELECT", "INSERT", "UPDATE", "WHERE clauses"] + )) + + scripts.append(MigrationScript( + script_type="sql", + description=f"Convert column {col_name} from {before_type} to {after_type}", + script_content=f"ALTER TABLE {table_name} ALTER COLUMN {col_name} TYPE {after_type} USING {col_name}::{after_type};", + rollback_script=f"ALTER TABLE {table_name} ALTER COLUMN {col_name} TYPE {before_type};", + dependencies=[f"backup_{table_name}"], + validation_query=f"SELECT COUNT(*) FROM {table_name} WHERE {col_name} IS NOT NULL;" + )) + + elif compatibility == "potentially_breaking": + issues.append(CompatibilityIssue( + type="risky_type_change", + severity="potentially_breaking", + description=f"Column '{col_name}' type changed from {before_type} to {after_type} - may lose data", + field_path=f"tables.{table_name}.columns.{col_name}.type", + old_value=before_type, + new_value=after_type, + impact="Potential data loss or precision reduction", + suggested_migration=f"Validate all existing data can be converted safely", + affected_operations=["Data integrity"] + )) + + # Check nullability changes + before_nullable = before_col.get("nullable", True) + after_nullable = after_col.get("nullable", True) + + if before_nullable != after_nullable: + if before_nullable and not after_nullable: # null -> not null + issues.append(CompatibilityIssue( + type="nullability_restriction", + severity="breaking", + description=f"Column '{col_name}' changed from nullable to NOT NULL", + field_path=f"tables.{table_name}.columns.{col_name}.nullable", + old_value=before_nullable, + new_value=after_nullable, + impact="Existing NULL values will cause constraint violations", + suggested_migration=f"Update NULL values to valid defaults before applying NOT NULL constraint", + affected_operations=["INSERT", "UPDATE"] + )) + + scripts.append(MigrationScript( + script_type="sql", + description=f"Make column {col_name} NOT NULL", + script_content=f""" + -- Update NULL values first + UPDATE {table_name} SET {col_name} = 'DEFAULT_VALUE' WHERE {col_name} IS NULL; + -- Add NOT NULL constraint + ALTER TABLE {table_name} ALTER COLUMN {col_name} SET NOT NULL; + """, + rollback_script=f"ALTER TABLE {table_name} ALTER COLUMN {col_name} DROP NOT NULL;", + dependencies=[], + validation_query=f"SELECT COUNT(*) FROM {table_name} WHERE {col_name} IS NULL;" + )) + + # Check length/precision changes + before_length = before_col.get("length") + after_length = after_col.get("length") + + if before_length and after_length and before_length != after_length: + if after_length < before_length: + issues.append(CompatibilityIssue( + type="length_reduction", + severity="potentially_breaking", + description=f"Column '{col_name}' length reduced from {before_length} to {after_length}", + field_path=f"tables.{table_name}.columns.{col_name}.length", + old_value=before_length, + new_value=after_length, + impact="Data truncation may occur for values exceeding new length", + suggested_migration=f"Validate no existing data exceeds new length limit", + affected_operations=["INSERT", "UPDATE"] + )) + + return issues, scripts + + def _analyze_constraint_changes(self, table_name: str, before_constraints: Dict[str, Any], + after_constraints: Dict[str, Any]) -> Tuple[List[CompatibilityIssue], List[MigrationScript]]: + """Analyze constraint changes""" + issues = [] + scripts = [] + + for constraint_type in ["primary_key", "foreign_key", "unique", "check"]: + before_constraint = before_constraints.get(constraint_type, []) + after_constraint = after_constraints.get(constraint_type, []) + + # Convert to sets for comparison + before_set = set(str(c) for c in before_constraint) if isinstance(before_constraint, list) else {str(before_constraint)} if before_constraint else set() + after_set = set(str(c) for c in after_constraint) if isinstance(after_constraint, list) else {str(after_constraint)} if after_constraint else set() + + # Check for removed constraints + for constraint in before_set - after_set: + implication = self.constraint_implications.get(constraint_type, {}).get("removed", "non_breaking") + issues.append(CompatibilityIssue( + type=f"{constraint_type}_removed", + severity=implication, + description=f"{constraint_type.replace('_', ' ').title()} constraint '{constraint}' removed from table '{table_name}'", + field_path=f"tables.{table_name}.constraints.{constraint_type}", + old_value=constraint, + new_value=None, + impact=f"No longer enforcing {constraint_type} constraint", + suggested_migration=f"Consider application-level validation for removed constraint", + affected_operations=["INSERT", "UPDATE", "DELETE"] + )) + + # Check for added constraints + for constraint in after_set - before_set: + implication = self.constraint_implications.get(constraint_type, {}).get("added", "potentially_breaking") + issues.append(CompatibilityIssue( + type=f"{constraint_type}_added", + severity=implication, + description=f"New {constraint_type.replace('_', ' ')} constraint '{constraint}' added to table '{table_name}'", + field_path=f"tables.{table_name}.constraints.{constraint_type}", + old_value=None, + new_value=constraint, + impact=f"New {constraint_type} constraint may reject existing data", + suggested_migration=f"Validate existing data complies with new constraint", + affected_operations=["INSERT", "UPDATE"] + )) + + scripts.append(MigrationScript( + script_type="sql", + description=f"Add {constraint_type} constraint to {table_name}", + script_content=f"ALTER TABLE {table_name} ADD CONSTRAINT {constraint_type}_{table_name} {constraint_type.upper()} ({constraint});", + rollback_script=f"ALTER TABLE {table_name} DROP CONSTRAINT {constraint_type}_{table_name};", + dependencies=[], + validation_query=f"SELECT COUNT(*) FROM information_schema.table_constraints WHERE table_name = '{table_name}' AND constraint_type = '{constraint_type.upper()}';" + )) + + return issues, scripts + + def _analyze_endpoint_changes(self, path: str, before_endpoint: Dict[str, Any], + after_endpoint: Dict[str, Any]) -> Tuple[List[CompatibilityIssue], List[MigrationScript]]: + """Analyze changes to an API endpoint""" + issues = [] + scripts = [] + + for method in set(before_endpoint.keys()) & set(after_endpoint.keys()): + before_method = before_endpoint[method] + after_method = after_endpoint[method] + + # Check parameter changes + before_params = before_method.get("parameters", []) + after_params = after_method.get("parameters", []) + + before_param_names = {p["name"] for p in before_params} + after_param_names = {p["name"] for p in after_params} + + # Check for removed required parameters + for param_name in before_param_names - after_param_names: + param = next(p for p in before_params if p["name"] == param_name) + if param.get("required", False): + issues.append(CompatibilityIssue( + type="required_parameter_removed", + severity="breaking", + description=f"Required parameter '{param_name}' removed from {method.upper()} {path}", + field_path=f"paths.{path}.{method}.parameters", + old_value=param, + new_value=None, + impact="Client requests with this parameter will fail", + suggested_migration="Implement parameter validation with backward compatibility", + affected_operations=[f"{method.upper()} {path}"] + )) + + # Check for added required parameters + for param_name in after_param_names - before_param_names: + param = next(p for p in after_params if p["name"] == param_name) + if param.get("required", False): + issues.append(CompatibilityIssue( + type="required_parameter_added", + severity="breaking", + description=f"New required parameter '{param_name}' added to {method.upper()} {path}", + field_path=f"paths.{path}.{method}.parameters", + old_value=None, + new_value=param, + impact="Client requests without this parameter will fail", + suggested_migration="Provide default value or make parameter optional initially", + affected_operations=[f"{method.upper()} {path}"] + )) + + # Check response schema changes + before_responses = before_method.get("responses", {}) + after_responses = after_method.get("responses", {}) + + for status_code in before_responses: + if status_code in after_responses: + before_schema = before_responses[status_code].get("content", {}).get("application/json", {}).get("schema", {}) + after_schema = after_responses[status_code].get("content", {}).get("application/json", {}).get("schema", {}) + + if before_schema != after_schema: + issues.append(CompatibilityIssue( + type="response_schema_changed", + severity="potentially_breaking", + description=f"Response schema changed for {method.upper()} {path} (status {status_code})", + field_path=f"paths.{path}.{method}.responses.{status_code}", + old_value=before_schema, + new_value=after_schema, + impact="Client response parsing may fail", + suggested_migration="Implement versioned API responses", + affected_operations=[f"{method.upper()} {path}"] + )) + + return issues, scripts + + def _analyze_model_changes(self, model_name: str, before_model: Dict[str, Any], + after_model: Dict[str, Any]) -> Tuple[List[CompatibilityIssue], List[MigrationScript]]: + """Analyze changes to an API data model""" + issues = [] + scripts = [] + + before_props = before_model.get("properties", {}) + after_props = after_model.get("properties", {}) + before_required = set(before_model.get("required", [])) + after_required = set(after_model.get("required", [])) + + # Check for removed properties + for prop_name in set(before_props.keys()) - set(after_props.keys()): + issues.append(CompatibilityIssue( + type="property_removed", + severity="breaking", + description=f"Property '{prop_name}' removed from model '{model_name}'", + field_path=f"components.schemas.{model_name}.properties.{prop_name}", + old_value=before_props[prop_name], + new_value=None, + impact="Client code expecting this property will fail", + suggested_migration="Use API versioning to maintain backward compatibility", + affected_operations=["Serialization", "Deserialization"] + )) + + # Check for newly required properties + for prop_name in after_required - before_required: + issues.append(CompatibilityIssue( + type="property_made_required", + severity="breaking", + description=f"Property '{prop_name}' is now required in model '{model_name}'", + field_path=f"components.schemas.{model_name}.required", + old_value=list(before_required), + new_value=list(after_required), + impact="Client requests without this property will fail validation", + suggested_migration="Provide default values or implement gradual rollout", + affected_operations=["Request validation"] + )) + + # Check for property type changes + for prop_name in set(before_props.keys()) & set(after_props.keys()): + before_type = before_props[prop_name].get("type") + after_type = after_props[prop_name].get("type") + + if before_type != after_type: + compatibility = self.type_compatibility_matrix.get(before_type, {}).get(after_type, "breaking") + issues.append(CompatibilityIssue( + type="property_type_changed", + severity=compatibility, + description=f"Property '{prop_name}' type changed from {before_type} to {after_type} in model '{model_name}'", + field_path=f"components.schemas.{model_name}.properties.{prop_name}.type", + old_value=before_type, + new_value=after_type, + impact="Client serialization/deserialization may fail", + suggested_migration="Implement type coercion or API versioning", + affected_operations=["Serialization", "Deserialization"] + )) + + return issues, scripts + + def _build_compatibility_report(self, before_schema: Dict[str, Any], after_schema: Dict[str, Any], + issues: List[CompatibilityIssue], migration_scripts: List[MigrationScript]) -> CompatibilityReport: + """Build the final compatibility report""" + # Count issues by severity + breaking_count = sum(1 for issue in issues if issue.severity == "breaking") + potentially_breaking_count = sum(1 for issue in issues if issue.severity == "potentially_breaking") + non_breaking_count = sum(1 for issue in issues if issue.severity == "non_breaking") + additive_count = sum(1 for issue in issues if issue.type == "additive") + + # Determine overall compatibility + if breaking_count > 0: + overall_compatibility = "breaking_changes" + elif potentially_breaking_count > 0: + overall_compatibility = "potentially_incompatible" + elif non_breaking_count > 0: + overall_compatibility = "backward_compatible" + else: + overall_compatibility = "fully_compatible" + + # Generate risk assessment + risk_assessment = { + "overall_risk": "high" if breaking_count > 0 else "medium" if potentially_breaking_count > 0 else "low", + "deployment_risk": "requires_coordinated_deployment" if breaking_count > 0 else "safe_independent_deployment", + "rollback_complexity": "high" if breaking_count > 3 else "medium" if breaking_count > 0 else "low", + "testing_requirements": ["integration_testing", "regression_testing"] + + (["data_migration_testing"] if any(s.script_type == "sql" for s in migration_scripts) else []) + } + + # Generate recommendations + recommendations = [] + if breaking_count > 0: + recommendations.append("Implement API versioning to maintain backward compatibility") + recommendations.append("Plan for coordinated deployment with all clients") + recommendations.append("Implement comprehensive rollback procedures") + + if potentially_breaking_count > 0: + recommendations.append("Conduct thorough testing with realistic data volumes") + recommendations.append("Implement monitoring for migration success metrics") + + if migration_scripts: + recommendations.append("Test all migration scripts in staging environment") + recommendations.append("Implement migration progress monitoring") + + recommendations.append("Create detailed communication plan for stakeholders") + recommendations.append("Implement feature flags for gradual rollout") + + return CompatibilityReport( + schema_before=json.dumps(before_schema, indent=2)[:500] + "..." if len(json.dumps(before_schema)) > 500 else json.dumps(before_schema, indent=2), + schema_after=json.dumps(after_schema, indent=2)[:500] + "..." if len(json.dumps(after_schema)) > 500 else json.dumps(after_schema, indent=2), + analysis_date=datetime.datetime.now().isoformat(), + overall_compatibility=overall_compatibility, + breaking_changes_count=breaking_count, + potentially_breaking_count=potentially_breaking_count, + non_breaking_changes_count=non_breaking_count, + additive_changes_count=additive_count, + issues=issues, + migration_scripts=migration_scripts, + risk_assessment=risk_assessment, + recommendations=recommendations + ) + + def _generate_create_table_sql(self, table_name: str, table_def: Dict[str, Any]) -> str: + """Generate CREATE TABLE SQL statement""" + columns = [] + for col_name, col_def in table_def.get("columns", {}).items(): + columns.append(self._generate_column_definition(col_name, col_def)) + + return f"CREATE TABLE {table_name} (\n " + ",\n ".join(columns) + "\n);" + + def _generate_column_definition(self, col_name: str, col_def: Dict[str, Any]) -> str: + """Generate column definition for SQL""" + col_type = col_def.get("type", "VARCHAR(255)") + nullable = "" if col_def.get("nullable", True) else " NOT NULL" + default = f" DEFAULT {col_def.get('default')}" if col_def.get("default") is not None else "" + + return f"{col_name} {col_type}{nullable}{default}" + + def generate_human_readable_report(self, report: CompatibilityReport) -> str: + """Generate human-readable compatibility report""" + output = [] + output.append("=" * 80) + output.append("COMPATIBILITY ANALYSIS REPORT") + output.append("=" * 80) + output.append(f"Analysis Date: {report.analysis_date}") + output.append(f"Overall Compatibility: {report.overall_compatibility.upper()}") + output.append("") + + # Summary + output.append("SUMMARY") + output.append("-" * 40) + output.append(f"Breaking Changes: {report.breaking_changes_count}") + output.append(f"Potentially Breaking: {report.potentially_breaking_count}") + output.append(f"Non-Breaking Changes: {report.non_breaking_changes_count}") + output.append(f"Additive Changes: {report.additive_changes_count}") + output.append(f"Total Issues Found: {len(report.issues)}") + output.append("") + + # Risk Assessment + output.append("RISK ASSESSMENT") + output.append("-" * 40) + for key, value in report.risk_assessment.items(): + output.append(f"{key.replace('_', ' ').title()}: {value}") + output.append("") + + # Issues by Severity + issues_by_severity = {} + for issue in report.issues: + if issue.severity not in issues_by_severity: + issues_by_severity[issue.severity] = [] + issues_by_severity[issue.severity].append(issue) + + for severity in ["breaking", "potentially_breaking", "non_breaking"]: + if severity in issues_by_severity: + output.append(f"{severity.upper().replace('_', ' ')} ISSUES") + output.append("-" * 40) + for issue in issues_by_severity[severity]: + output.append(f"• {issue.description}") + output.append(f" Field: {issue.field_path}") + output.append(f" Impact: {issue.impact}") + output.append(f" Migration: {issue.suggested_migration}") + if issue.affected_operations: + output.append(f" Affected Operations: {', '.join(issue.affected_operations)}") + output.append("") + + # Migration Scripts + if report.migration_scripts: + output.append("SUGGESTED MIGRATION SCRIPTS") + output.append("-" * 40) + for i, script in enumerate(report.migration_scripts, 1): + output.append(f"{i}. {script.description}") + output.append(f" Type: {script.script_type}") + output.append(" Script:") + for line in script.script_content.split('\n'): + output.append(f" {line}") + output.append("") + + # Recommendations + output.append("RECOMMENDATIONS") + output.append("-" * 40) + for i, rec in enumerate(report.recommendations, 1): + output.append(f"{i}. {rec}") + output.append("") + + return "\n".join(output) + + +def main(): + """Main function with command line interface""" + parser = argparse.ArgumentParser(description="Analyze schema and API compatibility between versions") + parser.add_argument("--before", required=True, help="Before schema file (JSON)") + parser.add_argument("--after", required=True, help="After schema file (JSON)") + parser.add_argument("--type", choices=["database", "api"], default="database", help="Schema type to analyze") + parser.add_argument("--output", "-o", help="Output file for compatibility report (JSON)") + parser.add_argument("--format", "-f", choices=["json", "text", "both"], default="both", help="Output format") + + args = parser.parse_args() + + try: + # Load schemas + with open(args.before, 'r') as f: + before_schema = json.load(f) + + with open(args.after, 'r') as f: + after_schema = json.load(f) + + # Analyze compatibility + checker = SchemaCompatibilityChecker() + + if args.type == "database": + report = checker.analyze_database_schema(before_schema, after_schema) + else: # api + report = checker.analyze_api_schema(before_schema, after_schema) + + # Output results + if args.format in ["json", "both"]: + report_dict = asdict(report) + if args.output: + with open(args.output, 'w') as f: + json.dump(report_dict, f, indent=2) + print(f"Compatibility report saved to {args.output}") + else: + print(json.dumps(report_dict, indent=2)) + + if args.format in ["text", "both"]: + human_report = checker.generate_human_readable_report(report) + text_output = args.output.replace('.json', '.txt') if args.output else None + if text_output: + with open(text_output, 'w') as f: + f.write(human_report) + print(f"Human-readable report saved to {text_output}") + else: + print("\n" + "="*80) + print("HUMAN-READABLE COMPATIBILITY REPORT") + print("="*80) + print(human_report) + + # Return exit code based on compatibility + if report.breaking_changes_count > 0: + return 2 # Breaking changes found + elif report.potentially_breaking_count > 0: + return 1 # Potentially breaking changes found + else: + return 0 # No compatibility issues + + except FileNotFoundError as e: + print(f"Error: File not found: {e}", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON: {e}", file=sys.stderr) + return 1 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/engineering/migration-architect/scripts/migration_planner.py b/engineering/migration-architect/scripts/migration_planner.py new file mode 100644 index 0000000..23cc976 --- /dev/null +++ b/engineering/migration-architect/scripts/migration_planner.py @@ -0,0 +1,661 @@ +#!/usr/bin/env python3 +""" +Migration Planner - Generate comprehensive migration plans with risk assessment + +This tool analyzes migration specifications and generates detailed, phased migration plans +including pre-migration checklists, validation gates, rollback triggers, timeline estimates, +and risk matrices. + +Author: Migration Architect Skill +Version: 1.0.0 +License: MIT +""" + +import json +import argparse +import sys +import datetime +import hashlib +import math +from typing import Dict, List, Any, Optional, Tuple +from dataclasses import dataclass, asdict +from enum import Enum + + +class MigrationType(Enum): + """Migration type enumeration""" + DATABASE = "database" + SERVICE = "service" + INFRASTRUCTURE = "infrastructure" + DATA = "data" + API = "api" + + +class MigrationComplexity(Enum): + """Migration complexity levels""" + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + CRITICAL = "critical" + + +class RiskLevel(Enum): + """Risk assessment levels""" + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + CRITICAL = "critical" + + +@dataclass +class MigrationConstraint: + """Migration constraint definition""" + type: str + description: str + impact: str + mitigation: str + + +@dataclass +class MigrationPhase: + """Individual migration phase""" + name: str + description: str + duration_hours: int + dependencies: List[str] + validation_criteria: List[str] + rollback_triggers: List[str] + tasks: List[str] + risk_level: str + resources_required: List[str] + + +@dataclass +class RiskItem: + """Individual risk assessment item""" + category: str + description: str + probability: str # low, medium, high + impact: str # low, medium, high + severity: str # low, medium, high, critical + mitigation: str + owner: str + + +@dataclass +class MigrationPlan: + """Complete migration plan structure""" + migration_id: str + source_system: str + target_system: str + migration_type: str + complexity: str + estimated_duration_hours: int + phases: List[MigrationPhase] + risks: List[RiskItem] + success_criteria: List[str] + rollback_plan: Dict[str, Any] + stakeholders: List[str] + created_at: str + + +class MigrationPlanner: + """Main migration planner class""" + + def __init__(self): + self.migration_patterns = self._load_migration_patterns() + self.risk_templates = self._load_risk_templates() + + def _load_migration_patterns(self) -> Dict[str, Any]: + """Load predefined migration patterns""" + return { + "database": { + "schema_change": { + "phases": ["preparation", "expand", "migrate", "contract", "cleanup"], + "base_duration": 24, + "complexity_multiplier": {"low": 1.0, "medium": 1.5, "high": 2.5, "critical": 4.0} + }, + "data_migration": { + "phases": ["assessment", "setup", "bulk_copy", "delta_sync", "validation", "cutover"], + "base_duration": 48, + "complexity_multiplier": {"low": 1.2, "medium": 2.0, "high": 3.0, "critical": 5.0} + } + }, + "service": { + "strangler_fig": { + "phases": ["intercept", "implement", "redirect", "validate", "retire"], + "base_duration": 168, # 1 week + "complexity_multiplier": {"low": 0.8, "medium": 1.0, "high": 1.8, "critical": 3.0} + }, + "parallel_run": { + "phases": ["setup", "deploy", "shadow", "compare", "cutover", "cleanup"], + "base_duration": 72, + "complexity_multiplier": {"low": 1.0, "medium": 1.3, "high": 2.0, "critical": 3.5} + } + }, + "infrastructure": { + "cloud_migration": { + "phases": ["assessment", "design", "pilot", "migration", "optimization", "decommission"], + "base_duration": 720, # 30 days + "complexity_multiplier": {"low": 0.6, "medium": 1.0, "high": 1.5, "critical": 2.5} + }, + "on_prem_to_cloud": { + "phases": ["discovery", "planning", "pilot", "migration", "validation", "cutover"], + "base_duration": 480, # 20 days + "complexity_multiplier": {"low": 0.8, "medium": 1.2, "high": 2.0, "critical": 3.0} + } + } + } + + def _load_risk_templates(self) -> Dict[str, List[RiskItem]]: + """Load risk templates for different migration types""" + return { + "database": [ + RiskItem("technical", "Data corruption during migration", "low", "critical", "high", + "Implement comprehensive backup and validation procedures", "DBA Team"), + RiskItem("technical", "Extended downtime due to migration complexity", "medium", "high", "high", + "Use blue-green deployment and phased migration approach", "DevOps Team"), + RiskItem("business", "Business process disruption", "medium", "high", "high", + "Communicate timeline and provide alternate workflows", "Business Owner"), + RiskItem("operational", "Insufficient rollback testing", "high", "critical", "critical", + "Execute full rollback procedures in staging environment", "QA Team") + ], + "service": [ + RiskItem("technical", "Service compatibility issues", "medium", "high", "high", + "Implement comprehensive integration testing", "Development Team"), + RiskItem("technical", "Performance degradation", "medium", "medium", "medium", + "Conduct load testing and performance benchmarking", "DevOps Team"), + RiskItem("business", "Feature parity gaps", "high", "high", "high", + "Document feature mapping and acceptance criteria", "Product Owner"), + RiskItem("operational", "Monitoring gap during transition", "medium", "medium", "medium", + "Set up dual monitoring and alerting systems", "SRE Team") + ], + "infrastructure": [ + RiskItem("technical", "Network connectivity issues", "medium", "critical", "high", + "Implement redundant network paths and monitoring", "Network Team"), + RiskItem("technical", "Security configuration drift", "high", "high", "high", + "Automated security scanning and compliance checks", "Security Team"), + RiskItem("business", "Cost overrun during transition", "high", "medium", "medium", + "Implement cost monitoring and budget alerts", "Finance Team"), + RiskItem("operational", "Team knowledge gaps", "high", "medium", "medium", + "Provide training and create detailed documentation", "Platform Team") + ] + } + + def _calculate_complexity(self, spec: Dict[str, Any]) -> str: + """Calculate migration complexity based on specification""" + complexity_score = 0 + + # Data volume complexity + data_volume = spec.get("constraints", {}).get("data_volume_gb", 0) + if data_volume > 10000: + complexity_score += 3 + elif data_volume > 1000: + complexity_score += 2 + elif data_volume > 100: + complexity_score += 1 + + # System dependencies + dependencies = len(spec.get("constraints", {}).get("dependencies", [])) + if dependencies > 10: + complexity_score += 3 + elif dependencies > 5: + complexity_score += 2 + elif dependencies > 2: + complexity_score += 1 + + # Downtime constraints + max_downtime = spec.get("constraints", {}).get("max_downtime_minutes", 480) + if max_downtime < 60: + complexity_score += 3 + elif max_downtime < 240: + complexity_score += 2 + elif max_downtime < 480: + complexity_score += 1 + + # Special requirements + special_reqs = spec.get("constraints", {}).get("special_requirements", []) + complexity_score += len(special_reqs) + + if complexity_score >= 8: + return "critical" + elif complexity_score >= 5: + return "high" + elif complexity_score >= 3: + return "medium" + else: + return "low" + + def _estimate_duration(self, migration_type: str, migration_pattern: str, complexity: str) -> int: + """Estimate migration duration based on type, pattern, and complexity""" + pattern_info = self.migration_patterns.get(migration_type, {}).get(migration_pattern, {}) + base_duration = pattern_info.get("base_duration", 48) + multiplier = pattern_info.get("complexity_multiplier", {}).get(complexity, 1.5) + + return int(base_duration * multiplier) + + def _generate_phases(self, spec: Dict[str, Any]) -> List[MigrationPhase]: + """Generate migration phases based on specification""" + migration_type = spec.get("type") + migration_pattern = spec.get("pattern", "") + complexity = self._calculate_complexity(spec) + + pattern_info = self.migration_patterns.get(migration_type, {}) + if migration_pattern in pattern_info: + phase_names = pattern_info[migration_pattern]["phases"] + else: + # Default phases based on migration type + phase_names = { + "database": ["preparation", "migration", "validation", "cutover"], + "service": ["preparation", "deployment", "testing", "cutover"], + "infrastructure": ["assessment", "preparation", "migration", "validation"] + }.get(migration_type, ["preparation", "execution", "validation", "cleanup"]) + + phases = [] + total_duration = self._estimate_duration(migration_type, migration_pattern, complexity) + phase_duration = total_duration // len(phase_names) + + for i, phase_name in enumerate(phase_names): + phase = self._create_phase(phase_name, phase_duration, complexity, i, phase_names) + phases.append(phase) + + return phases + + def _create_phase(self, phase_name: str, duration: int, complexity: str, + phase_index: int, all_phases: List[str]) -> MigrationPhase: + """Create a detailed migration phase""" + phase_templates = { + "preparation": { + "description": "Prepare systems and teams for migration", + "tasks": [ + "Backup source system", + "Set up monitoring and alerting", + "Prepare rollback procedures", + "Communicate migration timeline", + "Validate prerequisites" + ], + "validation_criteria": [ + "All backups completed successfully", + "Monitoring systems operational", + "Team members briefed and ready", + "Rollback procedures tested" + ], + "risk_level": "medium" + }, + "assessment": { + "description": "Assess current state and migration requirements", + "tasks": [ + "Inventory existing systems and dependencies", + "Analyze data volumes and complexity", + "Identify integration points", + "Document current architecture", + "Create migration mapping" + ], + "validation_criteria": [ + "Complete system inventory documented", + "Dependencies mapped and validated", + "Migration scope clearly defined", + "Resource requirements identified" + ], + "risk_level": "low" + }, + "migration": { + "description": "Execute core migration processes", + "tasks": [ + "Begin data/service migration", + "Monitor migration progress", + "Validate data consistency", + "Handle migration errors", + "Update configuration" + ], + "validation_criteria": [ + "Migration progress within expected parameters", + "Data consistency checks passing", + "Error rates within acceptable limits", + "Performance metrics stable" + ], + "risk_level": "high" + }, + "validation": { + "description": "Validate migration success and system health", + "tasks": [ + "Execute comprehensive testing", + "Validate business processes", + "Check system performance", + "Verify data integrity", + "Confirm security controls" + ], + "validation_criteria": [ + "All critical tests passing", + "Performance within acceptable range", + "Security controls functioning", + "Business processes operational" + ], + "risk_level": "medium" + }, + "cutover": { + "description": "Switch production traffic to new system", + "tasks": [ + "Update DNS/load balancer configuration", + "Redirect production traffic", + "Monitor system performance", + "Validate end-user experience", + "Confirm business operations" + ], + "validation_criteria": [ + "Traffic successfully redirected", + "System performance stable", + "User experience satisfactory", + "Business operations normal" + ], + "risk_level": "critical" + } + } + + template = phase_templates.get(phase_name, { + "description": f"Execute {phase_name} phase", + "tasks": [f"Complete {phase_name} activities"], + "validation_criteria": [f"{phase_name.title()} phase completed successfully"], + "risk_level": "medium" + }) + + dependencies = [] + if phase_index > 0: + dependencies.append(all_phases[phase_index - 1]) + + rollback_triggers = [ + "Critical system failure", + "Data corruption detected", + "Performance degradation > 50%", + "Business process failure" + ] + + resources_required = [ + "Technical team availability", + "System access and permissions", + "Monitoring and alerting systems", + "Communication channels" + ] + + return MigrationPhase( + name=phase_name, + description=template["description"], + duration_hours=duration, + dependencies=dependencies, + validation_criteria=template["validation_criteria"], + rollback_triggers=rollback_triggers, + tasks=template["tasks"], + risk_level=template["risk_level"], + resources_required=resources_required + ) + + def _assess_risks(self, spec: Dict[str, Any]) -> List[RiskItem]: + """Generate risk assessment for migration""" + migration_type = spec.get("type") + base_risks = self.risk_templates.get(migration_type, []) + + # Add specification-specific risks + additional_risks = [] + constraints = spec.get("constraints", {}) + + if constraints.get("max_downtime_minutes", 480) < 60: + additional_risks.append( + RiskItem("business", "Zero-downtime requirement increases complexity", "high", "medium", "high", + "Implement blue-green deployment or rolling update strategy", "DevOps Team") + ) + + if constraints.get("data_volume_gb", 0) > 5000: + additional_risks.append( + RiskItem("technical", "Large data volumes may cause extended migration time", "high", "medium", "medium", + "Implement parallel processing and progress monitoring", "Data Team") + ) + + compliance_reqs = constraints.get("compliance_requirements", []) + if compliance_reqs: + additional_risks.append( + RiskItem("compliance", "Regulatory compliance requirements", "medium", "high", "high", + "Ensure all compliance checks are integrated into migration process", "Compliance Team") + ) + + return base_risks + additional_risks + + def _generate_rollback_plan(self, phases: List[MigrationPhase]) -> Dict[str, Any]: + """Generate comprehensive rollback plan""" + rollback_phases = [] + + for phase in reversed(phases): + rollback_phase = { + "phase": phase.name, + "rollback_actions": [ + f"Revert {phase.name} changes", + f"Restore pre-{phase.name} state", + f"Validate {phase.name} rollback success" + ], + "validation_criteria": [ + f"System restored to pre-{phase.name} state", + f"All {phase.name} changes successfully reverted", + "System functionality confirmed" + ], + "estimated_time_minutes": phase.duration_hours * 15 # 25% of original phase time + } + rollback_phases.append(rollback_phase) + + return { + "rollback_phases": rollback_phases, + "rollback_triggers": [ + "Critical system failure", + "Data corruption detected", + "Migration timeline exceeded by > 50%", + "Business-critical functionality unavailable", + "Security breach detected", + "Stakeholder decision to abort" + ], + "rollback_decision_matrix": { + "low_severity": "Continue with monitoring", + "medium_severity": "Assess and decide within 15 minutes", + "high_severity": "Immediate rollback initiation", + "critical_severity": "Emergency rollback - all hands" + }, + "rollback_contacts": [ + "Migration Lead", + "Technical Lead", + "Business Owner", + "On-call Engineer" + ] + } + + def generate_plan(self, spec: Dict[str, Any]) -> MigrationPlan: + """Generate complete migration plan from specification""" + migration_id = hashlib.md5(json.dumps(spec, sort_keys=True).encode()).hexdigest()[:12] + complexity = self._calculate_complexity(spec) + phases = self._generate_phases(spec) + risks = self._assess_risks(spec) + total_duration = sum(phase.duration_hours for phase in phases) + rollback_plan = self._generate_rollback_plan(phases) + + success_criteria = [ + "All data successfully migrated with 100% integrity", + "System performance meets or exceeds baseline", + "All business processes functioning normally", + "No critical security vulnerabilities introduced", + "Stakeholder acceptance criteria met", + "Documentation and runbooks updated" + ] + + stakeholders = [ + "Business Owner", + "Technical Lead", + "DevOps Team", + "QA Team", + "Security Team", + "End Users" + ] + + return MigrationPlan( + migration_id=migration_id, + source_system=spec.get("source", "Unknown"), + target_system=spec.get("target", "Unknown"), + migration_type=spec.get("type", "Unknown"), + complexity=complexity, + estimated_duration_hours=total_duration, + phases=phases, + risks=risks, + success_criteria=success_criteria, + rollback_plan=rollback_plan, + stakeholders=stakeholders, + created_at=datetime.datetime.now().isoformat() + ) + + def generate_human_readable_plan(self, plan: MigrationPlan) -> str: + """Generate human-readable migration plan""" + output = [] + output.append("=" * 80) + output.append(f"MIGRATION PLAN: {plan.migration_id}") + output.append("=" * 80) + output.append(f"Source System: {plan.source_system}") + output.append(f"Target System: {plan.target_system}") + output.append(f"Migration Type: {plan.migration_type.upper()}") + output.append(f"Complexity Level: {plan.complexity.upper()}") + output.append(f"Estimated Duration: {plan.estimated_duration_hours} hours ({plan.estimated_duration_hours/24:.1f} days)") + output.append(f"Created: {plan.created_at}") + output.append("") + + # Phases + output.append("MIGRATION PHASES") + output.append("-" * 40) + for i, phase in enumerate(plan.phases, 1): + output.append(f"{i}. {phase.name.upper()} ({phase.duration_hours}h)") + output.append(f" Description: {phase.description}") + output.append(f" Risk Level: {phase.risk_level.upper()}") + if phase.dependencies: + output.append(f" Dependencies: {', '.join(phase.dependencies)}") + output.append(" Tasks:") + for task in phase.tasks: + output.append(f" • {task}") + output.append(" Success Criteria:") + for criteria in phase.validation_criteria: + output.append(f" ✓ {criteria}") + output.append("") + + # Risk Assessment + output.append("RISK ASSESSMENT") + output.append("-" * 40) + risk_by_severity = {} + for risk in plan.risks: + if risk.severity not in risk_by_severity: + risk_by_severity[risk.severity] = [] + risk_by_severity[risk.severity].append(risk) + + for severity in ["critical", "high", "medium", "low"]: + if severity in risk_by_severity: + output.append(f"{severity.upper()} SEVERITY RISKS:") + for risk in risk_by_severity[severity]: + output.append(f" • {risk.description}") + output.append(f" Category: {risk.category}") + output.append(f" Probability: {risk.probability} | Impact: {risk.impact}") + output.append(f" Mitigation: {risk.mitigation}") + output.append(f" Owner: {risk.owner}") + output.append("") + + # Rollback Plan + output.append("ROLLBACK STRATEGY") + output.append("-" * 40) + output.append("Rollback Triggers:") + for trigger in plan.rollback_plan["rollback_triggers"]: + output.append(f" • {trigger}") + output.append("") + + output.append("Rollback Phases:") + for rb_phase in plan.rollback_plan["rollback_phases"]: + output.append(f" {rb_phase['phase'].upper()}:") + for action in rb_phase["rollback_actions"]: + output.append(f" - {action}") + output.append(f" Estimated Time: {rb_phase['estimated_time_minutes']} minutes") + output.append("") + + # Success Criteria + output.append("SUCCESS CRITERIA") + output.append("-" * 40) + for criteria in plan.success_criteria: + output.append(f"✓ {criteria}") + output.append("") + + # Stakeholders + output.append("STAKEHOLDERS") + output.append("-" * 40) + for stakeholder in plan.stakeholders: + output.append(f"• {stakeholder}") + output.append("") + + return "\n".join(output) + + +def main(): + """Main function with command line interface""" + parser = argparse.ArgumentParser(description="Generate comprehensive migration plans") + parser.add_argument("--input", "-i", required=True, help="Input migration specification file (JSON)") + parser.add_argument("--output", "-o", help="Output file for migration plan (JSON)") + parser.add_argument("--format", "-f", choices=["json", "text", "both"], default="both", + help="Output format") + parser.add_argument("--validate", action="store_true", help="Validate migration specification only") + + args = parser.parse_args() + + try: + # Load migration specification + with open(args.input, 'r') as f: + spec = json.load(f) + + # Validate required fields + required_fields = ["type", "source", "target"] + for field in required_fields: + if field not in spec: + print(f"Error: Missing required field '{field}' in specification", file=sys.stderr) + return 1 + + if args.validate: + print("Migration specification is valid") + return 0 + + # Generate migration plan + planner = MigrationPlanner() + plan = planner.generate_plan(spec) + + # Output results + if args.format in ["json", "both"]: + plan_dict = asdict(plan) + if args.output: + with open(args.output, 'w') as f: + json.dump(plan_dict, f, indent=2) + print(f"Migration plan saved to {args.output}") + else: + print(json.dumps(plan_dict, indent=2)) + + if args.format in ["text", "both"]: + human_plan = planner.generate_human_readable_plan(plan) + text_output = args.output.replace('.json', '.txt') if args.output else None + if text_output: + with open(text_output, 'w') as f: + f.write(human_plan) + print(f"Human-readable plan saved to {text_output}") + else: + print("\n" + "="*80) + print("HUMAN-READABLE MIGRATION PLAN") + print("="*80) + print(human_plan) + + except FileNotFoundError: + print(f"Error: Input file '{args.input}' not found", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in input file: {e}", file=sys.stderr) + return 1 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/engineering/migration-architect/scripts/rollback_generator.py b/engineering/migration-architect/scripts/rollback_generator.py new file mode 100644 index 0000000..1bf78b3 --- /dev/null +++ b/engineering/migration-architect/scripts/rollback_generator.py @@ -0,0 +1,1109 @@ +#!/usr/bin/env python3 +""" +Rollback Generator - Generate comprehensive rollback procedures for migrations + +This tool takes a migration plan and generates detailed rollback procedures for each phase, +including data rollback scripts, service rollback steps, validation checks, and communication +templates to ensure safe and reliable migration reversals. + +Author: Migration Architect Skill +Version: 1.0.0 +License: MIT +""" + +import json +import argparse +import sys +import datetime +import hashlib +from typing import Dict, List, Any, Optional, Tuple +from dataclasses import dataclass, asdict +from enum import Enum + + +class RollbackTrigger(Enum): + """Types of rollback triggers""" + MANUAL = "manual" + AUTOMATED = "automated" + THRESHOLD_BASED = "threshold_based" + TIME_BASED = "time_based" + + +class RollbackUrgency(Enum): + """Rollback urgency levels""" + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + EMERGENCY = "emergency" + + +@dataclass +class RollbackStep: + """Individual rollback step""" + step_id: str + name: str + description: str + script_type: str # sql, bash, api, manual + script_content: str + estimated_duration_minutes: int + dependencies: List[str] + validation_commands: List[str] + success_criteria: List[str] + failure_escalation: str + rollback_order: int + + +@dataclass +class RollbackPhase: + """Rollback phase containing multiple steps""" + phase_name: str + description: str + urgency_level: str + estimated_duration_minutes: int + prerequisites: List[str] + steps: List[RollbackStep] + validation_checkpoints: List[str] + communication_requirements: List[str] + risk_level: str + + +@dataclass +class RollbackTriggerCondition: + """Conditions that trigger automatic rollback""" + trigger_id: str + name: str + condition: str + metric_threshold: Optional[Dict[str, Any]] + evaluation_window_minutes: int + auto_execute: bool + escalation_contacts: List[str] + + +@dataclass +class DataRecoveryPlan: + """Data recovery and restoration plan""" + recovery_method: str # backup_restore, point_in_time, event_replay + backup_location: str + recovery_scripts: List[str] + data_validation_queries: List[str] + estimated_recovery_time_minutes: int + recovery_dependencies: List[str] + + +@dataclass +class CommunicationTemplate: + """Communication template for rollback scenarios""" + template_type: str # start, progress, completion, escalation + audience: str # technical, business, executive, customers + subject: str + body: str + urgency: str + delivery_methods: List[str] + + +@dataclass +class RollbackRunbook: + """Complete rollback runbook""" + runbook_id: str + migration_id: str + created_at: str + rollback_phases: List[RollbackPhase] + trigger_conditions: List[RollbackTriggerCondition] + data_recovery_plan: DataRecoveryPlan + communication_templates: List[CommunicationTemplate] + escalation_matrix: Dict[str, Any] + validation_checklist: List[str] + post_rollback_procedures: List[str] + emergency_contacts: List[Dict[str, str]] + + +class RollbackGenerator: + """Main rollback generator class""" + + def __init__(self): + self.rollback_templates = self._load_rollback_templates() + self.validation_templates = self._load_validation_templates() + self.communication_templates = self._load_communication_templates() + + def _load_rollback_templates(self) -> Dict[str, Any]: + """Load rollback script templates for different migration types""" + return { + "database": { + "schema_rollback": { + "drop_table": "DROP TABLE IF EXISTS {table_name};", + "drop_column": "ALTER TABLE {table_name} DROP COLUMN IF EXISTS {column_name};", + "restore_column": "ALTER TABLE {table_name} ADD COLUMN {column_definition};", + "revert_type": "ALTER TABLE {table_name} ALTER COLUMN {column_name} TYPE {original_type};", + "drop_constraint": "ALTER TABLE {table_name} DROP CONSTRAINT {constraint_name};", + "add_constraint": "ALTER TABLE {table_name} ADD CONSTRAINT {constraint_name} {constraint_definition};" + }, + "data_rollback": { + "restore_backup": "pg_restore -d {database_name} -c {backup_file}", + "point_in_time_recovery": "SELECT pg_create_restore_point('pre_migration_{timestamp}');", + "delete_migrated_data": "DELETE FROM {table_name} WHERE migration_batch_id = '{batch_id}';", + "restore_original_values": "UPDATE {table_name} SET {column_name} = backup_{column_name} WHERE migration_flag = true;" + } + }, + "service": { + "deployment_rollback": { + "rollback_blue_green": "kubectl patch service {service_name} -p '{\"spec\":{\"selector\":{\"version\":\"blue\"}}}'", + "rollback_canary": "kubectl scale deployment {service_name}-canary --replicas=0", + "restore_previous_version": "kubectl rollout undo deployment/{service_name} --to-revision={revision_number}", + "update_load_balancer": "aws elbv2 modify-rule --rule-arn {rule_arn} --actions Type=forward,TargetGroupArn={original_target_group}" + }, + "configuration_rollback": { + "restore_config_map": "kubectl apply -f {original_config_file}", + "revert_feature_flags": "curl -X PUT {feature_flag_api}/flags/{flag_name} -d '{\"enabled\": false}'", + "restore_environment_vars": "kubectl set env deployment/{deployment_name} {env_var_name}={original_value}" + } + }, + "infrastructure": { + "cloud_rollback": { + "revert_terraform": "terraform apply -target={resource_name} {rollback_plan_file}", + "restore_dns": "aws route53 change-resource-record-sets --hosted-zone-id {zone_id} --change-batch file://{rollback_dns_changes}", + "rollback_security_groups": "aws ec2 authorize-security-group-ingress --group-id {group_id} --protocol {protocol} --port {port} --cidr {cidr}", + "restore_iam_policies": "aws iam put-role-policy --role-name {role_name} --policy-name {policy_name} --policy-document file://{original_policy}" + }, + "network_rollback": { + "restore_routing": "aws ec2 replace-route --route-table-id {route_table_id} --destination-cidr-block {cidr} --gateway-id {original_gateway}", + "revert_load_balancer": "aws elbv2 modify-load-balancer --load-balancer-arn {lb_arn} --scheme {original_scheme}", + "restore_firewall_rules": "aws ec2 revoke-security-group-ingress --group-id {group_id} --protocol {protocol} --port {port} --source-group {source_group}" + } + } + } + + def _load_validation_templates(self) -> Dict[str, List[str]]: + """Load validation command templates""" + return { + "database": [ + "SELECT COUNT(*) FROM {table_name};", + "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}';", + "SELECT COUNT(*) FROM information_schema.columns WHERE table_name = '{table_name}' AND column_name = '{column_name}';", + "SELECT COUNT(DISTINCT {primary_key}) FROM {table_name};", + "SELECT MAX({timestamp_column}) FROM {table_name};" + ], + "service": [ + "curl -f {health_check_url}", + "kubectl get pods -l app={service_name} --field-selector=status.phase=Running", + "kubectl logs deployment/{service_name} --tail=100 | grep -i error", + "curl -f {service_endpoint}/api/v1/status" + ], + "infrastructure": [ + "aws ec2 describe-instances --instance-ids {instance_id} --query 'Reservations[*].Instances[*].State.Name'", + "nslookup {domain_name}", + "curl -I {load_balancer_url}", + "aws elbv2 describe-target-health --target-group-arn {target_group_arn}" + ] + } + + def _load_communication_templates(self) -> Dict[str, Dict[str, str]]: + """Load communication templates""" + return { + "rollback_start": { + "technical": { + "subject": "ROLLBACK INITIATED: {migration_name}", + "body": """Team, + +We have initiated rollback for migration: {migration_name} +Rollback ID: {rollback_id} +Start Time: {start_time} +Estimated Duration: {estimated_duration} + +Reason: {rollback_reason} + +Current Status: Rolling back phase {current_phase} + +Next Updates: Every 15 minutes or upon phase completion + +Actions Required: +- Monitor system health dashboards +- Stand by for escalation if needed +- Do not make manual changes during rollback + +Incident Commander: {incident_commander} +""" + }, + "business": { + "subject": "System Rollback In Progress - {system_name}", + "body": """Business Stakeholders, + +We are currently performing a planned rollback of the {system_name} migration due to {rollback_reason}. + +Impact: {business_impact} +Expected Resolution: {estimated_completion_time} +Affected Services: {affected_services} + +We will provide updates every 30 minutes. + +Contact: {business_contact} +""" + }, + "executive": { + "subject": "EXEC ALERT: Critical System Rollback - {system_name}", + "body": """Executive Team, + +A critical rollback is in progress for {system_name}. + +Summary: +- Rollback Reason: {rollback_reason} +- Business Impact: {business_impact} +- Expected Resolution: {estimated_completion_time} +- Customer Impact: {customer_impact} + +We are following established procedures and will update hourly. + +Escalation: {escalation_contact} +""" + } + }, + "rollback_complete": { + "technical": { + "subject": "ROLLBACK COMPLETED: {migration_name}", + "body": """Team, + +Rollback has been successfully completed for migration: {migration_name} + +Summary: +- Start Time: {start_time} +- End Time: {end_time} +- Duration: {actual_duration} +- Phases Completed: {completed_phases} + +Validation Results: +{validation_results} + +System Status: {system_status} + +Next Steps: +- Continue monitoring for 24 hours +- Post-rollback review scheduled for {review_date} +- Root cause analysis to begin + +All clear to resume normal operations. + +Incident Commander: {incident_commander} +""" + } + } + } + + def generate_rollback_runbook(self, migration_plan: Dict[str, Any]) -> RollbackRunbook: + """Generate comprehensive rollback runbook from migration plan""" + runbook_id = f"rb_{hashlib.md5(str(migration_plan).encode()).hexdigest()[:8]}" + migration_id = migration_plan.get("migration_id", "unknown") + migration_type = migration_plan.get("migration_type", "unknown") + + # Generate rollback phases (reverse order of migration phases) + rollback_phases = self._generate_rollback_phases(migration_plan) + + # Generate trigger conditions + trigger_conditions = self._generate_trigger_conditions(migration_plan) + + # Generate data recovery plan + data_recovery_plan = self._generate_data_recovery_plan(migration_plan) + + # Generate communication templates + communication_templates = self._generate_communication_templates(migration_plan) + + # Generate escalation matrix + escalation_matrix = self._generate_escalation_matrix(migration_plan) + + # Generate validation checklist + validation_checklist = self._generate_validation_checklist(migration_plan) + + # Generate post-rollback procedures + post_rollback_procedures = self._generate_post_rollback_procedures(migration_plan) + + # Generate emergency contacts + emergency_contacts = self._generate_emergency_contacts(migration_plan) + + return RollbackRunbook( + runbook_id=runbook_id, + migration_id=migration_id, + created_at=datetime.datetime.now().isoformat(), + rollback_phases=rollback_phases, + trigger_conditions=trigger_conditions, + data_recovery_plan=data_recovery_plan, + communication_templates=communication_templates, + escalation_matrix=escalation_matrix, + validation_checklist=validation_checklist, + post_rollback_procedures=post_rollback_procedures, + emergency_contacts=emergency_contacts + ) + + def _generate_rollback_phases(self, migration_plan: Dict[str, Any]) -> List[RollbackPhase]: + """Generate rollback phases from migration plan""" + migration_phases = migration_plan.get("phases", []) + migration_type = migration_plan.get("migration_type", "unknown") + rollback_phases = [] + + # Reverse the order of migration phases for rollback + for i, phase in enumerate(reversed(migration_phases)): + if isinstance(phase, dict): + phase_name = phase.get("name", f"phase_{i}") + phase_duration = phase.get("duration_hours", 2) * 60 # Convert to minutes + phase_risk = phase.get("risk_level", "medium") + else: + phase_name = str(phase) + phase_duration = 120 # Default 2 hours + phase_risk = "medium" + + rollback_steps = self._generate_rollback_steps(phase_name, migration_type, i) + + rollback_phase = RollbackPhase( + phase_name=f"rollback_{phase_name}", + description=f"Rollback changes made during {phase_name} phase", + urgency_level=self._calculate_urgency(phase_risk), + estimated_duration_minutes=phase_duration // 2, # Rollback typically faster + prerequisites=self._get_rollback_prerequisites(phase_name, i), + steps=rollback_steps, + validation_checkpoints=self._get_validation_checkpoints(phase_name, migration_type), + communication_requirements=self._get_communication_requirements(phase_name, phase_risk), + risk_level=phase_risk + ) + + rollback_phases.append(rollback_phase) + + return rollback_phases + + def _generate_rollback_steps(self, phase_name: str, migration_type: str, phase_index: int) -> List[RollbackStep]: + """Generate specific rollback steps for a phase""" + steps = [] + templates = self.rollback_templates.get(migration_type, {}) + + if migration_type == "database": + if "migration" in phase_name.lower() or "cutover" in phase_name.lower(): + # Data rollback steps + steps.extend([ + RollbackStep( + step_id=f"rb_data_{phase_index}_01", + name="Stop data migration processes", + description="Halt all ongoing data migration processes", + script_type="sql", + script_content="-- Stop migration processes\nSELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE query LIKE '%migration%';", + estimated_duration_minutes=5, + dependencies=[], + validation_commands=["SELECT COUNT(*) FROM pg_stat_activity WHERE query LIKE '%migration%';"], + success_criteria=["No active migration processes"], + failure_escalation="Contact DBA immediately", + rollback_order=1 + ), + RollbackStep( + step_id=f"rb_data_{phase_index}_02", + name="Restore from backup", + description="Restore database from pre-migration backup", + script_type="bash", + script_content=templates.get("data_rollback", {}).get("restore_backup", "pg_restore -d {database_name} -c {backup_file}"), + estimated_duration_minutes=30, + dependencies=[f"rb_data_{phase_index}_01"], + validation_commands=["SELECT COUNT(*) FROM information_schema.tables;"], + success_criteria=["Database restored successfully", "All expected tables present"], + failure_escalation="Escalate to senior DBA and infrastructure team", + rollback_order=2 + ) + ]) + + if "preparation" in phase_name.lower(): + # Schema rollback steps + steps.append( + RollbackStep( + step_id=f"rb_schema_{phase_index}_01", + name="Drop migration artifacts", + description="Remove temporary migration tables and procedures", + script_type="sql", + script_content="-- Drop migration artifacts\nDROP TABLE IF EXISTS migration_log;\nDROP PROCEDURE IF EXISTS migrate_data();", + estimated_duration_minutes=5, + dependencies=[], + validation_commands=["SELECT COUNT(*) FROM information_schema.tables WHERE table_name LIKE '%migration%';"], + success_criteria=["No migration artifacts remain"], + failure_escalation="Manual cleanup required", + rollback_order=1 + ) + ) + + elif migration_type == "service": + if "cutover" in phase_name.lower(): + # Service rollback steps + steps.extend([ + RollbackStep( + step_id=f"rb_service_{phase_index}_01", + name="Redirect traffic back to old service", + description="Update load balancer to route traffic back to previous service version", + script_type="bash", + script_content=templates.get("deployment_rollback", {}).get("update_load_balancer", "aws elbv2 modify-rule --rule-arn {rule_arn} --actions Type=forward,TargetGroupArn={original_target_group}"), + estimated_duration_minutes=2, + dependencies=[], + validation_commands=["curl -f {health_check_url}"], + success_criteria=["Traffic routing to original service", "Health checks passing"], + failure_escalation="Emergency procedure - manual traffic routing", + rollback_order=1 + ), + RollbackStep( + step_id=f"rb_service_{phase_index}_02", + name="Rollback service deployment", + description="Revert to previous service deployment version", + script_type="bash", + script_content=templates.get("deployment_rollback", {}).get("restore_previous_version", "kubectl rollout undo deployment/{service_name} --to-revision={revision_number}"), + estimated_duration_minutes=10, + dependencies=[f"rb_service_{phase_index}_01"], + validation_commands=["kubectl get pods -l app={service_name} --field-selector=status.phase=Running"], + success_criteria=["Previous version deployed", "All pods running"], + failure_escalation="Manual pod management required", + rollback_order=2 + ) + ]) + + elif migration_type == "infrastructure": + steps.extend([ + RollbackStep( + step_id=f"rb_infra_{phase_index}_01", + name="Revert infrastructure changes", + description="Apply terraform plan to revert infrastructure to previous state", + script_type="bash", + script_content=templates.get("cloud_rollback", {}).get("revert_terraform", "terraform apply -target={resource_name} {rollback_plan_file}"), + estimated_duration_minutes=15, + dependencies=[], + validation_commands=["terraform plan -detailed-exitcode"], + success_criteria=["Infrastructure matches previous state", "No planned changes"], + failure_escalation="Manual infrastructure review required", + rollback_order=1 + ), + RollbackStep( + step_id=f"rb_infra_{phase_index}_02", + name="Restore DNS configuration", + description="Revert DNS changes to point back to original infrastructure", + script_type="bash", + script_content=templates.get("cloud_rollback", {}).get("restore_dns", "aws route53 change-resource-record-sets --hosted-zone-id {zone_id} --change-batch file://{rollback_dns_changes}"), + estimated_duration_minutes=10, + dependencies=[f"rb_infra_{phase_index}_01"], + validation_commands=["nslookup {domain_name}"], + success_criteria=["DNS resolves to original endpoints"], + failure_escalation="Contact DNS administrator", + rollback_order=2 + ) + ]) + + # Add generic validation step for all migration types + steps.append( + RollbackStep( + step_id=f"rb_validate_{phase_index}_final", + name="Validate rollback completion", + description=f"Comprehensive validation that {phase_name} rollback completed successfully", + script_type="manual", + script_content="Execute validation checklist for this phase", + estimated_duration_minutes=10, + dependencies=[step.step_id for step in steps], + validation_commands=self.validation_templates.get(migration_type, []), + success_criteria=[f"{phase_name} fully rolled back", "All validation checks pass"], + failure_escalation=f"Investigate {phase_name} rollback failures", + rollback_order=99 + ) + ) + + return steps + + def _generate_trigger_conditions(self, migration_plan: Dict[str, Any]) -> List[RollbackTriggerCondition]: + """Generate automatic rollback trigger conditions""" + triggers = [] + migration_type = migration_plan.get("migration_type", "unknown") + + # Generic triggers for all migration types + triggers.extend([ + RollbackTriggerCondition( + trigger_id="error_rate_spike", + name="Error Rate Spike", + condition="error_rate > baseline * 5 for 5 minutes", + metric_threshold={ + "metric": "error_rate", + "operator": "greater_than", + "value": "baseline_error_rate * 5", + "duration_minutes": 5 + }, + evaluation_window_minutes=5, + auto_execute=True, + escalation_contacts=["on_call_engineer", "migration_lead"] + ), + RollbackTriggerCondition( + trigger_id="response_time_degradation", + name="Response Time Degradation", + condition="p95_response_time > baseline * 3 for 10 minutes", + metric_threshold={ + "metric": "p95_response_time", + "operator": "greater_than", + "value": "baseline_p95 * 3", + "duration_minutes": 10 + }, + evaluation_window_minutes=10, + auto_execute=False, + escalation_contacts=["performance_team", "migration_lead"] + ), + RollbackTriggerCondition( + trigger_id="availability_drop", + name="Service Availability Drop", + condition="availability < 95% for 2 minutes", + metric_threshold={ + "metric": "availability", + "operator": "less_than", + "value": 0.95, + "duration_minutes": 2 + }, + evaluation_window_minutes=2, + auto_execute=True, + escalation_contacts=["sre_team", "incident_commander"] + ) + ]) + + # Migration-type specific triggers + if migration_type == "database": + triggers.extend([ + RollbackTriggerCondition( + trigger_id="data_integrity_failure", + name="Data Integrity Check Failure", + condition="data_validation_failures > 0", + metric_threshold={ + "metric": "data_validation_failures", + "operator": "greater_than", + "value": 0, + "duration_minutes": 1 + }, + evaluation_window_minutes=1, + auto_execute=True, + escalation_contacts=["dba_team", "data_team"] + ), + RollbackTriggerCondition( + trigger_id="migration_progress_stalled", + name="Migration Progress Stalled", + condition="migration_progress unchanged for 30 minutes", + metric_threshold={ + "metric": "migration_progress_rate", + "operator": "equals", + "value": 0, + "duration_minutes": 30 + }, + evaluation_window_minutes=30, + auto_execute=False, + escalation_contacts=["migration_team", "dba_team"] + ) + ]) + + elif migration_type == "service": + triggers.extend([ + RollbackTriggerCondition( + trigger_id="cpu_utilization_spike", + name="CPU Utilization Spike", + condition="cpu_utilization > 90% for 15 minutes", + metric_threshold={ + "metric": "cpu_utilization", + "operator": "greater_than", + "value": 0.90, + "duration_minutes": 15 + }, + evaluation_window_minutes=15, + auto_execute=False, + escalation_contacts=["devops_team", "infrastructure_team"] + ), + RollbackTriggerCondition( + trigger_id="memory_leak_detected", + name="Memory Leak Detected", + condition="memory_usage increasing continuously for 20 minutes", + metric_threshold={ + "metric": "memory_growth_rate", + "operator": "greater_than", + "value": "1MB/minute", + "duration_minutes": 20 + }, + evaluation_window_minutes=20, + auto_execute=True, + escalation_contacts=["development_team", "sre_team"] + ) + ]) + + return triggers + + def _generate_data_recovery_plan(self, migration_plan: Dict[str, Any]) -> DataRecoveryPlan: + """Generate data recovery plan""" + migration_type = migration_plan.get("migration_type", "unknown") + + if migration_type == "database": + return DataRecoveryPlan( + recovery_method="point_in_time", + backup_location="/backups/pre_migration_{migration_id}_{timestamp}.sql", + recovery_scripts=[ + "pg_restore -d production -c /backups/pre_migration_backup.sql", + "SELECT pg_create_restore_point('rollback_point');", + "VACUUM ANALYZE; -- Refresh statistics after restore" + ], + data_validation_queries=[ + "SELECT COUNT(*) FROM critical_business_table;", + "SELECT MAX(created_at) FROM audit_log;", + "SELECT COUNT(DISTINCT user_id) FROM user_sessions;", + "SELECT SUM(amount) FROM financial_transactions WHERE date = CURRENT_DATE;" + ], + estimated_recovery_time_minutes=45, + recovery_dependencies=["database_instance_running", "backup_file_accessible"] + ) + else: + return DataRecoveryPlan( + recovery_method="backup_restore", + backup_location="/backups/pre_migration_state", + recovery_scripts=[ + "# Restore configuration files from backup", + "cp -r /backups/pre_migration_state/config/* /app/config/", + "# Restart services with previous configuration", + "systemctl restart application_service" + ], + data_validation_queries=[ + "curl -f http://localhost:8080/health", + "curl -f http://localhost:8080/api/status" + ], + estimated_recovery_time_minutes=20, + recovery_dependencies=["service_stopped", "backup_accessible"] + ) + + def _generate_communication_templates(self, migration_plan: Dict[str, Any]) -> List[CommunicationTemplate]: + """Generate communication templates for rollback scenarios""" + templates = [] + base_templates = self.communication_templates + + # Rollback start notifications + for audience in ["technical", "business", "executive"]: + if audience in base_templates["rollback_start"]: + template_data = base_templates["rollback_start"][audience] + templates.append(CommunicationTemplate( + template_type="rollback_start", + audience=audience, + subject=template_data["subject"], + body=template_data["body"], + urgency="high" if audience == "executive" else "medium", + delivery_methods=["email", "slack"] if audience == "technical" else ["email"] + )) + + # Rollback completion notifications + for audience in ["technical", "business"]: + if audience in base_templates.get("rollback_complete", {}): + template_data = base_templates["rollback_complete"][audience] + templates.append(CommunicationTemplate( + template_type="rollback_complete", + audience=audience, + subject=template_data["subject"], + body=template_data["body"], + urgency="medium", + delivery_methods=["email", "slack"] if audience == "technical" else ["email"] + )) + + # Emergency escalation template + templates.append(CommunicationTemplate( + template_type="emergency_escalation", + audience="executive", + subject="CRITICAL: Rollback Emergency - {migration_name}", + body="""CRITICAL SITUATION - IMMEDIATE ATTENTION REQUIRED + +Migration: {migration_name} +Issue: Rollback procedure has encountered critical failures + +Current Status: {current_status} +Failed Components: {failed_components} +Business Impact: {business_impact} +Customer Impact: {customer_impact} + +Immediate Actions: +1. Emergency response team activated +2. {emergency_action_1} +3. {emergency_action_2} + +War Room: {war_room_location} +Bridge Line: {conference_bridge} + +Next Update: {next_update_time} + +Incident Commander: {incident_commander} +Executive On-Call: {executive_on_call} +""", + urgency="emergency", + delivery_methods=["email", "sms", "phone_call"] + )) + + return templates + + def _generate_escalation_matrix(self, migration_plan: Dict[str, Any]) -> Dict[str, Any]: + """Generate escalation matrix for different failure scenarios""" + return { + "level_1": { + "trigger": "Single component failure", + "response_time_minutes": 5, + "contacts": ["on_call_engineer", "migration_lead"], + "actions": ["Investigate issue", "Attempt automated remediation", "Monitor closely"] + }, + "level_2": { + "trigger": "Multiple component failures or single critical failure", + "response_time_minutes": 2, + "contacts": ["senior_engineer", "team_lead", "devops_lead"], + "actions": ["Initiate rollback", "Establish war room", "Notify stakeholders"] + }, + "level_3": { + "trigger": "System-wide failure or data corruption", + "response_time_minutes": 1, + "contacts": ["engineering_manager", "cto", "incident_commander"], + "actions": ["Emergency rollback", "All hands on deck", "Executive notification"] + }, + "emergency": { + "trigger": "Business-critical failure with customer impact", + "response_time_minutes": 0, + "contacts": ["ceo", "cto", "head_of_operations"], + "actions": ["Emergency procedures", "Customer communication", "Media preparation if needed"] + } + } + + def _generate_validation_checklist(self, migration_plan: Dict[str, Any]) -> List[str]: + """Generate comprehensive validation checklist""" + migration_type = migration_plan.get("migration_type", "unknown") + + base_checklist = [ + "Verify system is responding to health checks", + "Confirm error rates are within normal parameters", + "Validate response times meet SLA requirements", + "Check all critical business processes are functioning", + "Verify monitoring and alerting systems are operational", + "Confirm no data corruption has occurred", + "Validate security controls are functioning properly", + "Check backup systems are working correctly", + "Verify integration points with downstream systems", + "Confirm user authentication and authorization working" + ] + + if migration_type == "database": + base_checklist.extend([ + "Validate database schema matches expected state", + "Confirm referential integrity constraints", + "Check database performance metrics", + "Verify data consistency across related tables", + "Validate indexes and statistics are optimal", + "Confirm transaction logs are clean", + "Check database connections and connection pooling" + ]) + + elif migration_type == "service": + base_checklist.extend([ + "Verify service discovery is working correctly", + "Confirm load balancing is distributing traffic properly", + "Check service-to-service communication", + "Validate API endpoints are responding correctly", + "Confirm feature flags are in correct state", + "Check resource utilization (CPU, memory, disk)", + "Verify container orchestration is healthy" + ]) + + elif migration_type == "infrastructure": + base_checklist.extend([ + "Verify network connectivity between components", + "Confirm DNS resolution is working correctly", + "Check firewall rules and security groups", + "Validate load balancer configuration", + "Confirm SSL/TLS certificates are valid", + "Check storage systems are accessible", + "Verify backup and disaster recovery systems" + ]) + + return base_checklist + + def _generate_post_rollback_procedures(self, migration_plan: Dict[str, Any]) -> List[str]: + """Generate post-rollback procedures""" + return [ + "Monitor system stability for 24-48 hours post-rollback", + "Conduct thorough post-rollback testing of all critical paths", + "Review and analyze rollback metrics and timing", + "Document lessons learned and rollback procedure improvements", + "Schedule post-mortem meeting with all stakeholders", + "Update rollback procedures based on actual experience", + "Communicate rollback completion to all stakeholders", + "Archive rollback logs and artifacts for future reference", + "Review and update monitoring thresholds if needed", + "Plan for next migration attempt with improved procedures", + "Conduct security review to ensure no vulnerabilities introduced", + "Update disaster recovery procedures if affected by rollback", + "Review capacity planning based on rollback resource usage", + "Update documentation with rollback experience and timings" + ] + + def _generate_emergency_contacts(self, migration_plan: Dict[str, Any]) -> List[Dict[str, str]]: + """Generate emergency contact list""" + return [ + { + "role": "Incident Commander", + "name": "TBD - Assigned during migration", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "incident.commander@company.com", + "backup_contact": "backup.commander@company.com" + }, + { + "role": "Technical Lead", + "name": "TBD - Migration technical owner", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "tech.lead@company.com", + "backup_contact": "senior.engineer@company.com" + }, + { + "role": "Business Owner", + "name": "TBD - Business stakeholder", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "business.owner@company.com", + "backup_contact": "product.manager@company.com" + }, + { + "role": "On-Call Engineer", + "name": "Current on-call rotation", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "oncall@company.com", + "backup_contact": "backup.oncall@company.com" + }, + { + "role": "Executive Escalation", + "name": "CTO/VP Engineering", + "primary_phone": "+1-XXX-XXX-XXXX", + "email": "cto@company.com", + "backup_contact": "vp.engineering@company.com" + } + ] + + def _calculate_urgency(self, risk_level: str) -> str: + """Calculate rollback urgency based on risk level""" + risk_to_urgency = { + "low": "low", + "medium": "medium", + "high": "high", + "critical": "emergency" + } + return risk_to_urgency.get(risk_level, "medium") + + def _get_rollback_prerequisites(self, phase_name: str, phase_index: int) -> List[str]: + """Get prerequisites for rollback phase""" + prerequisites = [ + "Incident commander assigned and briefed", + "All team members notified of rollback initiation", + "Monitoring systems confirmed operational", + "Backup systems verified and accessible" + ] + + if phase_index > 0: + prerequisites.append("Previous rollback phase completed successfully") + + if "cutover" in phase_name.lower(): + prerequisites.extend([ + "Traffic redirection capabilities confirmed", + "Load balancer configuration backed up", + "DNS changes prepared for quick execution" + ]) + + if "data" in phase_name.lower() or "migration" in phase_name.lower(): + prerequisites.extend([ + "Database backup verified and accessible", + "Data validation queries prepared", + "Database administrator on standby" + ]) + + return prerequisites + + def _get_validation_checkpoints(self, phase_name: str, migration_type: str) -> List[str]: + """Get validation checkpoints for rollback phase""" + checkpoints = [ + f"{phase_name} rollback steps completed", + "System health checks passing", + "No critical errors in logs", + "Key metrics within acceptable ranges" + ] + + validation_commands = self.validation_templates.get(migration_type, []) + checkpoints.extend([f"Validation command passed: {cmd[:50]}..." for cmd in validation_commands[:3]]) + + return checkpoints + + def _get_communication_requirements(self, phase_name: str, risk_level: str) -> List[str]: + """Get communication requirements for rollback phase""" + base_requirements = [ + "Notify incident commander of phase start/completion", + "Update rollback status dashboard", + "Log all actions and decisions" + ] + + if risk_level in ["high", "critical"]: + base_requirements.extend([ + "Notify all stakeholders of phase progress", + "Update executive team if rollback extends beyond expected time", + "Prepare customer communication if needed" + ]) + + if "cutover" in phase_name.lower(): + base_requirements.append("Immediate notification when traffic is redirected") + + return base_requirements + + def generate_human_readable_runbook(self, runbook: RollbackRunbook) -> str: + """Generate human-readable rollback runbook""" + output = [] + output.append("=" * 80) + output.append(f"ROLLBACK RUNBOOK: {runbook.runbook_id}") + output.append("=" * 80) + output.append(f"Migration ID: {runbook.migration_id}") + output.append(f"Created: {runbook.created_at}") + output.append("") + + # Emergency Contacts + output.append("EMERGENCY CONTACTS") + output.append("-" * 40) + for contact in runbook.emergency_contacts: + output.append(f"{contact['role']}: {contact['name']}") + output.append(f" Phone: {contact['primary_phone']}") + output.append(f" Email: {contact['email']}") + output.append(f" Backup: {contact['backup_contact']}") + output.append("") + + # Escalation Matrix + output.append("ESCALATION MATRIX") + output.append("-" * 40) + for level, details in runbook.escalation_matrix.items(): + output.append(f"{level.upper()}:") + output.append(f" Trigger: {details['trigger']}") + output.append(f" Response Time: {details['response_time_minutes']} minutes") + output.append(f" Contacts: {', '.join(details['contacts'])}") + output.append(f" Actions: {', '.join(details['actions'])}") + output.append("") + + # Rollback Trigger Conditions + output.append("AUTOMATIC ROLLBACK TRIGGERS") + output.append("-" * 40) + for trigger in runbook.trigger_conditions: + output.append(f"• {trigger.name}") + output.append(f" Condition: {trigger.condition}") + output.append(f" Auto-Execute: {'Yes' if trigger.auto_execute else 'No'}") + output.append(f" Evaluation Window: {trigger.evaluation_window_minutes} minutes") + output.append(f" Contacts: {', '.join(trigger.escalation_contacts)}") + output.append("") + + # Rollback Phases + output.append("ROLLBACK PHASES") + output.append("-" * 40) + for i, phase in enumerate(runbook.rollback_phases, 1): + output.append(f"{i}. {phase.phase_name.upper()}") + output.append(f" Description: {phase.description}") + output.append(f" Urgency: {phase.urgency_level.upper()}") + output.append(f" Duration: {phase.estimated_duration_minutes} minutes") + output.append(f" Risk Level: {phase.risk_level.upper()}") + + if phase.prerequisites: + output.append(" Prerequisites:") + for prereq in phase.prerequisites: + output.append(f" ✓ {prereq}") + + output.append(" Steps:") + for step in sorted(phase.steps, key=lambda x: x.rollback_order): + output.append(f" {step.rollback_order}. {step.name}") + output.append(f" Duration: {step.estimated_duration_minutes} min") + output.append(f" Type: {step.script_type}") + if step.script_content and step.script_type != "manual": + output.append(" Script:") + for line in step.script_content.split('\n')[:3]: # Show first 3 lines + output.append(f" {line}") + if len(step.script_content.split('\n')) > 3: + output.append(" ...") + output.append(f" Success Criteria: {', '.join(step.success_criteria)}") + output.append("") + + if phase.validation_checkpoints: + output.append(" Validation Checkpoints:") + for checkpoint in phase.validation_checkpoints: + output.append(f" ☐ {checkpoint}") + output.append("") + + # Data Recovery Plan + output.append("DATA RECOVERY PLAN") + output.append("-" * 40) + drp = runbook.data_recovery_plan + output.append(f"Recovery Method: {drp.recovery_method}") + output.append(f"Backup Location: {drp.backup_location}") + output.append(f"Estimated Recovery Time: {drp.estimated_recovery_time_minutes} minutes") + output.append("Recovery Scripts:") + for script in drp.recovery_scripts: + output.append(f" • {script}") + output.append("Validation Queries:") + for query in drp.data_validation_queries: + output.append(f" • {query}") + output.append("") + + # Validation Checklist + output.append("POST-ROLLBACK VALIDATION CHECKLIST") + output.append("-" * 40) + for i, item in enumerate(runbook.validation_checklist, 1): + output.append(f"{i:2d}. ☐ {item}") + output.append("") + + # Post-Rollback Procedures + output.append("POST-ROLLBACK PROCEDURES") + output.append("-" * 40) + for i, procedure in enumerate(runbook.post_rollback_procedures, 1): + output.append(f"{i:2d}. {procedure}") + output.append("") + + return "\n".join(output) + + +def main(): + """Main function with command line interface""" + parser = argparse.ArgumentParser(description="Generate comprehensive rollback runbooks from migration plans") + parser.add_argument("--input", "-i", required=True, help="Input migration plan file (JSON)") + parser.add_argument("--output", "-o", help="Output file for rollback runbook (JSON)") + parser.add_argument("--format", "-f", choices=["json", "text", "both"], default="both", help="Output format") + + args = parser.parse_args() + + try: + # Load migration plan + with open(args.input, 'r') as f: + migration_plan = json.load(f) + + # Validate required fields + if "migration_id" not in migration_plan and "source" not in migration_plan: + print("Error: Migration plan must contain migration_id or source field", file=sys.stderr) + return 1 + + # Generate rollback runbook + generator = RollbackGenerator() + runbook = generator.generate_rollback_runbook(migration_plan) + + # Output results + if args.format in ["json", "both"]: + runbook_dict = asdict(runbook) + if args.output: + with open(args.output, 'w') as f: + json.dump(runbook_dict, f, indent=2) + print(f"Rollback runbook saved to {args.output}") + else: + print(json.dumps(runbook_dict, indent=2)) + + if args.format in ["text", "both"]: + human_runbook = generator.generate_human_readable_runbook(runbook) + text_output = args.output.replace('.json', '.txt') if args.output else None + if text_output: + with open(text_output, 'w') as f: + f.write(human_runbook) + print(f"Human-readable runbook saved to {text_output}") + else: + print("\n" + "="*80) + print("HUMAN-READABLE ROLLBACK RUNBOOK") + print("="*80) + print(human_runbook) + + except FileNotFoundError: + print(f"Error: Input file '{args.input}' not found", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in input file: {e}", file=sys.stderr) + return 1 + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file