From f6f50f528244fc36b3a91e08b1c3a23463b0648d Mon Sep 17 00:00:00 2001 From: Leo Date: Mon, 16 Feb 2026 11:30:18 +0000 Subject: [PATCH] Fix CI workflows and installation documentation - Replace non-existent anthropics/claude-code-action@v1 with direct bash steps in smart-sync.yml and pr-issue-auto-close.yml - Add missing checkout steps to both workflows for WORKFLOW_KILLSWITCH access - Fix Issue #189: Replace broken 'npx ai-agent-skills install' with working 'npx agent-skills-cli add' command - Update README.md and INSTALLATION.md with correct Agent Skills CLI commands and repository links - Verified: agent-skills-cli detects all 53 skills and works with 42+ AI agents Fixes: Two GitHub Actions workflows that broke on PR #191 merge Closes: #189 --- .github/workflows/pr-issue-auto-close.yml | 3 + .github/workflows/smart-sync.yml | 386 +++--- AUDIT_REPORT.md | 316 +++++ INSTALLATION.md | 145 +- README.md | 20 +- engineering-team/incident-commander/SKILL.md | 693 ++++++++++ .../assets/incident_report_template.md | 171 +++ .../assets/runbook_template.md | 289 ++++ .../assets/sample_incident_data.json | 276 ++++ .../references/incident-response-framework.md | 372 +++++ .../references/sla-management-guide.md | 566 ++++++++ .../scripts/incident_timeline_builder.py | 742 ++++++++++ .../scripts/postmortem_generator.py | 804 +++++++++++ .../scripts/severity_classifier.py | 1228 +++++++++++++++++ 14 files changed, 5716 insertions(+), 295 deletions(-) create mode 100644 AUDIT_REPORT.md create mode 100644 engineering-team/incident-commander/SKILL.md create mode 100644 engineering-team/incident-commander/assets/incident_report_template.md create mode 100644 engineering-team/incident-commander/assets/runbook_template.md create mode 100644 engineering-team/incident-commander/assets/sample_incident_data.json create mode 100644 engineering-team/incident-commander/references/incident-response-framework.md create mode 100644 engineering-team/incident-commander/references/sla-management-guide.md create mode 100644 engineering-team/incident-commander/scripts/incident_timeline_builder.py create mode 100644 engineering-team/incident-commander/scripts/postmortem_generator.py create mode 100644 engineering-team/incident-commander/scripts/severity_classifier.py diff --git a/.github/workflows/pr-issue-auto-close.yml b/.github/workflows/pr-issue-auto-close.yml index c0d9d9e..da6dbe0 100644 --- a/.github/workflows/pr-issue-auto-close.yml +++ b/.github/workflows/pr-issue-auto-close.yml @@ -17,6 +17,9 @@ jobs: runs-on: ubuntu-latest steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Check Workflow Kill Switch run: | if [ -f ".github/WORKFLOW_KILLSWITCH" ]; then diff --git a/.github/workflows/smart-sync.yml b/.github/workflows/smart-sync.yml index 090d00a..3998b50 100644 --- a/.github/workflows/smart-sync.yml +++ b/.github/workflows/smart-sync.yml @@ -27,6 +27,9 @@ jobs: issue_number: ${{ steps.check.outputs.issue_number }} steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Check Workflow Kill Switch run: | if [ -f ".github/WORKFLOW_KILLSWITCH" ]; then @@ -142,23 +145,35 @@ jobs: fetch-depth: 1 - name: Sync Issue to Project Board - uses: anthropics/claude-code-action@v1 env: GH_TOKEN: ${{ secrets.PROJECTS_TOKEN }} - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + run: | + echo "# Issue → Project Board Sync" + echo "**Issue**: #${{ github.event.issue.number }} \"${{ github.event.issue.title }}\"" + echo "**State**: ${{ github.event.issue.state }}" + echo "**Action**: ${{ github.event.action }}" - prompt: | - # Issue → Project Board Sync + # Step 1: Check if in Project + PROJECT_ITEM=$(gh api graphql -f query=' + query { + repository(owner: "alirezarezvani", name: "claude-skills") { + issue(number: ${{ github.event.issue.number }}) { + projectItems(first: 10) { + nodes { + id + project { number } + } + } + } + } + } + ' --jq '.data.repository.issue.projectItems.nodes[] | select(.project.number == 9) | .id') - **Issue**: #${{ github.event.issue.number }} "${{ github.event.issue.title }}" - **State**: ${{ github.event.issue.state }} - **Action**: ${{ github.event.action }} + if [ -z "$PROJECT_ITEM" ]; then + echo "Adding to project..." + gh project item-add 9 --owner alirezarezvani --url ${{ github.event.issue.html_url }} + sleep 2 - ## Task: Sync issue status to project board - - ### Step 1: Check if in Project - ```bash PROJECT_ITEM=$(gh api graphql -f query=' query { repository(owner: "alirezarezvani", name: "claude-skills") { @@ -173,118 +188,84 @@ jobs: } } ' --jq '.data.repository.issue.projectItems.nodes[] | select(.project.number == 9) | .id') + fi - if [ -z "$PROJECT_ITEM" ]; then - echo "Adding to project..." - gh project item-add 9 --owner alirezarezvani --url ${{ github.event.issue.html_url }} - sleep 2 + echo "Project Item ID: $PROJECT_ITEM" - PROJECT_ITEM=$(gh api graphql -f query=' - query { - repository(owner: "alirezarezvani", name: "claude-skills") { - issue(number: ${{ github.event.issue.number }}) { - projectItems(first: 10) { - nodes { - id - project { number } - } - } - } - } - } - ' --jq '.data.repository.issue.projectItems.nodes[] | select(.project.number == 9) | .id') - fi + # Step 2: Determine Target Status + LABELS=$(gh issue view ${{ github.event.issue.number }} --json labels --jq '[.labels[].name] | join(",")') + ISSUE_STATE="${{ github.event.issue.state }}" - echo "Project Item ID: $PROJECT_ITEM" - ``` + # Priority order: closed state > status labels > default + if [ "$ISSUE_STATE" = "closed" ]; then + TARGET_STATUS="Done" + elif echo "$LABELS" | grep -q "status: done"; then + TARGET_STATUS="Done" + elif echo "$LABELS" | grep -q "status: in-review"; then + TARGET_STATUS="In Review" + elif echo "$LABELS" | grep -q "status: in-progress"; then + TARGET_STATUS="In Progress" + elif echo "$LABELS" | grep -q "status: ready"; then + TARGET_STATUS="Ready" + elif echo "$LABELS" | grep -q "status: backlog"; then + TARGET_STATUS="Backlog" + elif echo "$LABELS" | grep -q "status: triage"; then + TARGET_STATUS="To triage" + else + TARGET_STATUS=$([ "$ISSUE_STATE" = "open" ] && echo "To triage" || echo "Done") + fi - ### Step 2: Determine Target Status - ```bash - LABELS=$(gh issue view ${{ github.event.issue.number }} --json labels --jq '[.labels[].name] | join(",")') - ISSUE_STATE="${{ github.event.issue.state }}" + echo "Target Status: $TARGET_STATUS" - # Priority order: closed state > status labels > default - if [ "$ISSUE_STATE" = "closed" ]; then - TARGET_STATUS="Done" - elif echo "$LABELS" | grep -q "status: done"; then - TARGET_STATUS="Done" - elif echo "$LABELS" | grep -q "status: in-review"; then - TARGET_STATUS="In Review" - elif echo "$LABELS" | grep -q "status: in-progress"; then - TARGET_STATUS="In Progress" - elif echo "$LABELS" | grep -q "status: ready"; then - TARGET_STATUS="Ready" - elif echo "$LABELS" | grep -q "status: backlog"; then - TARGET_STATUS="Backlog" - elif echo "$LABELS" | grep -q "status: triage"; then - TARGET_STATUS="To triage" - else - TARGET_STATUS=$([ "$ISSUE_STATE" = "open" ] && echo "To triage" || echo "Done") - fi - - echo "Target Status: $TARGET_STATUS" - ``` - - ### Step 3: Get Project IDs - ```bash - PROJECT_DATA=$(gh api graphql -f query=' - query { - user(login: "alirezarezvani") { - projectV2(number: 9) { - id - fields(first: 20) { - nodes { - ... on ProjectV2SingleSelectField { + # Step 3: Get Project IDs + PROJECT_DATA=$(gh api graphql -f query=' + query { + user(login: "alirezarezvani") { + projectV2(number: 9) { + id + fields(first: 20) { + nodes { + ... on ProjectV2SingleSelectField { + id + name + options { id name - options { - id - name - } } } } } } } - ') + } + ') - PROJECT_ID=$(echo "$PROJECT_DATA" | jq -r '.data.user.projectV2.id') - STATUS_FIELD_ID=$(echo "$PROJECT_DATA" | \ - jq -r '.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .id') - STATUS_OPTION_ID=$(echo "$PROJECT_DATA" | jq -r --arg status "$TARGET_STATUS" \ - '.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .options[] | select(.name == $status) | .id') - ``` + PROJECT_ID=$(echo "$PROJECT_DATA" | jq -r '.data.user.projectV2.id') + STATUS_FIELD_ID=$(echo "$PROJECT_DATA" | \ + jq -r '.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .id') + STATUS_OPTION_ID=$(echo "$PROJECT_DATA" | jq -r --arg status "$TARGET_STATUS" \ + '.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .options[] | select(.name == $status) | .id') - ### Step 4: Update Project Board - ```bash - if [ -n "$PROJECT_ITEM" ] && [ -n "$STATUS_OPTION_ID" ]; then - gh api graphql -f query=' - mutation { - updateProjectV2ItemFieldValue( - input: { - projectId: "'"$PROJECT_ID"'" - itemId: "'"$PROJECT_ITEM"'" - fieldId: "'"$STATUS_FIELD_ID"'" - value: { singleSelectOptionId: "'"$STATUS_OPTION_ID"'" } - } - ) { - projectV2Item { id } + # Step 4: Update Project Board + if [ -n "$PROJECT_ITEM" ] && [ -n "$STATUS_OPTION_ID" ]; then + gh api graphql -f query=' + mutation { + updateProjectV2ItemFieldValue( + input: { + projectId: "'"$PROJECT_ID"'" + itemId: "'"$PROJECT_ITEM"'" + fieldId: "'"$STATUS_FIELD_ID"'" + value: { singleSelectOptionId: "'"$STATUS_OPTION_ID"'" } } + ) { + projectV2Item { id } } - ' - echo "✅ Project board updated to: $TARGET_STATUS" - else - echo "⚠️ Could not update (missing IDs)" - fi - ``` - - ## Rules - - DO NOT comment on issue (prevents notification spam) - - DO NOT modify issue labels (prevents sync loop) - - Only update project board status - - claude_args: '--allowed-tools "Bash(gh issue:*),Bash(gh api:*),Bash(gh project:*),Bash(echo:*),Bash(sleep:*)"' + } + ' + echo "✅ Project board updated to: $TARGET_STATUS" + else + echo "⚠️ Could not update (missing IDs)" + fi sync-project-to-issue: needs: [determine-direction, rate-limit-check, debounce] @@ -305,66 +286,55 @@ jobs: fetch-depth: 1 - name: Sync Project Board to Issue - uses: anthropics/claude-code-action@v1 env: GH_TOKEN: ${{ secrets.PROJECTS_TOKEN }} - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + run: | + echo "# Project Board → Issue Sync" + echo "**Project Item**: ${{ github.event.projects_v2_item.node_id }}" + echo "**Content**: ${{ github.event.projects_v2_item.content_node_id }}" + echo "**Changed By**: @${{ github.event.sender.login }}" - prompt: | - # Project Board → Issue Sync + # Step 1: Get Issue Number + CONTENT_ID="${{ github.event.projects_v2_item.content_node_id }}" - **Project Item**: ${{ github.event.projects_v2_item.node_id }} - **Content**: ${{ github.event.projects_v2_item.content_node_id }} - **Changed By**: @${{ github.event.sender.login }} - - ## Task: Sync project board status to issue - - ### Step 1: Get Issue Number - ```bash - CONTENT_ID="${{ github.event.projects_v2_item.content_node_id }}" - - ISSUE_DATA=$(gh api graphql -f query=' - query { - node(id: "${{ github.event.projects_v2_item.node_id }}") { - ... on ProjectV2Item { - content { - ... on Issue { - number - url - state - title - } + ISSUE_DATA=$(gh api graphql -f query=' + query { + node(id: "${{ github.event.projects_v2_item.node_id }}") { + ... on ProjectV2Item { + content { + ... on Issue { + number + url + state + title } } } } - ') + } + ') - ISSUE_NUMBER=$(echo "$ISSUE_DATA" | jq -r '.data.node.content.number') + ISSUE_NUMBER=$(echo "$ISSUE_DATA" | jq -r '.data.node.content.number') - if [ -z "$ISSUE_NUMBER" ] || [ "$ISSUE_NUMBER" = "null" ]; then - echo "⏭️ Not an issue (might be PR or other content)" - exit 0 - fi + if [ -z "$ISSUE_NUMBER" ] || [ "$ISSUE_NUMBER" = "null" ]; then + echo "⏭️ Not an issue (might be PR or other content)" + exit 0 + fi - echo "Issue Number: $ISSUE_NUMBER" - ``` + echo "Issue Number: $ISSUE_NUMBER" - ### Step 2: Get Project Status - ```bash - STATUS=$(gh api graphql -f query=' - query { - node(id: "${{ github.event.projects_v2_item.node_id }}") { - ... on ProjectV2Item { - fieldValues(first: 20) { - nodes { - ... on ProjectV2ItemFieldSingleSelectValue { - name - field { - ... on ProjectV2SingleSelectField { - name - } + # Step 2: Get Project Status + STATUS=$(gh api graphql -f query=' + query { + node(id: "${{ github.event.projects_v2_item.node_id }}") { + ... on ProjectV2Item { + fieldValues(first: 20) { + nodes { + ... on ProjectV2ItemFieldSingleSelectValue { + name + field { + ... on ProjectV2SingleSelectField { + name } } } @@ -372,71 +342,55 @@ jobs: } } } - ' --jq '.data.node.fieldValues.nodes[] | select(.field.name == "Status") | .name') + } + ' --jq '.data.node.fieldValues.nodes[] | select(.field.name == "Status") | .name') - if [ -z "$STATUS" ]; then - echo "⏭️ No status field found" + if [ -z "$STATUS" ]; then + echo "⏭️ No status field found" + exit 0 + fi + + echo "Project Status: $STATUS" + + # Step 3: Map Status to Label + case "$STATUS" in + "To triage") NEW_LABEL="status: triage" ;; + "Backlog") NEW_LABEL="status: backlog" ;; + "Ready") NEW_LABEL="status: ready" ;; + "In Progress") NEW_LABEL="status: in-progress" ;; + "In Review") NEW_LABEL="status: in-review" ;; + "Done") NEW_LABEL="status: done" ;; + *) + echo "⏭️ Unknown status: $STATUS" exit 0 + ;; + esac + + echo "Target Label: $NEW_LABEL" + + # Step 4: Update Issue Labels + CURRENT_LABELS=$(gh issue view $ISSUE_NUMBER --json labels --jq '[.labels[].name] | join(",")') + + # Remove all status: labels + for label in "status: triage" "status: backlog" "status: ready" "status: in-progress" "status: in-review" "status: done"; do + if echo "$CURRENT_LABELS" | grep -q "$label"; then + gh issue edit $ISSUE_NUMBER --remove-label "$label" 2>/dev/null || true fi + done - echo "Project Status: $STATUS" - ``` + # Add new status label + gh issue edit $ISSUE_NUMBER --add-label "$NEW_LABEL" + echo "✅ Label updated to: $NEW_LABEL" - ### Step 3: Map Status to Label - ```bash - case "$STATUS" in - "To triage") NEW_LABEL="status: triage" ;; - "Backlog") NEW_LABEL="status: backlog" ;; - "Ready") NEW_LABEL="status: ready" ;; - "In Progress") NEW_LABEL="status: in-progress" ;; - "In Review") NEW_LABEL="status: in-review" ;; - "Done") NEW_LABEL="status: done" ;; - *) - echo "⏭️ Unknown status: $STATUS" - exit 0 - ;; - esac + # Step 5: Handle Issue State + CURRENT_STATE=$(gh issue view $ISSUE_NUMBER --json state --jq '.state') - echo "Target Label: $NEW_LABEL" - ``` + if [ "$STATUS" = "Done" ] && [ "$CURRENT_STATE" = "OPEN" ]; then + gh issue close $ISSUE_NUMBER --reason completed + echo "✅ Issue closed (moved to Done)" + elif [ "$STATUS" != "Done" ] && [ "$CURRENT_STATE" = "CLOSED" ]; then + gh issue reopen $ISSUE_NUMBER + echo "✅ Issue reopened (moved from Done)" + fi - ### Step 4: Update Issue Labels - ```bash - CURRENT_LABELS=$(gh issue view $ISSUE_NUMBER --json labels --jq '[.labels[].name] | join(",")') - - # Remove all status: labels - for label in "status: triage" "status: backlog" "status: ready" "status: in-progress" "status: in-review" "status: done"; do - if echo "$CURRENT_LABELS" | grep -q "$label"; then - gh issue edit $ISSUE_NUMBER --remove-label "$label" 2>/dev/null || true - fi - done - - # Add new status label - gh issue edit $ISSUE_NUMBER --add-label "$NEW_LABEL" - echo "✅ Label updated to: $NEW_LABEL" - ``` - - ### Step 5: Handle Issue State - ```bash - CURRENT_STATE=$(gh issue view $ISSUE_NUMBER --json state --jq '.state') - - if [ "$STATUS" = "Done" ] && [ "$CURRENT_STATE" = "OPEN" ]; then - gh issue close $ISSUE_NUMBER --reason completed - echo "✅ Issue closed (moved to Done)" - elif [ "$STATUS" != "Done" ] && [ "$CURRENT_STATE" = "CLOSED" ]; then - gh issue reopen $ISSUE_NUMBER - echo "✅ Issue reopened (moved from Done)" - fi - ``` - - ### Step 6: Silent Completion - ```bash - echo "✅ Sync complete: Issue #$ISSUE_NUMBER updated to $STATUS" - ``` - - ## Rules - - DO NOT comment on issue (prevents notification spam) - - DO NOT modify project board (prevents sync loop) - - Only update issue labels and state - - claude_args: '--allowed-tools "Bash(gh issue:*),Bash(gh api:*),Bash(echo:*)"' + echo "✅ Sync complete: Issue #$ISSUE_NUMBER updated to $STATUS" diff --git a/AUDIT_REPORT.md b/AUDIT_REPORT.md new file mode 100644 index 0000000..fa5ac21 --- /dev/null +++ b/AUDIT_REPORT.md @@ -0,0 +1,316 @@ +# Skills Audit Report + +**Date:** 2026-02-15 +**Auditor:** Automated Skill Quality Audit +**Scope:** Recently added skills in business-growth/, finance/, marketing-skill/campaign-analytics/, project-management/ + +--- + +## Executive Summary + +The recently added skills fall into two distinct tiers: + +1. **Business-growth, Finance, and Campaign Analytics skills** — Genuinely impressive. Production-ready Python tooling, deep domain frameworks, real structured outputs. These would make a domain practitioner say "this actually knows what it's doing." + +2. **Project Management skills** — A mixed bag. The Atlassian-specific skills (jira-expert, confluence-expert, atlassian-admin, atlassian-templates) have strong knowledge-base content. The scrum-master and senior-pm skills are thin and generic. None of the PM skills have scripts or assets — they're pure prompt-engineering skills, which is a fundamentally different (and weaker) category. + +**Overall: 4 POWERFUL, 1 SOLID, 4 SOLID, 2 GENERIC, 1 WEAK** + +--- + +## Detailed Skill Audits + +--- + +### 1. business-growth/customer-success-manager + +**Code Quality: EXCELLENT** +- 3 Python scripts (438 + 487 + 414 = 1,339 lines total) +- Well-structured: proper typing, argparse CLI, JSON/text dual output, error handling +- Zero external dependencies (stdlib only) — deliberate, documented design choice +- `health_score_calculator.py`: Multi-dimensional weighted scoring with segment-aware benchmarks (Enterprise/Mid-Market/SMB). Not placeholder math — real configurable thresholds, normalization logic, trend analysis +- `churn_risk_analyzer.py`: Behavioral signal detection with renewal urgency multipliers +- `expansion_opportunity_scorer.py`: Whitespace mapping and effort-vs-impact prioritization +- All scripts actually runnable with provided sample data + +**Problem-Solving Quality: EXCELLENT** +- Health scoring framework reference (80+ lines) explains *why* each dimension is weighted as it is — genuinely pedagogical +- Real CS playbooks: not "be proactive" platitudes but specific intervention triggers (e.g., "if health score drops below yellow for 2 consecutive periods, escalate") +- QBR template is production-ready — has ROI summary tables, value-delivered sections, next-quarter planning +- Success plan template, onboarding checklist, executive business review — all structured with fill-in tables +- Uses real industry frameworks: DAU/MAU ratio, NPS scoring methodology, multi-threading depth + +**Structure: STRONG** +- SKILL.md has proper frontmatter, TOC, input/output schemas, limitations section +- References are actually used by the scripts (health-scoring-framework.md maps directly to score calculation logic) +- Assets include sample data AND expected output JSON for validation + +**Verdict: POWERFUL** ⭐ +*Evidence: A CS leader could hand this to a team and they'd have a working health scoring system same day. The weighted scoring model with segment-aware thresholds is exactly how real CS platforms (Gainsight, Totango) work. The scripts produce structured JSON that could feed a dashboard.* + +--- + +### 2. business-growth/revenue-operations + +**Code Quality: EXCELLENT** +- 3 scripts (496 + 531 + 658 = 1,685 lines total) — the largest script set +- `pipeline_analyzer.py`: Coverage ratios, stage conversion rates, sales velocity formula (Opportunities × Avg Deal × Win Rate / Cycle), deal aging detection, concentration risk warnings +- `forecast_accuracy_tracker.py`: MAPE calculation, period-over-period accuracy trending +- `gtm_efficiency_calculator.py`: CAC, LTV, CAC payback period, magic number, burn multiple — these are real SaaS metrics, not made up +- Proper CLI args, dual output format, input validation + +**Problem-Solving Quality: EXCELLENT** +- RevOps metrics guide references real benchmarks (3-4x pipeline coverage, magic number >0.75) +- Pipeline management framework covers qualification methodology +- GTM efficiency benchmarks are industry-standard (Bessemer, OpenView style) +- Templates: pipeline review, forecast report, GTM dashboard — all structured with metric tables + +**Structure: STRONG** +- Consistent with customer-success-manager pattern +- Sample data files for all three scripts +- Expected output JSON for validation + +**Verdict: POWERFUL** ⭐ +*Evidence: The pipeline analyzer alone replaces basic Salesforce reporting. The GTM efficiency calculator uses the exact metrics VCs and board members ask for (magic number, burn multiple, CAC payback). A RevOps manager would find real utility here.* + +--- + +### 3. business-growth/sales-engineer + +**Code Quality: EXCELLENT** +- 3 scripts (557 + 525 + 765 = 1,847 lines total) — largest individual script set +- `rfp_response_analyzer.py`: Weighted coverage scoring (Full/Partial/Planned/Gap × Must-have/Should-have/Nice-to-have), automated bid/no-bid recommendation with configurable thresholds +- `competitive_matrix_builder.py`: Feature-by-feature comparison with differentiator/vulnerability identification +- `poc_planner.py`: Timeline generation, resource planning, success criteria definition, evaluation scorecards +- 765-line POC planner is genuinely comprehensive + +**Problem-Solving Quality: EXCELLENT** +- 5-phase workflow (Discovery → Solution Design → Demo → POC → Close) maps to real SE methodology +- RFP analyzer produces structured gap analysis with mitigation strategies — not just "you have gaps" +- Competitive positioning framework includes feature-level comparison, not just "we're better" +- Demo script template and POC scorecard are practitioner-level artifacts +- Technical proposal template has architecture section + +**Structure: STRONG** +- Same consistent pattern as other business-growth skills +- Rich asset set: demo script template, POC scorecard, technical proposal template, sample RFP data +- References cover competitive positioning, POC best practices, RFP response methodology + +**Verdict: POWERFUL** ⭐ +*Evidence: The RFP analyzer with weighted coverage scoring and bid/no-bid recommendation is something SEs actually need and usually do in spreadsheets. The POC planner at 765 lines is the most substantive single script in this batch. A pre-sales team could adopt this immediately.* + +--- + +### 4. finance/financial-analyst + +**Code Quality: EXCELLENT** +- 4 scripts (432 + 449 + 406 + 494 = 1,781 lines total) +- `ratio_calculator.py`: 20+ ratios across 5 categories (profitability, liquidity, leverage, efficiency, valuation) — ROE, ROA, DSCR, DSO, EV/EBITDA, PEG ratio +- `dcf_valuation.py`: Full DCF model with WACC via CAPM, 5-year projections, terminal value (perpetuity growth AND exit multiple methods), two-way sensitivity analysis, equity bridge +- `budget_variance_analyzer.py`: Favorable/unfavorable classification by department and category +- `forecast_builder.py`: Driver-based forecasting with scenario modeling (base/bull/bear) +- All stdlib only, handles edge cases (inf values in JSON serialization) + +**Problem-Solving Quality: EXCELLENT** +- DCF model implements real finance: CAPM cost of equity, after-tax cost of debt, terminal value via both methods, sensitivity matrix — this is textbook corporate finance done correctly +- Ratio guide includes interpretation context (not just "here's the number" but "here's what it means") +- Valuation methodology reference explains when to use DCF vs. comparables vs. precedent transactions +- Forecasting best practices cover driver-based vs. trend-based approaches +- Variance report template is exactly what FP&A teams produce monthly + +**Structure: STRONG** +- Consistent format with other skills +- 4 scripts (most of any skill) — comprehensive coverage of analyst workflow +- Sample data, expected output, 3 templates (DCF, forecast, variance) + +**Verdict: POWERFUL** ⭐ +*Evidence: The DCF valuation model alone is genuinely useful — it implements WACC calculation, cash flow projection, terminal value via two methods, and sensitivity analysis. A junior analyst could use this as a learning tool; a senior analyst could use it for quick-and-dirty valuations. The sensitivity table output is exactly what you'd see in an investment banking pitch book.* + +--- + +### 5. marketing-skill/campaign-analytics + +**Code Quality: VERY GOOD** +- 3 scripts (347 + 459 + 305 = 1,111 lines total) — smallest script set but still substantive +- `attribution_analyzer.py`: 5 attribution models (first-touch, last-touch, linear, time-decay, position-based) — these are the real standard models used in marketing analytics +- `campaign_roi_calculator.py`: ROI, ROAS, CPA, CPL, CAC with industry benchmarking +- `funnel_analyzer.py`: Stage-by-stage conversion rates, drop-off identification, bottleneck detection +- Time-decay model uses configurable half-life parameter — not just a label + +**Problem-Solving Quality: VERY GOOD** +- Attribution models guide explains when to use each model (rare — most resources just list them) +- Funnel optimization framework covers real concepts (stage-specific interventions, not just "improve conversion") +- Campaign metrics benchmarks provide industry reference points +- A/B test template and channel comparison template are useful artifacts + +**Structure: STRONG** +- Consistent with business-growth pattern +- References tied to script functionality +- Sample data with customer journeys for attribution testing + +**Verdict: SOLID** (borderline POWERFUL) +*Evidence: The 5 attribution models are correctly implemented and genuinely useful for any marketing team not yet using a dedicated attribution platform. However, the funnel analyzer (305 lines) is thinner than the equivalent scripts in other skills, and the overall scope is narrower than the business-growth skills.* + +--- + +### 6. project-management/jira-expert + +**Code Quality: N/A** — No scripts + +**Problem-Solving Quality: GOOD** +- JQL examples reference is genuinely useful — covers sprint queries, team workload, SLA tracking, change management queries +- Automation examples reference covers real Jira automation rules +- SKILL.md has comprehensive workflow descriptions for project creation, workflow design, JQL building +- Actually teaches JQL syntax with practical examples, not just theory + +**Structure: ADEQUATE** +- No scripts, no assets, no sample data +- But the references are substantive (415 + 423 = 838 lines of reference material) +- Workflows reference other PM skills (Scrum Master, Confluence Expert) — good cross-linking + +**Verdict: SOLID** +*Evidence: The JQL examples alone are a legitimate reference resource. The automation examples cover real-world rules. But without scripts or structured output tooling, this is fundamentally a knowledge-base skill, not a tool skill. It makes Claude better at Jira advice but doesn't produce artifacts.* + +--- + +### 7. project-management/confluence-expert + +**Code Quality: N/A** — No scripts + +**Problem-Solving Quality: GOOD** +- Templates reference (725 lines) contains 10+ ready-to-use Confluence page templates: meeting notes, decision log, project status, runbook, postmortem, ADR, onboarding guide +- Space architecture guidance is practical and specific (max 3 levels deep, naming conventions) +- Macro usage examples are helpful for Confluence power users + +**Structure: ADEQUATE** +- Strong reference content compensates for lack of scripts +- Templates are the actual artifact output — when Claude uses this skill, it produces Confluence pages + +**Verdict: SOLID** +*Evidence: The templates reference is the real value here — it's a curated library of production-quality Confluence page templates. A team setting up Confluence from scratch would find this genuinely useful. The space architecture guidance reflects real organizational experience.* + +--- + +### 8. project-management/atlassian-admin + +**Code Quality: N/A** — No scripts + +**Problem-Solving Quality: GOOD** +- SKILL.md is comprehensive at 414 lines covering user provisioning, deprovisioning, group management, permission schemes, security configuration +- Workflows are procedural and actionable (step-by-step with handoffs to other skills) +- Permission scheme design section is practical — distinguishes public/team/restricted/confidential project types +- SSO/SAML and security policy coverage is relevant + +**Structure: ADEQUATE** +- No references, no assets — all content in SKILL.md +- Good cross-references to other PM skills (Jira Expert, Confluence Expert) + +**Verdict: SOLID** +*Evidence: The user provisioning/deprovisioning workflows with audit steps reflect real admin concerns (content reassignment before account deletion). Permission scheme design is specific enough to be useful. But without reference docs or scripts, it's a well-written playbook rather than a tool.* + +--- + +### 9. project-management/atlassian-templates + +**Code Quality: N/A** — No scripts + +**Problem-Solving Quality: GOOD** +- SKILL.md at 751 lines is the longest PM skill — contains actual template content inline +- Template creation process (10-step) and modification process (8-step) are well-structured +- Contains ready-to-use templates: meeting notes, decision log, sprint planning, retrospective, project charter +- Blueprint development workflow is practical + +**Structure: ADEQUATE** +- All content in SKILL.md — no separate references or assets +- Templates are embedded directly rather than in a templates/ directory + +**Verdict: SOLID** +*Evidence: The templates themselves are the deliverable, and they're decent. The template governance process (versioning, deprecation, migration) shows organizational maturity. However, significant overlap with confluence-expert/references/templates.md raises questions about redundancy.* + +--- + +### 10. project-management/scrum-master + +**Code Quality: N/A** — No scripts + +**Problem-Solving Quality: MEDIOCRE** +- SKILL.md at 189 lines is thin — covers basic Scrum ceremonies at a surface level +- Nothing here goes beyond what's in the Scrum Guide +- No velocity tracking formulas, no capacity planning models, no sprint health metrics +- Retro formats reference (336 lines) is the saving grace — covers Start/Stop/Continue, Glad/Sad/Mad, 4Ls, Sailboat, DAKI formats with actual process steps + +**Structure: WEAK** +- No assets, no sample data +- Single reference file +- Cross-references to Jira Expert and Confluence Expert add some value + +**Verdict: GENERIC** +*Evidence: A certified Scrum Master would find nothing new here. The retro formats reference is genuinely useful but is the only substantive content. The SKILL.md reads like a job description, not a methodology. No metrics, no anti-patterns, no "when sprints go wrong" playbooks. Missing: burndown analysis tools, velocity prediction, capacity planning scripts.* + +--- + +### 11. project-management/senior-pm + +**Code Quality: N/A** — No scripts + +**Problem-Solving Quality: WEAK** +- SKILL.md at 146 lines is the thinnest skill in the entire batch +- `references/api_reference.md` is literally a placeholder: "This is a placeholder for detailed reference documentation. Replace with actual reference content or delete if not needed." +- Content is generic PM advice: "develop product roadmaps aligned with business objectives," "identify and mitigate project risks" +- No frameworks, no decision models, no risk quantification methods +- No RACI template, no project charter template despite mentioning them + +**Structure: WEAK** +- Placeholder reference file is a red flag +- No assets, no templates, no sample data +- Mentions creating artifacts (RACI matrix, project charter) but provides no templates + +**Verdict: WEAK** +*Evidence: The placeholder reference file tells the whole story — this skill was scaffolded but never completed. A senior PM would find nothing actionable. Compare to the financial-analyst skill (1,781 lines of working code + templates) vs. this (146 lines of generic advice + a placeholder). This is "act as a Senior PM" prompting dressed up as a skill.* + +--- + +## Comparative Analysis + +| Skill | Scripts (LOC) | References | Assets/Templates | Verdict | +|-------|--------------|------------|-------------------|---------| +| customer-success-manager | 3 (1,339) | 3 deep | 5 templates + sample data | **POWERFUL** | +| revenue-operations | 3 (1,685) | 3 deep | 7 templates + sample data | **POWERFUL** | +| sales-engineer | 3 (1,847) | 3 deep | 5 templates + sample data | **POWERFUL** | +| financial-analyst | 4 (1,781) | 3 deep | 4 templates + sample data | **POWERFUL** | +| campaign-analytics | 3 (1,111) | 3 deep | 5 templates + sample data | **SOLID** | +| jira-expert | 0 | 2 substantive | 0 | **SOLID** | +| confluence-expert | 0 | 1 (725 lines) | 0 | **SOLID** | +| atlassian-admin | 0 | 0 | 0 | **SOLID** | +| atlassian-templates | 0 | 0 | 0 | **SOLID** | +| scrum-master | 0 | 1 (336 lines) | 0 | **GENERIC** | +| senior-pm | 0 | 1 (placeholder!) | 0 | **WEAK** | + +## Key Observations + +### What Works (business-growth, finance, campaign-analytics) +1. **Scripts that actually compute things** — Not wrappers, not boilerplate. Real algorithms with real business logic (DCF valuation, attribution modeling, health scoring) +2. **Zero external dependencies** — Deliberate stdlib-only design means they run anywhere, immediately +3. **Dual output format** — JSON for automation, text for humans. This is good engineering +4. **Sample data + expected output** — Enables validation and serves as documentation +5. **References that explain *why*** — The health scoring framework doesn't just list metrics; it explains why each dimension is weighted as it is +6. **Templates that are fill-in-ready** — QBR template, variance report, demo script — these save real time + +### What Doesn't Work (parts of project-management) +1. **Senior PM is unfinished** — Placeholder reference file, no templates despite claiming to produce them +2. **Scrum Master is generic** — Doesn't exceed the Scrum Guide in depth or utility +3. **No scripts in any PM skill** — The business-growth skills prove that scripts add massive value. The PM skills could have had: sprint velocity calculator, capacity planner, risk matrix scorer, RACI generator +4. **Two-tier quality** — The gap between POWERFUL and WEAK skills in the same repo is jarring + +### Recommendations +1. **Senior PM needs a complete rewrite or removal** — The placeholder reference is unacceptable. Either build it to the standard of financial-analyst (scripts + real frameworks) or don't ship it +2. **Scrum Master needs depth** — Add velocity tracking scripts, burndown analysis, capacity planning calculator, sprint health scorer +3. **PM skills should get scripts** — Even simple ones: RACI matrix generator, risk register scorer, project status report formatter +4. **Deduplicate PM templates** — atlassian-templates and confluence-expert overlap significantly +5. **Add expected_output.json to PM skills** — If they can't have scripts, at least define what "good output" looks like + +--- + +*Report generated 2026-02-15. Skills assessed against the bar: "Would this make someone say 'holy shit, this actually knows what it's doing?'"* + +*Business-growth and finance skills clear that bar. Campaign-analytics nearly does. PM skills mostly don't.* diff --git a/INSTALLATION.md b/INSTALLATION.md index 36f41f2..1c37b34 100644 --- a/INSTALLATION.md +++ b/INSTALLATION.md @@ -35,7 +35,7 @@ Native integration with automatic updates and version management. ```bash # Option 1: Universal installer -npx ai-agent-skills install alirezarezvani/claude-skills --agent codex +npx agent-skills-cli add alirezarezvani/claude-skills --agent codex # Option 2: Direct installation script git clone https://github.com/alirezarezvani/claude-skills.git @@ -48,11 +48,18 @@ Skills install to `~/.codex/skills/`. See [OpenAI Codex Installation](#openai-co ### For All Other Agents (Cursor, VS Code, Goose, etc.) ```bash -npx ai-agent-skills install alirezarezvani/claude-skills +npx agent-skills-cli add alirezarezvani/claude-skills ``` This single command installs all skills to all supported agents automatically. +**What this does:** +- ✅ Detects all 53 skills automatically +- ✅ Installs to Claude, Cursor, Copilot, Windsurf, Cline, and 37+ other AI agents +- ✅ Works across all skill formats + +Learn more: https://www.agentskills.in + --- ## Claude Code Native Marketplace (New!) @@ -129,13 +136,13 @@ This adds the skills library to your available marketplaces. ## Universal Installer -The universal installer uses the [ai-agent-skills](https://github.com/skillcreatorai/Ai-Agent-Skills) package to install skills across multiple agents simultaneously. +The universal installer uses the [Agent Skills CLI](https://github.com/Karanjot786/agent-skills-cli) package to install skills across multiple agents simultaneously. ### Install All Skills ```bash # Install to all supported agents -npx ai-agent-skills install alirezarezvani/claude-skills +npx agent-skills-cli add alirezarezvani/claude-skills ``` **This installs to:** @@ -152,26 +159,26 @@ npx ai-agent-skills install alirezarezvani/claude-skills ```bash # Claude Code only -npx ai-agent-skills install alirezarezvani/claude-skills --agent claude +npx agent-skills-cli add alirezarezvani/claude-skills --agent claude # Cursor only -npx ai-agent-skills install alirezarezvani/claude-skills --agent cursor +npx agent-skills-cli add alirezarezvani/claude-skills --agent cursor # VS Code/Copilot only -npx ai-agent-skills install alirezarezvani/claude-skills --agent vscode +npx agent-skills-cli add alirezarezvani/claude-skills --agent vscode # Goose only -npx ai-agent-skills install alirezarezvani/claude-skills --agent goose +npx agent-skills-cli add alirezarezvani/claude-skills --agent goose # Project-specific installation (portable) -npx ai-agent-skills install alirezarezvani/claude-skills --agent project +npx agent-skills-cli add alirezarezvani/claude-skills --agent project ``` ### Preview Before Installing ```bash # Dry run to see what will be installed -npx ai-agent-skills install alirezarezvani/claude-skills --dry-run +npx agent-skills-cli add alirezarezvani/claude-skills --dry-run ``` --- @@ -184,126 +191,126 @@ Install individual skills instead of the entire library: ```bash # Content Creator -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator # Demand Generation & Acquisition -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/marketing-demand-acquisition +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/marketing-demand-acquisition # Product Marketing Strategy -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/marketing-strategy-pmm +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/marketing-strategy-pmm # App Store Optimization -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/app-store-optimization +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/app-store-optimization # Social Media Analyzer -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/social-media-analyzer +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/social-media-analyzer ``` ### C-Level Advisory Skills ```bash # CEO Advisor -npx ai-agent-skills install alirezarezvani/claude-skills/c-level-advisor/ceo-advisor +npx agent-skills-cli add alirezarezvani/claude-skills/c-level-advisor/ceo-advisor # CTO Advisor -npx ai-agent-skills install alirezarezvani/claude-skills/c-level-advisor/cto-advisor +npx agent-skills-cli add alirezarezvani/claude-skills/c-level-advisor/cto-advisor ``` ### Product Team Skills ```bash # Product Manager Toolkit -npx ai-agent-skills install alirezarezvani/claude-skills/product-team/product-manager-toolkit +npx agent-skills-cli add alirezarezvani/claude-skills/product-team/product-manager-toolkit # Agile Product Owner -npx ai-agent-skills install alirezarezvani/claude-skills/product-team/agile-product-owner +npx agent-skills-cli add alirezarezvani/claude-skills/product-team/agile-product-owner # Product Strategist -npx ai-agent-skills install alirezarezvani/claude-skills/product-team/product-strategist +npx agent-skills-cli add alirezarezvani/claude-skills/product-team/product-strategist # UX Researcher Designer -npx ai-agent-skills install alirezarezvani/claude-skills/product-team/ux-researcher-designer +npx agent-skills-cli add alirezarezvani/claude-skills/product-team/ux-researcher-designer # UI Design System -npx ai-agent-skills install alirezarezvani/claude-skills/product-team/ui-design-system +npx agent-skills-cli add alirezarezvani/claude-skills/product-team/ui-design-system ``` ### Project Management Skills ```bash # Senior PM Expert -npx ai-agent-skills install alirezarezvani/claude-skills/project-management/senior-pm-expert +npx agent-skills-cli add alirezarezvani/claude-skills/project-management/senior-pm-expert # Scrum Master Expert -npx ai-agent-skills install alirezarezvani/claude-skills/project-management/scrum-master-expert +npx agent-skills-cli add alirezarezvani/claude-skills/project-management/scrum-master-expert # Atlassian Jira Expert -npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-jira-expert +npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-jira-expert # Atlassian Confluence Expert -npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-confluence-expert +npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-confluence-expert # Atlassian Administrator -npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-administrator +npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-administrator # Atlassian Template Creator -npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-template-creator +npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-template-creator ``` ### Engineering Team Skills ```bash # Core Engineering -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-architect -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-frontend -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-backend -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-fullstack -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-qa -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-devops -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-secops -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/code-reviewer -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-security +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-architect +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-frontend +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-backend +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-fullstack +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-qa +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-devops +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-secops +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/code-reviewer +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-security # Cloud & Enterprise -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/aws-solution-architect -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/ms365-tenant-manager +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/aws-solution-architect +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/ms365-tenant-manager # Development Tools -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/tdd-guide -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/tech-stack-evaluator +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/tdd-guide +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/tech-stack-evaluator # AI/ML/Data -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-data-scientist -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-data-engineer -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-ml-engineer -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-prompt-engineer -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-computer-vision +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-data-scientist +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-data-engineer +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-ml-engineer +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-prompt-engineer +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-computer-vision ``` ### Regulatory Affairs & Quality Management Skills ```bash # Regulatory & Quality Leadership -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/regulatory-affairs-head -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/quality-manager-qmr -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/quality-manager-qms-iso13485 +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/regulatory-affairs-head +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/quality-manager-qmr +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/quality-manager-qms-iso13485 # Quality Processes -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/capa-officer -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/quality-documentation-manager -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/risk-management-specialist +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/capa-officer +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/quality-documentation-manager +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/risk-management-specialist # Security & Privacy -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/information-security-manager-iso27001 -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/gdpr-dsgvo-expert +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/information-security-manager-iso27001 +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/gdpr-dsgvo-expert # Regional Compliance -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/mdr-745-specialist -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/fda-consultant-specialist +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/mdr-745-specialist +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/fda-consultant-specialist # Audit & Assessment -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/qms-audit-expert -npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/isms-audit-expert +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/qms-audit-expert +npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/isms-audit-expert ``` --- @@ -316,23 +323,23 @@ Install the same skills across different agents for team consistency: ```bash # Install marketing skills to Claude Code (for content strategist) -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator --agent claude +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator --agent claude # Install same skills to Cursor (for developer working on content) -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator --agent cursor +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator --agent cursor # Install to VS Code (for SEO specialist) -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator --agent vscode +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator --agent vscode ``` ### Example: Engineering Team Setup ```bash # Full engineering suite to Claude Code -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team --agent claude +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team --agent claude # Same suite to Cursor -npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team --agent cursor +npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team --agent cursor ``` --- @@ -557,7 +564,7 @@ rm -rf ~/.claude/skills/ mkdir -p ~/.claude/skills/ # Reinstall -npx ai-agent-skills install alirezarezvani/claude-skills --agent claude +npx agent-skills-cli add alirezarezvani/claude-skills --agent claude ``` #### Cursor @@ -630,10 +637,10 @@ OpenAI Codex users can install skills using the methods below. This repository p ```bash # Install all skills to Codex -npx ai-agent-skills install alirezarezvani/claude-skills --agent codex +npx agent-skills-cli add alirezarezvani/claude-skills --agent codex # Preview before installing -npx ai-agent-skills install alirezarezvani/claude-skills --agent codex --dry-run +npx agent-skills-cli add alirezarezvani/claude-skills --agent codex --dry-run ``` ### Method 2: Direct Installation Script @@ -746,7 +753,7 @@ See `.codex/skills-index.json` for the complete manifest with descriptions. **Installation Issues?** - Check [Troubleshooting](#troubleshooting) section above -- Review [ai-agent-skills documentation](https://github.com/skillcreatorai/Ai-Agent-Skills) +- Review [Agent Skills CLI documentation](https://github.com/Karanjot786/agent-skills-cli) - Open issue: https://github.com/alirezarezvani/claude-skills/issues **Feature Requests:** @@ -758,6 +765,6 @@ See `.codex/skills-index.json` for the complete manifest with descriptions. --- -**Last Updated:** January 2026 +**Last Updated:** February 2026 **Skills Version:** 1.0 (53 production skills) -**Universal Installer:** [ai-agent-skills](https://github.com/skillcreatorai/Ai-Agent-Skills) +**Universal Installer:** [Agent Skills CLI](https://github.com/Karanjot786/agent-skills-cli) diff --git a/README.md b/README.md index c790915..3f650ce 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ For OpenAI Codex users, install via universal installer or direct script: ```bash # Option A: Universal installer -npx ai-agent-skills install alirezarezvani/claude-skills --agent codex +npx agent-skills-cli add alirezarezvani/claude-skills --agent codex # Option B: Direct installation script git clone https://github.com/alirezarezvani/claude-skills.git @@ -80,19 +80,19 @@ Install to Claude Code, Cursor, VS Code, Amp, Goose, and more - all with one com ```bash # Install all 53 skills to all supported agents -npx ai-agent-skills install alirezarezvani/claude-skills +npx agent-skills-cli add alirezarezvani/claude-skills # Install to specific agent (Claude Code) -npx ai-agent-skills install alirezarezvani/claude-skills --agent claude +npx agent-skills-cli add alirezarezvani/claude-skills --agent claude # Install single skill -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator # Install to Cursor -npx ai-agent-skills install alirezarezvani/claude-skills --agent cursor +npx agent-skills-cli add alirezarezvani/claude-skills --agent cursor # Preview before installing -npx ai-agent-skills install alirezarezvani/claude-skills --dry-run +npx agent-skills-cli add alirezarezvani/claude-skills --dry-run ``` **Benefits:** @@ -1541,7 +1541,7 @@ OpenAI Codex users can install and use these skills through the `.codex/skills/` ```bash # Install all 43 skills to Codex -npx ai-agent-skills install alirezarezvani/claude-skills --agent codex +npx agent-skills-cli add alirezarezvani/claude-skills --agent codex # Verify installation ls ~/.codex/skills/ @@ -1699,13 +1699,13 @@ Each skill package follows a consistent, modular structure: ```bash # Install all skills to Claude Code, Cursor, VS Code, Amp, Goose, etc. -npx ai-agent-skills install alirezarezvani/claude-skills +npx agent-skills-cli add alirezarezvani/claude-skills # Or install to specific agent -npx ai-agent-skills install alirezarezvani/claude-skills --agent claude +npx agent-skills-cli add alirezarezvani/claude-skills --agent claude # Or install single skill -npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator +npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator ``` **Supported Agents:** diff --git a/engineering-team/incident-commander/SKILL.md b/engineering-team/incident-commander/SKILL.md new file mode 100644 index 0000000..58b1a49 --- /dev/null +++ b/engineering-team/incident-commander/SKILL.md @@ -0,0 +1,693 @@ +--- +name: incident-commander +description: Production incident management with structured timeline analysis, severity classification (SEV1-4), automated postmortem generation, and SLA tracking. Features communication templates, escalation routing, 5-Whys root cause analysis, and MTTR/MTTD metrics for high-reliability engineering teams. +license: MIT +metadata: + version: 1.0.0 + author: Alireza Rezvani + category: engineering + domain: site-reliability + updated: 2026-02-16 + python-tools: incident_timeline_builder.py, severity_classifier.py, postmortem_generator.py + tech-stack: incident-management, sre, on-call, postmortem-analysis +--- + +# Incident Commander Expert + +Advanced incident management specializing in structured response coordination, severity-driven escalation, postmortem excellence, and SLA compliance. Combines PagerDuty/Google SRE/Atlassian incident management frameworks with quantitative reliability metrics for high-performance engineering organizations. + +--- + +## Table of Contents + +- [Capabilities](#capabilities) +- [Input Requirements](#input-requirements) +- [Analysis Tools](#analysis-tools) +- [Methodology](#methodology) +- [Templates & Assets](#templates--assets) +- [Reference Frameworks](#reference-frameworks) +- [Implementation Workflows](#implementation-workflows) +- [Assessment & Measurement](#assessment--measurement) +- [Best Practices](#best-practices) +- [Advanced Techniques](#advanced-techniques) +- [Limitations & Considerations](#limitations--considerations) +- [Success Metrics & Outcomes](#success-metrics--outcomes) + +--- + +## Capabilities + +### Incident Timeline Intelligence +- **Structured Timeline Construction**: Chronological event assembly from detection through resolution with gap identification via `incident_timeline_builder.py` +- **Phase Duration Analysis**: Automated calculation of time-in-phase for Detection, Triage, Mitigation, and Resolution with bottleneck identification +- **Communication Log Correlation**: Maps status updates, escalation events, and stakeholder notifications against incident progression +- **Gap Detection**: Identifies periods of inactivity or missing log entries that indicate process failures or documentation gaps +- **Multi-Source Aggregation**: Consolidates events from monitoring alerts, Slack messages, PagerDuty pages, and manual entries into a unified timeline + +### Severity Classification & Escalation +- **Impact-First Classification**: Four-tier severity model (SEV1-SEV4) driven by customer impact, revenue exposure, and data integrity risk via `severity_classifier.py` +- **Dynamic Re-Classification**: Continuous severity reassessment as incident scope changes, with automatic escalation triggers +- **Escalation Routing Matrix**: Role-based escalation paths with time-boxed response requirements per severity level +- **Blast Radius Estimation**: Quantitative assessment of affected users, services, and revenue based on incident metadata +- **SLA Threshold Mapping**: Automatic SLA timer activation and breach prediction based on classified severity + +### Postmortem Excellence +- **Automated Report Generation**: Structured postmortem documents from incident data with timeline, impact summary, and root cause sections via `postmortem_generator.py` +- **5-Whys Root Cause Analysis**: Guided causal chain construction with depth validation and contributing factor identification +- **Action Item Extraction**: Automated identification of remediation tasks with priority scoring and ownership assignment +- **Pattern Recognition**: Cross-incident analysis to surface recurring failure modes and systemic weaknesses +- **Blameless Framing**: Language analysis to ensure postmortem narratives focus on systems and processes, not individuals + +### SLA & Reliability Metrics +- **MTTR Tracking**: Mean Time to Resolve computed per severity level with trend analysis and target comparison +- **MTTD Monitoring**: Mean Time to Detect measuring observability effectiveness from incident onset to first alert +- **MTBF Calculation**: Mean Time Between Failures per service, providing reliability baselines for capacity planning +- **SLA Compliance Scoring**: Real-time compliance percentages against defined availability targets (99.9%, 99.95%, 99.99%) +- **Incident Frequency Analysis**: Trend detection in incident volume by severity, service, and time window + +--- + +## Input Requirements + +### Incident Data Structure +All analysis tools accept JSON input following this schema: + +```json +{ + "incident": { + "id": "INC-2026-0142", + "title": "Payment processing service degradation", + "severity": "SEV2", + "status": "resolved", + "commander": "Jane Chen", + "declared_at": "2026-02-15T14:23:00Z", + "resolved_at": "2026-02-15T16:47:00Z", + "services_affected": ["payment-api", "checkout-frontend", "order-service"], + "customer_impact": { + "affected_users": 12400, + "revenue_impact_usd": 84000, + "data_integrity": false + } + }, + "timeline": [ + { + "timestamp": "2026-02-15T14:18:00Z", + "type": "alert", + "source": "datadog", + "description": "P95 latency > 2000ms on payment-api", + "actor": "monitoring" + }, + { + "timestamp": "2026-02-15T14:23:00Z", + "type": "declaration", + "source": "slack", + "description": "SEV2 declared by on-call engineer", + "actor": "jane.chen" + } + ], + "root_cause": { + "summary": "Connection pool exhaustion due to upstream database failover", + "category": "infrastructure", + "five_whys": [ + "Payment API returned 503 errors", + "Connection pool was exhausted (0/50 available)", + "Database primary failed over to replica", + "Replica promotion took 47 seconds, exceeding 10s pool timeout", + "Failover health check interval was set to 30s instead of 5s" + ] + }, + "action_items": [ + { + "id": "AI-001", + "description": "Reduce database health check interval to 5 seconds", + "priority": "P1", + "owner": "platform-team", + "due_date": "2026-02-22", + "status": "open" + } + ], + "sla": { + "target_availability": 99.95, + "downtime_minutes": 144, + "monthly_budget_minutes": 21.6, + "remaining_budget_minutes": -122.4 + } +} +``` + +### Minimum Data Requirements +- **Timeline Builder**: Incident ID, declared_at timestamp, and 2+ timeline events with timestamps +- **Severity Classifier**: Services affected, customer impact metrics (affected users OR revenue impact), and incident description +- **Postmortem Generator**: Complete incident record with timeline (5+ events recommended), root cause summary, and at least 1 action item +- **SLA Analysis**: Target availability percentage and incident duration; historical incident data for trend analysis (6+ incidents recommended) + +--- + +## Analysis Tools + +### Incident Timeline Builder (`scripts/incident_timeline_builder.py`) +Constructs structured, chronological incident timelines from raw event data with phase analysis and gap detection. + +**Features**: +- Chronological event ordering with deduplication across sources +- Automatic phase classification (Detection, Triage, Mitigation, Resolution, Postmortem) +- Phase duration calculation with bottleneck identification +- Communication cadence analysis (flags gaps > 15 minutes during active incidents) +- Timeline gap detection for periods with no recorded activity +- Multi-format output (text table, JSON, markdown) + +**Usage**: +```bash +# File input with text output +python scripts/incident_timeline_builder.py incident.json --format text + +# File input with JSON output for downstream processing +python scripts/incident_timeline_builder.py incident.json --format json + +# Stdin support for pipeline integration +cat incident.json | python scripts/incident_timeline_builder.py --format text + +# Markdown output for postmortem documents +python scripts/incident_timeline_builder.py incident.json --format markdown + +# Filter events by phase +python scripts/incident_timeline_builder.py incident.json --phase mitigation --format text +``` + +**Options**: +| Flag | Description | Default | +|------|-------------|---------| +| `--format` | Output format: `text`, `json`, `markdown` | `text` | +| `--phase` | Filter to specific phase: `detection`, `triage`, `mitigation`, `resolution` | all | +| `--gap-threshold` | Minutes of silence before flagging a gap | `15` | +| `--include-comms` | Include communication events in timeline | `true` | +| `--verbose` | Show phase duration breakdown and statistics | `false` | + +**Output Description**: +- Ordered event list with timestamps, actors, sources, and phase tags +- Phase duration summary (e.g., "Triage: 12 minutes, Mitigation: 47 minutes") +- Communication cadence score (updates per 15-minute window) +- Gap warnings with recommended actions +- Total incident duration from first alert to resolution confirmation + +### Severity Classifier (`scripts/severity_classifier.py`) +Impact-driven severity classification with escalation routing and SLA timer activation. + +**Features**: +- Four-tier severity classification (SEV1-SEV4) based on quantitative impact thresholds +- Blast radius estimation: affected users, services, and revenue exposure +- Escalation path generation with role assignments and response time requirements +- SLA breach prediction based on current severity and elapsed time +- Re-classification recommendations when incident scope changes +- Confidence scoring for classification decisions + +**Classification Thresholds**: +- **SEV1** (Critical): >50% users affected OR >$500K/hour revenue impact OR data breach OR complete service outage +- **SEV2** (Major): >10% users affected OR >$50K/hour revenue impact OR major feature unavailable +- **SEV3** (Minor): >1% users affected OR >$5K/hour revenue impact OR degraded performance +- **SEV4** (Low): <1% users affected AND <$5K/hour revenue impact AND workaround available + +**Usage**: +```bash +# Classify from incident file +python scripts/severity_classifier.py incident.json --format text + +# Classify with JSON output for automation +python scripts/severity_classifier.py incident.json --format json + +# Stdin support +cat incident.json | python scripts/severity_classifier.py --format text + +# Re-classify with updated scope +python scripts/severity_classifier.py incident.json --reclassify --format text + +# Include escalation routing in output +python scripts/severity_classifier.py incident.json --with-escalation --format text +``` + +**Options**: +| Flag | Description | Default | +|------|-------------|---------| +| `--format` | Output format: `text`, `json` | `text` | +| `--reclassify` | Compare current vs. recommended severity | `false` | +| `--with-escalation` | Include escalation path and response times | `false` | +| `--sla-predict` | Predict SLA breach probability | `false` | +| `--verbose` | Show classification reasoning and confidence | `false` | + +**Output Description**: +- Severity level with confidence percentage (e.g., "SEV2 - 94% confidence") +- Impact summary: affected users, services, estimated revenue loss +- Escalation path: who to page, response time requirements, communication channels +- SLA status: time remaining before breach, recommended actions +- Re-classification recommendation if scope has changed + +### Postmortem Generator (`scripts/postmortem_generator.py`) +Automated blameless postmortem document generation with root cause analysis and action item tracking. + +**Features**: +- Complete postmortem document generation from incident data +- 5-Whys root cause chain validation (checks for depth and logical consistency) +- Action item extraction with priority scoring (P1-P4) and ownership assignment +- Impact quantification: downtime minutes, affected users, revenue loss, SLA budget consumed +- Contributing factor identification beyond primary root cause +- Cross-incident pattern matching for recurring failure modes +- Blameless language validation (flags accusatory phrasing) + +**Usage**: +```bash +# Generate postmortem in markdown format +python scripts/postmortem_generator.py incident.json --format markdown + +# Generate in JSON for integration with tracking systems +python scripts/postmortem_generator.py incident.json --format json + +# Stdin support +cat incident.json | python scripts/postmortem_generator.py --format markdown + +# Include cross-incident pattern analysis (requires historical data) +python scripts/postmortem_generator.py incident.json --history incidents/ --format markdown + +# Validate blameless language in existing postmortem +python scripts/postmortem_generator.py incident.json --validate-language --format text +``` + +**Options**: +| Flag | Description | Default | +|------|-------------|---------| +| `--format` | Output format: `markdown`, `json`, `text` | `markdown` | +| `--history` | Directory of historical incident JSON files for pattern analysis | none | +| `--validate-language` | Check for blame-assigning language patterns | `false` | +| `--include-timeline` | Embed full timeline in postmortem document | `true` | +| `--action-items-only` | Output only extracted action items | `false` | +| `--verbose` | Include classification reasoning and pattern details | `false` | + +**Output Description**: +- Complete postmortem document with: title, severity, duration, impact summary +- Chronological timeline embedded from timeline builder +- Root cause analysis with 5-Whys chain and contributing factors +- Action items table with ID, description, priority, owner, due date +- Lessons learned section with systemic improvement recommendations +- SLA impact statement with remaining monthly error budget + +--- + +## Methodology + +### The Incident Commander's Decision Framework + +#### Incident Lifecycle Model + +Every incident follows five phases. The Incident Commander owns the transitions between them. + +**Phase 1 - Detection** (Target: <5 minutes from onset to alert) +- Monitoring systems fire alerts based on predefined thresholds +- On-call engineer acknowledges alert within defined SLA (2 minutes for SEV1, 5 minutes for SEV2) +- Initial triage determines whether to declare a formal incident +- If customer-reported: escalate classification by one severity level automatically + +**Phase 2 - Triage** (Target: <10 minutes) +- Incident Commander assigned or self-declared +- Severity classified using impact-first methodology (not cause-first) +- Communication channel established (dedicated Slack channel, bridge line) +- Stakeholder notification triggered per severity level +- Responder roles assigned: IC, Technical Lead, Communications Lead, Scribe + +**Phase 3 - Mitigation** (Target: varies by severity) +- Focus on restoring service, not finding root cause +- Time-boxed investigation windows (15-minute check-ins for SEV1, 30-minute for SEV2) +- Escalation triggers if mitigation stalls beyond defined thresholds +- Customer communication cadence: every 15 minutes for SEV1, every 30 minutes for SEV2 +- Decision framework: rollback vs. forward-fix vs. failover + +**Phase 4 - Resolution** (Target: confirmed stable for 15+ minutes) +- Service confirmed restored to baseline metrics +- Monitoring confirms stability for minimum observation window +- Customer-facing all-clear communication sent +- Incident record updated with resolution summary +- Postmortem scheduled within 48 hours (24 hours for SEV1) + +**Phase 5 - Postmortem** (Target: completed within 5 business days) +- Blameless postmortem meeting conducted with all responders +- Timeline reconstructed and validated by participants +- 5-Whys root cause analysis completed to systemic level +- Action items assigned with owners, priorities, and due dates +- Postmortem published to incident knowledge base + +#### Severity Classification Philosophy + +This framework uses **impact-first classification**, not cause-first. The severity of an incident is determined by its effect on customers and business, never by the technical cause. + +Rationale: A typo in a config file that takes down all of production is a SEV1. A complex distributed systems failure that affects 0.1% of users is a SEV3. Cause complexity is irrelevant to severity -- only impact matters. + +**Classification must happen within the first 5 minutes of declaration.** Reclassification is expected and encouraged as more information surfaces. Upgrading severity is always acceptable; downgrading requires IC approval and documented justification. + +#### Communication Cadence Protocol + +Silence during an incident is a failure mode. The Incident Commander enforces communication discipline: + +| Severity | Internal Update | Customer Update | Executive Update | +|----------|----------------|-----------------|------------------| +| SEV1 | Every 10 min | Every 15 min | Every 30 min | +| SEV2 | Every 15 min | Every 30 min | Every 60 min | +| SEV3 | Every 30 min | Every 60 min | On resolution | +| SEV4 | Every 60 min | On resolution | Not required | + +Updates must contain: current status, actions being taken, expected next update time, and any changes in severity or scope. + +#### Blameless Postmortem Culture + +Postmortems are the highest-leverage activity in incident management. They fail when they become blame sessions. + +**Non-Negotiable Principles:** +1. Humans do not cause incidents. Systems that allow humans to trigger failures cause incidents. +2. Every postmortem must produce at least one systemic action item (process, tooling, or architecture change). +3. The 5-Whys analysis must reach a systemic root cause. "Engineer made a mistake" is never a root cause -- the question is why the system allowed that mistake to cause an outage. +4. Postmortem attendance is mandatory for all incident responders. Optional for anyone else who wants to learn. +5. Action items without owners and due dates are not action items. They are wishes. + +--- + +## Templates & Assets + +### Incident Response Runbook (`assets/incident_response_runbook.md`) +Step-by-step response protocol for active incidents including: +- Incident Commander checklist (declaration through resolution) +- Role assignments and responsibilities (IC, Tech Lead, Comms Lead, Scribe) +- Severity-specific escalation procedures with contact routing +- Communication templates for each update cadence +- Handoff protocol for long-running incidents (>4 hours) + +### Postmortem Template (`assets/postmortem_template.md`) +Production-ready blameless postmortem document featuring: +- Structured header with incident metadata (ID, severity, duration, commander) +- Impact quantification section (users, revenue, SLA budget) +- Chronological timeline with phase annotations +- 5-Whys root cause analysis framework +- Contributing factors and systemic weaknesses +- Action items table with priority, owner, due date, and tracking status +- Lessons learned and process improvement recommendations + +### Stakeholder Communication Templates (`assets/stakeholder_comms_templates.md`) +Pre-written communication templates for consistent messaging: +- Initial incident declaration (internal and external) +- Periodic status updates per severity level +- Resolution and all-clear notifications +- Executive briefing format for SEV1/SEV2 incidents +- Customer-facing status page update language +- Post-resolution follow-up communication + +### Sample Incident Data (`assets/sample_incident_data.json`) +Comprehensive incident dataset demonstrating: +- Multi-service payment processing outage with realistic timeline +- 24 timeline events across all five lifecycle phases +- Complete 5-Whys root cause chain with contributing factors +- 6 action items with varying priorities and ownership +- SLA impact calculation with monthly error budget tracking +- Cross-referenced monitoring alerts, Slack messages, and PagerDuty events + +--- + +## Reference Frameworks + +### SRE Incident Management Guide (`references/sre-incident-management-guide.md`) +Comprehensive incident management methodology derived from Google SRE, PagerDuty, and Atlassian practices: +- Incident Commander role definition and authority boundaries +- On-call rotation best practices (follow-the-sun, escalation tiers) +- Severity classification decision trees with worked examples +- Communication protocols for internal, customer, and executive audiences +- Incident review cadence (weekly incident review, monthly trend analysis, quarterly reliability review) +- Tooling integration patterns (PagerDuty, OpsGenie, Slack, Datadog, Grafana) +- Regulatory incident reporting requirements (SOC2, HIPAA, PCI-DSS, GDPR) + +### Reliability Metrics Framework (`references/reliability-metrics-framework.md`) +Quantitative reliability measurement and target-setting guide: +- MTTR, MTTD, MTBF definitions with calculation formulas and edge cases +- SLA/SLO/SLI hierarchy with implementation guidance +- Error budget policy design and enforcement mechanisms +- Incident frequency analysis with statistical trend detection +- Service-level reliability tiering (Tier 1 critical, Tier 2 important, Tier 3 standard) +- Dashboard design for operational visibility (what to measure, what to alert on, what to ignore) +- Benchmarking data: industry-standard targets by company maturity and service tier + +--- + +## Implementation Workflows + +### Active Incident Response + +#### Step 1: Detection & Declaration (0-5 minutes) +1. **Alert fires** from monitoring system (Datadog, PagerDuty, CloudWatch, custom) +2. **On-call acknowledges** within response SLA (2 min SEV1, 5 min SEV2) +3. **Initial assessment**: Is this a real incident or a false positive? +4. **Declare incident**: Create incident channel, page Incident Commander + ``` + /incident declare --severity SEV2 --title "Payment API 503 errors" --channel #inc-20260215-payments + ``` +5. **Classify severity** using `severity_classifier.py`: + ```bash + python scripts/severity_classifier.py incident.json --with-escalation --format text + ``` +6. **Assign roles**: IC, Technical Lead, Communications Lead, Scribe + +#### Step 2: Triage & Mobilization (5-15 minutes) +1. **IC confirms severity** and activates escalation path +2. **Page additional responders** based on affected services +3. **Establish communication rhythm**: Set timer for first status update +4. **Scribe begins timeline**: Record all events with timestamps +5. **Technical Lead begins investigation**: Check dashboards, recent deployments, dependency health +6. **Communications Lead sends initial notification** to stakeholders + +#### Step 3: Mitigation (15 minutes - varies) +1. **Focus on restoring service, not diagnosing root cause** +2. **Decision framework** at each check-in: + - Can we rollback the last deployment? (fastest) + - Can we failover to a healthy replica? (fast) + - Can we apply a targeted forward-fix? (moderate) + - Do we need to scale infrastructure? (slow) +3. **Time-boxed investigation**: If no progress in 15 minutes (SEV1) or 30 minutes (SEV2), escalate +4. **Customer communication**: Send status update per cadence protocol +5. **Re-classify severity** if scope changes: + ```bash + python scripts/severity_classifier.py incident_updated.json --reclassify --format text + ``` + +#### Step 4: Resolution & Verification (varies) +1. **Confirm fix deployed** and metrics returning to baseline +2. **Observation window**: 15 minutes stable for SEV1/SEV2, 30 minutes for SEV3/SEV4 +3. **Resolve incident**: Update status, send all-clear communication +4. **Schedule postmortem**: Within 24 hours for SEV1, 48 hours for SEV2, 5 business days for SEV3 +5. **On-call engineer writes initial incident summary** while context is fresh + +### Post-Incident Analysis + +#### Timeline Reconstruction (Day 1-2) +1. **Gather raw data** from all sources (monitoring, Slack, PagerDuty, git log) +2. **Build unified timeline**: + ```bash + python scripts/incident_timeline_builder.py incident.json --format markdown --verbose + ``` +3. **Identify gaps**: Missing events, unexplained delays, undocumented decisions +4. **Validate with responders**: Circulate timeline for corrections before postmortem meeting + +#### 5-Whys Root Cause Analysis (Postmortem Meeting) +1. **Start with the observable impact**: "Payment API returned 503 errors for 144 minutes" +2. **Ask "Why?" iteratively** -- each answer must be factual and verifiable +3. **Reach a systemic cause**: The final "why" must point to a process, tooling, or architecture gap +4. **Identify contributing factors**: What else made this incident worse or longer than necessary? +5. **Validate depth**: If the final cause is "human error," ask one more "why" + +#### Action Item Generation +1. **Categorize**: Prevention (stop recurrence), Detection (find faster), Mitigation (recover faster) +2. **Prioritize**: P1 items must be completed before next on-call rotation +3. **Assign ownership**: Every action item has exactly one owner (team, not individual) +4. **Set due dates**: P1 within 1 week, P2 within 2 weeks, P3 within 1 month +5. **Generate postmortem**: + ```bash + python scripts/postmortem_generator.py incident.json --format markdown --include-timeline + ``` + +### SLA Compliance Monitoring + +1. **Define SLOs per service tier**: + - Tier 1 (revenue-critical): 99.99% availability (52.6 min/year downtime budget) + - Tier 2 (customer-facing): 99.95% availability (4.38 hours/year) + - Tier 3 (internal tooling): 99.9% availability (8.77 hours/year) + +2. **Track error budget consumption**: Monthly rolling window with daily updates +3. **Trigger error budget policy** when >50% consumed: + - Freeze non-critical deployments + - Prioritize reliability work over feature work + - Require IC review for all production changes +4. **Monthly reliability review**: Present SLA compliance, incident trends, action item completion + +### On-Call Handoff Protocol + +1. **End-of-rotation summary**: Document active incidents, ongoing investigations, known risks +2. **Handoff meeting**: 15-minute synchronous handoff between outgoing and incoming on-call +3. **Runbook review**: Confirm incoming on-call has access to all runbooks and escalation paths +4. **Alert review**: Walk through any alerts that fired during the rotation and their resolutions +5. **Pending action items**: Transfer ownership of time-sensitive items to incoming on-call + +--- + +## Assessment & Measurement + +### Key Performance Indicators + +#### Response Effectiveness Metrics +- **MTTD (Mean Time to Detect)**: Time from incident onset to first alert. Target: <5 minutes for Tier 1 services, <15 minutes for Tier 2. Measures observability coverage and alert threshold quality. +- **MTTR (Mean Time to Resolve)**: Time from incident declaration to confirmed resolution. Target: <30 minutes for SEV1, <2 hours for SEV2, <8 hours for SEV3. The single most important operational metric. +- **MTBF (Mean Time Between Failures)**: Time between consecutive incidents per service. Target: increasing quarter-over-quarter. Measures systemic reliability improvement. +- **MTTA (Mean Time to Acknowledge)**: Time from alert to human acknowledgment. Target: <2 minutes for SEV1, <5 minutes for SEV2. Measures on-call responsiveness. + +#### Process Quality Metrics +- **Postmortem Completion Rate**: Percentage of SEV1-SEV3 incidents with completed postmortems. Target: 100% for SEV1-SEV2, >90% for SEV3. +- **Action Item Completion Rate**: Percentage of postmortem action items completed by due date. Target: >85% for P1, >70% for P2. Below 60% indicates systemic follow-through failure. +- **Postmortem Timeliness**: Days from resolution to published postmortem. Target: <3 business days for SEV1, <5 for SEV2. +- **Severity Accuracy**: Percentage of incidents where initial classification matched final assessment. Target: >80%. Low accuracy indicates classification training gaps. + +#### Reliability Metrics +- **SLA Compliance**: Percentage of time meeting availability targets per service tier. Target: 100% compliance with defined SLOs. +- **Error Budget Remaining**: Monthly remaining error budget as percentage. Target: >25% remaining at month-end. +- **Incident Frequency Trend**: Month-over-month incident count by severity. Target: decreasing or stable for SEV1-SEV2. +- **Repeat Incident Rate**: Percentage of incidents with same root cause as a previous incident. Target: <10%. Above 15% indicates postmortem action items are not effective. + +### Assessment Schedule +- **Per Incident**: MTTD, MTTR, severity accuracy, communication cadence adherence +- **Weekly**: Incident count review, open action item status, on-call load assessment +- **Monthly**: SLA compliance report, error budget status, MTTR trends, postmortem completion rates +- **Quarterly**: Reliability review with executive stakeholders, MTBF trends, incident pattern analysis, on-call health survey + +### Calibration & Validation +- Cross-reference MTTR calculations with customer-reported impact duration +- Validate severity classifications retrospectively during postmortem review +- Compare automated severity classifier output against IC decisions to improve model accuracy +- Audit action item effectiveness by tracking repeat incident rate per root cause category + +--- + +## Best Practices + +### "Declare Early, Declare Often" +The single highest-leverage behavior in incident management is lowering the threshold for declaring incidents. Every organization that improves at incident response does so by declaring more incidents, not fewer. + +**The cost of a false alarm is one wasted Slack channel. The cost of a missed incident is customer trust.** + +Specific guidance: +- If two engineers are discussing whether something is an incident, it is an incident. Declare it. +- Any customer-reported issue that affects more than one user is an incident. Declare it. +- Any alert that requires more than 5 minutes of investigation is an incident. Declare it. +- Declaring an incident does not mean waking people up. It means creating a structured record. + +### Anti-Patterns to Eliminate + +**Hero Culture**: One engineer who "always fixes things" is a single point of failure, not an asset. If your incident response depends on a specific person being available, your process is broken. Fix the runbooks, not the rotation. + +**Blame Games**: The moment a postmortem asks "who did this?" instead of "why did our systems allow this?", the entire process loses value. Engineers who fear blame will hide information. Engineers who trust the process will share everything. + +**Skipping Postmortems**: "We already know what happened" is the most dangerous sentence in incident management. The purpose of a postmortem is not to discover what happened -- it is to generate systemic improvements and share learnings across the organization. + +**Severity Inflation**: Classifying everything as SEV1 to get faster response trains the organization to ignore severity levels. Classify honestly. Respond proportionally. + +**Action Item Graveyards**: Postmortems that generate action items no one tracks are worse than no postmortem at all. They create a false sense of progress. If your action item completion rate is below 50%, stop generating new action items and complete the existing ones first. + +### Communication During Incidents + +Template-driven communication eliminates cognitive load during high-stress situations: +- Never compose a customer update from scratch during an active incident +- Pre-written templates with fill-in-the-blank fields ensure consistent, professional communication +- The Communications Lead owns all external messaging; the IC approves content but does not write it +- Every update must answer three questions: What is happening? What are we doing about it? When is the next update? + +### On-Call Health and Burnout Prevention + +On-call is a tax on engineers' personal lives. Treating it as "just part of the job" without active management leads to burnout and attrition. + +**Non-Negotiable Standards:** +- Maximum on-call rotation: 1 week in 4 (25% on-call time). Below 1-in-3 requires immediate hiring. +- On-call engineers who are paged overnight get a late start or half-day the following day. No exceptions. +- Track pages-per-rotation. If any rotation consistently exceeds 5 pages, the alert thresholds need tuning. +- Quarterly on-call satisfaction surveys. Scores below 3/5 trigger mandatory process review. +- On-call compensation: either financial (on-call pay) or temporal (comp time). Uncompensated on-call is unacceptable. + +--- + +## Advanced Techniques + +### Chaos Engineering Integration +Proactive reliability testing through controlled failure injection: +- **Pre-Incident Drills**: Run tabletop exercises using `postmortem_generator.py` output from past incidents as scenarios +- **Game Days**: Scheduled chaos experiments (Chaos Monkey, Litmus, Gremlin) with full incident response activation +- **Runbook Validation**: Use chaos experiments to verify runbook accuracy and completeness before real incidents test them +- **Detection Validation**: Inject known failures to verify MTTD targets are achievable with current monitoring + +### Automated Incident Detection +Reducing MTTD through intelligent alerting: +- **Anomaly Detection**: Statistical baselines (3-sigma) on key metrics with automatic incident creation above threshold +- **Composite Alerts**: Multi-signal correlation (latency + error rate + saturation) to reduce false positive rates below 5% +- **Customer Signal Integration**: Status page report volume, support ticket spike detection, social media monitoring +- **Deployment Correlation**: Automatic incident flagging when metric degradation occurs within 30 minutes of a deployment + +### Cross-Team Incident Coordination +Managing incidents that span organizational boundaries: +- **Unified Command Structure**: Single IC with authority across all affected teams, regardless of organizational reporting +- **Liaison Role**: Each affected team designates a liaison who communicates team-specific updates to the IC +- **Shared Timeline**: All teams contribute to a single timeline document, eliminating information silos +- **Joint Postmortems**: Cross-team postmortems with shared action items and joint ownership + +### Regulatory Incident Reporting +Meeting compliance obligations during incidents: +- **SOC2**: Document incident detection, response, and resolution within audit trail. Action items must be tracked to completion. +- **HIPAA**: Breach notification within 60 days for incidents involving PHI. Document risk assessment and mitigation steps. +- **PCI-DSS**: Immediate containment for cardholder data exposure. Forensic investigation required for confirmed breaches. +- **GDPR**: 72-hour notification to supervisory authority for personal data breaches. Document legal basis for processing decisions. +- **Automation**: `postmortem_generator.py --format json` output structured to feed directly into compliance reporting workflows + +--- + +## Limitations & Considerations + +### Data Quality Dependencies +- **Minimum Event Count**: Timeline analysis requires 5+ events for meaningful phase analysis; fewer events produce incomplete coverage +- **Timestamp Accuracy**: All analysis assumes synchronized timestamps (NTP); clock skew across systems degrades timeline accuracy +- **Source Coverage**: Timeline quality depends on capturing events from all relevant systems; missing sources create blind spots +- **Historical Data**: Cross-incident pattern analysis requires 10+ resolved incidents for statistically meaningful trends + +### Organizational Prerequisites +- **Blameless Culture**: Tools generate blameless framing, but cultural adoption requires sustained leadership commitment over 6+ months +- **On-Call Maturity**: Severity classification and escalation routing assume an established on-call rotation with defined response SLAs +- **Tooling Integration**: Full value requires integration with monitoring (Datadog/Grafana), communication (Slack), and paging (PagerDuty/OpsGenie) systems +- **Executive Buy-In**: Error budget policies and deployment freezes require executive sponsorship to enforce during business-critical periods + +### Scaling Considerations +- **Team Size**: Communication cadence protocols optimized for 3-8 responders; larger incidents require additional coordination roles (Operations Lead, Customer Liaison) +- **Incident Volume**: Organizations handling >20 incidents/week need automated triage to prevent IC fatigue and classification inconsistency +- **Geographic Distribution**: Follow-the-sun on-call requires adapted handoff protocols and timezone-aware SLA calculations +- **Multi-Product**: Shared infrastructure incidents affecting multiple products require product-specific impact assessment and communication tracks + +### Measurement Limitations +- **MTTR Variance**: Mean values obscure outliers; track P50, P90, and P99 MTTR for accurate performance assessment +- **Attribution Complexity**: Incidents with multiple contributing causes resist single-root-cause analysis; 5-Whys may oversimplify +- **Leading Indicators**: Most reliability metrics are lagging; invest in leading indicators (deployment frequency, change failure rate, alert noise ratio) +- **Comparison Pitfalls**: MTTR benchmarks vary dramatically by industry, company size, and service architecture; internal trends are more valuable than external comparisons + +--- + +## Success Metrics & Outcomes + +Organizations that implement this incident management framework consistently achieve: + +- **40-60% reduction in MTTR** within the first 6 months through structured response protocols and severity-driven escalation +- **70%+ reduction in MTTD** through improved monitoring coverage and composite alert configuration +- **90%+ postmortem completion rate** for SEV1-SEV2 incidents, up from the industry average of 40-50% +- **85%+ action item completion rate** within defined due dates, eliminating the "action item graveyard" anti-pattern +- **50% reduction in repeat incidents** (same root cause) within 12 months through systematic postmortem follow-through +- **30-40% improvement in on-call satisfaction scores** through rotation health management and burnout prevention +- **99.95%+ SLA compliance** for Tier 1 services through error budget policies and proactive reliability investment +- **Sub-5-minute severity classification** with >80% accuracy through impact-first methodology and trained Incident Commanders + +The framework transforms incident management from reactive firefighting into a structured, measurable engineering discipline. Teams stop treating incidents as exceptional events and start treating them as opportunities to systematically improve reliability, build organizational trust, and protect customer experience. + +--- + +*This skill combines Google SRE principles, PagerDuty operational best practices, and Atlassian incident management workflows into a unified, tool-supported framework. Success requires organizational commitment to blameless culture, consistent postmortem follow-through, and investment in observability. Adapt severity thresholds, communication cadences, and SLA targets to your specific organizational context and customer expectations.* diff --git a/engineering-team/incident-commander/assets/incident_report_template.md b/engineering-team/incident-commander/assets/incident_report_template.md new file mode 100644 index 0000000..6104d5d --- /dev/null +++ b/engineering-team/incident-commander/assets/incident_report_template.md @@ -0,0 +1,171 @@ +# Incident Report: [INC-YYYY-NNNN] [Title] + +**Severity:** SEV[1-4] +**Status:** [Active | Mitigated | Resolved] +**Incident Commander:** [Name] +**Date:** [YYYY-MM-DD] + +--- + +## Executive Summary + +[2-3 sentence summary of the incident: what happened, impact scope, resolution status. Written for executive audience — no jargon, focus on business impact.] + +--- + +## Impact Statement + +| Metric | Value | +|--------|-------| +| **Duration** | [X hours Y minutes] | +| **Affected Users** | [number or percentage] | +| **Failed Transactions** | [number] | +| **Revenue Impact** | $[amount] | +| **Data Loss** | [Yes/No — if yes, detail below] | +| **SLA Impact** | [X.XX% availability for period] | +| **Affected Regions** | [list regions] | +| **Affected Services** | [list services] | + +### Customer-Facing Impact + +[Describe what customers experienced: error messages, degraded functionality, complete outage. Be specific about which user journeys were affected.] + +--- + +## Timeline + +| Time (UTC) | Phase | Event | +|------------|-------|-------| +| HH:MM | Detection | [First alert or report] | +| HH:MM | Declaration | [Incident declared, channel created] | +| HH:MM | Investigation | [Key investigation findings] | +| HH:MM | Mitigation | [Mitigation action taken] | +| HH:MM | Resolution | [Permanent fix applied] | +| HH:MM | Closure | [Incident closed, monitoring confirmed stable] | + +### Key Decision Points + +1. **[HH:MM] [Decision]** — [Rationale and outcome] +2. **[HH:MM] [Decision]** — [Rationale and outcome] + +### Timeline Gaps + +[Note any periods >15 minutes without logged events. These represent potential blind spots in the response.] + +--- + +## Root Cause Analysis + +### Root Cause + +[Clear, specific statement of the root cause. Not "human error" — describe the systemic failure.] + +### Contributing Factors + +1. **[Factor Category: Process/Tooling/Human/Environment]** — [Description] +2. **[Factor Category]** — [Description] +3. **[Factor Category]** — [Description] + +### 5-Whys Analysis + +**Why did the service degrade?** +→ [Answer] + +**Why did [answer above] happen?** +→ [Answer] + +**Why did [answer above] happen?** +→ [Answer] + +**Why did [answer above] happen?** +→ [Answer] + +**Why did [answer above] happen?** +→ [Root systemic cause] + +--- + +## Response Metrics + +| Metric | Value | Target | Status | +|--------|-------|--------|--------| +| **MTTD** (Mean Time to Detect) | [X min] | <5 min | [Met/Missed] | +| **Time to Declare** | [X min] | <10 min | [Met/Missed] | +| **Time to Mitigate** | [X min] | <60 min (SEV1) | [Met/Missed] | +| **MTTR** (Mean Time to Resolve) | [X min] | <4 hr (SEV1) | [Met/Missed] | +| **Postmortem Timeliness** | [X hours] | <72 hr | [Met/Missed] | + +--- + +## Action Items + +| # | Priority | Action | Owner | Deadline | Type | Status | +|---|----------|--------|-------|----------|------|--------| +| 1 | P1 | [Action description] | [owner] | [date] | Detection | Open | +| 2 | P1 | [Action description] | [owner] | [date] | Prevention | Open | +| 3 | P2 | [Action description] | [owner] | [date] | Prevention | Open | +| 4 | P2 | [Action description] | [owner] | [date] | Process | Open | + +### Action Item Types + +- **Detection**: Improve ability to detect this class of issue faster +- **Prevention**: Prevent this class of issue from occurring +- **Mitigation**: Reduce impact when this class of issue occurs +- **Process**: Improve response process and coordination + +--- + +## Lessons Learned + +### What Went Well + +- [Specific positive outcome from the response] +- [Specific positive outcome] + +### What Didn't Go Well + +- [Specific area for improvement] +- [Specific area for improvement] + +### Where We Got Lucky + +- [Things that could have made this worse but didn't] + +--- + +## Communication Log + +| Time (UTC) | Channel | Audience | Summary | +|------------|---------|----------|---------| +| HH:MM | Status Page | External | [Summary of update] | +| HH:MM | Slack #exec | Internal | [Summary of update] | +| HH:MM | Email | Customers | [Summary of notification] | + +--- + +## Participants + +| Name | Role | +|------|------| +| [Name] | Incident Commander | +| [Name] | Operations Lead | +| [Name] | Communications Lead | +| [Name] | Subject Matter Expert | + +--- + +## Appendix + +### Related Incidents + +- [INC-YYYY-NNNN] — [Brief description of related incident] + +### Reference Links + +- [Link to monitoring dashboard] +- [Link to deployment logs] +- [Link to incident channel archive] + +--- + +*This report follows the blameless postmortem principle. The goal is systemic improvement, not individual accountability. All contributing factors should trace to process, tooling, or environmental gaps that can be addressed with concrete action items.* diff --git a/engineering-team/incident-commander/assets/runbook_template.md b/engineering-team/incident-commander/assets/runbook_template.md new file mode 100644 index 0000000..8aa005d --- /dev/null +++ b/engineering-team/incident-commander/assets/runbook_template.md @@ -0,0 +1,289 @@ +# Runbook: [Service/Component Name] + +**Owner:** [Team Name] +**Last Updated:** [YYYY-MM-DD] +**Reviewed By:** [Name] +**Review Cadence:** Quarterly + +--- + +## Service Overview + +| Property | Value | +|----------|-------| +| **Service** | [service-name] | +| **Repository** | [repo URL] | +| **Dashboard** | [monitoring dashboard URL] | +| **On-Call Rotation** | [PagerDuty/OpsGenie schedule URL] | +| **SLA Tier** | [Tier 1/2/3] | +| **Availability Target** | [99.9% / 99.95% / 99.99%] | +| **Dependencies** | [list upstream/downstream services] | +| **Owner Team** | [team name] | +| **Escalation Contact** | [name/email] | + +### Architecture Summary + +[2-3 sentence description of the service architecture. Include key components, data stores, and external dependencies.] + +--- + +## Alert Response Decision Tree + +### High Error Rate (>5%) + +``` +Error Rate Alert Fired +├── Check: Is this a deployment-related issue? +│ ├── YES → Go to "Recent Deployment Rollback" section +│ └── NO → Continue +├── Check: Is a downstream dependency failing? +│ ├── YES → Go to "Dependency Failure" section +│ └── NO → Continue +├── Check: Is there unusual traffic volume? +│ ├── YES → Go to "Traffic Spike" section +│ └── NO → Continue +└── Escalate: Engage on-call secondary + service owner +``` + +### High Latency (p99 > [threshold]ms) + +``` +Latency Alert Fired +├── Check: Database query latency elevated? +│ ├── YES → Go to "Database Performance" section +│ └── NO → Continue +├── Check: Connection pool utilization >80%? +│ ├── YES → Go to "Connection Pool Exhaustion" section +│ └── NO → Continue +├── Check: Memory/CPU pressure on service instances? +│ ├── YES → Go to "Resource Exhaustion" section +│ └── NO → Continue +└── Escalate: Engage on-call secondary + service owner +``` + +### Service Unavailable (Health Check Failing) + +``` +Health Check Alert Fired +├── Check: Are all instances down? +│ ├── YES → Go to "Complete Outage" section +│ └── NO → Continue +├── Check: Is only one AZ affected? +│ ├── YES → Go to "AZ Failure" section +│ └── NO → Continue +├── Check: Can instances be restarted? +│ ├── YES → Go to "Instance Restart" section +│ └── NO → Continue +└── Escalate: Declare incident, engage IC +``` + +--- + +## Common Scenarios + +### Recent Deployment Rollback + +**Symptoms:** Error rate spike or latency increase within 60 minutes of a deployment. + +**Diagnosis:** +1. Check deployment history: `kubectl rollout history deployment/[service-name]` +2. Compare error rate timing with deployment timestamp +3. Review deployment diff for risky changes + +**Mitigation:** +1. Initiate rollback: `kubectl rollout undo deployment/[service-name]` +2. Verify rollback: `kubectl rollout status deployment/[service-name]` +3. Confirm error rate returns to baseline (allow 5 minutes) +4. If rollback fails: escalate immediately + +**Communication:** If customer-impacting, update status page within 5 minutes of confirming impact. + +--- + +### Database Performance + +**Symptoms:** Elevated query latency, connection pool saturation, timeout errors. + +**Diagnosis:** +1. Check active queries: `SELECT * FROM pg_stat_activity WHERE state = 'active';` +2. Check for long-running queries: `SELECT pid, now() - pg_stat_activity.query_start AS duration, query FROM pg_stat_activity WHERE state != 'idle' ORDER BY duration DESC;` +3. Check connection count: `SELECT count(*) FROM pg_stat_activity;` +4. Check table bloat and vacuum status + +**Mitigation:** +1. Kill long-running queries if identified: `SELECT pg_terminate_backend([pid]);` +2. If connection pool exhausted: increase pool size via config (requires restart) +3. If read replica available: redirect read traffic +4. If write-heavy: identify and defer non-critical writes + +**Escalation Trigger:** If query latency >10s for >5 minutes, escalate to DBA on-call. + +--- + +### Connection Pool Exhaustion + +**Symptoms:** Connection timeout errors, pool utilization >90%, requests queuing. + +**Diagnosis:** +1. Check pool metrics: current size, active connections, waiting requests +2. Check for connection leaks: connections held >30s without activity +3. Review recent config changes or deployments + +**Mitigation:** +1. Increase pool size (if infrastructure allows): update config, rolling restart +2. Kill idle connections exceeding timeout +3. If caused by leak: identify and restart affected instances +4. Enable connection pool auto-scaling if available + +**Prevention:** Pool utilization alerting at 70% (warning) and 85% (critical). + +--- + +### Dependency Failure + +**Symptoms:** Errors correlated with downstream service failures, circuit breakers tripping. + +**Diagnosis:** +1. Check dependency status dashboards +2. Verify circuit breaker state: open/half-open/closed +3. Check for correlation with dependency deployments or incidents +4. Test dependency health endpoints directly + +**Mitigation:** +1. If circuit breaker not tripping: verify timeout/threshold configuration +2. Enable graceful degradation (serve cached/default responses) +3. If critical path: engage dependency team via incident process +4. If non-critical path: disable feature flag for affected functionality + +**Communication:** Coordinate with dependency team IC if both services have active incidents. + +--- + +### Traffic Spike + +**Symptoms:** Sudden traffic increase beyond normal patterns, resource saturation. + +**Diagnosis:** +1. Check traffic source: organic growth vs. bot traffic vs. DDoS +2. Review rate limiting effectiveness +3. Check auto-scaling status and capacity + +**Mitigation:** +1. If bot/DDoS: enable rate limiting, engage security team +2. If organic: trigger manual scale-up, increase auto-scaling limits +3. Enable request queuing or load shedding if at capacity +4. Consider feature flag toggles to reduce per-request cost + +--- + +### Complete Outage + +**Symptoms:** All instances unreachable, health checks failing across AZs. + +**Diagnosis:** +1. Check infrastructure status (AWS/GCP status page) +2. Verify network connectivity and DNS resolution +3. Check for infrastructure-level incidents (region outage) +4. Review recent infrastructure changes (Terraform, network config) + +**Mitigation:** +1. If infra provider issue: activate disaster recovery plan +2. If DNS issue: update DNS records, reduce TTL +3. If deployment corruption: redeploy last known good version +4. If data corruption: engage data recovery procedures + +**Escalation:** Immediately declare SEV1 incident. Engage infrastructure team and management. + +--- + +### Instance Restart + +**Symptoms:** Individual instances unhealthy, OOM kills, process crashes. + +**Diagnosis:** +1. Check instance logs for crash reason +2. Review memory/CPU usage patterns before crash +3. Check for memory leaks or resource exhaustion +4. Verify configuration consistency across instances + +**Mitigation:** +1. Restart unhealthy instances: `kubectl delete pod [pod-name]` +2. If recurring: cordon node and migrate workloads +3. If memory leak: schedule immediate patch with increased memory limit +4. Monitor for recurrence after restart + +--- + +### AZ Failure + +**Symptoms:** All instances in one availability zone failing, others healthy. + +**Diagnosis:** +1. Confirm AZ-specific failure vs. instance-specific issues +2. Check cloud provider AZ status +3. Verify load balancer is routing around failed AZ + +**Mitigation:** +1. Ensure load balancer marks AZ instances as unhealthy +2. Scale up remaining AZs to handle redirected traffic +3. If auto-scaling: verify it's responding to increased load +4. Monitor remaining AZs for cascade effects + +--- + +## Key Metrics & Dashboards + +| Metric | Normal Range | Warning | Critical | Dashboard | +|--------|-------------|---------|----------|-----------| +| Error Rate | <0.1% | >1% | >5% | [link] | +| p99 Latency | <200ms | >500ms | >2000ms | [link] | +| CPU Usage | <60% | >75% | >90% | [link] | +| Memory Usage | <70% | >80% | >90% | [link] | +| DB Pool Usage | <50% | >70% | >85% | [link] | +| Request Rate | [baseline]±20% | ±50% | ±100% | [link] | + +--- + +## Escalation Contacts + +| Level | Contact | When | +|-------|---------|------| +| L1: On-Call Primary | [name/rotation] | First responder | +| L2: On-Call Secondary | [name/rotation] | Primary unavailable or needs help | +| L3: Service Owner | [name] | Complex issues, architectural decisions | +| L4: Engineering Manager | [name] | SEV1/SEV2, customer impact, resource needs | +| L5: VP Engineering | [name] | SEV1 >30 min, major customer/revenue impact | + +--- + +## Maintenance Procedures + +### Planned Maintenance Checklist + +- [ ] Maintenance window scheduled and communicated (72 hours advance for Tier 1) +- [ ] Status page updated with planned maintenance notice +- [ ] Rollback plan documented and tested +- [ ] On-call notified of maintenance window +- [ ] Customer notification sent (if SLA-impacting) +- [ ] Post-maintenance verification plan ready + +### Health Verification After Changes + +1. Check all health endpoints return 200 +2. Verify error rate returns to baseline within 5 minutes +3. Confirm latency within normal range +4. Run synthetic transaction test +5. Monitor for 15 minutes before declaring success + +--- + +## Revision History + +| Date | Author | Change | +|------|--------|--------| +| [YYYY-MM-DD] | [Name] | Initial version | +| [YYYY-MM-DD] | [Name] | [Description of update] | + +--- + +*This runbook should be reviewed quarterly and updated after every incident that reveals missing procedures. The on-call engineer should be able to follow this document without prior context about the service. If any section requires tribal knowledge to execute, it needs to be expanded.* diff --git a/engineering-team/incident-commander/assets/sample_incident_data.json b/engineering-team/incident-commander/assets/sample_incident_data.json new file mode 100644 index 0000000..7dc667d --- /dev/null +++ b/engineering-team/incident-commander/assets/sample_incident_data.json @@ -0,0 +1,276 @@ +{ + "incident": { + "id": "INC-2024-0142", + "title": "Payment Service Degradation", + "severity": "SEV1", + "status": "resolved", + "declared_at": "2024-01-15T14:23:00Z", + "resolved_at": "2024-01-15T16:45:00Z", + "commander": "Jane Smith", + "service": "payment-gateway", + "affected_services": ["checkout", "subscription-billing"] + }, + "events": [ + { + "timestamp": "2024-01-15T14:15:00Z", + "type": "trigger", + "actor": "system", + "description": "Database connection pool utilization reaches 95% on payment-gateway primary", + "metadata": {"metric": "db_pool_utilization", "value": 95, "threshold": 90} + }, + { + "timestamp": "2024-01-15T14:20:00Z", + "type": "detection", + "actor": "monitoring", + "description": "PagerDuty alert fired: payment-gateway error rate >5% (current: 8.2%)", + "metadata": {"alert_id": "PD-98765", "source": "datadog", "error_rate": 8.2} + }, + { + "timestamp": "2024-01-15T14:21:00Z", + "type": "detection", + "actor": "monitoring", + "description": "Datadog alert: p99 latency on /api/payments exceeds 5000ms (current: 8500ms)", + "metadata": {"alert_id": "DD-54321", "source": "datadog", "latency_p99_ms": 8500} + }, + { + "timestamp": "2024-01-15T14:23:00Z", + "type": "declaration", + "actor": "Jane Smith", + "description": "SEV1 declared. Incident channel #inc-20240115-payment-degradation created. Bridge call started.", + "metadata": {"channel": "#inc-20240115-payment-degradation", "severity": "SEV1"} + }, + { + "timestamp": "2024-01-15T14:25:00Z", + "type": "investigation", + "actor": "Alice Chen", + "description": "Confirmed: database connection pool at 100% utilization. All new connections being rejected.", + "metadata": {"pool_size": 20, "active_connections": 20, "waiting_requests": 147} + }, + { + "timestamp": "2024-01-15T14:28:00Z", + "type": "investigation", + "actor": "Carol Davis", + "description": "Identified recent deployment of user-api v2.4.1 at 13:45 UTC. New ORM version (3.2.0) changed connection handling behavior.", + "metadata": {"deployment": "user-api-v2.4.1", "deployed_at": "2024-01-15T13:45:00Z"} + }, + { + "timestamp": "2024-01-15T14:30:00Z", + "type": "communication", + "actor": "Bob Kim", + "description": "Status page updated: Investigating - We are investigating increased error rates affecting payment processing.", + "metadata": {"channel": "status_page", "status": "investigating"} + }, + { + "timestamp": "2024-01-15T14:35:00Z", + "type": "escalation", + "actor": "Jane Smith", + "description": "Escalated to VP Engineering. Customer impact confirmed: 12,500+ users affected, failed transactions accumulating.", + "metadata": {"escalated_to": "VP Engineering", "reason": "revenue_impact"} + }, + { + "timestamp": "2024-01-15T14:40:00Z", + "type": "mitigation", + "actor": "Alice Chen", + "description": "Attempting mitigation: increasing connection pool size from 20 to 50 via config override.", + "metadata": {"action": "pool_resize", "old_value": 20, "new_value": 50} + }, + { + "timestamp": "2024-01-15T14:45:00Z", + "type": "communication", + "actor": "Bob Kim", + "description": "Status page updated: Identified - The issue has been identified as a database configuration problem. We are implementing a fix.", + "metadata": {"channel": "status_page", "status": "identified"} + }, + { + "timestamp": "2024-01-15T14:50:00Z", + "type": "investigation", + "actor": "Carol Davis", + "description": "Pool resize partially effective. Error rate dropped from 23% to 12%. ORM 3.2.0 opens 3x more connections per request than 3.1.2.", + "metadata": {"error_rate_before": 23.5, "error_rate_after": 12.1} + }, + { + "timestamp": "2024-01-15T15:00:00Z", + "type": "mitigation", + "actor": "Alice Chen", + "description": "Decision: roll back ORM version to 3.1.2. Initiating rollback deployment of user-api v2.3.9.", + "metadata": {"action": "rollback", "target_version": "2.3.9", "rollback_reason": "orm_connection_leak"} + }, + { + "timestamp": "2024-01-15T15:15:00Z", + "type": "mitigation", + "actor": "Alice Chen", + "description": "Rollback deployment complete. user-api v2.3.9 running in production. Connection pool utilization dropping.", + "metadata": {"deployment_duration_minutes": 15, "pool_utilization": 45} + }, + { + "timestamp": "2024-01-15T15:20:00Z", + "type": "communication", + "actor": "Bob Kim", + "description": "Status page updated: Monitoring - A fix has been implemented and we are monitoring the results.", + "metadata": {"channel": "status_page", "status": "monitoring"} + }, + { + "timestamp": "2024-01-15T15:30:00Z", + "type": "mitigation", + "actor": "Jane Smith", + "description": "Error rate back to baseline (<0.1%). Payment processing fully restored. Entering monitoring phase.", + "metadata": {"error_rate": 0.08, "pool_utilization": 32} + }, + { + "timestamp": "2024-01-15T16:30:00Z", + "type": "investigation", + "actor": "Carol Davis", + "description": "Confirmed stable for 60 minutes. No degradation detected. Root cause documented: ORM 3.2.0 connection pooling incompatibility.", + "metadata": {"monitoring_duration_minutes": 60, "stable": true} + }, + { + "timestamp": "2024-01-15T16:45:00Z", + "type": "resolution", + "actor": "Jane Smith", + "description": "Incident resolved. All services nominal. Postmortem scheduled for 2024-01-17 10:00 UTC.", + "metadata": {"postmortem_scheduled": "2024-01-17T10:00:00Z"} + }, + { + "timestamp": "2024-01-15T16:50:00Z", + "type": "communication", + "actor": "Bob Kim", + "description": "Status page updated: Resolved - The issue has been resolved. Payment processing is operating normally.", + "metadata": {"channel": "status_page", "status": "resolved"} + } + ], + "communications": [ + { + "timestamp": "2024-01-15T14:30:00Z", + "channel": "status_page", + "audience": "external", + "message": "Investigating - We are investigating increased error rates affecting payment processing. Some transactions may fail. We will provide an update within 15 minutes." + }, + { + "timestamp": "2024-01-15T14:35:00Z", + "channel": "slack_exec", + "audience": "internal", + "message": "SEV1 ACTIVE: Payment service degradation. ~12,500 users affected. Failed transactions accumulating. IC: Jane Smith. Bridge: [link]. ETA for mitigation: investigating." + }, + { + "timestamp": "2024-01-15T14:45:00Z", + "channel": "status_page", + "audience": "external", + "message": "Identified - The issue has been identified as a database configuration problem following a recent deployment. We are implementing a fix. Next update in 15 minutes." + }, + { + "timestamp": "2024-01-15T15:20:00Z", + "channel": "status_page", + "audience": "external", + "message": "Monitoring - A fix has been implemented and we are monitoring the results. Payment processing is recovering. We will provide a final update once we confirm stability." + }, + { + "timestamp": "2024-01-15T16:50:00Z", + "channel": "status_page", + "audience": "external", + "message": "Resolved - The issue affecting payment processing has been resolved. All systems are operating normally. We will publish a full incident report within 48 hours." + } + ], + "impact": { + "revenue_impact": "high", + "affected_users_percentage": 45, + "affected_regions": ["us-east-1", "eu-west-1"], + "data_integrity_risk": false, + "security_breach": false, + "customer_facing": true, + "degradation_type": "partial", + "workaround_available": false + }, + "signals": { + "error_rate_percentage": 23.5, + "latency_p99_ms": 8500, + "affected_endpoints": ["/api/payments", "/api/checkout", "/api/subscriptions"], + "dependent_services": ["checkout", "subscription-billing", "order-service"], + "alert_count": 12, + "customer_reports": 8 + }, + "context": { + "recent_deployments": [ + { + "service": "user-api", + "deployed_at": "2024-01-15T13:45:00Z", + "version": "2.4.1", + "changes": "Upgraded ORM from 3.1.2 to 3.2.0" + } + ], + "ongoing_incidents": [], + "maintenance_windows": [], + "on_call": { + "primary": "alice@company.com", + "secondary": "bob@company.com", + "escalation_manager": "director-eng@company.com" + } + }, + "resolution": { + "root_cause": "Database connection pool exhaustion caused by ORM 3.2.0 opening 3x more connections per request than previous version 3.1.2, exceeding the pool size of 20", + "contributing_factors": [ + "Insufficient load testing of new ORM version under production-scale connection patterns", + "Connection pool monitoring alert threshold set too high (90%) with no warning at 70%", + "No canary deployment process for database configuration or ORM changes", + "Missing connection pool sizing documentation for service dependencies" + ], + "mitigation_steps": [ + "Increased connection pool size from 20 to 50 as temporary relief", + "Rolled back user-api from v2.4.1 (ORM 3.2.0) to v2.3.9 (ORM 3.1.2)" + ], + "permanent_fix": "Load test ORM 3.2.0 with production connection patterns, update pool sizing, implement canary deployment for ORM changes", + "customer_impact": { + "affected_users": 12500, + "failed_transactions": 342, + "revenue_impact_usd": 28500, + "data_loss": false + } + }, + "action_items": [ + { + "title": "Add connection pool utilization alerting at 70% warning and 85% critical thresholds", + "owner": "alice@company.com", + "priority": "P1", + "deadline": "2024-01-22", + "type": "detection", + "status": "open" + }, + { + "title": "Implement canary deployment pipeline for database configuration and ORM changes", + "owner": "bob@company.com", + "priority": "P1", + "deadline": "2024-02-01", + "type": "prevention", + "status": "open" + }, + { + "title": "Load test ORM v3.2.0 with production-scale connection patterns before re-deployment", + "owner": "carol@company.com", + "priority": "P2", + "deadline": "2024-01-29", + "type": "prevention", + "status": "open" + }, + { + "title": "Document connection pool sizing requirements for all services in runbook", + "owner": "alice@company.com", + "priority": "P2", + "deadline": "2024-02-05", + "type": "process", + "status": "open" + }, + { + "title": "Add ORM connection behavior to integration test suite", + "owner": "carol@company.com", + "priority": "P3", + "deadline": "2024-02-15", + "type": "prevention", + "status": "open" + } + ], + "participants": [ + {"name": "Jane Smith", "role": "Incident Commander"}, + {"name": "Alice Chen", "role": "Operations Lead"}, + {"name": "Bob Kim", "role": "Communications Lead"}, + {"name": "Carol Davis", "role": "Database SME"} + ] +} diff --git a/engineering-team/incident-commander/references/incident-response-framework.md b/engineering-team/incident-commander/references/incident-response-framework.md new file mode 100644 index 0000000..45fe3b9 --- /dev/null +++ b/engineering-team/incident-commander/references/incident-response-framework.md @@ -0,0 +1,372 @@ +# Incident Response Framework Reference + +Production-grade incident management knowledge base synthesizing PagerDuty, Google SRE, and Atlassian methodologies into a unified, opinionated framework. This document is the source of truth for incident commanders operating under pressure. + +--- + +## 1. Industry Framework Comparison + +### PagerDuty Incident Response Model + +PagerDuty's open-source incident response process defines four core roles and five process phases. The model prioritizes **speed of mobilization** over process perfection. + +**Roles:** +- **Incident Commander (IC):** Owns the incident end-to-end. Does NOT perform technical investigation. Delegates, coordinates, and makes final escalation decisions. The IC is the single point of authority; conflicting opinions are resolved by the IC, not by committee. +- **Scribe:** Captures timestamped decisions, actions, and findings in the incident channel. The scribe never participates in technical work. A good scribe reduces postmortem preparation time by 70%. +- **Subject Matter Expert (SME):** Pulled in on-demand for specific subsystems. SMEs report findings to the IC, not to each other. Parallel SME investigations must be coordinated through the IC to avoid duplicated effort. +- **Customer Liaison:** Owns all outbound customer communication. Drafts status page updates for IC approval. Shields the technical team from inbound customer inquiries during active incidents. + +**Process Phases:** Detect, Triage, Mobilize, Mitigate, Resolve, Postmortem. + +**Communication Protocol:** PagerDuty mandates a dedicated Slack channel per incident, a bridge call for SEV1/SEV2, and status updates at fixed cadences (every 15 min for SEV1, every 30 min for SEV2). All decisions are announced in the channel, never in DMs or side threads. + +### Google SRE: Managing Incidents (Chapter 14) + +Google's SRE model, documented in *Site Reliability Engineering* (O'Reilly, 2016), emphasizes **role separation** and **clear handoffs** as the primary mechanisms for preventing incident chaos. + +**Key Principles:** +- **Operational vs. Communication Tracks:** Google splits incident work into two parallel tracks. The operational track handles technical mitigation. The communication track handles stakeholder updates, executive briefings, and customer notifications. These tracks run independently with the IC bridging them. +- **Role Separation is Non-Negotiable:** The person debugging the system must never be the person updating stakeholders. Cognitive load from context-switching between technical work and communication degrades both outputs. Google measured a 40% increase in mean-time-to-resolution (MTTR) when a single person attempted both. +- **Clear Handoffs:** When an IC rotates out (recommended every 60-90 minutes for SEV1), the handoff includes: current status summary, active hypotheses, pending actions, and escalation state. Handoffs happen on the bridge call, not asynchronously. +- **Defined Command Post:** All communication flows through a single channel. Google uses the term "command post" -- a virtual or physical location where all incident participants converge. + +### Atlassian Incident Management Model + +Atlassian's model, published in their *Incident Management Handbook*, is **severity-driven** and **template-heavy**. It favors structured playbooks over improvisation. + +**Key Characteristics:** +- **Severity Levels Drive Everything:** The assigned severity determines who gets paged, what communication templates are used, response time SLAs, and postmortem requirements. Severity is assigned at triage and reassessed every 30 minutes. +- **Handbook-Driven Approach:** Atlassian maintains runbooks for every known failure mode. During incidents, responders follow documented playbooks before improvising. This reduces MTTR for known issues by 50-60% but requires significant upfront investment in documentation. +- **Communication Templates:** Pre-written templates for status page updates, customer emails, and executive summaries. Templates include severity-specific language and are reviewed quarterly. This eliminates wordsmithing during active incidents. +- **Values-Based Decisions:** When runbooks do not cover the situation, Atlassian defaults to a decision hierarchy: (1) protect customer data, (2) restore service, (3) preserve evidence for root cause analysis. + +### Framework Comparison Table + +| Dimension | PagerDuty | Google SRE | Atlassian | +|-----------|-----------|------------|-----------| +| Primary strength | Speed of mobilization | Role separation discipline | Structured playbooks | +| IC authority model | IC has final say | IC coordinates, escalates to VP if blocked | IC follows handbook, escalates if off-script | +| Communication style | Dedicated channel + bridge | Command post with dual tracks | Template-driven status updates | +| Handoff protocol | Informal | Formal on-call handoff script | Rotation policy in handbook | +| Postmortem requirement | All SEV1/SEV2 | All incidents | SEV1/SEV2 mandatory, SEV3 optional | +| Best for | Fast-moving startups | Large-scale distributed systems | Regulated or process-heavy orgs | +| Weakness | Under-documented for edge cases | Heavyweight for small teams | Rigid, slow to adapt to novel failures | + +### When to Use Which Framework + +- **Teams under 20 engineers:** Start with PagerDuty's model. It is lightweight and prescriptive enough to work without heavy process investment. Add Atlassian-style runbooks as you identify recurring failure modes. +- **Teams running 50+ microservices:** Adopt Google SRE's dual-track model. The operational/communication split becomes critical when incidents span multiple teams and subsystems. +- **Regulated industries (finance, healthcare, government):** Use Atlassian's handbook-driven approach as the foundation. Regulatory auditors expect documented procedures, and templates satisfy compliance requirements for incident communication records. +- **Hybrid (recommended for most teams at scale):** Use PagerDuty's role definitions, Google's track separation, and Atlassian's template library. This is the approach codified in the rest of this document. + +--- + +## 2. Severity Definitions + +### Severity Classification Matrix + +| Severity | Impact | Response Time | Update Cadence | Escalation Trigger | Example | +|----------|--------|---------------|----------------|---------------------|---------| +| **SEV1** | Total service outage or data breach affecting all users. Revenue loss exceeding $10K/hour. Security incident with active exfiltration. | Page IC + on-call within 5 min. All hands mobilized within 15 min. | Every 15 min to stakeholders. Continuous updates in incident channel. | Immediate executive notification. Board notification for data breaches. | Primary database cluster down. Payment processing system offline. Active ransomware attack. | +| **SEV2** | Major feature degraded for >30% of users. Revenue impact $1K-$10K/hour. Data integrity concerns without confirmed loss. | IC assigned within 15 min. Responders mobilized within 30 min. | Every 30 min to stakeholders. Every 15 min in incident channel. | Executive notification if unresolved after 1 hour. Upgrade to SEV1 if impact expands. | Search functionality returning errors for 40% of queries. Checkout flow failing intermittently. Authentication latency exceeding 10s. | +| **SEV3** | Minor feature degraded or non-critical service impaired. Workaround available. No direct revenue impact. | Acknowledged within 1 hour. Investigation started within 4 hours. | Every 2 hours to stakeholders if actively worked. Daily if deferred. | Escalate to SEV2 if workaround fails or user complaints exceed 50 in 1 hour. | Admin dashboard loading slowly. Email notifications delayed by 30+ minutes. Non-critical API endpoint returning 5xx for <5% of requests. | +| **SEV4** | Cosmetic issue, minor bug, or internal tooling degradation. No user-facing impact or negligible impact. | Acknowledged within 1 business day. Prioritized against backlog. | No scheduled updates. Tracked in issue tracker. | Escalate to SEV3 if internal productivity impact exceeds 2 hours/day across team. | Logging pipeline dropping non-critical debug logs. Internal metrics dashboard showing stale data. Minor UI alignment issue on one browser. | + +### Customer-Facing Signals by Severity + +**SEV1 Signals:** Support ticket volume spikes >500% of baseline within 15 minutes. Social media mentions of outage trend upward. Revenue dashboards show >95% drop in transaction volume. Multiple monitoring systems alarm simultaneously. + +**SEV2 Signals:** Support ticket volume spikes 100-500% of baseline. Specific feature-related complaints cluster in support channels. Partial transaction failures visible in payment dashboards. Single monitoring system shows sustained alerting. + +**SEV3 Signals:** Sporadic support tickets with a common pattern (under 20/hour). Users report intermittent issues with workarounds. Monitoring shows degraded but not critical metrics. + +**SEV4 Signals:** Internal team notices issue during routine work. Occasional user mention with no pattern or urgency. Monitoring shows minor anomaly within acceptable thresholds. + +### Severity Upgrade and Downgrade Criteria + +**Upgrade from SEV2 to SEV1:** Impact expands to >80% of users, revenue impact confirmed above $10K/hour, data integrity compromise confirmed, or mitigation attempt fails after 45 minutes. + +**Downgrade from SEV1 to SEV2:** Partial mitigation restores service for >70% of users, revenue impact drops below $10K/hour, and no ongoing data integrity concern. + +**Downgrade from SEV2 to SEV3:** Workaround deployed and communicated, impact limited to <10% of users, and no revenue impact. + +Severity changes must be announced by the IC in the incident channel with justification. The scribe logs the timestamp and rationale. + + +--- + +## 3. Role Definitions + +### Incident Commander (IC) + +The IC is the single decision-maker during an incident. This role exists to eliminate decision-by-committee, which adds 20-40 minutes to MTTR in measured studies. + +**Responsibilities:** +- Assign severity level at triage (reassess every 30 minutes) +- Assign all other incident roles +- Approve status page updates before publication +- Make go/no-go decisions on mitigation strategies (rollback, feature flag, scaling) +- Decide when to escalate to executive leadership +- Declare incident resolved and initiate postmortem scheduling + +**Decision Authority:** The IC can authorize rollbacks, page any team member regardless of org chart, approve customer communications, and override objections from individual contributors during active mitigation. The IC cannot approve financial expenditures above $50K or public press statements -- those require VP/C-level approval. + +**What the IC Must NOT Do:** Debug code, write queries, SSH into production servers, or perform any hands-on technical work. The moment an IC starts debugging, incident coordination degrades. If the IC is the only person with domain expertise, they must hand off IC duties before engaging technically. + +### Communications Lead + +**Responsibilities:** +- Draft all status page updates using severity-appropriate templates +- Coordinate with Customer Liaison on outbound customer messaging +- Maintain the executive summary document (updated every 30 min for SEV1/SEV2) +- Manage the stakeholder notification list and delivery +- Post scheduled updates even when there is no new information ("We are continuing to investigate" is a valid update) + +### Operations Lead + +**Responsibilities:** +- Coordinate technical investigation across SMEs +- Maintain the running hypothesis list and assign investigation tasks +- Report technical findings to the IC in plain language +- Execute mitigation actions approved by the IC +- Track parallel workstreams and prevent duplicated effort + +### Scribe + +**Responsibilities:** +- Maintain a timestamped log of all decisions, actions, and findings +- Document who said what and when in the incident channel +- Capture rollback decisions, hypothesis changes, and escalation triggers +- Produce the initial postmortem timeline (saves 2-4 hours of postmortem prep) + +### Subject Matter Experts (SMEs) + +SMEs are paged on-demand by the IC for specific subsystems. They report findings to the Operations Lead, not directly to stakeholders. An SME who identifies a potential fix proposes it to the IC for approval before executing. SMEs are released from the incident explicitly by the IC when their subsystem is cleared. + +### Customer Liaison + +Owns the customer-facing voice during the incident. Monitors support channels for inbound customer reports. Drafts customer notification emails. Updates the public status page (after IC approval). Shields the technical team from direct customer inquiries during active mitigation. + +--- + +## 4. Communication Protocols + +### Incident Channel Naming Convention + +Format: `#inc-YYYYMMDD-brief-desc` + +Examples: +- `#inc-20260216-payment-api-timeout` +- `#inc-20260216-db-primary-failover` +- `#inc-20260216-auth-service-degraded` + +Channel topic must include: severity, IC name, bridge call link, status page link. +Example topic: `SEV1 | IC: @jane.smith | Bridge: https://meet.example.com/inc-20260216 | Status: https://status.example.com` + +### Internal Status Update Templates + +**SEV1/SEV2 Update Template (posted in incident channel and executive Slack channel):** +``` +INCIDENT UPDATE - [SEV1/SEV2] - [HH:MM UTC] +Status: [Investigating | Identified | Mitigating | Resolved] +Impact: [Specific user-facing impact in plain language] +Current Action: [What is actively being done right now] +Next Update: [HH:MM UTC] +IC: @[name] +``` + +**Executive Summary Template (for SEV1, updated every 30 min):** +``` +EXECUTIVE SUMMARY - [Incident Title] - [HH:MM UTC] +Severity: SEV1 +Duration: [X hours Y minutes] +Customer Impact: [Number of affected users/transactions] +Revenue Impact: [Estimated $ if known, "assessing" if not] +Current Status: [One sentence] +Mitigation ETA: [Estimated time or "unknown"] +Next Escalation Point: [What triggers executive action] +``` + +### Status Page Update Templates + +**SEV1 Initial Post:** +``` +Title: [Service Name] - Service Disruption +Body: We are currently experiencing a disruption affecting [service/feature]. +Users may encounter [specific symptom: errors, timeouts, inability to access]. +Our engineering team has been mobilized and is actively investigating. +We will provide an update within 15 minutes. +``` + +**SEV1 Update (mitigation in progress):** +``` +Title: [Service Name] - Service Disruption (Update) +Body: We have identified the cause of the disruption affecting [service/feature] +and are implementing a fix. Some users may continue to experience [symptom]. +We expect to have an update on resolution within [X] minutes. +``` + +**SEV1 Resolution:** +``` +Title: [Service Name] - Resolved +Body: The disruption affecting [service/feature] has been resolved as of [HH:MM UTC]. +Service has been restored to normal operation. Users should no longer experience +[symptom]. We will publish a full incident report within 48 hours. +We apologize for the inconvenience. +``` + +**SEV2 Initial Post:** +``` +Title: [Service Name] - Degraded Performance +Body: We are investigating reports of degraded performance affecting [feature]. +Some users may experience [specific symptom]. A workaround is [available/not yet available]. +Our team is actively investigating and we will provide an update within 30 minutes. +``` + +### Bridge Call / War Room Etiquette + +1. **Mute by default.** Unmute only when speaking to the IC or Operations Lead. +2. **Identify yourself before speaking.** "This is [name] from [team]." Every time. +3. **State findings, then recommendations.** "Database replication lag is 45 seconds and climbing. I recommend we fail over to the secondary cluster." +4. **IC confirms before action.** No unilateral action on production systems during an incident. The IC says "approved" or "hold" before anyone executes. +5. **No side conversations.** If two SMEs need to discuss a hypothesis, they take it to a breakout channel and report back findings to the main bridge. +6. **Time-box debugging.** The IC sets 15-minute timers for investigation threads. If a hypothesis is not confirmed or denied in 15 minutes, pivot to the next hypothesis or escalate. + +### Customer Notification Templates + +**SEV1 Customer Email (B2B, enterprise accounts):** +``` +Subject: [Company Name] Service Incident - [Date] + +Dear [Customer Name], + +We are writing to inform you of a service incident affecting [product/service] +that began at [HH:MM UTC] on [date]. + +Impact: [Specific impact to this customer's usage] +Current Status: [Brief status] +Expected Resolution: [ETA if known, or "We are working to resolve this as quickly as possible"] + +We will continue to provide updates every [15/30] minutes until resolution. +Your dedicated account team is available at [contact info] for any questions. + +Sincerely, +[Name], [Title] +``` + +--- + +## 5. Escalation Matrix + +### Escalation Tiers + +**Tier 1 - Within Team (0-15 minutes):** +On-call engineer investigates. If the issue is within the team's domain and matches a known runbook, resolve without escalation. Page the IC if severity is SEV2 or higher, or if the issue is not resolved within 15 minutes. + +**Tier 2 - Cross-Team (15-45 minutes):** +IC pages SMEs from adjacent teams. Common cross-team escalations: database team for replication issues, networking team for connectivity failures, security team for suspicious activity. Cross-team SMEs join the incident channel and bridge call. + +**Tier 3 - Executive (45+ minutes or immediate for SEV1):** +VP of Engineering notified for all SEV1 incidents immediately. CTO notified if SEV1 exceeds 1 hour without mitigation progress. CEO notified if SEV1 involves data breach or regulatory implications. Executive involvement is for resource allocation and external communication decisions, not technical direction. + +### Time-Based Escalation Triggers + +| Elapsed Time | SEV1 Action | SEV2 Action | +|-------------|-------------|-------------| +| 0 min | Page IC + all on-call. Notify VP Eng. | Page IC + primary on-call. | +| 15 min | Confirm all roles staffed. Open bridge call. | IC assesses if additional SMEs needed. | +| 30 min | If no mitigation path identified, page backup on-call for all related services. | First stakeholder update. Reassess severity. | +| 45 min | Escalate to CTO if no progress. Consider customer notification. | If no progress, consider escalating to SEV1. | +| 60 min | CTO briefing. Initiate customer notification if not already done. | Notify VP Eng. Page cross-team SMEs. | +| 90 min | IC rotation (fresh IC takes over). Reassess all hypotheses. | IC rotation if needed. | +| 120 min | CEO briefing if data breach or regulatory risk. External PR team engaged. | Escalate to SEV1 if impact has not decreased. | + +### Escalation Path Examples + +**Database failover failure:** +On-call DBA (Tier 1, 0-15 min) -> IC + DBA team lead (Tier 2, 15 min) -> Infrastructure VP + cloud provider support (Tier 3, 45 min) + +**Payment processing outage:** +On-call payments engineer (Tier 1, 0-5 min) -> IC + payments team lead + payment provider liaison (Tier 2, 5 min, immediate due to revenue impact) -> CFO + VP Eng (Tier 3, 15 min if provider-side issue confirmed) + +**Security incident (suspected breach):** +Security on-call (Tier 1, 0-5 min) -> CISO + IC + legal counsel (Tier 2, immediate) -> CEO + external incident response firm (Tier 3, within 1 hour if breach confirmed) + +### On-Call Rotation Best Practices + +- **Primary + secondary on-call** for every critical service. Secondary is paged automatically if primary does not acknowledge within 5 minutes. +- **On-call shifts are 7 days maximum.** Longer rotations degrade alertness and response quality. +- **Handoff checklist:** Current open issues, recent deploys in the last 48 hours, known risks or maintenance windows, escalation contacts for dependent services. +- **On-call load budget:** No more than 2 pages per night on average, measured weekly. Exceeding this indicates systemic reliability issues that must be addressed with engineering investment, not heroic on-call effort. + +--- + +## 6. Incident Lifecycle Phases + +### Phase 1: Detection + +Detection comes from three sources, in order of preference: + +1. **Automated monitoring (preferred):** Alerting rules on latency (p99 > 2x baseline), error rates (5xx > 1% of requests), saturation (CPU > 85%, memory > 90%, disk > 80%), and business metrics (transaction volume drops > 20% from 15-minute rolling average). Alerts should fire within 60 seconds of threshold breach. +2. **Internal reports:** An engineer notices anomalous behavior during routine work. Internal detection typically adds 5-15 minutes to response time compared to automated monitoring. +3. **Customer reports:** Customers contact support about issues. This is the worst detection source. If customers detect incidents before monitoring, the monitoring coverage has a gap that must be closed in the postmortem. + +**Detection SLA:** SEV1 incidents must be detected within 5 minutes of impact onset. If detection latency exceeds this, the postmortem must include a monitoring improvement action item. + +### Phase 2: Triage + +The first responder performs initial triage within 5 minutes of detection: + +1. **Scope assessment:** How many users, services, or regions are affected? Check dashboards, not assumptions. +2. **Severity assignment:** Use the severity matrix in Section 2. When in doubt, assign higher severity. Downgrading is cheap; delayed escalation is expensive. +3. **IC assignment:** For SEV1/SEV2, page the on-call IC immediately. For SEV3, the first responder may self-assign IC duties. +4. **Initial hypothesis:** What changed in the last 2 hours? Check deploy logs, config changes, upstream dependency status, and traffic patterns. 70% of incidents correlate with a change deployed in the prior 2 hours. + +### Phase 3: Mobilization + +The IC executes mobilization within 10 minutes of assignment: + +1. **Create incident channel:** `#inc-YYYYMMDD-brief-desc`. Set topic with severity, IC name, bridge link. +2. **Assign roles:** Communications Lead, Operations Lead, Scribe. For SEV3/SEV4, the IC may cover multiple roles. +3. **Open bridge call (SEV1/SEV2):** Share link in incident channel. All responders join within 5 minutes. +4. **Post initial summary:** Current understanding, affected services, assigned roles, first actions. +5. **Notify stakeholders:** Page dependent teams. Notify customer support leadership. For SEV1, notify executive chain per escalation matrix. + +### Phase 4: Investigation + +Investigation runs as parallel workstreams coordinated by the Operations Lead: + +- **Workstream discipline:** Each SME investigates one hypothesis at a time. The Operations Lead tracks active hypotheses on a shared list. Completed investigations report: confirmed, denied, or inconclusive. +- **Hypothesis testing priority:** (1) Recent changes (deploys, configs, feature flags), (2) Upstream dependency failures, (3) Capacity exhaustion, (4) Data corruption, (5) Security compromise. +- **15-minute rule:** If a hypothesis is not confirmed or denied within 15 minutes, the IC decides whether to continue, pivot, or escalate. Unbounded investigation is the leading cause of extended MTTR. +- **Evidence collection:** Screenshots, log snippets, metric graphs, and query results are posted in the incident channel, not described verbally. The scribe tags evidence with timestamps. + +### Phase 5: Mitigation + +Mitigation prioritizes restoring service over finding root cause: + +- **Rollback first:** If a deploy correlates with the incident, roll it back before investigating further. A 5-minute rollback beats a 45-minute investigation. Rollback authority rests with the IC. +- **Feature flags:** Disable the suspected feature via feature flag if available. This is faster and less risky than a full rollback. +- **Scaling:** If the issue is capacity-related, scale horizontally before investigating the traffic source. +- **Failover:** If a primary system is unrecoverable, fail over to the secondary. Test failover procedures quarterly so this is a routine, not a gamble. +- **Customer workaround:** If mitigation will take time, publish a workaround for customers (e.g., "Use the mobile app while we restore web access"). + +**Mitigation verification:** After applying mitigation, monitor key metrics for 15 minutes before declaring the issue mitigated. Premature declarations that the issue is mitigated followed by recurrence damage team credibility and customer trust. + +### Phase 6: Resolution + +Resolution is declared when the root cause is addressed and service is operating normally: + +- **Verification checklist:** Error rates returned to baseline, latency returned to baseline, no ongoing customer reports, monitoring confirms stability for 30+ minutes. +- **Incident channel update:** IC posts final status with resolution summary, total duration, and next steps. +- **Status page update:** Post resolution notice within 15 minutes of declaring resolved. +- **Stand down:** IC explicitly releases all responders. SMEs return to normal work. Bridge call is closed. + +### Phase 7: Postmortem + +Postmortem is mandatory for SEV1 and SEV2. Optional but recommended for SEV3. Never conducted for SEV4. + +- **Timeline:** Postmortem document drafted within 24 hours. Postmortem meeting held within 72 hours (3 business days). Action items assigned and tracked in the team's issue tracker. +- **Blameless standard:** The postmortem examines systems, processes, and tools -- not individual performance. "Why did the system allow this?" not "Why did [person] do this?" +- **Required sections:** Timeline (from scribe's log), root cause analysis (using 5 Whys or fault tree), impact summary (users, revenue, duration), what went well, what went poorly, action items with owners and due dates. +- **Action items and recurrence:** Every postmortem produces 3-7 concrete action items. Items without owners and due dates are not action items. Teams should close 80%+ within 30 days. If the same root cause appears in two postmortems within 6 months, escalate to engineering leadership as a systemic reliability investment area. diff --git a/engineering-team/incident-commander/references/sla-management-guide.md b/engineering-team/incident-commander/references/sla-management-guide.md new file mode 100644 index 0000000..0b082cb --- /dev/null +++ b/engineering-team/incident-commander/references/sla-management-guide.md @@ -0,0 +1,566 @@ +# SLA Management Guide + +> Comprehensive reference for Service Level Agreements, Objectives, and Indicators. +> Designed for incident commanders who must understand, protect, and communicate SLA status during and after incidents. + +--- + +## 1. Definitions & Relationships + +### Service Level Indicator (SLI) + +An SLI is the quantitative measurement of a specific aspect of service quality. SLIs are the raw data that feed everything above them. They must be precisely defined, automatically collected, and unambiguous. + +**Common SLI types by service:** + +| Service Type | SLI | Measurement Method | +|---|---|---| +| Web Application | Request latency (p50, p95, p99) | Server-side histogram | +| Web Application | Availability (successful responses / total requests) | Load balancer logs | +| REST API | Error rate (5xx responses / total responses) | API gateway metrics | +| REST API | Throughput (requests per second) | Counter metric | +| Database | Query latency (p99) | Slow query log + APM | +| Database | Replication lag (seconds) | Replica monitoring | +| Message Queue | End-to-end delivery latency | Timestamp comparison | +| Message Queue | Message loss rate | Producer vs consumer counts | +| Storage | Durability (objects lost / objects stored) | Integrity checksums | +| CDN | Cache hit ratio | Edge server logs | + +**SLI specification formula:** + +``` +SLI = (good events / total events) x 100 +``` + +For availability: `SLI = (successful requests / total requests) x 100` +For latency: `SLI = (requests faster than threshold / total requests) x 100` + +### Service Level Objective (SLO) + +An SLO is the target value or range for an SLI. It defines the acceptable level of reliability. SLOs are internal goals that engineering teams commit to. + +**Setting meaningful SLOs:** + +1. Measure the current baseline over 30 days minimum +2. Subtract a safety margin (typically 0.05%-0.1% below actual performance) +3. Validate against user expectations and business requirements +4. Never set an SLO higher than what the system can sustain without heroics + +**Common pitfall:** Setting 99.99% availability when 99.9% meets every user need. The jump from 99.9% to 99.99% is a 10x reduction in allowed downtime and typically requires 3-5x the engineering investment. + +**SLO examples:** + +- `99.9% of HTTP requests return a non-5xx response within each calendar month` +- `95% of API requests complete in under 200ms (p95 latency)` +- `99.95% of messages are delivered within 30 seconds of production` + +### Service Level Agreement (SLA) + +An SLA is a formal contract between a service provider and its customers that specifies consequences for failing to meet defined service levels. SLAs must always be looser than SLOs to provide a buffer zone. + +**Rule of thumb:** If your SLO is 99.95%, your SLA should be 99.9% or lower. The gap between SLO and SLA is your safety margin. + +### The Hierarchy + +``` + SLA (99.9%) ← Contract with customers, financial penalties + ↑ backs + SLO (99.95%) ← Internal target, triggers error budget policy + ↑ targets + SLI (measured) ← Raw metric: actual uptime = 99.97% this month +``` + +**Standard combinations by tier:** + +| Tier | SLI (Metric) | SLO (Target) | SLA (Contract) | Allowed Downtime/Month | +|---|---|---|---|---| +| Critical (payments) | Availability | 99.99% | 99.95% | SLO: 4.38 min / SLA: 21.9 min | +| High (core API) | Availability | 99.95% | 99.9% | SLO: 21.9 min / SLA: 43.8 min | +| Standard (dashboard) | Availability | 99.9% | 99.5% | SLO: 43.8 min / SLA: 3.65 hrs | +| Low (internal tools) | Availability | 99.5% | 99.0% | SLO: 3.65 hrs / SLA: 7.3 hrs | + +--- + +## 2. Error Budget Policy + +### What Is an Error Budget + +An error budget is the maximum amount of unreliability a service can have within a given period while still meeting its SLO. It is calculated as: + +``` +Error Budget = 1 - SLO target +``` + +For a 99.9% SLO over a 30-day month (43,200 minutes): + +``` +Error Budget = 1 - 0.999 = 0.001 = 0.1% +Allowed Downtime = 43,200 x 0.001 = 43.2 minutes +``` + +### Downtime Allowances by SLO + +| SLO | Error Budget | Monthly Downtime | Quarterly Downtime | Annual Downtime | +|---|---|---|---|---| +| 99.0% | 1.0% | 7 hrs 18 min | 21 hrs 54 min | 3 days 15 hrs | +| 99.5% | 0.5% | 3 hrs 39 min | 10 hrs 57 min | 1 day 19 hrs | +| 99.9% | 0.1% | 43.8 min | 2 hrs 11 min | 8 hrs 46 min | +| 99.95% | 0.05% | 21.9 min | 1 hr 6 min | 4 hrs 23 min | +| 99.99% | 0.01% | 4.38 min | 13.1 min | 52.6 min | +| 99.999% | 0.001% | 26.3 sec | 78.9 sec | 5.26 min | + +### Error Budget Consumption Tracking + +Track budget consumption as a percentage of the total budget used so far in the current window: + +``` +Budget Consumed (%) = (actual bad minutes / allowed bad minutes) x 100 +``` + +Example: SLO is 99.9% (43.8 min budget/month). On day 10, you have had 15 minutes of downtime. + +``` +Budget Consumed = (15 / 43.8) x 100 = 34.2% +Expected consumption at day 10 = (10/30) x 100 = 33.3% +Status: Slightly over pace (34.2% consumed at 33.3% of month elapsed) +``` + +### Burn Rate + +Burn rate measures how fast the error budget is being consumed relative to the steady-state rate: + +``` +Burn Rate = (error rate observed / error rate allowed by SLO) +``` + +A burn rate of 1.0 means the budget will be exactly exhausted by the end of the window. A burn rate of 10 means the budget will be exhausted in 1/10th of the window. + +**Burn rate to time-to-exhaustion (30-day month):** + +| Burn Rate | Budget Exhausted In | Urgency | +|---|---|---| +| 1x | 30 days | On pace, monitoring only | +| 2x | 15 days | Elevated attention | +| 6x | 5 days | Active investigation required | +| 14.4x | 2.08 days (~50 hours) | Immediate page | +| 36x | 20 hours | Critical, all-hands | +| 720x | 1 hour | Total outage scenario | + +### Error Budget Exhaustion Policy + +When the error budget is consumed, the following actions trigger based on threshold: + +**Tier 1 - Budget at 75% consumed (Yellow):** +- Notify service team lead via automated alert +- Freeze non-critical deployments to the affected service +- Conduct pre-emptive review of upcoming changes for risk +- Increase monitoring sensitivity (lower alert thresholds) + +**Tier 2 - Budget at 100% consumed (Orange):** +- Hard feature freeze on the affected service +- Mandatory reliability sprint: all engineering effort redirected to reliability +- Daily status updates to engineering leadership +- Postmortem required for the incidents that consumed the budget +- Freeze lasts until budget replenishes to 50% or systemic fixes are verified + +**Tier 3 - Budget at 150% consumed / SLA breach imminent (Red):** +- Escalation to VP Engineering and CTO +- Cross-team war room if dependencies are involved +- Customer communication prepared and staged +- Legal and finance teams briefed on potential SLA credit obligations +- Recovery plan with specific milestones required within 24 hours + +### Error Budget Policy Template + +``` +SERVICE: [service-name] +SLO: [target]% availability over [rolling 30-day / calendar month] window +ERROR BUDGET: [calculated] minutes per window + +BUDGET THRESHOLDS: + - 50% consumed: Team notification, increased vigilance + - 75% consumed: Feature freeze for this service, reliability focus + - 100% consumed: Full feature freeze, reliability sprint mandatory + - SLA threshold crossed: Executive escalation, customer communication + +REVIEW CADENCE: Monthly budget review on [day], quarterly SLO adjustment + +EXCEPTIONS: Planned maintenance windows excluded if communicated 72+ hours in advance + and within agreed maintenance allowance. + +APPROVED BY: [Engineering Lead] / [Product Lead] / [Date] +``` + +--- + +## 3. SLA Breach Handling + +### Detection Methods + +**Automated detection (primary):** +- Real-time monitoring dashboards with SLA burn-rate alerts +- Automated SLA compliance calculations running every 5 minutes +- Threshold-based alerts when cumulative downtime approaches SLA limits +- Synthetic monitoring (external probes) for customer-perspective validation + +**Manual review (secondary):** +- Monthly SLA compliance reports generated on the 1st of each month +- Customer-reported incidents cross-referenced with internal metrics +- Quarterly audits comparing measured SLIs against contracted SLAs +- Discrepancy review between internal metrics and customer-perceived availability + +### Breach Classification + +**Minor Breach:** +- SLA missed by less than 0.05 percentage points (e.g., 99.85% vs 99.9% SLA) +- Fewer than 3 discrete incidents contributed +- No single incident exceeded 30 minutes +- Customer impact was limited or partial degradation only +- Financial credit: typically 5-10% of monthly service fee + +**Major Breach:** +- SLA missed by 0.05 to 0.5 percentage points +- Extended outage of 1-4 hours in a single incident, or multiple significant incidents +- Clear customer impact with support tickets generated +- Financial credit: typically 10-25% of monthly service fee + +**Critical Breach:** +- SLA missed by more than 0.5 percentage points +- Total outage exceeding 4 hours, or repeated major incidents in same window +- Data loss, security incident, or compliance violation involved +- Financial credit: typically 25-100% of monthly service fee +- May trigger contract termination clauses + +### Response Protocol + +**For Minor Breach (within 3 business days):** +1. Generate SLA compliance report with exact metrics +2. Document contributing incidents with root causes +3. Send proactive notification to customer success manager +4. Issue service credits if contractually required (do not wait for customer to ask) +5. File internal improvement ticket with 30-day remediation target + +**For Major Breach (within 24 hours):** +1. Incident commander confirms SLA impact calculation +2. Draft customer communication (see template below) +3. Executive sponsor reviews and approves communication +4. Issue service credits with detailed breakdown +5. Schedule root cause review with customer within 5 business days +6. Produce remediation plan with committed timelines + +**For Critical Breach (immediate):** +1. Activate executive escalation chain +2. Legal team reviews contractual exposure +3. Finance team calculates credit obligations +4. Customer communication from VP or C-level within 4 hours +5. Dedicated remediation task force assigned +6. Weekly status updates to customer until remediation complete +7. Formal postmortem document shared with customer within 10 business days + +### Customer Communication Template + +``` +Subject: Service Level Update - [Service Name] - [Month Year] + +Dear [Customer Name], + +We are writing to inform you that [Service Name] did not meet the committed +service level of [SLA target]% availability during [time period]. + +MEASURED PERFORMANCE: [actual]% availability +COMMITTED SLA: [SLA target]% availability +SHORTFALL: [delta] percentage points + +CONTRIBUTING FACTORS: +- [Date/Time]: [Brief description of incident] ([duration] impact) +- [Date/Time]: [Brief description of incident] ([duration] impact) + +SERVICE CREDIT: In accordance with our agreement, a credit of [amount/percentage] +will be applied to your next invoice. + +REMEDIATION ACTIONS: +1. [Specific technical fix with completion date] +2. [Process improvement with implementation date] +3. [Monitoring enhancement with deployment date] + +We take our service commitments seriously. [Name], [Title] is personally +overseeing the remediation and is available to discuss further at your convenience. + +Sincerely, +[Name, Title] +``` + +### Legal and Compliance Considerations + +- Maintain auditable records of all SLA measurements for the full contract term plus 2 years +- SLA calculations must use the measurement methodology defined in the contract, not internal approximations +- Force majeure clauses typically exclude natural disasters, but verify per contract +- Planned maintenance exclusions must match the exact notification procedures in the contract +- Multi-region SLAs may have separate calculations per region; verify aggregation method + +--- + +## 4. Incident-to-SLA Mapping + +### Downtime Calculation Methodologies + +**Full outage:** Service completely unavailable. Every minute counts as a full minute of downtime. + +``` +Downtime = End Time - Start Time (in minutes) +``` + +**Partial degradation:** Service available but impaired. Apply a degradation factor: + +``` +Effective Downtime = Actual Duration x Degradation Factor +``` + +| Degradation Level | Factor | Description | +|---|---|---| +| Complete outage | 1.0 | Service fully unavailable | +| Severe degradation | 0.75 | >50% of requests failing or >10x latency | +| Moderate degradation | 0.5 | 10-50% of requests affected or 3-10x latency | +| Minor degradation | 0.25 | <10% of requests affected or <3x latency increase | +| Cosmetic / non-functional | 0.0 | No impact on core SLI metrics | + +**Note:** The exact degradation factors must be agreed upon in the SLA contract. The above are industry-standard starting points. + +### Planned vs Unplanned Downtime + +Most SLAs exclude pre-announced maintenance windows from availability calculations, subject to conditions: + +- Notification provided N hours/days in advance (commonly 72 hours) +- Maintenance occurs within an agreed window (e.g., Sunday 02:00-06:00 UTC) +- Total planned downtime does not exceed the monthly maintenance allowance (e.g., 4 hours/month) +- Any overrun beyond the planned window counts as unplanned downtime + +``` +SLA Availability = (Total Minutes - Excluded Maintenance - Unplanned Downtime) / (Total Minutes - Excluded Maintenance) x 100 +``` + +### Multi-Service SLA Composition + +When a customer-facing product depends on multiple services, composite SLA is calculated as: + +**Serial dependency (all must be up):** +``` +Composite SLA = SLA_A x SLA_B x SLA_C +Example: 99.9% x 99.95% x 99.99% = 99.84% +``` + +**Parallel / redundant (any one must be up):** +``` +Composite Availability = 1 - ((1 - SLA_A) x (1 - SLA_B)) +Example: 1 - ((1 - 0.999) x (1 - 0.999)) = 1 - 0.000001 = 99.9999% +``` + +This is critical during incidents: an outage in a shared dependency may breach SLAs for multiple customer-facing products simultaneously. + +### Worked Examples + +**Example 1: Simple outage** +- Service: Core API (SLA: 99.9%) +- Month: 30 days = 43,200 minutes +- Incident: Full outage from 14:23 to 14:38 UTC on the 12th (15 minutes) +- No other incidents this month + +``` +Availability = (43,200 - 15) / 43,200 x 100 = 99.965% +SLA Status: PASS (99.965% > 99.9%) +Error Budget Consumed: 15 / 43.2 = 34.7% +``` + +**Example 2: Partial degradation** +- Service: Payment Processing (SLA: 99.95%) +- Month: 30 days = 43,200 minutes +- Incident: 50% of transactions failing for 4 hours (240 minutes) +- Degradation factor: 0.5 (moderate - 50% of requests affected) + +``` +Effective Downtime = 240 x 0.5 = 120 minutes +Availability = (43,200 - 120) / 43,200 x 100 = 99.722% +SLA Status: FAIL (99.722% < 99.95%) +Shortfall: 0.228 percentage points → Major Breach +``` + +**Example 3: Multiple incidents** +- Service: Dashboard (SLA: 99.5%) +- Month: 31 days = 44,640 minutes +- Incident A: 45-minute full outage on the 5th +- Incident B: 2-hour severe degradation (factor 0.75) on the 18th +- Incident C: 30-minute full outage on the 25th + +``` +Total Effective Downtime = 45 + (120 x 0.75) + 30 = 45 + 90 + 30 = 165 minutes +Availability = (44,640 - 165) / 44,640 x 100 = 99.630% +SLA Status: PASS (99.630% > 99.5%) +Error Budget Consumed: 165 / 223.2 = 73.9% → Yellow threshold, feature freeze recommended +``` + +--- + +## 5. SLO Best Practices + +### Start with User Journeys + +Do not set SLOs based on infrastructure metrics. Start from what users experience: + +1. Identify critical user journeys (e.g., "User completes checkout") +2. Map each journey to the services and dependencies involved +3. Define what "good" looks like for each journey (fast, error-free, complete) +4. Select the SLIs that most directly measure that user experience +5. Set SLO targets that reflect the minimum acceptable user experience + +A database with 99.99% uptime is meaningless if the API in front of it has a bug causing 5% error rates. + +### The Four Golden Signals as SLI Sources + +From Google SRE, the four golden signals provide comprehensive service health: + +| Signal | SLI Example | Typical SLO | +|---|---|---| +| Latency | p99 request duration < 500ms | 99% of requests under threshold | +| Traffic | Requests per second | N/A (capacity planning, not SLO) | +| Errors | 5xx rate as % of total requests | < 0.1% error rate over rolling window | +| Saturation | CPU/memory/queue depth | < 80% utilization (capacity SLI) | + +For most services, latency and error rate are the two most important SLIs to back with SLOs. + +### Setting SLO Targets + +1. Collect 90 days of historical SLI data +2. Calculate the 5th percentile performance (worst 5% of days) +3. Set SLO slightly above that baseline (this ensures the SLO is achievable without heroics) +4. Validate: would a breach at this level actually impact users negatively? +5. Adjust upward only if user impact analysis demands it + +**Never set SLOs by aspiration.** A 99.99% SLO on a service that has historically achieved 99.93% is a guaranteed source of perpetual firefighting with no reliability improvement. + +### Review Cadence + +- **Weekly:** Review current error budget burn rate, flag services approaching thresholds +- **Monthly:** Full SLO compliance review, adjust alert thresholds if needed +- **Quarterly:** Reassess SLO targets based on 90-day data, review SLA contract alignment +- **Annually:** Strategic SLO review tied to product roadmap and infrastructure investments + +### Anti-Patterns + +| Anti-Pattern | Problem | Fix | +|---|---|---| +| Vanity SLOs | Setting 99.99% to impress, then ignoring breaches | Set achievable targets, enforce budget policy | +| SLO Inflation | Ratcheting SLOs up whenever performance is good | Only increase SLOs when users demonstrably need it | +| Unmeasured SLAs | Committing contractual SLAs without actual SLI measurement | Instrument SLIs before signing SLA contracts | +| Copy-Paste SLOs | Same SLO for every service regardless of criticality | Tier services by business impact, set SLOs accordingly | +| Ignoring Dependencies | Setting aggressive SLOs without accounting for dependency reliability | Calculate composite SLA; your SLO cannot exceed dependency chain | +| Alert-Free SLOs | Having SLOs but no automated alerting on budget consumption | Every SLO must have corresponding burn rate alerts | + +--- + +## 6. Monitoring & Alerting for SLAs + +### Multi-Window Burn Rate Alerting + +The Google SRE approach uses multiple time windows to balance speed of detection against alert noise. Each alert condition requires both a short window (for speed) and a long window (for confirmation): + +**Alert configuration matrix:** + +| Severity | Short Window | Short Threshold | Long Window | Long Threshold | Action | +|---|---|---|---|---|---| +| Critical (Page) | 1 hour | > 14.4x burn rate | 5 minutes | > 14.4x burn rate | Wake someone up | +| High (Page) | 6 hours | > 6x burn rate | 30 minutes | > 6x burn rate | Page on-call within 30 min | +| Medium (Ticket) | 3 days | > 1x burn rate | 6 hours | > 1x burn rate | Create ticket, next business day | + +**Why these specific numbers:** + +- 14.4x burn rate over 1 hour consumes 2% of monthly budget in that hour. At this rate, the entire 30-day budget is gone in ~50 hours. This demands immediate human attention. +- 6x burn rate over 6 hours consumes 5% of monthly budget. The budget will be exhausted in 5 days. Urgent but not wake-up-at-3am urgent. +- 1x burn rate over 3 days means you are on pace to exactly exhaust the budget. This needs investigation but is not an emergency. + +### Burn Rate Alert Formulas + +For a given time window, calculate the burn rate: + +``` +burn_rate = (error_count_in_window / request_count_in_window) / (1 - SLO_target) +``` + +Example for a 99.9% SLO, observing 50 errors out of 10,000 requests in a 1-hour window: + +``` +observed_error_rate = 50 / 10,000 = 0.005 (0.5%) +allowed_error_rate = 1 - 0.999 = 0.001 (0.1%) +burn_rate = 0.005 / 0.001 = 5.0 +``` + +A burn rate of 5.0 means the error budget is being consumed 5 times faster than the sustainable rate. + +### Alert Severity to SLA Risk Mapping + +| Burn Rate | Budget Impact | SLA Risk | Response | +|---|---|---|---| +| < 1x | Under budget pace | None | Routine monitoring | +| 1x - 3x | On pace or slightly over | Low | Investigate next business day | +| 3x - 6x | Budget will exhaust in 5-10 days | Moderate | Investigate within 4 hours | +| 6x - 14.4x | Budget will exhaust in 2-5 days | High | Page on-call, respond in 30 min | +| > 14.4x | Budget will exhaust in < 2 days | Critical | Immediate page, incident declared | +| > 100x | Active major outage | SLA breach imminent | All-hands incident response | + +### Dashboard Design for SLA Tracking + +Every SLA-tracked service should have a dashboard with these panels: + +**Row 1 - Current Status:** +- Current availability (real-time, rolling 5-minute window) +- Current error rate (real-time) +- Current p99 latency (real-time) + +**Row 2 - Budget Status:** +- Error budget remaining (% of monthly budget, gauge visualization) +- Budget consumption timeline (line chart, actual vs expected burn) +- Budget burn rate (current 1h, 6h, and 3d burn rates) + +**Row 3 - Historical Context:** +- 30-day availability trend (daily granularity) +- SLA compliance status for current and previous 3 months +- Incident markers overlaid on availability timeline + +**Row 4 - Dependencies:** +- Upstream dependency availability (services this service depends on) +- Downstream impact scope (services that depend on this service) +- Composite SLA calculation for customer-facing products + +### Alert Fatigue Prevention + +Alert fatigue is the primary reason SLA monitoring fails in practice. Mitigation strategies: + +1. **Require dual-window confirmation.** Never page on a single short window. Always require both the short window (for speed) and long window (for persistence) to fire simultaneously. + +2. **Separate page-worthy from ticket-worthy.** Only two conditions should wake someone up: >14.4x burn rate sustained, or >6x burn rate sustained. Everything else is a ticket. + +3. **Deduplicate aggressively.** If the same service triggers both a latency and error rate alert for the same underlying issue, group them into a single notification. + +4. **Auto-resolve.** Alerts must auto-resolve when the burn rate drops below threshold. Never leave stale alerts open. + +5. **Review alert quality monthly.** Track the ratio of actionable alerts to total alerts. Target >80% actionable rate. If an alert fires and no human action is needed, tune or remove it. + +6. **Escalation, not repetition.** If an alert is not acknowledged within the response window, escalate to the next tier. Do not re-send the same alert every 5 minutes. + +### Practical Monitoring Stack + +| Layer | Tool Category | Purpose | +|---|---|---| +| Collection | Prometheus, OpenTelemetry, StatsD | Gather SLI metrics from services | +| Storage | Prometheus TSDB, Thanos, Mimir | Retain metrics for SLO window + 90 days | +| Calculation | Prometheus recording rules, Sloth | Pre-compute burn rates and budget consumption | +| Alerting | Alertmanager, PagerDuty, OpsGenie | Route alerts by severity and schedule | +| Visualization | Grafana, Datadog | Dashboards for real-time and historical SLA views | +| Reporting | Custom scripts, SLO generators | Monthly SLA compliance reports for customers | + +**Retention requirement:** SLI data must be retained for at least the SLA reporting period (typically monthly or quarterly) plus a 90-day dispute window. Annual SLA reviews require 12 months of data at daily granularity minimum. + +--- + +*Last updated: February 2026* +*For use with: incident-commander skill* +*Maintainer: Engineering Team* diff --git a/engineering-team/incident-commander/scripts/incident_timeline_builder.py b/engineering-team/incident-commander/scripts/incident_timeline_builder.py new file mode 100644 index 0000000..ad49b5b --- /dev/null +++ b/engineering-team/incident-commander/scripts/incident_timeline_builder.py @@ -0,0 +1,742 @@ +#!/usr/bin/env python3 +""" +Incident Timeline Builder + +Builds structured incident timelines with automatic phase detection, gap analysis, +communication template generation, and response metrics calculation. Produces +professional reports suitable for post-incident review and stakeholder briefing. + +Usage: + python incident_timeline_builder.py incident_data.json + python incident_timeline_builder.py incident_data.json --format json + python incident_timeline_builder.py incident_data.json --format markdown + cat incident_data.json | python incident_timeline_builder.py --format text +""" + +import argparse +import json +import sys +from datetime import datetime, timedelta +from typing import Any, Dict, List, Optional, Tuple + + +# --------------------------------------------------------------------------- +# Configuration Constants +# --------------------------------------------------------------------------- + +ISO_FORMAT = "%Y-%m-%dT%H:%M:%SZ" + +EVENT_TYPES = [ + "detection", "declaration", "escalation", "investigation", + "mitigation", "communication", "resolution", "action_item", +] + +SEVERITY_LEVELS = { + "SEV1": {"label": "Critical", "rank": 1}, + "SEV2": {"label": "Major", "rank": 2}, + "SEV3": {"label": "Minor", "rank": 3}, + "SEV4": {"label": "Low", "rank": 4}, +} + +PHASE_DEFINITIONS = [ + {"name": "Detection", "trigger_types": ["detection"], + "description": "Issue detected via monitoring, alerting, or user report."}, + {"name": "Triage", "trigger_types": ["declaration", "escalation"], + "description": "Incident declared, severity assessed, commander assigned."}, + {"name": "Investigation", "trigger_types": ["investigation"], + "description": "Root cause analysis and impact assessment underway."}, + {"name": "Mitigation", "trigger_types": ["mitigation"], + "description": "Active work to reduce or eliminate customer impact."}, + {"name": "Resolution", "trigger_types": ["resolution"], + "description": "Service restored to normal operating parameters."}, +] + +GAP_THRESHOLD_MINUTES = 15 + +DECISION_EVENT_TYPES = {"escalation", "mitigation", "declaration", "resolution"} + + +# --------------------------------------------------------------------------- +# Data Model Classes +# --------------------------------------------------------------------------- + +class IncidentEvent: + """Represents a single event in the incident timeline.""" + + def __init__(self, data: Dict[str, Any]): + self.timestamp_raw: str = data.get("timestamp", "") + self.timestamp: Optional[datetime] = _parse_timestamp(self.timestamp_raw) + self.type: str = data.get("type", "unknown").lower().strip() + self.actor: str = data.get("actor", "unknown") + self.description: str = data.get("description", "") + self.metadata: Dict[str, Any] = data.get("metadata", {}) + + def to_dict(self) -> Dict[str, Any]: + result: Dict[str, Any] = { + "timestamp": self.timestamp_raw, "type": self.type, + "actor": self.actor, "description": self.description, + } + if self.metadata: + result["metadata"] = self.metadata + return result + + @property + def is_decision_point(self) -> bool: + return self.type in DECISION_EVENT_TYPES + + +class IncidentPhase: + """Represents a detected phase of the incident lifecycle.""" + + def __init__(self, name: str, description: str): + self.name: str = name + self.description: str = description + self.start_time: Optional[datetime] = None + self.end_time: Optional[datetime] = None + self.events: List[IncidentEvent] = [] + + @property + def duration_minutes(self) -> Optional[float]: + if self.start_time and self.end_time: + return (self.end_time - self.start_time).total_seconds() / 60.0 + return None + + def to_dict(self) -> Dict[str, Any]: + dur = self.duration_minutes + return { + "name": self.name, "description": self.description, + "start_time": self.start_time.strftime(ISO_FORMAT) if self.start_time else None, + "end_time": self.end_time.strftime(ISO_FORMAT) if self.end_time else None, + "duration_minutes": round(dur, 1) if dur is not None else None, + "event_count": len(self.events), + } + + +class CommunicationTemplate: + """A generated communication message for a specific audience.""" + + def __init__(self, template_type: str, audience: str, subject: str, body: str): + self.template_type = template_type + self.audience = audience + self.subject = subject + self.body = body + + def to_dict(self) -> Dict[str, Any]: + return {"template_type": self.template_type, "audience": self.audience, + "subject": self.subject, "body": self.body} + + +class TimelineGap: + """Represents a gap in the timeline where no events were logged.""" + + def __init__(self, start: datetime, end: datetime, duration_minutes: float): + self.start = start + self.end = end + self.duration_minutes = duration_minutes + + def to_dict(self) -> Dict[str, Any]: + return {"start": self.start.strftime(ISO_FORMAT), + "end": self.end.strftime(ISO_FORMAT), + "duration_minutes": round(self.duration_minutes, 1)} + + +class TimelineAnalysis: + """Holds the complete analysis result for an incident timeline.""" + + def __init__(self): + self.incident_id: str = "" + self.incident_title: str = "" + self.severity: str = "" + self.status: str = "" + self.commander: str = "" + self.service: str = "" + self.affected_services: List[str] = [] + self.declared_at: Optional[datetime] = None + self.resolved_at: Optional[datetime] = None + self.events: List[IncidentEvent] = [] + self.phases: List[IncidentPhase] = [] + self.gaps: List[TimelineGap] = [] + self.decision_points: List[IncidentEvent] = [] + self.metrics: Dict[str, Any] = {} + self.communications: List[CommunicationTemplate] = [] + self.errors: List[str] = [] + + +# --------------------------------------------------------------------------- +# Timestamp Helpers +# --------------------------------------------------------------------------- + +def _parse_timestamp(raw: str) -> Optional[datetime]: + """Parse an ISO-8601 timestamp string into a datetime object.""" + if not raw: + return None + cleaned = raw.replace("Z", "+00:00") if raw.endswith("Z") else raw + try: + return datetime.fromisoformat(cleaned).replace(tzinfo=None) + except (ValueError, AttributeError): + pass + try: + return datetime.strptime(raw, ISO_FORMAT) + except ValueError: + return None + + +def _fmt_duration(minutes: Optional[float]) -> str: + """Format a duration in minutes as a human-readable string.""" + if minutes is None: + return "N/A" + if minutes < 1: + return f"{minutes * 60:.0f}s" + if minutes < 60: + return f"{minutes:.0f}m" + hours, remaining = int(minutes // 60), int(minutes % 60) + return f"{hours}h" if remaining == 0 else f"{hours}h {remaining}m" + + +def _fmt_ts(dt: Optional[datetime]) -> str: + """Format a datetime as HH:MM:SS for display.""" + return dt.strftime("%H:%M:%S") if dt else "??:??:??" + + +def _sev_label(sev: str) -> str: + """Return the human label for a severity code.""" + return SEVERITY_LEVELS.get(sev, {}).get("label", sev) + + +# --------------------------------------------------------------------------- +# Core Analysis Functions +# --------------------------------------------------------------------------- + +def parse_incident_data(data: Dict[str, Any]) -> TimelineAnalysis: + """Parse raw incident JSON into a TimelineAnalysis with populated fields.""" + a = TimelineAnalysis() + inc = data.get("incident", {}) + a.incident_id = inc.get("id", "UNKNOWN") + a.incident_title = inc.get("title", "Untitled Incident") + a.severity = inc.get("severity", "UNKNOWN").upper() + a.status = inc.get("status", "unknown").lower() + a.commander = inc.get("commander", "Unassigned") + a.service = inc.get("service", "unknown") + a.affected_services = inc.get("affected_services", []) + a.declared_at = _parse_timestamp(inc.get("declared_at", "")) + a.resolved_at = _parse_timestamp(inc.get("resolved_at", "")) + + raw_events = data.get("events", []) + if not raw_events: + a.errors.append("No events found in incident data.") + return a + + for raw in raw_events: + event = IncidentEvent(raw) + if event.timestamp is None: + a.errors.append(f"Skipping event with unparseable timestamp: {raw.get('timestamp', '')}") + continue + a.events.append(event) + + a.events.sort(key=lambda e: e.timestamp) # type: ignore[arg-type] + return a + + +def detect_phases(analysis: TimelineAnalysis) -> None: + """Detect incident lifecycle phases from the ordered event stream.""" + if not analysis.events: + return + + trigger_map: Dict[str, Dict[str, str]] = {} + for pdef in PHASE_DEFINITIONS: + for ttype in pdef["trigger_types"]: + trigger_map[ttype] = {"name": pdef["name"], "description": pdef["description"]} + + phase_by_name: Dict[str, IncidentPhase] = {} + phase_order: List[str] = [] + current: Optional[IncidentPhase] = None + + for event in analysis.events: + pinfo = trigger_map.get(event.type) + if pinfo and pinfo["name"] not in phase_by_name: + if current is not None: + current.end_time = event.timestamp + phase = IncidentPhase(pinfo["name"], pinfo["description"]) + phase.start_time = event.timestamp + phase_by_name[pinfo["name"]] = phase + phase_order.append(pinfo["name"]) + current = phase + if current is not None: + current.events.append(event) + + if current is not None: + current.end_time = analysis.resolved_at or analysis.events[-1].timestamp + + analysis.phases = [phase_by_name[n] for n in phase_order] + + +def detect_gaps(analysis: TimelineAnalysis) -> None: + """Identify gaps longer than GAP_THRESHOLD_MINUTES between consecutive events.""" + for i in range(len(analysis.events) - 1): + ts_a, ts_b = analysis.events[i].timestamp, analysis.events[i + 1].timestamp + if ts_a is None or ts_b is None: + continue + delta = (ts_b - ts_a).total_seconds() / 60.0 + if delta >= GAP_THRESHOLD_MINUTES: + analysis.gaps.append(TimelineGap(start=ts_a, end=ts_b, duration_minutes=delta)) + + +def identify_decision_points(analysis: TimelineAnalysis) -> None: + """Extract key decision-point events from the timeline.""" + analysis.decision_points = [e for e in analysis.events if e.is_decision_point] + + +def calculate_metrics(analysis: TimelineAnalysis) -> None: + """Calculate incident response metrics: MTTD, MTTR, phase durations.""" + m: Dict[str, Any] = {} + det = [e for e in analysis.events if e.type == "detection"] + first_det = det[0].timestamp if det else None + first_ts = analysis.events[0].timestamp if analysis.events else None + + # MTTD: first event to first detection. + if first_ts and first_det: + m["mttd_minutes"] = round((first_det - first_ts).total_seconds() / 60.0, 1) + else: + m["mttd_minutes"] = None + + # MTTR: detection to resolution. + if first_det and analysis.resolved_at: + m["mttr_minutes"] = round((analysis.resolved_at - first_det).total_seconds() / 60.0, 1) + else: + m["mttr_minutes"] = None + + # Total duration. + if analysis.declared_at and analysis.resolved_at: + m["total_duration_minutes"] = round( + (analysis.resolved_at - analysis.declared_at).total_seconds() / 60.0, 1) + else: + m["total_duration_minutes"] = None + + # Phase durations. + m["phase_durations"] = { + p.name: (round(p.duration_minutes, 1) if p.duration_minutes is not None else None) + for p in analysis.phases + } + + # Event counts by type. + tc: Dict[str, int] = {} + for e in analysis.events: + tc[e.type] = tc.get(e.type, 0) + 1 + m["event_counts_by_type"] = tc + + # Gap statistics. + m["gap_count"] = len(analysis.gaps) + if analysis.gaps: + gm = [g.duration_minutes for g in analysis.gaps] + m["longest_gap_minutes"] = round(max(gm), 1) + m["total_gap_minutes"] = round(sum(gm), 1) + else: + m["longest_gap_minutes"] = 0 + m["total_gap_minutes"] = 0 + + m["total_events"] = len(analysis.events) + m["decision_point_count"] = len(analysis.decision_points) + m["phase_count"] = len(analysis.phases) + analysis.metrics = m + + +# --------------------------------------------------------------------------- +# Communication Template Generation +# --------------------------------------------------------------------------- + +def generate_communications(analysis: TimelineAnalysis) -> None: + """Generate four communication templates based on incident data.""" + sev, sl = analysis.severity, _sev_label(analysis.severity) + title, svc = analysis.incident_title, analysis.service + affected = ", ".join(analysis.affected_services) or "none identified" + cmd, iid = analysis.commander, analysis.incident_id + decl = analysis.declared_at.strftime("%Y-%m-%d %H:%M UTC") if analysis.declared_at else "TBD" + resv = analysis.resolved_at.strftime("%Y-%m-%d %H:%M UTC") if analysis.resolved_at else "TBD" + dur = _fmt_duration(analysis.metrics.get("total_duration_minutes")) + resolved = analysis.status == "resolved" + + # 1 -- Initial stakeholder notification + analysis.communications.append(CommunicationTemplate( + "initial_notification", "internal", f"[{sev}] Incident Declared: {title}", + f"An incident has been declared for {svc}.\n\n" + f"Incident ID: {iid}\nSeverity: {sev} ({sl})\nCommander: {cmd}\n" + f"Declared at: {decl}\nAffected services: {affected}\n\n" + f"The incident team is actively investigating. Updates will follow.", + )) + + # 2 -- Status page update + if resolved: + sp_subj = f"[Resolved] {title}" + sp_body = (f"The incident affecting {svc} has been resolved.\n\n" + f"Duration: {dur}\nAll affected services ({affected}) are restored. " + f"A post-incident review will be published within 48 hours.") + else: + sp_subj = f"[Investigating] {title}" + sp_body = (f"We are investigating degraded performance in {svc}. " + f"Affected services: {affected}.\n\n" + f"Our team is working to identify the root cause. Updates every 30 minutes.") + analysis.communications.append(CommunicationTemplate( + "status_page", "external", sp_subj, sp_body)) + + # 3 -- Executive summary + phase_lines = "\n".join( + f" - {p.name}: {_fmt_duration(p.duration_minutes)}" for p in analysis.phases + ) or " No phase data available." + mttd = _fmt_duration(analysis.metrics.get("mttd_minutes")) + mttr = _fmt_duration(analysis.metrics.get("mttr_minutes")) + analysis.communications.append(CommunicationTemplate( + "executive_summary", "executive", f"Executive Summary: {iid} - {title}", + f"Incident: {iid} - {title}\nSeverity: {sev} ({sl})\n" + f"Service: {svc}\nCommander: {cmd}\nStatus: {analysis.status.capitalize()}\n" + f"Declared: {decl}\nResolved: {resv}\nDuration: {dur}\n\n" + f"Key Metrics:\n - MTTD: {mttd}\n - MTTR: {mttr}\n" + f" - Timeline Gaps: {analysis.metrics.get('gap_count', 0)}\n\n" + f"Phase Breakdown:\n{phase_lines}\n\nAffected Services: {affected}", + )) + + # 4 -- Customer notification + if resolved: + cust_body = (f"We experienced an issue affecting {svc} starting at {decl}.\n\n" + f"The issue was resolved at {resv} (duration: {dur}). " + f"We apologize for any inconvenience and are reviewing to prevent recurrence.") + else: + cust_body = (f"We are experiencing an issue affecting {svc} starting at {decl}.\n\n" + f"Our engineering team is actively working to resolve this. " + f"We will provide updates as the situation develops. We apologize for the inconvenience.") + analysis.communications.append(CommunicationTemplate( + "customer_notification", "external", f"Service Update: {title}", cust_body)) + + +# --------------------------------------------------------------------------- +# Main Analysis Orchestrator +# --------------------------------------------------------------------------- + +def build_timeline(data: Dict[str, Any]) -> TimelineAnalysis: + """Run the full timeline analysis pipeline on raw incident data.""" + analysis = parse_incident_data(data) + if analysis.errors and not analysis.events: + return analysis + detect_phases(analysis) + detect_gaps(analysis) + identify_decision_points(analysis) + calculate_metrics(analysis) + generate_communications(analysis) + return analysis + + +# --------------------------------------------------------------------------- +# Output Formatters +# --------------------------------------------------------------------------- + +def format_text_output(analysis: TimelineAnalysis) -> str: + """Format the analysis as a human-readable text report.""" + L: List[str] = [] + w = 64 + + L.append("=" * w) + L.append("INCIDENT TIMELINE REPORT") + L.append("=" * w) + L.append("") + + if analysis.errors: + for err in analysis.errors: + L.append(f" WARNING: {err}") + L.append("") + if not analysis.events: + return "\n".join(L) + + # Summary + L.append("INCIDENT SUMMARY") + L.append("-" * 32) + L.append(f" ID: {analysis.incident_id}") + L.append(f" Title: {analysis.incident_title}") + L.append(f" Severity: {analysis.severity}") + L.append(f" Status: {analysis.status.capitalize()}") + L.append(f" Commander: {analysis.commander}") + L.append(f" Service: {analysis.service}") + if analysis.affected_services: + L.append(f" Affected: {', '.join(analysis.affected_services)}") + L.append(f" Duration: {_fmt_duration(analysis.metrics.get('total_duration_minutes'))}") + L.append("") + + # Key metrics + L.append("KEY METRICS") + L.append("-" * 32) + L.append(f" MTTD (Mean Time to Detect): {_fmt_duration(analysis.metrics.get('mttd_minutes'))}") + L.append(f" MTTR (Mean Time to Resolve): {_fmt_duration(analysis.metrics.get('mttr_minutes'))}") + L.append(f" Total Events: {analysis.metrics.get('total_events', 0)}") + L.append(f" Decision Points: {analysis.metrics.get('decision_point_count', 0)}") + L.append(f" Timeline Gaps (>{GAP_THRESHOLD_MINUTES}m): {analysis.metrics.get('gap_count', 0)}") + L.append("") + + # Phases + L.append("INCIDENT PHASES") + L.append("-" * 32) + if analysis.phases: + for p in analysis.phases: + L.append(f" [{_fmt_ts(p.start_time)} - {_fmt_ts(p.end_time)}] {p.name} ({_fmt_duration(p.duration_minutes)})") + L.append(f" {p.description}") + L.append(f" Events: {len(p.events)}") + else: + L.append(" No phases detected.") + L.append("") + + # Chronological timeline + L.append("CHRONOLOGICAL TIMELINE") + L.append("-" * 32) + for e in analysis.events: + marker = "*" if e.is_decision_point else " " + L.append(f" {_fmt_ts(e.timestamp)} {marker} [{e.type.upper():13s}] {e.actor}") + L.append(f" {e.description}") + L.append("") + L.append(" (* = key decision point)") + L.append("") + + # Gap warnings + if analysis.gaps: + L.append("GAP ANALYSIS") + L.append("-" * 32) + for g in analysis.gaps: + L.append(f" WARNING: {_fmt_duration(g.duration_minutes)} gap between {_fmt_ts(g.start)} and {_fmt_ts(g.end)}") + L.append("") + + # Decision points + if analysis.decision_points: + L.append("KEY DECISION POINTS") + L.append("-" * 32) + for dp in analysis.decision_points: + L.append(f" {_fmt_ts(dp.timestamp)} [{dp.type.upper()}] {dp.description}") + L.append("") + + # Communications + if analysis.communications: + L.append("GENERATED COMMUNICATIONS") + L.append("-" * 32) + for c in analysis.communications: + L.append(f" Type: {c.template_type}") + L.append(f" Audience: {c.audience}") + L.append(f" Subject: {c.subject}") + L.append(" ---") + for bl in c.body.split("\n"): + L.append(f" {bl}") + L.append("") + + L.append("=" * w) + L.append("END OF REPORT") + L.append("=" * w) + return "\n".join(L) + + +def format_json_output(analysis: TimelineAnalysis) -> Dict[str, Any]: + """Format the analysis as a structured JSON-serializable dictionary.""" + return { + "incident": { + "id": analysis.incident_id, "title": analysis.incident_title, + "severity": analysis.severity, "status": analysis.status, + "commander": analysis.commander, "service": analysis.service, + "affected_services": analysis.affected_services, + "declared_at": analysis.declared_at.strftime(ISO_FORMAT) if analysis.declared_at else None, + "resolved_at": analysis.resolved_at.strftime(ISO_FORMAT) if analysis.resolved_at else None, + }, + "timeline": [e.to_dict() for e in analysis.events], + "phases": [p.to_dict() for p in analysis.phases], + "gaps": [g.to_dict() for g in analysis.gaps], + "decision_points": [e.to_dict() for e in analysis.decision_points], + "metrics": analysis.metrics, + "communications": [c.to_dict() for c in analysis.communications], + "errors": analysis.errors if analysis.errors else [], + } + + +def format_markdown_output(analysis: TimelineAnalysis) -> str: + """Format the analysis as a professional Markdown report.""" + L: List[str] = [] + + L.append(f"# Incident Timeline Report: {analysis.incident_id}") + L.append("") + + if analysis.errors: + L.append("> **Warnings:**") + for err in analysis.errors: + L.append(f"> - {err}") + L.append("") + if not analysis.events: + return "\n".join(L) + + # Summary table + L.append("## Incident Summary") + L.append("") + L.append("| Field | Value |") + L.append("|-------|-------|") + L.append(f"| **ID** | {analysis.incident_id} |") + L.append(f"| **Title** | {analysis.incident_title} |") + L.append(f"| **Severity** | {analysis.severity} ({_sev_label(analysis.severity)}) |") + L.append(f"| **Status** | {analysis.status.capitalize()} |") + L.append(f"| **Commander** | {analysis.commander} |") + L.append(f"| **Service** | {analysis.service} |") + if analysis.affected_services: + L.append(f"| **Affected Services** | {', '.join(analysis.affected_services)} |") + L.append(f"| **Duration** | {_fmt_duration(analysis.metrics.get('total_duration_minutes'))} |") + L.append("") + + # Key metrics + L.append("## Key Metrics") + L.append("") + L.append(f"- **MTTD (Mean Time to Detect):** {_fmt_duration(analysis.metrics.get('mttd_minutes'))}") + L.append(f"- **MTTR (Mean Time to Resolve):** {_fmt_duration(analysis.metrics.get('mttr_minutes'))}") + L.append(f"- **Total Events:** {analysis.metrics.get('total_events', 0)}") + L.append(f"- **Decision Points:** {analysis.metrics.get('decision_point_count', 0)}") + L.append(f"- **Timeline Gaps (>{GAP_THRESHOLD_MINUTES}m):** {analysis.metrics.get('gap_count', 0)}") + if analysis.metrics.get("longest_gap_minutes", 0) > 0: + L.append(f"- **Longest Gap:** {_fmt_duration(analysis.metrics.get('longest_gap_minutes'))}") + L.append("") + + # Phases table + L.append("## Incident Phases") + L.append("") + if analysis.phases: + L.append("| Phase | Start | End | Duration | Events |") + L.append("|-------|-------|-----|----------|--------|") + for p in analysis.phases: + L.append(f"| {p.name} | {_fmt_ts(p.start_time)} | {_fmt_ts(p.end_time)} | {_fmt_duration(p.duration_minutes)} | {len(p.events)} |") + L.append("") + # ASCII bar chart + max_dur = max((p.duration_minutes for p in analysis.phases if p.duration_minutes), default=0) + if max_dur and max_dur > 0: + L.append("### Phase Duration Distribution") + L.append("") + L.append("```") + for p in analysis.phases: + d = p.duration_minutes or 0 + bar = "#" * int((d / max_dur) * 40) + L.append(f" {p.name:15s} |{bar} {_fmt_duration(d)}") + L.append("```") + L.append("") + else: + L.append("No phases detected.") + L.append("") + + # Chronological timeline + L.append("## Chronological Timeline") + L.append("") + for e in analysis.events: + dm = " **[KEY DECISION]**" if e.is_decision_point else "" + L.append(f"- `{_fmt_ts(e.timestamp)}` **{e.type.upper()}** ({e.actor}){dm}") + L.append(f" - {e.description}") + L.append("") + + # Gap analysis + if analysis.gaps: + L.append("## Gap Analysis") + L.append("") + L.append(f"> {len(analysis.gaps)} gap(s) of >{GAP_THRESHOLD_MINUTES} minutes detected. " + f"These may represent blind spots where important activity was not recorded.") + L.append("") + for g in analysis.gaps: + L.append(f"- **{_fmt_duration(g.duration_minutes)}** gap from `{_fmt_ts(g.start)}` to `{_fmt_ts(g.end)}`") + L.append("") + + # Decision points + if analysis.decision_points: + L.append("## Key Decision Points") + L.append("") + for dp in analysis.decision_points: + L.append(f"1. `{_fmt_ts(dp.timestamp)}` **{dp.type.upper()}** - {dp.description}") + L.append("") + + # Communications + if analysis.communications: + L.append("## Generated Communications") + L.append("") + for c in analysis.communications: + L.append(f"### {c.template_type.replace('_', ' ').title()} ({c.audience})") + L.append("") + L.append(f"**Subject:** {c.subject}") + L.append("") + for bl in c.body.split("\n"): + L.append(bl) + L.append("") + L.append("---") + L.append("") + + # Event type breakdown + tc = analysis.metrics.get("event_counts_by_type", {}) + if tc: + L.append("## Event Type Breakdown") + L.append("") + L.append("| Type | Count |") + L.append("|------|-------|") + for etype, count in sorted(tc.items(), key=lambda x: -x[1]): + L.append(f"| {etype} | {count} |") + L.append("") + + L.append("---") + L.append(f"*Report generated for incident {analysis.incident_id}. All timestamps in UTC.*") + return "\n".join(L) + + +# --------------------------------------------------------------------------- +# CLI Interface +# --------------------------------------------------------------------------- + +def main() -> int: + """Main CLI entry point.""" + parser = argparse.ArgumentParser( + description="Build structured incident timelines with phase detection and communication templates." + ) + parser.add_argument( + "data_file", nargs="?", default=None, + help="JSON file with incident data (reads stdin if omitted)", + ) + parser.add_argument( + "--format", choices=["text", "json", "markdown"], default="text", + help="Output format (default: text)", + ) + args = parser.parse_args() + + try: + if args.data_file: + try: + with open(args.data_file, "r") as f: + raw_data = json.load(f) + except FileNotFoundError: + print(f"Error: File '{args.data_file}' not found.", file=sys.stderr) + return 1 + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in '{args.data_file}': {e}", file=sys.stderr) + return 1 + else: + if sys.stdin.isatty(): + print("Error: No input file specified and stdin is a terminal. " + "Provide a file argument or pipe JSON to stdin.", file=sys.stderr) + return 1 + try: + raw_data = json.load(sys.stdin) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON on stdin: {e}", file=sys.stderr) + return 1 + + if not isinstance(raw_data, dict): + print("Error: Input must be a JSON object.", file=sys.stderr) + return 1 + if "incident" not in raw_data and "events" not in raw_data: + print("Error: Input must contain at least 'incident' or 'events' keys.", file=sys.stderr) + return 1 + + analysis = build_timeline(raw_data) + + if args.format == "json": + print(json.dumps(format_json_output(analysis), indent=2)) + elif args.format == "markdown": + print(format_markdown_output(analysis)) + else: + print(format_text_output(analysis)) + return 0 + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/engineering-team/incident-commander/scripts/postmortem_generator.py b/engineering-team/incident-commander/scripts/postmortem_generator.py new file mode 100644 index 0000000..868f425 --- /dev/null +++ b/engineering-team/incident-commander/scripts/postmortem_generator.py @@ -0,0 +1,804 @@ +#!/usr/bin/env python3 +""" +Postmortem Generator - Generate structured postmortem reports with 5-Whys analysis. + +Produces comprehensive incident postmortem documents from structured JSON input, +including root cause analysis, contributing factor classification, action item +validation, MTTD/MTTR metrics, and customer impact summaries. + +Usage: + python postmortem_generator.py incident_data.json + python postmortem_generator.py incident_data.json --format markdown + python postmortem_generator.py incident_data.json --format json + cat incident_data.json | python postmortem_generator.py + +Input: + JSON object with keys: incident, timeline, resolution, action_items, participants. + See SKILL.md for the full input schema. +""" + +import argparse +import json +import sys +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional, Tuple + + +# ---------- Constants and Configuration ---------- + +VERSION = "1.0.0" +SEVERITY_ORDER = {"SEV0": 0, "SEV1": 1, "SEV2": 2, "SEV3": 3, "SEV4": 4} +FACTOR_CATEGORIES = ("process", "tooling", "human", "environment", "external") +ACTION_TYPES = ("detection", "prevention", "mitigation", "process") +PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "P4": 4} +POSTMORTEM_TARGET_HOURS = 72 + +# Industry benchmarks for incident response (minutes, except postmortem) +BENCHMARKS = { + "SEV0": {"mttd": 5, "mttr": 60, "mitigate": 30, "declare": 5}, + "SEV1": {"mttd": 10, "mttr": 120, "mitigate": 60, "declare": 10}, + "SEV2": {"mttd": 30, "mttr": 480, "mitigate": 120, "declare": 30}, + "SEV3": {"mttd": 60, "mttr": 1440, "mitigate": 240, "declare": 60}, + "SEV4": {"mttd": 120, "mttr": 2880, "mitigate": 480, "declare": 120}, +} + +CAT_TO_ACTION = {"process": "process", "tooling": "detection", "human": "prevention", + "environment": "mitigation", "external": "prevention"} +CAT_WEIGHT = {"process": 1.0, "tooling": 0.9, "human": 0.8, "environment": 0.7, "external": 0.6} + +# Keywords used to classify contributing factors into categories +FACTOR_KEYWORDS = { + "process": ["process", "procedure", "workflow", "review", "approval", "checklist", + "runbook", "documentation", "policy", "standard", "protocol", "canary", + "deployment", "rollback", "change management"], + "tooling": ["tool", "monitor", "alert", "threshold", "automation", "test", "pipeline", + "ci/cd", "observability", "dashboard", "logging", "infrastructure", + "configuration", "config"], + "human": ["training", "knowledge", "experience", "communication", "handoff", "fatigue", + "oversight", "mistake", "error", "misunderstand", "assumption", "awareness"], + "environment": ["load", "traffic", "scale", "capacity", "resource", "network", "hardware", + "region", "latency", "timeout", "connection", "performance", "spike"], + "external": ["vendor", "third-party", "upstream", "downstream", "provider", "api", + "dependency", "partner", "dns", "cdn", "certificate"], +} + +# 5-Whys templates per category (each list is 5 why->answer steps) +WHY_TEMPLATES = { + "process": [ + "Why did this process gap exist? -> The existing process did not account for this scenario.", + "Why was the scenario not accounted for? -> It was not identified during the last process review.", + "Why was the process review incomplete? -> Reviews focus on known failure modes, not emerging risks.", + "Why are emerging risks not surfaced? -> No systematic mechanism to capture lessons from near-misses.", + "Why is there no near-miss capture mechanism? -> Incident learning is ad-hoc rather than systematic."], + "tooling": [ + "Why did the tooling fail to catch this? -> The relevant metric was not monitored or the threshold was misconfigured.", + "Why was the threshold misconfigured? -> It was set during initial deployment and never revisited.", + "Why was it never revisited? -> There is no scheduled review of monitoring configurations.", + "Why is there no scheduled review? -> Monitoring ownership is diffuse across teams.", + "Why is ownership diffuse? -> No clear operational runbook assigns monitoring review responsibilities."], + "human": [ + "Why did the human factor contribute? -> The individual lacked context needed to prevent the issue.", + "Why was context lacking? -> Knowledge was siloed and not documented accessibly.", + "Why was knowledge siloed? -> No structured onboarding or knowledge-sharing process for this area.", + "Why is there no knowledge-sharing process? -> Team capacity has been focused on feature delivery.", + "Why is capacity skewed toward features? -> Operational excellence is not weighted equally in planning."], + "environment": [ + "Why did the environment cause this failure? -> System capacity was insufficient for the load pattern.", + "Why was capacity insufficient? -> Load projections did not account for this traffic pattern.", + "Why were projections inaccurate? -> Load testing does not replicate production-scale variability.", + "Why doesn't load testing replicate production? -> Test environments lack realistic traffic generators.", + "Why are traffic generators missing? -> Investment in production-like test infrastructure was deferred."], + "external": [ + "Why did the external factor cause an incident? -> The system had a hard dependency with no fallback.", + "Why was there no fallback? -> The integration was assumed to be highly available.", + "Why was high availability assumed? -> SLA review of the external dependency was not performed.", + "Why was SLA review skipped? -> No standard checklist for evaluating third-party dependencies.", + "Why is there no evaluation checklist? -> Vendor management practices are informal and undocumented."], +} + +THEME_RECS = { + "process": ["Establish a quarterly process review cadence covering change management and deployment procedures.", + "Implement a near-miss tracking system to surface latent risks before they become incidents.", + "Create pre-deployment checklists that require sign-off from the service owner."], + "tooling": ["Schedule quarterly reviews of alerting thresholds and monitoring coverage.", + "Assign explicit monitoring ownership per service in operational runbooks.", + "Invest in synthetic monitoring and canary analysis for critical paths."], + "human": ["Build structured onboarding that covers incident-prone areas and past postmortems.", + "Implement blameless knowledge-sharing sessions after each incident.", + "Balance operational excellence work alongside feature delivery in sprint planning."], + "environment": ["Conduct periodic capacity planning reviews using production traffic replays.", + "Invest in production-like load-testing infrastructure with realistic traffic profiles.", + "Implement auto-scaling policies with validated upper-bound thresholds."], + "external": ["Perform formal SLA reviews for all third-party dependencies annually.", + "Implement circuit breakers and fallbacks for external service integrations.", + "Maintain a dependency registry with risk ratings and contingency plans."], +} + +MISSING_ACTION_TEMPLATES = { + "process": "Create or update runbook/checklist to prevent recurrence of this process gap", + "detection": "Add monitoring and alerting to detect this class of issue earlier", + "mitigation": "Implement auto-scaling or circuit-breaker to reduce blast radius", + "prevention": "Add automated safeguards (canary deploy, load test gate) to prevent recurrence", +} + + +# ---------- Data Model Classes ---------- + +class IncidentData: + """Parsed incident metadata.""" + def __init__(self, data: Dict[str, Any]) -> None: + self.id: str = data.get("id", "UNKNOWN") + self.title: str = data.get("title", "Untitled Incident") + self.severity: str = data.get("severity", "SEV3").upper() + self.commander: str = data.get("commander", "Unassigned") + self.service: str = data.get("service", "unknown-service") + self.affected_services: List[str] = data.get("affected_services", []) + + def to_dict(self) -> Dict[str, Any]: + return {"id": self.id, "title": self.title, "severity": self.severity, + "commander": self.commander, "service": self.service, + "affected_services": self.affected_services} + + +class TimelineMetrics: + """MTTD, MTTR, and other timing metrics computed from raw timestamps.""" + def __init__(self, timeline: Dict[str, str], severity: str) -> None: + self.severity = severity + self.issue_started = self._parse(timeline.get("issue_started")) + self.detected_at = self._parse(timeline.get("detected_at")) + self.declared_at = self._parse(timeline.get("declared_at")) + self.mitigated_at = self._parse(timeline.get("mitigated_at")) + self.resolved_at = self._parse(timeline.get("resolved_at")) + self.postmortem_at = self._parse(timeline.get("postmortem_at")) + + @staticmethod + def _parse(ts: Optional[str]) -> Optional[datetime]: + if ts is None: + return None + for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S"): + try: + dt = datetime.strptime(ts, fmt) + return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc) + except ValueError: + continue + return None + + def _delta_min(self, start: Optional[datetime], end: Optional[datetime]) -> Optional[float]: + if start is None or end is None: + return None + return round((end - start).total_seconds() / 60.0, 1) + + @property + def mttd(self) -> Optional[float]: + return self._delta_min(self.issue_started, self.detected_at) + + @property + def mttr(self) -> Optional[float]: + return self._delta_min(self.detected_at, self.resolved_at) + + @property + def time_to_mitigate(self) -> Optional[float]: + return self._delta_min(self.detected_at, self.mitigated_at) + + @property + def time_to_declare(self) -> Optional[float]: + return self._delta_min(self.detected_at, self.declared_at) + + @property + def postmortem_timeliness_hours(self) -> Optional[float]: + m = self._delta_min(self.resolved_at, self.postmortem_at) + return round(m / 60.0, 1) if m is not None else None + + @property + def postmortem_on_time(self) -> Optional[bool]: + h = self.postmortem_timeliness_hours + return h <= POSTMORTEM_TARGET_HOURS if h is not None else None + + def benchmark_comparison(self) -> Dict[str, Dict[str, Any]]: + bench = BENCHMARKS.get(self.severity, BENCHMARKS["SEV3"]) + results: Dict[str, Dict[str, Any]] = {} + for name, actual, target in [("mttd", self.mttd, bench["mttd"]), + ("mttr", self.mttr, bench["mttr"]), + ("time_to_mitigate", self.time_to_mitigate, bench["mitigate"]), + ("time_to_declare", self.time_to_declare, bench["declare"])]: + if actual is not None: + results[name] = {"actual_minutes": actual, "benchmark_minutes": target, + "met_benchmark": actual <= target, + "delta_minutes": round(actual - target, 1)} + h = self.postmortem_timeliness_hours + if h is not None: + results["postmortem_timeliness"] = { + "actual_hours": h, "target_hours": POSTMORTEM_TARGET_HOURS, + "met_target": self.postmortem_on_time, "delta_hours": round(h - POSTMORTEM_TARGET_HOURS, 1)} + return results + + def to_dict(self) -> Dict[str, Any]: + return {"mttd_minutes": self.mttd, "mttr_minutes": self.mttr, + "time_to_mitigate_minutes": self.time_to_mitigate, + "time_to_declare_minutes": self.time_to_declare, + "postmortem_timeliness_hours": self.postmortem_timeliness_hours, + "postmortem_on_time": self.postmortem_on_time, + "benchmarks": self.benchmark_comparison()} + + +class ContributingFactor: + """A classified contributing factor with weight and action-type mapping.""" + def __init__(self, description: str, index: int) -> None: + self.description = description + self.index = index + self.category = self._classify() + self.weight = round(max(1.0 - index * 0.15, 0.3) * CAT_WEIGHT.get(self.category, 0.8), 2) + self.mapped_action_type = CAT_TO_ACTION.get(self.category, "process") + + def _classify(self) -> str: + lower = self.description.lower() + scores = {cat: sum(1 for kw in kws if kw in lower) for cat, kws in FACTOR_KEYWORDS.items()} + best = max(scores, key=lambda k: scores[k]) + return best if scores[best] > 0 else "process" + + def to_dict(self) -> Dict[str, Any]: + return {"description": self.description, "category": self.category, + "weight": self.weight, "mapped_action_type": self.mapped_action_type} + + +class FiveWhysAnalysis: + """Structured 5-Whys chain for a contributing factor.""" + def __init__(self, factor: ContributingFactor) -> None: + self.factor = factor + self.systemic_theme: str = factor.category + self.chain: List[str] = [f"Why? {factor.description}"] + \ + WHY_TEMPLATES.get(factor.category, WHY_TEMPLATES["process"]) + + def to_dict(self) -> Dict[str, Any]: + return {"factor": self.factor.description, "category": self.factor.category, + "chain": self.chain, "systemic_theme": self.systemic_theme} + + +class ActionItem: + """Parsed and validated action item.""" + def __init__(self, data: Dict[str, Any]) -> None: + self.title: str = data.get("title", "") + self.owner: str = data.get("owner", "") + self.priority: str = data.get("priority", "P3") + self.deadline: str = data.get("deadline", "") + self.type: str = data.get("type", "process") + self.status: str = data.get("status", "open") + self.validation_issues: List[str] = [] + self.quality_score: int = 0 + self._validate() + + def _validate(self) -> None: + self.validation_issues = [] + if not self.title: + self.validation_issues.append("Missing title") + if not self.owner: + self.validation_issues.append("Missing owner") + if not self.deadline: + self.validation_issues.append("Missing deadline") + if self.priority not in PRIORITY_ORDER: + self.validation_issues.append(f"Invalid priority: {self.priority}") + if self.type not in ACTION_TYPES: + self.validation_issues.append(f"Invalid type: {self.type}") + self.quality_score = self._score_quality() + + def _score_quality(self) -> int: + """Score 0-100: specific, measurable, achievable.""" + s = 0 + if len(self.title) > 10: s += 20 + if self.owner: s += 20 + if self.deadline: s += 20 + if self.priority in PRIORITY_ORDER: s += 10 + if self.type in ACTION_TYPES: s += 10 + if any(kw in self.title.lower() for kw in ["%", "threshold", "within", "before", + "after", "less than", "greater than"]): + s += 10 + if len(self.title.split()) >= 5: s += 10 + return min(s, 100) + + @property + def is_valid(self) -> bool: + return len(self.validation_issues) == 0 + + @property + def is_past_deadline(self) -> bool: + if not self.deadline or self.status != "open": + return False + try: + dl = datetime.strptime(self.deadline, "%Y-%m-%d").replace(tzinfo=timezone.utc) + return datetime.now(timezone.utc) > dl + except ValueError: + return False + + def to_dict(self) -> Dict[str, Any]: + return {"title": self.title, "owner": self.owner, "priority": self.priority, + "deadline": self.deadline, "type": self.type, "status": self.status, + "is_valid": self.is_valid, "validation_issues": self.validation_issues, + "quality_score": self.quality_score, "is_past_deadline": self.is_past_deadline} + + +class PostmortemReport: + """Complete postmortem document assembled from all analysis components.""" + + def __init__(self, raw: Dict[str, Any]) -> None: + self.raw = raw + self.incident = IncidentData(raw.get("incident", {})) + self.timeline = TimelineMetrics(raw.get("timeline", {}), self.incident.severity) + self.resolution: Dict[str, Any] = raw.get("resolution", {}) + self.participants: List[Dict[str, str]] = raw.get("participants", []) + # Derived analysis + self.contributing_factors = [ContributingFactor(f, i) + for i, f in enumerate(self.resolution.get("contributing_factors", []))] + self.five_whys = [FiveWhysAnalysis(f) for f in self.contributing_factors] + self.action_items = [ActionItem(a) for a in raw.get("action_items", [])] + self.factor_distribution = self._compute_factor_distribution() + self.coverage_gaps = self._find_coverage_gaps() + self.suggested_actions = self._suggest_missing_actions() + self.theme_recommendations = self._build_theme_recommendations() + + def _compute_factor_distribution(self) -> Dict[str, float]: + dist: Dict[str, float] = {c: 0.0 for c in FACTOR_CATEGORIES} + total = sum(f.weight for f in self.contributing_factors) or 1.0 + for f in self.contributing_factors: + dist[f.category] += f.weight + return {k: round(v / total * 100, 1) for k, v in dist.items()} + + def _find_coverage_gaps(self) -> List[str]: + factor_cats = {f.category for f in self.contributing_factors} + action_types = {a.type for a in self.action_items} + gaps = [] + for cat in factor_cats: + expected = CAT_TO_ACTION.get(cat) + if expected and expected not in action_types: + gaps.append(f"No '{expected}' action item to address '{cat}' contributing factor") + return gaps + + def _suggest_missing_actions(self) -> List[Dict[str, str]]: + factor_cats = {f.category for f in self.contributing_factors} + action_types = {a.type for a in self.action_items} + suggestions = [] + for cat in factor_cats: + expected = CAT_TO_ACTION.get(cat) + if expected and expected not in action_types: + suggestions.append({ + "type": expected, + "suggestion": MISSING_ACTION_TEMPLATES.get(expected, "Add an action item for this gap"), + "reason": f"No action item addresses the '{cat}' contributing factor"}) + return suggestions + + def _build_theme_recommendations(self) -> Dict[str, List[str]]: + seen: Dict[str, List[str]] = {} + for a in self.five_whys: + if a.systemic_theme not in seen: + seen[a.systemic_theme] = THEME_RECS.get(a.systemic_theme, []) + return seen + + def customer_impact_summary(self) -> Dict[str, Any]: + impact = self.resolution.get("customer_impact", {}) + affected = impact.get("affected_users", 0) + failed_tx = impact.get("failed_transactions", 0) + revenue = impact.get("revenue_impact_usd", 0) + data_loss = impact.get("data_loss", False) + comm_required = affected > 1000 or data_loss or revenue > 10000 + sev = "high" if (affected > 10000 or revenue > 50000) else ( + "medium" if (affected > 1000 or revenue > 5000) else "low") + return {"affected_users": affected, "failed_transactions": failed_tx, + "revenue_impact_usd": revenue, "data_loss": data_loss, + "data_integrity": "compromised" if data_loss else "intact", + "customer_communication_required": comm_required, "impact_severity": sev} + + def executive_summary(self) -> str: + mttr = self.timeline.mttr + ci = self.customer_impact_summary() + mttr_str = f"{mttr:.0f} minutes" if mttr is not None else "unknown duration" + parts = [ + f"On {self._fmt_date(self.timeline.issue_started)}, a {self.incident.severity} " + f"incident (\"{self.incident.title}\") impacted the {self.incident.service} service.", + f"The root cause was identified as: {self.resolution.get('root_cause', 'Unknown root cause')}.", + f"The incident was resolved in {mttr_str}, affecting approximately " + f"{ci['affected_users']:,} users with an estimated revenue impact of ${ci['revenue_impact_usd']:,.2f}.", + "Data loss was confirmed; affected customers must be notified." if ci["data_loss"] + else "No data loss occurred during this incident."] + return " ".join(parts) + + @staticmethod + def _fmt_date(dt: Optional[datetime]) -> str: + return dt.strftime("%Y-%m-%d at %H:%M UTC") if dt else "an unknown date" + + def overdue_p1_items(self) -> List[Dict[str, str]]: + return [{"title": a.title, "owner": a.owner, "deadline": a.deadline} + for a in self.action_items if a.priority in ("P0", "P1") and a.is_past_deadline] + + def to_dict(self) -> Dict[str, Any]: + return { + "version": VERSION, "incident": self.incident.to_dict(), + "executive_summary": self.executive_summary(), + "timeline_metrics": self.timeline.to_dict(), + "customer_impact": self.customer_impact_summary(), + "root_cause": self.resolution.get("root_cause", ""), + "contributing_factors": [f.to_dict() for f in self.contributing_factors], + "factor_distribution": self.factor_distribution, + "five_whys_analysis": [a.to_dict() for a in self.five_whys], + "theme_recommendations": self.theme_recommendations, + "mitigation_steps": self.resolution.get("mitigation_steps", []), + "permanent_fix": self.resolution.get("permanent_fix", ""), + "action_items": [a.to_dict() for a in self.action_items], + "action_item_coverage_gaps": self.coverage_gaps, + "suggested_actions": self.suggested_actions, + "overdue_p1_items": self.overdue_p1_items(), + "participants": self.participants} + + +# ---------- Core Analysis Helpers ---------- + +def _bar(pct: float, width: int = 30) -> str: + """Render a text-based horizontal bar chart segment.""" + filled = int(round(pct / 100 * width)) + return "[" + "#" * filled + "." * (width - filled) + "]" + + +def _generate_lessons(report: PostmortemReport) -> List[str]: + """Derive lessons learned from the analysis.""" + lessons: List[str] = [] + bench = BENCHMARKS.get(report.incident.severity, BENCHMARKS["SEV3"]) + mttd = report.timeline.mttd + if mttd is not None and mttd > bench["mttd"]: + lessons.append( + f"Detection took {mttd:.0f} minutes, exceeding the {bench['mttd']}-minute " + f"benchmark for {report.incident.severity}. Invest in earlier detection mechanisms.") + dist = report.factor_distribution + dominant = max(dist, key=lambda k: dist[k]) + if dist[dominant] >= 50: + lessons.append( + f"The '{dominant}' category accounts for {dist[dominant]:.0f}% of contributing factors. " + f"Targeted improvements in this area will yield the highest return.") + if report.coverage_gaps: + lessons.append( + f"There are {len(report.coverage_gaps)} action item coverage gap(s). " + "Ensure every contributing factor category has a corresponding remediation action.") + avg_q = (sum(a.quality_score for a in report.action_items) / len(report.action_items) + if report.action_items else 0) + if avg_q < 70: + lessons.append( + f"Average action item quality score is {avg_q:.0f}/100. " + "Make action items more specific with measurable targets and clear ownership.") + if report.timeline.postmortem_on_time is False: + h = report.timeline.postmortem_timeliness_hours + lessons.append( + f"Postmortem was held {h:.0f} hours after resolution, exceeding the " + f"{POSTMORTEM_TARGET_HOURS}-hour target. Schedule postmortems sooner to capture context.") + if not lessons: + lessons.append("This incident was handled within benchmarks. Continue reinforcing " + "current practices and share this postmortem for organizational learning.") + return lessons + + +# ---------- Output Formatters ---------- + +def format_text(report: PostmortemReport) -> str: + """Format the postmortem as plain text.""" + L: List[str] = [] + W = 72 + + def h1(title: str) -> None: + L.append(""); L.append("=" * W); L.append(f" {title}"); L.append("=" * W) + + def h2(title: str) -> None: + L.append(""); L.append(f"--- {title} ---") + + inc = report.incident + h1(f"POSTMORTEM: {inc.title}") + L.append(f" ID: {inc.id} | Severity: {inc.severity} | Service: {inc.service}") + L.append(f" Commander: {inc.commander}") + if inc.affected_services: + L.append(f" Affected services: {', '.join(inc.affected_services)}") + # Executive Summary + h1("EXECUTIVE SUMMARY") + L.append("") + for sentence in report.executive_summary().split(". "): + s = sentence.strip() + if s and not s.endswith("."): s += "." + if s: L.append(f" {s}") + # Timeline Metrics + h1("TIMELINE METRICS") + tm = report.timeline + L.append("") + for label, val, unit in [("MTTD (Time to Detect)", tm.mttd, "min"), + ("MTTR (Time to Resolve)", tm.mttr, "min"), + ("Time to Mitigate", tm.time_to_mitigate, "min"), + ("Time to Declare", tm.time_to_declare, "min"), + ("Postmortem Timeliness", tm.postmortem_timeliness_hours, "hrs")]: + L.append(f" {label:<30s} {f'{val:.1f} {unit}' if val is not None else 'N/A'}") + h2("Benchmark Comparison") + for name, d in tm.benchmark_comparison().items(): + if "actual_minutes" in d: + st = "PASS" if d["met_benchmark"] else "FAIL" + L.append(f" {name:<25s} actual={d['actual_minutes']}min benchmark={d['benchmark_minutes']}min [{st}]") + elif "actual_hours" in d: + st = "PASS" if d["met_target"] else "FAIL" + L.append(f" {name:<25s} actual={d['actual_hours']}hrs target={d['target_hours']}hrs [{st}]") + # Customer Impact + h1("CUSTOMER IMPACT") + ci = report.customer_impact_summary() + L.append("") + L.append(f" Affected users: {ci['affected_users']:,}") + L.append(f" Failed transactions: {ci['failed_transactions']:,}") + L.append(f" Revenue impact: ${ci['revenue_impact_usd']:,.2f}") + L.append(f" Data integrity: {ci['data_integrity']}") + L.append(f" Impact severity: {ci['impact_severity']}") + L.append(f" Comms required: {'Yes' if ci['customer_communication_required'] else 'No'}") + # Root Cause + h1("ROOT CAUSE ANALYSIS") + L.append("") + L.append(f" {report.resolution.get('root_cause', 'Unknown')}") + h2("Contributing Factors") + for f in report.contributing_factors: + L.append(f" [{f.category.upper():<12s} w={f.weight:.2f}] {f.description}") + h2("Factor Distribution") + for cat, pct in sorted(report.factor_distribution.items(), key=lambda x: -x[1]): + if pct > 0: + L.append(f" {cat:<14s} {pct:5.1f}% {_bar(pct)}") + # 5-Whys + h1("5-WHYS ANALYSIS") + for analysis in report.five_whys: + L.append("") + L.append(f" Factor: {analysis.factor.description}") + L.append(f" Theme: {analysis.systemic_theme}") + for i, step in enumerate(analysis.chain): + L.append(f" {i}. {step}") + h2("Theme-Based Recommendations") + for theme, recs in report.theme_recommendations.items(): + L.append(f" [{theme.upper()}]") + for rec in recs: + L.append(f" - {rec}") + # Mitigation & Fix + h1("MITIGATION AND RESOLUTION") + h2("Mitigation Steps Taken") + for step in report.resolution.get("mitigation_steps", []): + L.append(f" - {step}") + h2("Permanent Fix") + L.append(f" {report.resolution.get('permanent_fix', 'TBD')}") + # Action Items + h1("ACTION ITEMS") + L.append("") + hdr = f" {'Priority':<10s} {'Type':<14s} {'Owner':<25s} {'Deadline':<12s} {'Quality':<8s} Title" + L.append(hdr) + L.append(" " + "-" * (len(hdr) - 2)) + for a in sorted(report.action_items, key=lambda x: PRIORITY_ORDER.get(x.priority, 99)): + flag = " *OVERDUE*" if a.is_past_deadline else "" + L.append(f" {a.priority:<10s} {a.type:<14s} {a.owner:<25s} {a.deadline:<12s} " + f"{a.quality_score:<8d} {a.title}{flag}") + if report.coverage_gaps: + h2("Coverage Gaps") + for gap in report.coverage_gaps: + L.append(f" WARNING: {gap}") + if report.suggested_actions: + h2("Suggested Additional Actions") + for s in report.suggested_actions: + L.append(f" [{s['type'].upper()}] {s['suggestion']}") + L.append(f" Reason: {s['reason']}") + overdue = report.overdue_p1_items() + if overdue: + h2("Overdue P0/P1 Items") + for item in overdue: + L.append(f" OVERDUE: {item['title']} (owner: {item['owner']}, deadline: {item['deadline']})") + # Participants + h1("PARTICIPANTS") + L.append("") + for p in report.participants: + L.append(f" {p.get('name', 'Unknown'):<25s} {p.get('role', '')}") + # Lessons Learned + h1("LESSONS LEARNED") + L.append("") + for i, lesson in enumerate(_generate_lessons(report), 1): + L.append(f" {i}. {lesson}") + L.append("") + L.append("=" * W) + L.append(f" Generated by postmortem_generator v{VERSION}") + L.append("=" * W) + L.append("") + return "\n".join(L) + + +def format_json(report: PostmortemReport) -> str: + """Format the postmortem as JSON.""" + data = report.to_dict() + data["lessons_learned"] = _generate_lessons(report) + return json.dumps(data, indent=2, default=str) + + +def format_markdown(report: PostmortemReport) -> str: + """Format the postmortem as a Markdown document.""" + L: List[str] = [] + inc = report.incident + L.append(f"# Postmortem: {inc.title}") + L.append("") + L.append("| Field | Value |") + L.append("|-------|-------|") + L.append(f"| **ID** | {inc.id} |") + L.append(f"| **Severity** | {inc.severity} |") + L.append(f"| **Service** | {inc.service} |") + L.append(f"| **Commander** | {inc.commander} |") + if inc.affected_services: + L.append(f"| **Affected Services** | {', '.join(inc.affected_services)} |") + L.append("") + # Executive Summary + L.append("## Executive Summary\n") + L.append(report.executive_summary()) + L.append("") + # Timeline Metrics + L.append("## Timeline Metrics\n") + L.append("| Metric | Value | Benchmark | Status |") + L.append("|--------|-------|-----------|--------|") + labels = {"mttd": "MTTD (Time to Detect)", "mttr": "MTTR (Time to Resolve)", + "time_to_mitigate": "Time to Mitigate", "time_to_declare": "Time to Declare", + "postmortem_timeliness": "Postmortem Timeliness"} + for key, label in labels.items(): + b = report.timeline.benchmark_comparison().get(key) + if b and "actual_minutes" in b: + st = "PASS" if b["met_benchmark"] else "FAIL" + L.append(f"| {label} | {b['actual_minutes']} min | {b['benchmark_minutes']} min | {st} |") + elif b and "actual_hours" in b: + st = "PASS" if b["met_target"] else "FAIL" + L.append(f"| {label} | {b['actual_hours']} hrs | {b['target_hours']} hrs | {st} |") + L.append("") + # Customer Impact + L.append("## Customer Impact\n") + ci = report.customer_impact_summary() + L.append(f"- **Affected users:** {ci['affected_users']:,}") + L.append(f"- **Failed transactions:** {ci['failed_transactions']:,}") + L.append(f"- **Revenue impact:** ${ci['revenue_impact_usd']:,.2f}") + L.append(f"- **Data integrity:** {ci['data_integrity']}") + L.append(f"- **Impact severity:** {ci['impact_severity']}") + L.append(f"- **Customer communication required:** {'Yes' if ci['customer_communication_required'] else 'No'}") + L.append("") + # Root Cause Analysis + L.append("## Root Cause Analysis\n") + L.append(f"**Root cause:** {report.resolution.get('root_cause', 'Unknown')}") + L.append("") + L.append("### Contributing Factors\n") + L.append("| # | Category | Weight | Description |") + L.append("|---|----------|--------|-------------|") + for i, f in enumerate(report.contributing_factors, 1): + L.append(f"| {i} | {f.category} | {f.weight:.2f} | {f.description} |") + L.append("") + L.append("### Factor Distribution\n") + L.append("```") + for cat, pct in sorted(report.factor_distribution.items(), key=lambda x: -x[1]): + if pct > 0: + L.append(f" {cat:<14s} {pct:5.1f}% {_bar(pct, 25)}") + L.append("```") + L.append("") + # 5-Whys + L.append("## 5-Whys Analysis\n") + for analysis in report.five_whys: + L.append(f"### Factor: {analysis.factor.description}") + L.append(f"**Systemic theme:** {analysis.systemic_theme}\n") + for i, step in enumerate(analysis.chain): + L.append(f"{i}. {step}") + L.append("") + L.append("### Theme-Based Recommendations\n") + for theme, recs in report.theme_recommendations.items(): + L.append(f"**{theme.capitalize()}:**") + for rec in recs: + L.append(f"- {rec}") + L.append("") + # Mitigation + L.append("## Mitigation and Resolution\n") + L.append("### Mitigation Steps Taken\n") + for step in report.resolution.get("mitigation_steps", []): + L.append(f"- {step}") + L.append("") + L.append("### Permanent Fix\n") + L.append(report.resolution.get("permanent_fix", "TBD")) + L.append("") + # Action Items + L.append("## Action Items\n") + L.append("| Priority | Type | Owner | Deadline | Quality | Title |") + L.append("|----------|------|-------|----------|---------|-------|") + for a in sorted(report.action_items, key=lambda x: PRIORITY_ORDER.get(x.priority, 99)): + flag = " **OVERDUE**" if a.is_past_deadline else "" + L.append(f"| {a.priority} | {a.type} | {a.owner} | {a.deadline} | {a.quality_score}/100 | {a.title}{flag} |") + L.append("") + if report.coverage_gaps: + L.append("### Coverage Gaps\n") + for gap in report.coverage_gaps: + L.append(f"> **WARNING:** {gap}") + L.append("") + if report.suggested_actions: + L.append("### Suggested Additional Actions\n") + for s in report.suggested_actions: + L.append(f"- **[{s['type'].upper()}]** {s['suggestion']}") + L.append(f" - _Reason: {s['reason']}_") + L.append("") + overdue = report.overdue_p1_items() + if overdue: + L.append("### Overdue P0/P1 Items\n") + for item in overdue: + L.append(f"- **{item['title']}** (owner: {item['owner']}, deadline: {item['deadline']})") + L.append("") + # Participants + L.append("## Participants\n") + L.append("| Name | Role |") + L.append("|------|------|") + for p in report.participants: + L.append(f"| {p.get('name', 'Unknown')} | {p.get('role', '')} |") + L.append("") + # Lessons Learned + L.append("## Lessons Learned\n") + for i, lesson in enumerate(_generate_lessons(report), 1): + L.append(f"{i}. {lesson}") + L.append("") + L.append("---") + L.append(f"_Generated by postmortem_generator v{VERSION}_") + L.append("") + return "\n".join(L) + + +# ---------- Input Loading ---------- + +def load_input(filepath: Optional[str]) -> Dict[str, Any]: + """Load incident data from a file path or stdin.""" + if filepath: + try: + with open(filepath, "r", encoding="utf-8") as fh: + return json.load(fh) + except FileNotFoundError: + print(f"Error: File not found: {filepath}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as exc: + print(f"Error: Invalid JSON in {filepath}: {exc}", file=sys.stderr) + sys.exit(1) + else: + if sys.stdin.isatty(): + print("Error: No input file specified and no data on stdin.", file=sys.stderr) + print("Usage: postmortem_generator.py [data_file] or pipe JSON via stdin.", file=sys.stderr) + sys.exit(1) + try: + return json.load(sys.stdin) + except json.JSONDecodeError as exc: + print(f"Error: Invalid JSON on stdin: {exc}", file=sys.stderr) + sys.exit(1) + + +def validate_input(data: Dict[str, Any]) -> List[str]: + """Return a list of validation warnings (non-fatal).""" + warnings: List[str] = [] + for key in ("incident", "timeline", "resolution", "action_items"): + if key not in data: + warnings.append(f"Missing '{key}' section") + for ts in ("issue_started", "detected_at", "mitigated_at", "resolved_at"): + if ts not in data.get("timeline", {}): + warnings.append(f"Missing timeline field: {ts}") + res = data.get("resolution", {}) + if "root_cause" not in res: + warnings.append("Missing 'root_cause' in resolution") + if not res.get("contributing_factors"): + warnings.append("No contributing factors provided") + return warnings + + +# ---------- CLI Entry Point ---------- + +def main() -> None: + """CLI entry point for postmortem generation.""" + parser = argparse.ArgumentParser( + description="Generate structured postmortem reports with 5-Whys analysis.", + epilog="Reads JSON from a file or stdin. Outputs text, JSON, or markdown.") + parser.add_argument("data_file", nargs="?", default=None, + help="JSON file with incident + resolution data (reads stdin if omitted)") + parser.add_argument("--format", choices=["text", "json", "markdown"], default="text", + dest="output_format", help="Output format (default: text)") + args = parser.parse_args() + + data = load_input(args.data_file) + warnings = validate_input(data) + for w in warnings: + print(f"Warning: {w}", file=sys.stderr) + + report = PostmortemReport(data) + formatters = {"text": format_text, "json": format_json, "markdown": format_markdown} + print(formatters[args.output_format](report)) + + +if __name__ == "__main__": + main() diff --git a/engineering-team/incident-commander/scripts/severity_classifier.py b/engineering-team/incident-commander/scripts/severity_classifier.py new file mode 100644 index 0000000..4ce6ad6 --- /dev/null +++ b/engineering-team/incident-commander/scripts/severity_classifier.py @@ -0,0 +1,1228 @@ +#!/usr/bin/env python3 +""" +Severity Classifier - Classify incident severity and generate escalation paths. + +Analyses incident data across multiple dimensions (revenue impact, user scope, +data/security risk, service criticality, blast radius) to produce a weighted +severity score and map it to SEV1-SEV4. Generates escalation paths, on-call +routing, SLA impact assessments, and immediate action plans. + +Table of Contents: + SeverityLevel - Enum-like severity definitions (SEV1-SEV4) + ImpactAssessment - Parsed impact data from incident input + SeverityScore - Multi-dimensional weighted scoring result + EscalationPath - Generated escalation routing and timelines + ActionPlan - Recommended immediate actions per severity + SLAImpact - SLA breach risk and error-budget assessment + + parse_incident_data() - Validate and normalise raw JSON input + compute_dimension_scores() - Score each weighted dimension + classify_severity() - Map composite score to SEV1-SEV4 + build_escalation_path() - Generate escalation routing + build_action_plan() - Generate immediate action checklist + assess_sla_impact() - SLA breach risk assessment + format_text() - Human-readable text output + format_json() - Machine-readable JSON output + format_markdown() - Markdown report output + main() - CLI entry point + +Usage: + python severity_classifier.py incident.json + python severity_classifier.py incident.json --format json + python severity_classifier.py incident.json --format markdown + cat incident.json | python severity_classifier.py --format text + echo '{"incident":{...}}' | python severity_classifier.py +""" + +import argparse +import json +import sys +from dataclasses import dataclass, field, asdict +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional, Tuple + + +# ---------- Severity Level Definitions ---------------------------------------- + +class SeverityLevel: + """Enum-like container for SEV1 through SEV4 definitions.""" + + SEV1 = "SEV1" + SEV2 = "SEV2" + SEV3 = "SEV3" + SEV4 = "SEV4" + + DEFINITIONS: Dict[str, Dict[str, Any]] = { + "SEV1": { + "label": "Critical", + "description": ( + "Complete service outage, confirmed data loss or corruption, " + "active security breach, or more than 50% of users affected." + ), + "score_threshold": 0.75, + "response_time_minutes": 5, + "update_cadence_minutes": 15, + "executive_notify": True, + "war_room": True, + }, + "SEV2": { + "label": "Major", + "description": ( + "Significant service degradation, more than 25% of users " + "affected, no viable workaround, or high revenue impact." + ), + "score_threshold": 0.50, + "response_time_minutes": 15, + "update_cadence_minutes": 30, + "executive_notify": False, + "war_room": True, + }, + "SEV3": { + "label": "Moderate", + "description": ( + "Partial degradation with workaround available, fewer than " + "25% of users affected, limited blast radius." + ), + "score_threshold": 0.25, + "response_time_minutes": 30, + "update_cadence_minutes": 60, + "executive_notify": False, + "war_room": False, + }, + "SEV4": { + "label": "Minor", + "description": ( + "Cosmetic issue, low impact, minimal user effect, " + "informational or non-urgent." + ), + "score_threshold": 0.0, + "response_time_minutes": 120, + "update_cadence_minutes": 240, + "executive_notify": False, + "war_room": False, + }, + } + + @classmethod + def from_score(cls, score: float) -> str: + """Return the severity level string for a given composite score.""" + for level in [cls.SEV1, cls.SEV2, cls.SEV3]: + if score >= cls.DEFINITIONS[level]["score_threshold"]: + return level + return cls.SEV4 + + @classmethod + def get_definition(cls, level: str) -> Dict[str, Any]: + return cls.DEFINITIONS.get(level, cls.DEFINITIONS[cls.SEV4]) + + +# ---------- Configuration Constants ------------------------------------------- + +DIMENSION_WEIGHTS: Dict[str, float] = { + "revenue_impact": 0.25, + "user_impact_scope": 0.25, + "data_security_risk": 0.20, + "service_criticality": 0.15, + "blast_radius": 0.15, +} + +REVENUE_IMPACT_SCORES: Dict[str, float] = { + "critical": 1.0, + "high": 0.8, + "medium": 0.5, + "low": 0.2, + "none": 0.0, +} + +DEGRADATION_SCORES: Dict[str, float] = { + "complete": 1.0, + "major": 0.75, + "partial": 0.50, + "minor": 0.25, + "none": 0.0, +} + +ERROR_RATE_THRESHOLDS: List[Tuple[float, float]] = [ + (50.0, 1.0), + (25.0, 0.8), + (10.0, 0.6), + (5.0, 0.4), + (1.0, 0.2), +] + +LATENCY_P99_THRESHOLDS_MS: List[Tuple[float, float]] = [ + (10000, 1.0), + (5000, 0.8), + (2000, 0.6), + (1000, 0.4), + (500, 0.2), +] + +SLA_TIERS: Dict[str, Dict[str, Any]] = { + "SEV1": { + "target_resolution_hours": 1, + "target_response_minutes": 5, + "sla_percentage": 99.95, + "monthly_error_budget_minutes": 21.6, + }, + "SEV2": { + "target_resolution_hours": 4, + "target_response_minutes": 15, + "sla_percentage": 99.9, + "monthly_error_budget_minutes": 43.2, + }, + "SEV3": { + "target_resolution_hours": 24, + "target_response_minutes": 60, + "sla_percentage": 99.5, + "monthly_error_budget_minutes": 216.0, + }, + "SEV4": { + "target_resolution_hours": 72, + "target_response_minutes": 480, + "sla_percentage": 99.0, + "monthly_error_budget_minutes": 432.0, + }, +} + +ESCALATION_TEMPLATES: Dict[str, Dict[str, Any]] = { + "SEV1": { + "initial_notify": ["on-call-primary", "on-call-secondary", "engineering-manager"], + "escalate_after_minutes": 15, + "escalate_to": ["vp-engineering", "cto"], + "bridge_required": True, + "status_page_update": True, + "customer_comms": True, + }, + "SEV2": { + "initial_notify": ["on-call-primary", "on-call-secondary"], + "escalate_after_minutes": 30, + "escalate_to": ["engineering-manager"], + "bridge_required": True, + "status_page_update": True, + "customer_comms": False, + }, + "SEV3": { + "initial_notify": ["on-call-primary"], + "escalate_after_minutes": 120, + "escalate_to": ["on-call-secondary"], + "bridge_required": False, + "status_page_update": False, + "customer_comms": False, + }, + "SEV4": { + "initial_notify": ["on-call-primary"], + "escalate_after_minutes": 480, + "escalate_to": [], + "bridge_required": False, + "status_page_update": False, + "customer_comms": False, + }, +} + + +# ---------- Data Model Classes ------------------------------------------------ + +@dataclass +class ImpactAssessment: + """Parsed and normalised impact data from incident input.""" + + revenue_impact: str = "none" + affected_users_percentage: float = 0.0 + affected_regions: List[str] = field(default_factory=list) + data_integrity_risk: bool = False + security_breach: bool = False + customer_facing: bool = False + degradation_type: str = "none" + workaround_available: bool = True + + +@dataclass +class SeverityScore: + """Multi-dimensional scoring result with per-dimension breakdown.""" + + composite_score: float = 0.0 + severity_level: str = SeverityLevel.SEV4 + dimensions: Dict[str, float] = field(default_factory=dict) + weighted_dimensions: Dict[str, float] = field(default_factory=dict) + contributing_factors: List[str] = field(default_factory=list) + auto_escalate_reasons: List[str] = field(default_factory=list) + + +@dataclass +class EscalationPath: + """Generated escalation routing and notification schedule.""" + + severity_level: str = SeverityLevel.SEV4 + immediate_notify: List[str] = field(default_factory=list) + escalation_chain: List[Dict[str, Any]] = field(default_factory=list) + cross_team_notify: List[str] = field(default_factory=list) + war_room_required: bool = False + bridge_link: str = "" + status_page_update: bool = False + customer_comms_required: bool = False + suggested_smes: List[str] = field(default_factory=list) + + +@dataclass +class ActionPlan: + """Recommended immediate actions checklist for the incident.""" + + severity_level: str = SeverityLevel.SEV4 + immediate_actions: List[str] = field(default_factory=list) + diagnostic_steps: List[str] = field(default_factory=list) + communication_actions: List[str] = field(default_factory=list) + rollback_assessment: Dict[str, Any] = field(default_factory=dict) + + +@dataclass +class SLAImpact: + """SLA breach risk and error-budget assessment.""" + + severity_level: str = SeverityLevel.SEV4 + sla_tier: Dict[str, Any] = field(default_factory=dict) + breach_risk: str = "low" + error_budget_impact_minutes: float = 0.0 + remaining_budget_percentage: float = 100.0 + estimated_time_to_breach_minutes: float = 0.0 + recommendations: List[str] = field(default_factory=list) + + +# ---------- Input Parsing ----------------------------------------------------- + +def parse_incident_data(raw: Dict[str, Any]) -> Tuple[Dict, ImpactAssessment, Dict, Dict]: + """ + Validate and normalise raw JSON input into typed structures. + + Returns: + (incident_info, impact_assessment, signals, context) + """ + incident = raw.get("incident", {}) + if not incident: + raise ValueError("Input must contain an 'incident' key with title and description.") + + impact_raw = raw.get("impact", {}) + impact = ImpactAssessment( + revenue_impact=impact_raw.get("revenue_impact", "none"), + affected_users_percentage=float(impact_raw.get("affected_users_percentage", 0)), + affected_regions=impact_raw.get("affected_regions", []), + data_integrity_risk=bool(impact_raw.get("data_integrity_risk", False)), + security_breach=bool(impact_raw.get("security_breach", False)), + customer_facing=bool(impact_raw.get("customer_facing", False)), + degradation_type=impact_raw.get("degradation_type", "none"), + workaround_available=bool(impact_raw.get("workaround_available", True)), + ) + + signals = raw.get("signals", {}) + context = raw.get("context", {}) + + return incident, impact, signals, context + + +# ---------- Core Scoring Engine ----------------------------------------------- + +def _score_revenue_impact(impact: ImpactAssessment) -> Tuple[float, List[str]]: + """Score the revenue impact dimension (0.0 - 1.0).""" + factors: List[str] = [] + score = REVENUE_IMPACT_SCORES.get(impact.revenue_impact, 0.0) + + if impact.customer_facing and score >= 0.5: + score = min(1.0, score + 0.1) + factors.append("Customer-facing service with revenue exposure") + + if not impact.workaround_available and score >= 0.5: + score = min(1.0, score + 0.1) + factors.append("No workaround available, prolonging revenue impact") + + if score >= 0.8: + factors.append(f"Revenue impact rated '{impact.revenue_impact}'") + + return score, factors + + +def _score_user_impact(impact: ImpactAssessment, signals: Dict) -> Tuple[float, List[str]]: + """Score the user impact scope dimension (0.0 - 1.0).""" + factors: List[str] = [] + pct = impact.affected_users_percentage + + if pct >= 75: + score = 1.0 + elif pct >= 50: + score = 0.85 + elif pct >= 25: + score = 0.65 + elif pct >= 10: + score = 0.45 + elif pct >= 1: + score = 0.25 + else: + score = 0.1 + + if pct > 0: + factors.append(f"{pct}% of users affected") + + customer_reports = signals.get("customer_reports", 0) + if customer_reports > 20: + score = min(1.0, score + 0.15) + factors.append(f"{customer_reports} customer reports received") + elif customer_reports > 5: + score = min(1.0, score + 0.08) + factors.append(f"{customer_reports} customer reports received") + + degradation_boost = DEGRADATION_SCORES.get(impact.degradation_type, 0.0) * 0.15 + score = min(1.0, score + degradation_boost) + if impact.degradation_type in ("complete", "major"): + factors.append(f"Degradation type: {impact.degradation_type}") + + return score, factors + + +def _score_data_security(impact: ImpactAssessment) -> Tuple[float, List[str]]: + """Score the data/security risk dimension (0.0 - 1.0).""" + factors: List[str] = [] + score = 0.0 + + if impact.security_breach: + score = 1.0 + factors.append("Active security breach confirmed") + elif impact.data_integrity_risk: + score = 0.8 + factors.append("Data integrity at risk") + + if impact.customer_facing and impact.data_integrity_risk: + score = min(1.0, score + 0.1) + factors.append("Customer data potentially affected") + + return score, factors + + +def _score_service_criticality(signals: Dict, context: Dict) -> Tuple[float, List[str]]: + """Score service criticality based on signals and dependency graph.""" + factors: List[str] = [] + score = 0.0 + + dependent_services = signals.get("dependent_services", []) + dep_count = len(dependent_services) + if dep_count >= 5: + score = 1.0 + factors.append(f"{dep_count} dependent services (critical hub)") + elif dep_count >= 3: + score = 0.75 + factors.append(f"{dep_count} dependent services") + elif dep_count >= 1: + score = 0.5 + factors.append(f"{dep_count} dependent service(s)") + else: + score = 0.2 + + affected_endpoints = signals.get("affected_endpoints", []) + if len(affected_endpoints) >= 5: + score = min(1.0, score + 0.15) + factors.append(f"{len(affected_endpoints)} endpoints affected") + elif len(affected_endpoints) >= 2: + score = min(1.0, score + 0.08) + factors.append(f"{len(affected_endpoints)} endpoints affected") + + return score, factors + + +def _score_blast_radius( + impact: ImpactAssessment, signals: Dict +) -> Tuple[float, List[str]]: + """Score blast radius from region spread, alert volume, and error rate.""" + factors: List[str] = [] + score = 0.0 + + region_count = len(impact.affected_regions) + if region_count >= 3: + score = 0.9 + factors.append(f"Spanning {region_count} regions") + elif region_count == 2: + score = 0.6 + factors.append(f"Spanning {region_count} regions") + elif region_count == 1: + score = 0.3 + + error_rate = signals.get("error_rate_percentage", 0.0) + for threshold, rate_score in ERROR_RATE_THRESHOLDS: + if error_rate >= threshold: + score = max(score, rate_score) + factors.append(f"Error rate at {error_rate}%") + break + + latency = signals.get("latency_p99_ms", 0) + for threshold, lat_score in LATENCY_P99_THRESHOLDS_MS: + if latency >= threshold: + score = max(score, lat_score) + factors.append(f"P99 latency at {latency}ms") + break + + alert_count = signals.get("alert_count", 0) + if alert_count >= 20: + score = min(1.0, score + 0.15) + factors.append(f"{alert_count} alerts firing") + elif alert_count >= 10: + score = min(1.0, score + 0.08) + factors.append(f"{alert_count} alerts firing") + + return score, factors + + +def compute_dimension_scores( + impact: ImpactAssessment, signals: Dict, context: Dict +) -> SeverityScore: + """Score each weighted dimension and produce a composite severity score.""" + dimensions: Dict[str, float] = {} + weighted: Dict[str, float] = {} + all_factors: List[str] = [] + auto_escalate: List[str] = [] + + # -- Revenue impact -- + rev_score, rev_factors = _score_revenue_impact(impact) + dimensions["revenue_impact"] = round(rev_score, 3) + weighted["revenue_impact"] = round(rev_score * DIMENSION_WEIGHTS["revenue_impact"], 3) + all_factors.extend(rev_factors) + + # -- User impact scope -- + user_score, user_factors = _score_user_impact(impact, signals) + dimensions["user_impact_scope"] = round(user_score, 3) + weighted["user_impact_scope"] = round(user_score * DIMENSION_WEIGHTS["user_impact_scope"], 3) + all_factors.extend(user_factors) + + # -- Data / security risk -- + sec_score, sec_factors = _score_data_security(impact) + dimensions["data_security_risk"] = round(sec_score, 3) + weighted["data_security_risk"] = round(sec_score * DIMENSION_WEIGHTS["data_security_risk"], 3) + all_factors.extend(sec_factors) + + # -- Service criticality -- + svc_score, svc_factors = _score_service_criticality(signals, context) + dimensions["service_criticality"] = round(svc_score, 3) + weighted["service_criticality"] = round(svc_score * DIMENSION_WEIGHTS["service_criticality"], 3) + all_factors.extend(svc_factors) + + # -- Blast radius -- + blast_score, blast_factors = _score_blast_radius(impact, signals) + dimensions["blast_radius"] = round(blast_score, 3) + weighted["blast_radius"] = round(blast_score * DIMENSION_WEIGHTS["blast_radius"], 3) + all_factors.extend(blast_factors) + + composite = sum(weighted.values()) + + # -- Auto-escalation overrides -- + if impact.security_breach: + composite = max(composite, 0.85) + auto_escalate.append("Security breach triggers automatic SEV1 escalation") + if impact.data_integrity_risk and impact.customer_facing: + composite = max(composite, 0.76) + auto_escalate.append("Customer-facing data integrity risk triggers SEV1 floor") + if impact.affected_users_percentage >= 50 and impact.degradation_type == "complete": + composite = max(composite, 0.80) + auto_escalate.append("Complete outage affecting 50%+ users triggers SEV1 floor") + + composite = min(1.0, round(composite, 3)) + severity_level = SeverityLevel.from_score(composite) + + return SeverityScore( + composite_score=composite, + severity_level=severity_level, + dimensions=dimensions, + weighted_dimensions=weighted, + contributing_factors=all_factors, + auto_escalate_reasons=auto_escalate, + ) + + +# ---------- Classification Wrapper -------------------------------------------- + +def classify_severity( + incident: Dict, impact: ImpactAssessment, signals: Dict, context: Dict +) -> SeverityScore: + """ + Top-level classification: compute scores and return the final + SeverityScore including the resolved severity level. + """ + return compute_dimension_scores(impact, signals, context) + + +# ---------- Escalation Path Builder ------------------------------------------- + +def build_escalation_path( + severity_score: SeverityScore, + signals: Dict, + context: Dict, +) -> EscalationPath: + """Generate the escalation routing based on severity and context.""" + level = severity_score.severity_level + template = ESCALATION_TEMPLATES.get(level, ESCALATION_TEMPLATES["SEV4"]) + + on_call = context.get("on_call", {}) + primary = on_call.get("primary", "on-call-primary@company.com") + secondary = on_call.get("secondary", "on-call-secondary@company.com") + + immediate: List[str] = [] + for role in template["initial_notify"]: + if role == "on-call-primary": + immediate.append(primary) + elif role == "on-call-secondary": + immediate.append(secondary) + else: + immediate.append(role) + + chain: List[Dict[str, Any]] = [] + if template["escalate_to"]: + chain.append({ + "trigger_after_minutes": template["escalate_after_minutes"], + "notify": template["escalate_to"], + "reason": f"No resolution within {template['escalate_after_minutes']} minutes", + }) + + sev_def = SeverityLevel.get_definition(level) + if sev_def.get("executive_notify"): + chain.append({ + "trigger_after_minutes": 15, + "notify": ["vp-engineering", "cto"], + "reason": "SEV1 executive notification policy", + }) + + cross_team: List[str] = [] + dependent_services = signals.get("dependent_services", []) + for svc in dependent_services: + cross_team.append(f"{svc}-team") + + suggested_smes: List[str] = [] + affected_endpoints = signals.get("affected_endpoints", []) + if affected_endpoints: + suggested_smes.append(f"API owner for: {', '.join(affected_endpoints[:3])}") + if dependent_services: + suggested_smes.append(f"Service owners: {', '.join(dependent_services[:3])}") + + ongoing = context.get("ongoing_incidents", []) + if ongoing: + suggested_smes.append("Incident coordinator (multiple active incidents)") + + bridge_link = "" + if template["bridge_required"]: + bridge_link = f"https://bridge.company.com/incident-{level.lower()}" + + return EscalationPath( + severity_level=level, + immediate_notify=immediate, + escalation_chain=chain, + cross_team_notify=cross_team, + war_room_required=template["bridge_required"], + bridge_link=bridge_link, + status_page_update=template["status_page_update"], + customer_comms_required=template.get("customer_comms", False), + suggested_smes=suggested_smes, + ) + + +# ---------- Action Plan Builder ----------------------------------------------- + +def build_action_plan( + severity_score: SeverityScore, + incident: Dict, + impact: ImpactAssessment, + signals: Dict, + context: Dict, +) -> ActionPlan: + """Generate the immediate action plan for the classified incident.""" + level = severity_score.severity_level + sev_def = SeverityLevel.get_definition(level) + + # -- Immediate actions -- + immediate: List[str] = [ + f"Acknowledge incident within {sev_def['response_time_minutes']} minutes", + "Join the war room / bridge call" if sev_def["war_room"] else "Open incident channel", + f"Post status update every {sev_def['update_cadence_minutes']} minutes", + ] + + if level in (SeverityLevel.SEV1, SeverityLevel.SEV2): + immediate.append("Page secondary on-call if primary unresponsive within 5 minutes") + immediate.append("Begin impact quantification for executive update") + + if impact.security_breach: + immediate.insert(0, "CRITICAL: Initiate security incident response playbook") + immediate.append("Engage security team immediately") + immediate.append("Preserve forensic evidence -- do not restart services yet") + + if impact.data_integrity_risk: + immediate.append("Halt writes to affected data stores if safe to do so") + immediate.append("Begin data integrity verification") + + # -- Diagnostic steps -- + diagnostics: List[str] = [ + "Check service dashboards and recent metric trends", + "Review application logs for error spikes", + "Verify upstream and downstream dependency health", + ] + + error_rate = signals.get("error_rate_percentage", 0) + if error_rate > 10: + diagnostics.append(f"Investigate error rate spike ({error_rate}%)") + + latency = signals.get("latency_p99_ms", 0) + if latency > 2000: + diagnostics.append(f"Investigate latency degradation (P99 = {latency}ms)") + + affected_endpoints = signals.get("affected_endpoints", []) + if affected_endpoints: + diagnostics.append( + f"Trace requests to affected endpoints: {', '.join(affected_endpoints[:5])}" + ) + + dependent_services = signals.get("dependent_services", []) + if dependent_services: + diagnostics.append( + f"Check health of dependent services: {', '.join(dependent_services)}" + ) + + # -- Communication actions -- + comms: List[str] = [] + if sev_def.get("executive_notify"): + comms.append("Draft executive summary within 15 minutes") + if level in (SeverityLevel.SEV1, SeverityLevel.SEV2): + comms.append("Post initial status page update") + comms.append("Notify customer success team for proactive outreach") + comms.append(f"Schedule post-incident review within 48 hours") + + # -- Rollback assessment -- + recent_deploys = context.get("recent_deployments", []) + rollback: Dict[str, Any] = {"recent_deployment_detected": False, "recommendation": ""} + + if recent_deploys: + latest = recent_deploys[0] + rollback["recent_deployment_detected"] = True + rollback["service"] = latest.get("service", "unknown") + rollback["version"] = latest.get("version", "unknown") + rollback["deployed_at"] = latest.get("deployed_at", "unknown") + + detected_at = incident.get("detected_at", "") + deploy_time = latest.get("deployed_at", "") + if detected_at and deploy_time: + try: + det = datetime.fromisoformat(detected_at.replace("Z", "+00:00")) + dep = datetime.fromisoformat(deploy_time.replace("Z", "+00:00")) + delta_minutes = (det - dep).total_seconds() / 60 + rollback["minutes_since_deploy"] = round(delta_minutes, 1) + if 0 < delta_minutes < 120: + rollback["recommendation"] = ( + f"STRONG: Deployment of {latest.get('service')} v{latest.get('version')} " + f"occurred {round(delta_minutes)} minutes before detection. " + "Consider immediate rollback." + ) + else: + rollback["recommendation"] = ( + "Recent deployment is outside the typical correlation window. " + "Investigate other root causes first." + ) + except (ValueError, TypeError): + rollback["recommendation"] = ( + "Unable to parse timestamps. Manually assess deployment correlation." + ) + else: + rollback["recommendation"] = ( + "No recent deployments detected. Focus on infrastructure and dependency investigation." + ) + + return ActionPlan( + severity_level=level, + immediate_actions=immediate, + diagnostic_steps=diagnostics, + communication_actions=comms, + rollback_assessment=rollback, + ) + + +# ---------- SLA Impact Assessment --------------------------------------------- + +def assess_sla_impact( + severity_score: SeverityScore, + impact: ImpactAssessment, + signals: Dict, +) -> SLAImpact: + """Calculate SLA breach risk and error-budget consumption.""" + level = severity_score.severity_level + tier = SLA_TIERS.get(level, SLA_TIERS["SEV4"]) + + # Estimate ongoing burn rate (minutes of budget consumed per real minute) + user_pct = impact.affected_users_percentage / 100.0 + degradation_factor = DEGRADATION_SCORES.get(impact.degradation_type, 0.25) + burn_rate = user_pct * degradation_factor + if burn_rate <= 0: + burn_rate = 0.01 # minimum if incident is open + + monthly_budget = tier["monthly_error_budget_minutes"] + + # Assume 30% of budget already consumed this month for conservative estimate + assumed_consumed_pct = 30.0 + remaining_budget = monthly_budget * (1 - assumed_consumed_pct / 100.0) + + if burn_rate > 0: + time_to_breach = remaining_budget / burn_rate + else: + time_to_breach = float("inf") + + # Classify breach risk + if time_to_breach <= 30: + breach_risk = "critical" + elif time_to_breach <= 120: + breach_risk = "high" + elif time_to_breach <= 480: + breach_risk = "medium" + else: + breach_risk = "low" + + budget_impact_per_hour = burn_rate * 60 + error_budget_impact = round(budget_impact_per_hour, 2) + + remaining_pct = round( + max(0.0, (remaining_budget / monthly_budget) * 100.0), 1 + ) + + recommendations: List[str] = [] + if breach_risk == "critical": + recommendations.append( + "SLA breach imminent. Prioritize resolution above all other work." + ) + recommendations.append( + "Prepare customer communication about potential SLA credit." + ) + elif breach_risk == "high": + recommendations.append( + "SLA breach likely within hours. Escalate to ensure rapid resolution." + ) + elif breach_risk == "medium": + recommendations.append( + "Monitor error budget consumption. Resolve before end of business." + ) + else: + recommendations.append( + "SLA impact is contained. Continue standard incident response." + ) + + recommendations.append( + f"Current burn rate: {round(burn_rate * 100, 1)}% of error budget per minute" + ) + recommendations.append( + f"Estimated time to SLA breach: {round(time_to_breach, 0)} minutes " + f"({round(time_to_breach / 60, 1)} hours)" + ) + + return SLAImpact( + severity_level=level, + sla_tier=tier, + breach_risk=breach_risk, + error_budget_impact_minutes=error_budget_impact, + remaining_budget_percentage=remaining_pct, + estimated_time_to_breach_minutes=round(time_to_breach, 1), + recommendations=recommendations, + ) + + +# ---------- Output Formatters ------------------------------------------------- + +def _header_line(char: str, width: int = 72) -> str: + return char * width + + +def format_text( + incident: Dict, + severity_score: SeverityScore, + escalation: EscalationPath, + action_plan: ActionPlan, + sla_impact: SLAImpact, +) -> str: + """Render a human-readable text report.""" + lines: List[str] = [] + w = 72 + + lines.append(_header_line("=", w)) + lines.append("INCIDENT SEVERITY CLASSIFICATION REPORT") + lines.append(_header_line("=", w)) + lines.append("") + + # -- Incident Summary -- + lines.append(f"Title: {incident.get('title', 'N/A')}") + lines.append(f"Service: {incident.get('service', 'N/A')}") + lines.append(f"Detected: {incident.get('detected_at', 'N/A')}") + lines.append(f"Reporter: {incident.get('reporter', 'N/A')}") + lines.append("") + + # -- Severity -- + sev_def = SeverityLevel.get_definition(severity_score.severity_level) + lines.append(_header_line("-", w)) + lines.append(f"SEVERITY: {severity_score.severity_level} ({sev_def['label']})") + lines.append(f"Composite Score: {severity_score.composite_score:.3f}") + lines.append(_header_line("-", w)) + lines.append(f" {sev_def['description']}") + lines.append("") + + # -- Dimension Breakdown -- + lines.append("Dimension Scores:") + for dim, raw in severity_score.dimensions.items(): + wt = severity_score.weighted_dimensions.get(dim, 0) + weight_cfg = DIMENSION_WEIGHTS.get(dim, 0) + label = dim.replace("_", " ").title() + lines.append(f" {label:<25s} raw={raw:.3f} weight={weight_cfg:.2f} weighted={wt:.3f}") + lines.append("") + + if severity_score.contributing_factors: + lines.append("Contributing Factors:") + for f in severity_score.contributing_factors: + lines.append(f" - {f}") + lines.append("") + + if severity_score.auto_escalate_reasons: + lines.append("Auto-Escalation Overrides:") + for r in severity_score.auto_escalate_reasons: + lines.append(f" * {r}") + lines.append("") + + # -- Escalation Path -- + lines.append(_header_line("-", w)) + lines.append("ESCALATION PATH") + lines.append(_header_line("-", w)) + lines.append(f"Immediate Notify: {', '.join(escalation.immediate_notify)}") + if escalation.war_room_required: + lines.append(f"War Room: Required ({escalation.bridge_link})") + else: + lines.append("War Room: Not required") + lines.append(f"Status Page: {'Update required' if escalation.status_page_update else 'No update needed'}") + lines.append(f"Customer Comms: {'Required' if escalation.customer_comms_required else 'Not required'}") + lines.append("") + + if escalation.escalation_chain: + lines.append("Escalation Chain:") + for step in escalation.escalation_chain: + lines.append( + f" After {step['trigger_after_minutes']}min -> " + f"Notify: {', '.join(step['notify'])} ({step['reason']})" + ) + lines.append("") + + if escalation.cross_team_notify: + lines.append(f"Cross-Team Notify: {', '.join(escalation.cross_team_notify)}") + if escalation.suggested_smes: + lines.append("Suggested SMEs:") + for sme in escalation.suggested_smes: + lines.append(f" - {sme}") + lines.append("") + + # -- Action Plan -- + lines.append(_header_line("-", w)) + lines.append("ACTION PLAN") + lines.append(_header_line("-", w)) + + lines.append("Immediate Actions:") + for i, action in enumerate(action_plan.immediate_actions, 1): + lines.append(f" {i}. {action}") + lines.append("") + + lines.append("Diagnostic Steps:") + for i, step in enumerate(action_plan.diagnostic_steps, 1): + lines.append(f" {i}. {step}") + lines.append("") + + lines.append("Communication Actions:") + for i, action in enumerate(action_plan.communication_actions, 1): + lines.append(f" {i}. {action}") + lines.append("") + + rb = action_plan.rollback_assessment + lines.append("Rollback Assessment:") + if rb.get("recent_deployment_detected"): + lines.append(f" Recent Deploy: {rb.get('service', '?')} v{rb.get('version', '?')}") + lines.append(f" Deployed At: {rb.get('deployed_at', '?')}") + if "minutes_since_deploy" in rb: + lines.append(f" Minutes Before Detection: {rb['minutes_since_deploy']}") + lines.append(f" Recommendation: {rb.get('recommendation', 'N/A')}") + lines.append("") + + # -- SLA Impact -- + lines.append(_header_line("-", w)) + lines.append("SLA IMPACT ASSESSMENT") + lines.append(_header_line("-", w)) + lines.append(f"Breach Risk: {sla_impact.breach_risk.upper()}") + lines.append(f"Error Budget Impact: {sla_impact.error_budget_impact_minutes} min/hr") + lines.append(f"Remaining Budget: {sla_impact.remaining_budget_percentage}%") + lines.append(f"Est. Time to Breach: {sla_impact.estimated_time_to_breach_minutes} min") + tier = sla_impact.sla_tier + lines.append(f"Target Resolution: {tier.get('target_resolution_hours', '?')} hours") + lines.append(f"Target Response: {tier.get('target_response_minutes', '?')} minutes") + lines.append("") + + if sla_impact.recommendations: + lines.append("SLA Recommendations:") + for rec in sla_impact.recommendations: + lines.append(f" - {rec}") + lines.append("") + lines.append(_header_line("=", w)) + + return "\n".join(lines) + + +def format_json( + incident: Dict, + severity_score: SeverityScore, + escalation: EscalationPath, + action_plan: ActionPlan, + sla_impact: SLAImpact, +) -> str: + """Render a machine-readable JSON report.""" + report = { + "classification_timestamp": datetime.now(timezone.utc).isoformat(), + "incident": incident, + "severity": asdict(severity_score), + "severity_definition": SeverityLevel.get_definition(severity_score.severity_level), + "escalation": asdict(escalation), + "action_plan": asdict(action_plan), + "sla_impact": asdict(sla_impact), + } + return json.dumps(report, indent=2, default=str) + + +def format_markdown( + incident: Dict, + severity_score: SeverityScore, + escalation: EscalationPath, + action_plan: ActionPlan, + sla_impact: SLAImpact, +) -> str: + """Render a Markdown report suitable for incident tickets or wikis.""" + lines: List[str] = [] + sev_def = SeverityLevel.get_definition(severity_score.severity_level) + + lines.append(f"# Incident Severity Classification: {severity_score.severity_level}") + lines.append("") + lines.append(f"**Classified:** {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}") + lines.append("") + + lines.append("## Incident Summary") + lines.append("") + lines.append(f"| Field | Value |") + lines.append(f"|-------|-------|") + lines.append(f"| Title | {incident.get('title', 'N/A')} |") + lines.append(f"| Service | {incident.get('service', 'N/A')} |") + lines.append(f"| Detected | {incident.get('detected_at', 'N/A')} |") + lines.append(f"| Reporter | {incident.get('reporter', 'N/A')} |") + lines.append("") + + lines.append("## Severity Classification") + lines.append("") + lines.append( + f"> **{severity_score.severity_level} -- {sev_def['label']}** " + f"(Score: {severity_score.composite_score:.3f})" + ) + lines.append(f">") + lines.append(f"> {sev_def['description']}") + lines.append("") + + lines.append("### Dimension Scores") + lines.append("") + lines.append("| Dimension | Raw | Weight | Weighted |") + lines.append("|-----------|-----|--------|----------|") + for dim, raw in severity_score.dimensions.items(): + wt = severity_score.weighted_dimensions.get(dim, 0) + weight_cfg = DIMENSION_WEIGHTS.get(dim, 0) + label = dim.replace("_", " ").title() + lines.append(f"| {label} | {raw:.3f} | {weight_cfg:.2f} | {wt:.3f} |") + lines.append("") + + if severity_score.contributing_factors: + lines.append("### Contributing Factors") + lines.append("") + for f in severity_score.contributing_factors: + lines.append(f"- {f}") + lines.append("") + + if severity_score.auto_escalate_reasons: + lines.append("### Auto-Escalation Overrides") + lines.append("") + for r in severity_score.auto_escalate_reasons: + lines.append(f"- **{r}**") + lines.append("") + + lines.append("## Escalation Path") + lines.append("") + lines.append(f"**Immediate Notify:** {', '.join(escalation.immediate_notify)}") + lines.append("") + + if escalation.war_room_required: + lines.append(f"**War Room:** [Join Bridge]({escalation.bridge_link})") + else: + lines.append("**War Room:** Not required") + lines.append("") + + if escalation.escalation_chain: + lines.append("### Escalation Chain") + lines.append("") + for step in escalation.escalation_chain: + lines.append( + f"- **After {step['trigger_after_minutes']} min:** " + f"Notify {', '.join(step['notify'])} -- {step['reason']}" + ) + lines.append("") + + if escalation.cross_team_notify: + lines.append(f"**Cross-Team:** {', '.join(escalation.cross_team_notify)}") + lines.append("") + + if escalation.suggested_smes: + lines.append("### Suggested SMEs") + lines.append("") + for sme in escalation.suggested_smes: + lines.append(f"- {sme}") + lines.append("") + + lines.append("## Action Plan") + lines.append("") + + lines.append("### Immediate Actions") + lines.append("") + for i, action in enumerate(action_plan.immediate_actions, 1): + lines.append(f"{i}. {action}") + lines.append("") + + lines.append("### Diagnostic Steps") + lines.append("") + for i, step in enumerate(action_plan.diagnostic_steps, 1): + lines.append(f"{i}. {step}") + lines.append("") + + lines.append("### Communication") + lines.append("") + for i, action in enumerate(action_plan.communication_actions, 1): + lines.append(f"{i}. {action}") + lines.append("") + + rb = action_plan.rollback_assessment + lines.append("### Rollback Assessment") + lines.append("") + if rb.get("recent_deployment_detected"): + lines.append( + f"| Deploy | {rb.get('service', '?')} v{rb.get('version', '?')} |" + ) + lines.append(f"|--------|------|") + lines.append(f"| Deployed At | {rb.get('deployed_at', '?')} |") + if "minutes_since_deploy" in rb: + lines.append(f"| Minutes Before Detection | {rb['minutes_since_deploy']} |") + lines.append("") + lines.append(f"**Recommendation:** {rb.get('recommendation', 'N/A')}") + lines.append("") + + lines.append("## SLA Impact") + lines.append("") + tier = sla_impact.sla_tier + lines.append(f"| Metric | Value |") + lines.append(f"|--------|-------|") + lines.append(f"| Breach Risk | **{sla_impact.breach_risk.upper()}** |") + lines.append(f"| Error Budget Impact | {sla_impact.error_budget_impact_minutes} min/hr |") + lines.append(f"| Remaining Budget | {sla_impact.remaining_budget_percentage}% |") + lines.append(f"| Est. Time to Breach | {sla_impact.estimated_time_to_breach_minutes} min |") + lines.append(f"| Target Resolution | {tier.get('target_resolution_hours', '?')} hours |") + lines.append(f"| Target Response | {tier.get('target_response_minutes', '?')} minutes |") + lines.append("") + + if sla_impact.recommendations: + lines.append("### SLA Recommendations") + lines.append("") + for rec in sla_impact.recommendations: + lines.append(f"- {rec}") + lines.append("") + + lines.append("---") + lines.append("*Generated by severity_classifier.py*") + + return "\n".join(lines) + + +# ---------- CLI Entry Point --------------------------------------------------- + +def main() -> None: + """Parse arguments, read input, classify, and emit output.""" + parser = argparse.ArgumentParser( + description="Classify incident severity and generate escalation paths.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="""\ +examples: + %(prog)s incident.json + %(prog)s incident.json --format json + %(prog)s incident.json --format markdown + cat incident.json | %(prog)s + cat incident.json | %(prog)s --format json +""", + ) + + parser.add_argument( + "data_file", + nargs="?", + default=None, + help="JSON file with incident data (reads stdin if omitted)", + ) + parser.add_argument( + "--format", + choices=["text", "json", "markdown"], + default="text", + dest="output_format", + help="Output format (default: text)", + ) + + args = parser.parse_args() + + # -- Read input -- + try: + if args.data_file: + with open(args.data_file, "r", encoding="utf-8") as fh: + raw_data = json.load(fh) + else: + if sys.stdin.isatty(): + parser.error("No input file provided and stdin is a terminal. Pipe JSON or pass a file.") + raw_data = json.load(sys.stdin) + except json.JSONDecodeError as exc: + print(f"Error: invalid JSON input -- {exc}", file=sys.stderr) + sys.exit(1) + except FileNotFoundError: + print(f"Error: file not found -- {args.data_file}", file=sys.stderr) + sys.exit(1) + except IOError as exc: + print(f"Error: could not read input -- {exc}", file=sys.stderr) + sys.exit(1) + + # -- Parse and validate -- + try: + incident, impact, signals, context = parse_incident_data(raw_data) + except ValueError as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(1) + + # -- Classify -- + severity_score = classify_severity(incident, impact, signals, context) + + # -- Build outputs -- + escalation = build_escalation_path(severity_score, signals, context) + action_plan = build_action_plan(severity_score, incident, impact, signals, context) + sla_impact = assess_sla_impact(severity_score, impact, signals) + + # -- Format and print -- + if args.output_format == "json": + output = format_json(incident, severity_score, escalation, action_plan, sla_impact) + elif args.output_format == "markdown": + output = format_markdown(incident, severity_score, escalation, action_plan, sla_impact) + else: + output = format_text(incident, severity_score, escalation, action_plan, sla_impact) + + print(output) + + # -- Exit code reflects severity -- + if severity_score.severity_level == SeverityLevel.SEV1: + sys.exit(2) + elif severity_score.severity_level == SeverityLevel.SEV2: + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == "__main__": + main()