Files
claude-skills-reference/engineering-team/incident-commander/scripts/postmortem_generator.py
Leo f6f50f5282 Fix CI workflows and installation documentation
- Replace non-existent anthropics/claude-code-action@v1 with direct bash steps in smart-sync.yml and pr-issue-auto-close.yml
- Add missing checkout steps to both workflows for WORKFLOW_KILLSWITCH access
- Fix Issue #189: Replace broken 'npx ai-agent-skills install' with working 'npx agent-skills-cli add' command
- Update README.md and INSTALLATION.md with correct Agent Skills CLI commands and repository links
- Verified: agent-skills-cli detects all 53 skills and works with 42+ AI agents

Fixes: Two GitHub Actions workflows that broke on PR #191 merge
Closes: #189
2026-02-16 11:30:18 +00:00

805 lines
38 KiB
Python

#!/usr/bin/env python3
"""
Postmortem Generator - Generate structured postmortem reports with 5-Whys analysis.
Produces comprehensive incident postmortem documents from structured JSON input,
including root cause analysis, contributing factor classification, action item
validation, MTTD/MTTR metrics, and customer impact summaries.
Usage:
python postmortem_generator.py incident_data.json
python postmortem_generator.py incident_data.json --format markdown
python postmortem_generator.py incident_data.json --format json
cat incident_data.json | python postmortem_generator.py
Input:
JSON object with keys: incident, timeline, resolution, action_items, participants.
See SKILL.md for the full input schema.
"""
import argparse
import json
import sys
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Tuple
# ---------- Constants and Configuration ----------
VERSION = "1.0.0"
SEVERITY_ORDER = {"SEV0": 0, "SEV1": 1, "SEV2": 2, "SEV3": 3, "SEV4": 4}
FACTOR_CATEGORIES = ("process", "tooling", "human", "environment", "external")
ACTION_TYPES = ("detection", "prevention", "mitigation", "process")
PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "P4": 4}
POSTMORTEM_TARGET_HOURS = 72
# Industry benchmarks for incident response (minutes, except postmortem)
BENCHMARKS = {
"SEV0": {"mttd": 5, "mttr": 60, "mitigate": 30, "declare": 5},
"SEV1": {"mttd": 10, "mttr": 120, "mitigate": 60, "declare": 10},
"SEV2": {"mttd": 30, "mttr": 480, "mitigate": 120, "declare": 30},
"SEV3": {"mttd": 60, "mttr": 1440, "mitigate": 240, "declare": 60},
"SEV4": {"mttd": 120, "mttr": 2880, "mitigate": 480, "declare": 120},
}
CAT_TO_ACTION = {"process": "process", "tooling": "detection", "human": "prevention",
"environment": "mitigation", "external": "prevention"}
CAT_WEIGHT = {"process": 1.0, "tooling": 0.9, "human": 0.8, "environment": 0.7, "external": 0.6}
# Keywords used to classify contributing factors into categories
FACTOR_KEYWORDS = {
"process": ["process", "procedure", "workflow", "review", "approval", "checklist",
"runbook", "documentation", "policy", "standard", "protocol", "canary",
"deployment", "rollback", "change management"],
"tooling": ["tool", "monitor", "alert", "threshold", "automation", "test", "pipeline",
"ci/cd", "observability", "dashboard", "logging", "infrastructure",
"configuration", "config"],
"human": ["training", "knowledge", "experience", "communication", "handoff", "fatigue",
"oversight", "mistake", "error", "misunderstand", "assumption", "awareness"],
"environment": ["load", "traffic", "scale", "capacity", "resource", "network", "hardware",
"region", "latency", "timeout", "connection", "performance", "spike"],
"external": ["vendor", "third-party", "upstream", "downstream", "provider", "api",
"dependency", "partner", "dns", "cdn", "certificate"],
}
# 5-Whys templates per category (each list is 5 why->answer steps)
WHY_TEMPLATES = {
"process": [
"Why did this process gap exist? -> The existing process did not account for this scenario.",
"Why was the scenario not accounted for? -> It was not identified during the last process review.",
"Why was the process review incomplete? -> Reviews focus on known failure modes, not emerging risks.",
"Why are emerging risks not surfaced? -> No systematic mechanism to capture lessons from near-misses.",
"Why is there no near-miss capture mechanism? -> Incident learning is ad-hoc rather than systematic."],
"tooling": [
"Why did the tooling fail to catch this? -> The relevant metric was not monitored or the threshold was misconfigured.",
"Why was the threshold misconfigured? -> It was set during initial deployment and never revisited.",
"Why was it never revisited? -> There is no scheduled review of monitoring configurations.",
"Why is there no scheduled review? -> Monitoring ownership is diffuse across teams.",
"Why is ownership diffuse? -> No clear operational runbook assigns monitoring review responsibilities."],
"human": [
"Why did the human factor contribute? -> The individual lacked context needed to prevent the issue.",
"Why was context lacking? -> Knowledge was siloed and not documented accessibly.",
"Why was knowledge siloed? -> No structured onboarding or knowledge-sharing process for this area.",
"Why is there no knowledge-sharing process? -> Team capacity has been focused on feature delivery.",
"Why is capacity skewed toward features? -> Operational excellence is not weighted equally in planning."],
"environment": [
"Why did the environment cause this failure? -> System capacity was insufficient for the load pattern.",
"Why was capacity insufficient? -> Load projections did not account for this traffic pattern.",
"Why were projections inaccurate? -> Load testing does not replicate production-scale variability.",
"Why doesn't load testing replicate production? -> Test environments lack realistic traffic generators.",
"Why are traffic generators missing? -> Investment in production-like test infrastructure was deferred."],
"external": [
"Why did the external factor cause an incident? -> The system had a hard dependency with no fallback.",
"Why was there no fallback? -> The integration was assumed to be highly available.",
"Why was high availability assumed? -> SLA review of the external dependency was not performed.",
"Why was SLA review skipped? -> No standard checklist for evaluating third-party dependencies.",
"Why is there no evaluation checklist? -> Vendor management practices are informal and undocumented."],
}
THEME_RECS = {
"process": ["Establish a quarterly process review cadence covering change management and deployment procedures.",
"Implement a near-miss tracking system to surface latent risks before they become incidents.",
"Create pre-deployment checklists that require sign-off from the service owner."],
"tooling": ["Schedule quarterly reviews of alerting thresholds and monitoring coverage.",
"Assign explicit monitoring ownership per service in operational runbooks.",
"Invest in synthetic monitoring and canary analysis for critical paths."],
"human": ["Build structured onboarding that covers incident-prone areas and past postmortems.",
"Implement blameless knowledge-sharing sessions after each incident.",
"Balance operational excellence work alongside feature delivery in sprint planning."],
"environment": ["Conduct periodic capacity planning reviews using production traffic replays.",
"Invest in production-like load-testing infrastructure with realistic traffic profiles.",
"Implement auto-scaling policies with validated upper-bound thresholds."],
"external": ["Perform formal SLA reviews for all third-party dependencies annually.",
"Implement circuit breakers and fallbacks for external service integrations.",
"Maintain a dependency registry with risk ratings and contingency plans."],
}
MISSING_ACTION_TEMPLATES = {
"process": "Create or update runbook/checklist to prevent recurrence of this process gap",
"detection": "Add monitoring and alerting to detect this class of issue earlier",
"mitigation": "Implement auto-scaling or circuit-breaker to reduce blast radius",
"prevention": "Add automated safeguards (canary deploy, load test gate) to prevent recurrence",
}
# ---------- Data Model Classes ----------
class IncidentData:
"""Parsed incident metadata."""
def __init__(self, data: Dict[str, Any]) -> None:
self.id: str = data.get("id", "UNKNOWN")
self.title: str = data.get("title", "Untitled Incident")
self.severity: str = data.get("severity", "SEV3").upper()
self.commander: str = data.get("commander", "Unassigned")
self.service: str = data.get("service", "unknown-service")
self.affected_services: List[str] = data.get("affected_services", [])
def to_dict(self) -> Dict[str, Any]:
return {"id": self.id, "title": self.title, "severity": self.severity,
"commander": self.commander, "service": self.service,
"affected_services": self.affected_services}
class TimelineMetrics:
"""MTTD, MTTR, and other timing metrics computed from raw timestamps."""
def __init__(self, timeline: Dict[str, str], severity: str) -> None:
self.severity = severity
self.issue_started = self._parse(timeline.get("issue_started"))
self.detected_at = self._parse(timeline.get("detected_at"))
self.declared_at = self._parse(timeline.get("declared_at"))
self.mitigated_at = self._parse(timeline.get("mitigated_at"))
self.resolved_at = self._parse(timeline.get("resolved_at"))
self.postmortem_at = self._parse(timeline.get("postmortem_at"))
@staticmethod
def _parse(ts: Optional[str]) -> Optional[datetime]:
if ts is None:
return None
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S"):
try:
dt = datetime.strptime(ts, fmt)
return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
except ValueError:
continue
return None
def _delta_min(self, start: Optional[datetime], end: Optional[datetime]) -> Optional[float]:
if start is None or end is None:
return None
return round((end - start).total_seconds() / 60.0, 1)
@property
def mttd(self) -> Optional[float]:
return self._delta_min(self.issue_started, self.detected_at)
@property
def mttr(self) -> Optional[float]:
return self._delta_min(self.detected_at, self.resolved_at)
@property
def time_to_mitigate(self) -> Optional[float]:
return self._delta_min(self.detected_at, self.mitigated_at)
@property
def time_to_declare(self) -> Optional[float]:
return self._delta_min(self.detected_at, self.declared_at)
@property
def postmortem_timeliness_hours(self) -> Optional[float]:
m = self._delta_min(self.resolved_at, self.postmortem_at)
return round(m / 60.0, 1) if m is not None else None
@property
def postmortem_on_time(self) -> Optional[bool]:
h = self.postmortem_timeliness_hours
return h <= POSTMORTEM_TARGET_HOURS if h is not None else None
def benchmark_comparison(self) -> Dict[str, Dict[str, Any]]:
bench = BENCHMARKS.get(self.severity, BENCHMARKS["SEV3"])
results: Dict[str, Dict[str, Any]] = {}
for name, actual, target in [("mttd", self.mttd, bench["mttd"]),
("mttr", self.mttr, bench["mttr"]),
("time_to_mitigate", self.time_to_mitigate, bench["mitigate"]),
("time_to_declare", self.time_to_declare, bench["declare"])]:
if actual is not None:
results[name] = {"actual_minutes": actual, "benchmark_minutes": target,
"met_benchmark": actual <= target,
"delta_minutes": round(actual - target, 1)}
h = self.postmortem_timeliness_hours
if h is not None:
results["postmortem_timeliness"] = {
"actual_hours": h, "target_hours": POSTMORTEM_TARGET_HOURS,
"met_target": self.postmortem_on_time, "delta_hours": round(h - POSTMORTEM_TARGET_HOURS, 1)}
return results
def to_dict(self) -> Dict[str, Any]:
return {"mttd_minutes": self.mttd, "mttr_minutes": self.mttr,
"time_to_mitigate_minutes": self.time_to_mitigate,
"time_to_declare_minutes": self.time_to_declare,
"postmortem_timeliness_hours": self.postmortem_timeliness_hours,
"postmortem_on_time": self.postmortem_on_time,
"benchmarks": self.benchmark_comparison()}
class ContributingFactor:
"""A classified contributing factor with weight and action-type mapping."""
def __init__(self, description: str, index: int) -> None:
self.description = description
self.index = index
self.category = self._classify()
self.weight = round(max(1.0 - index * 0.15, 0.3) * CAT_WEIGHT.get(self.category, 0.8), 2)
self.mapped_action_type = CAT_TO_ACTION.get(self.category, "process")
def _classify(self) -> str:
lower = self.description.lower()
scores = {cat: sum(1 for kw in kws if kw in lower) for cat, kws in FACTOR_KEYWORDS.items()}
best = max(scores, key=lambda k: scores[k])
return best if scores[best] > 0 else "process"
def to_dict(self) -> Dict[str, Any]:
return {"description": self.description, "category": self.category,
"weight": self.weight, "mapped_action_type": self.mapped_action_type}
class FiveWhysAnalysis:
"""Structured 5-Whys chain for a contributing factor."""
def __init__(self, factor: ContributingFactor) -> None:
self.factor = factor
self.systemic_theme: str = factor.category
self.chain: List[str] = [f"Why? {factor.description}"] + \
WHY_TEMPLATES.get(factor.category, WHY_TEMPLATES["process"])
def to_dict(self) -> Dict[str, Any]:
return {"factor": self.factor.description, "category": self.factor.category,
"chain": self.chain, "systemic_theme": self.systemic_theme}
class ActionItem:
"""Parsed and validated action item."""
def __init__(self, data: Dict[str, Any]) -> None:
self.title: str = data.get("title", "")
self.owner: str = data.get("owner", "")
self.priority: str = data.get("priority", "P3")
self.deadline: str = data.get("deadline", "")
self.type: str = data.get("type", "process")
self.status: str = data.get("status", "open")
self.validation_issues: List[str] = []
self.quality_score: int = 0
self._validate()
def _validate(self) -> None:
self.validation_issues = []
if not self.title:
self.validation_issues.append("Missing title")
if not self.owner:
self.validation_issues.append("Missing owner")
if not self.deadline:
self.validation_issues.append("Missing deadline")
if self.priority not in PRIORITY_ORDER:
self.validation_issues.append(f"Invalid priority: {self.priority}")
if self.type not in ACTION_TYPES:
self.validation_issues.append(f"Invalid type: {self.type}")
self.quality_score = self._score_quality()
def _score_quality(self) -> int:
"""Score 0-100: specific, measurable, achievable."""
s = 0
if len(self.title) > 10: s += 20
if self.owner: s += 20
if self.deadline: s += 20
if self.priority in PRIORITY_ORDER: s += 10
if self.type in ACTION_TYPES: s += 10
if any(kw in self.title.lower() for kw in ["%", "threshold", "within", "before",
"after", "less than", "greater than"]):
s += 10
if len(self.title.split()) >= 5: s += 10
return min(s, 100)
@property
def is_valid(self) -> bool:
return len(self.validation_issues) == 0
@property
def is_past_deadline(self) -> bool:
if not self.deadline or self.status != "open":
return False
try:
dl = datetime.strptime(self.deadline, "%Y-%m-%d").replace(tzinfo=timezone.utc)
return datetime.now(timezone.utc) > dl
except ValueError:
return False
def to_dict(self) -> Dict[str, Any]:
return {"title": self.title, "owner": self.owner, "priority": self.priority,
"deadline": self.deadline, "type": self.type, "status": self.status,
"is_valid": self.is_valid, "validation_issues": self.validation_issues,
"quality_score": self.quality_score, "is_past_deadline": self.is_past_deadline}
class PostmortemReport:
"""Complete postmortem document assembled from all analysis components."""
def __init__(self, raw: Dict[str, Any]) -> None:
self.raw = raw
self.incident = IncidentData(raw.get("incident", {}))
self.timeline = TimelineMetrics(raw.get("timeline", {}), self.incident.severity)
self.resolution: Dict[str, Any] = raw.get("resolution", {})
self.participants: List[Dict[str, str]] = raw.get("participants", [])
# Derived analysis
self.contributing_factors = [ContributingFactor(f, i)
for i, f in enumerate(self.resolution.get("contributing_factors", []))]
self.five_whys = [FiveWhysAnalysis(f) for f in self.contributing_factors]
self.action_items = [ActionItem(a) for a in raw.get("action_items", [])]
self.factor_distribution = self._compute_factor_distribution()
self.coverage_gaps = self._find_coverage_gaps()
self.suggested_actions = self._suggest_missing_actions()
self.theme_recommendations = self._build_theme_recommendations()
def _compute_factor_distribution(self) -> Dict[str, float]:
dist: Dict[str, float] = {c: 0.0 for c in FACTOR_CATEGORIES}
total = sum(f.weight for f in self.contributing_factors) or 1.0
for f in self.contributing_factors:
dist[f.category] += f.weight
return {k: round(v / total * 100, 1) for k, v in dist.items()}
def _find_coverage_gaps(self) -> List[str]:
factor_cats = {f.category for f in self.contributing_factors}
action_types = {a.type for a in self.action_items}
gaps = []
for cat in factor_cats:
expected = CAT_TO_ACTION.get(cat)
if expected and expected not in action_types:
gaps.append(f"No '{expected}' action item to address '{cat}' contributing factor")
return gaps
def _suggest_missing_actions(self) -> List[Dict[str, str]]:
factor_cats = {f.category for f in self.contributing_factors}
action_types = {a.type for a in self.action_items}
suggestions = []
for cat in factor_cats:
expected = CAT_TO_ACTION.get(cat)
if expected and expected not in action_types:
suggestions.append({
"type": expected,
"suggestion": MISSING_ACTION_TEMPLATES.get(expected, "Add an action item for this gap"),
"reason": f"No action item addresses the '{cat}' contributing factor"})
return suggestions
def _build_theme_recommendations(self) -> Dict[str, List[str]]:
seen: Dict[str, List[str]] = {}
for a in self.five_whys:
if a.systemic_theme not in seen:
seen[a.systemic_theme] = THEME_RECS.get(a.systemic_theme, [])
return seen
def customer_impact_summary(self) -> Dict[str, Any]:
impact = self.resolution.get("customer_impact", {})
affected = impact.get("affected_users", 0)
failed_tx = impact.get("failed_transactions", 0)
revenue = impact.get("revenue_impact_usd", 0)
data_loss = impact.get("data_loss", False)
comm_required = affected > 1000 or data_loss or revenue > 10000
sev = "high" if (affected > 10000 or revenue > 50000) else (
"medium" if (affected > 1000 or revenue > 5000) else "low")
return {"affected_users": affected, "failed_transactions": failed_tx,
"revenue_impact_usd": revenue, "data_loss": data_loss,
"data_integrity": "compromised" if data_loss else "intact",
"customer_communication_required": comm_required, "impact_severity": sev}
def executive_summary(self) -> str:
mttr = self.timeline.mttr
ci = self.customer_impact_summary()
mttr_str = f"{mttr:.0f} minutes" if mttr is not None else "unknown duration"
parts = [
f"On {self._fmt_date(self.timeline.issue_started)}, a {self.incident.severity} "
f"incident (\"{self.incident.title}\") impacted the {self.incident.service} service.",
f"The root cause was identified as: {self.resolution.get('root_cause', 'Unknown root cause')}.",
f"The incident was resolved in {mttr_str}, affecting approximately "
f"{ci['affected_users']:,} users with an estimated revenue impact of ${ci['revenue_impact_usd']:,.2f}.",
"Data loss was confirmed; affected customers must be notified." if ci["data_loss"]
else "No data loss occurred during this incident."]
return " ".join(parts)
@staticmethod
def _fmt_date(dt: Optional[datetime]) -> str:
return dt.strftime("%Y-%m-%d at %H:%M UTC") if dt else "an unknown date"
def overdue_p1_items(self) -> List[Dict[str, str]]:
return [{"title": a.title, "owner": a.owner, "deadline": a.deadline}
for a in self.action_items if a.priority in ("P0", "P1") and a.is_past_deadline]
def to_dict(self) -> Dict[str, Any]:
return {
"version": VERSION, "incident": self.incident.to_dict(),
"executive_summary": self.executive_summary(),
"timeline_metrics": self.timeline.to_dict(),
"customer_impact": self.customer_impact_summary(),
"root_cause": self.resolution.get("root_cause", ""),
"contributing_factors": [f.to_dict() for f in self.contributing_factors],
"factor_distribution": self.factor_distribution,
"five_whys_analysis": [a.to_dict() for a in self.five_whys],
"theme_recommendations": self.theme_recommendations,
"mitigation_steps": self.resolution.get("mitigation_steps", []),
"permanent_fix": self.resolution.get("permanent_fix", ""),
"action_items": [a.to_dict() for a in self.action_items],
"action_item_coverage_gaps": self.coverage_gaps,
"suggested_actions": self.suggested_actions,
"overdue_p1_items": self.overdue_p1_items(),
"participants": self.participants}
# ---------- Core Analysis Helpers ----------
def _bar(pct: float, width: int = 30) -> str:
"""Render a text-based horizontal bar chart segment."""
filled = int(round(pct / 100 * width))
return "[" + "#" * filled + "." * (width - filled) + "]"
def _generate_lessons(report: PostmortemReport) -> List[str]:
"""Derive lessons learned from the analysis."""
lessons: List[str] = []
bench = BENCHMARKS.get(report.incident.severity, BENCHMARKS["SEV3"])
mttd = report.timeline.mttd
if mttd is not None and mttd > bench["mttd"]:
lessons.append(
f"Detection took {mttd:.0f} minutes, exceeding the {bench['mttd']}-minute "
f"benchmark for {report.incident.severity}. Invest in earlier detection mechanisms.")
dist = report.factor_distribution
dominant = max(dist, key=lambda k: dist[k])
if dist[dominant] >= 50:
lessons.append(
f"The '{dominant}' category accounts for {dist[dominant]:.0f}% of contributing factors. "
f"Targeted improvements in this area will yield the highest return.")
if report.coverage_gaps:
lessons.append(
f"There are {len(report.coverage_gaps)} action item coverage gap(s). "
"Ensure every contributing factor category has a corresponding remediation action.")
avg_q = (sum(a.quality_score for a in report.action_items) / len(report.action_items)
if report.action_items else 0)
if avg_q < 70:
lessons.append(
f"Average action item quality score is {avg_q:.0f}/100. "
"Make action items more specific with measurable targets and clear ownership.")
if report.timeline.postmortem_on_time is False:
h = report.timeline.postmortem_timeliness_hours
lessons.append(
f"Postmortem was held {h:.0f} hours after resolution, exceeding the "
f"{POSTMORTEM_TARGET_HOURS}-hour target. Schedule postmortems sooner to capture context.")
if not lessons:
lessons.append("This incident was handled within benchmarks. Continue reinforcing "
"current practices and share this postmortem for organizational learning.")
return lessons
# ---------- Output Formatters ----------
def format_text(report: PostmortemReport) -> str:
"""Format the postmortem as plain text."""
L: List[str] = []
W = 72
def h1(title: str) -> None:
L.append(""); L.append("=" * W); L.append(f" {title}"); L.append("=" * W)
def h2(title: str) -> None:
L.append(""); L.append(f"--- {title} ---")
inc = report.incident
h1(f"POSTMORTEM: {inc.title}")
L.append(f" ID: {inc.id} | Severity: {inc.severity} | Service: {inc.service}")
L.append(f" Commander: {inc.commander}")
if inc.affected_services:
L.append(f" Affected services: {', '.join(inc.affected_services)}")
# Executive Summary
h1("EXECUTIVE SUMMARY")
L.append("")
for sentence in report.executive_summary().split(". "):
s = sentence.strip()
if s and not s.endswith("."): s += "."
if s: L.append(f" {s}")
# Timeline Metrics
h1("TIMELINE METRICS")
tm = report.timeline
L.append("")
for label, val, unit in [("MTTD (Time to Detect)", tm.mttd, "min"),
("MTTR (Time to Resolve)", tm.mttr, "min"),
("Time to Mitigate", tm.time_to_mitigate, "min"),
("Time to Declare", tm.time_to_declare, "min"),
("Postmortem Timeliness", tm.postmortem_timeliness_hours, "hrs")]:
L.append(f" {label:<30s} {f'{val:.1f} {unit}' if val is not None else 'N/A'}")
h2("Benchmark Comparison")
for name, d in tm.benchmark_comparison().items():
if "actual_minutes" in d:
st = "PASS" if d["met_benchmark"] else "FAIL"
L.append(f" {name:<25s} actual={d['actual_minutes']}min benchmark={d['benchmark_minutes']}min [{st}]")
elif "actual_hours" in d:
st = "PASS" if d["met_target"] else "FAIL"
L.append(f" {name:<25s} actual={d['actual_hours']}hrs target={d['target_hours']}hrs [{st}]")
# Customer Impact
h1("CUSTOMER IMPACT")
ci = report.customer_impact_summary()
L.append("")
L.append(f" Affected users: {ci['affected_users']:,}")
L.append(f" Failed transactions: {ci['failed_transactions']:,}")
L.append(f" Revenue impact: ${ci['revenue_impact_usd']:,.2f}")
L.append(f" Data integrity: {ci['data_integrity']}")
L.append(f" Impact severity: {ci['impact_severity']}")
L.append(f" Comms required: {'Yes' if ci['customer_communication_required'] else 'No'}")
# Root Cause
h1("ROOT CAUSE ANALYSIS")
L.append("")
L.append(f" {report.resolution.get('root_cause', 'Unknown')}")
h2("Contributing Factors")
for f in report.contributing_factors:
L.append(f" [{f.category.upper():<12s} w={f.weight:.2f}] {f.description}")
h2("Factor Distribution")
for cat, pct in sorted(report.factor_distribution.items(), key=lambda x: -x[1]):
if pct > 0:
L.append(f" {cat:<14s} {pct:5.1f}% {_bar(pct)}")
# 5-Whys
h1("5-WHYS ANALYSIS")
for analysis in report.five_whys:
L.append("")
L.append(f" Factor: {analysis.factor.description}")
L.append(f" Theme: {analysis.systemic_theme}")
for i, step in enumerate(analysis.chain):
L.append(f" {i}. {step}")
h2("Theme-Based Recommendations")
for theme, recs in report.theme_recommendations.items():
L.append(f" [{theme.upper()}]")
for rec in recs:
L.append(f" - {rec}")
# Mitigation & Fix
h1("MITIGATION AND RESOLUTION")
h2("Mitigation Steps Taken")
for step in report.resolution.get("mitigation_steps", []):
L.append(f" - {step}")
h2("Permanent Fix")
L.append(f" {report.resolution.get('permanent_fix', 'TBD')}")
# Action Items
h1("ACTION ITEMS")
L.append("")
hdr = f" {'Priority':<10s} {'Type':<14s} {'Owner':<25s} {'Deadline':<12s} {'Quality':<8s} Title"
L.append(hdr)
L.append(" " + "-" * (len(hdr) - 2))
for a in sorted(report.action_items, key=lambda x: PRIORITY_ORDER.get(x.priority, 99)):
flag = " *OVERDUE*" if a.is_past_deadline else ""
L.append(f" {a.priority:<10s} {a.type:<14s} {a.owner:<25s} {a.deadline:<12s} "
f"{a.quality_score:<8d} {a.title}{flag}")
if report.coverage_gaps:
h2("Coverage Gaps")
for gap in report.coverage_gaps:
L.append(f" WARNING: {gap}")
if report.suggested_actions:
h2("Suggested Additional Actions")
for s in report.suggested_actions:
L.append(f" [{s['type'].upper()}] {s['suggestion']}")
L.append(f" Reason: {s['reason']}")
overdue = report.overdue_p1_items()
if overdue:
h2("Overdue P0/P1 Items")
for item in overdue:
L.append(f" OVERDUE: {item['title']} (owner: {item['owner']}, deadline: {item['deadline']})")
# Participants
h1("PARTICIPANTS")
L.append("")
for p in report.participants:
L.append(f" {p.get('name', 'Unknown'):<25s} {p.get('role', '')}")
# Lessons Learned
h1("LESSONS LEARNED")
L.append("")
for i, lesson in enumerate(_generate_lessons(report), 1):
L.append(f" {i}. {lesson}")
L.append("")
L.append("=" * W)
L.append(f" Generated by postmortem_generator v{VERSION}")
L.append("=" * W)
L.append("")
return "\n".join(L)
def format_json(report: PostmortemReport) -> str:
"""Format the postmortem as JSON."""
data = report.to_dict()
data["lessons_learned"] = _generate_lessons(report)
return json.dumps(data, indent=2, default=str)
def format_markdown(report: PostmortemReport) -> str:
"""Format the postmortem as a Markdown document."""
L: List[str] = []
inc = report.incident
L.append(f"# Postmortem: {inc.title}")
L.append("")
L.append("| Field | Value |")
L.append("|-------|-------|")
L.append(f"| **ID** | {inc.id} |")
L.append(f"| **Severity** | {inc.severity} |")
L.append(f"| **Service** | {inc.service} |")
L.append(f"| **Commander** | {inc.commander} |")
if inc.affected_services:
L.append(f"| **Affected Services** | {', '.join(inc.affected_services)} |")
L.append("")
# Executive Summary
L.append("## Executive Summary\n")
L.append(report.executive_summary())
L.append("")
# Timeline Metrics
L.append("## Timeline Metrics\n")
L.append("| Metric | Value | Benchmark | Status |")
L.append("|--------|-------|-----------|--------|")
labels = {"mttd": "MTTD (Time to Detect)", "mttr": "MTTR (Time to Resolve)",
"time_to_mitigate": "Time to Mitigate", "time_to_declare": "Time to Declare",
"postmortem_timeliness": "Postmortem Timeliness"}
for key, label in labels.items():
b = report.timeline.benchmark_comparison().get(key)
if b and "actual_minutes" in b:
st = "PASS" if b["met_benchmark"] else "FAIL"
L.append(f"| {label} | {b['actual_minutes']} min | {b['benchmark_minutes']} min | {st} |")
elif b and "actual_hours" in b:
st = "PASS" if b["met_target"] else "FAIL"
L.append(f"| {label} | {b['actual_hours']} hrs | {b['target_hours']} hrs | {st} |")
L.append("")
# Customer Impact
L.append("## Customer Impact\n")
ci = report.customer_impact_summary()
L.append(f"- **Affected users:** {ci['affected_users']:,}")
L.append(f"- **Failed transactions:** {ci['failed_transactions']:,}")
L.append(f"- **Revenue impact:** ${ci['revenue_impact_usd']:,.2f}")
L.append(f"- **Data integrity:** {ci['data_integrity']}")
L.append(f"- **Impact severity:** {ci['impact_severity']}")
L.append(f"- **Customer communication required:** {'Yes' if ci['customer_communication_required'] else 'No'}")
L.append("")
# Root Cause Analysis
L.append("## Root Cause Analysis\n")
L.append(f"**Root cause:** {report.resolution.get('root_cause', 'Unknown')}")
L.append("")
L.append("### Contributing Factors\n")
L.append("| # | Category | Weight | Description |")
L.append("|---|----------|--------|-------------|")
for i, f in enumerate(report.contributing_factors, 1):
L.append(f"| {i} | {f.category} | {f.weight:.2f} | {f.description} |")
L.append("")
L.append("### Factor Distribution\n")
L.append("```")
for cat, pct in sorted(report.factor_distribution.items(), key=lambda x: -x[1]):
if pct > 0:
L.append(f" {cat:<14s} {pct:5.1f}% {_bar(pct, 25)}")
L.append("```")
L.append("")
# 5-Whys
L.append("## 5-Whys Analysis\n")
for analysis in report.five_whys:
L.append(f"### Factor: {analysis.factor.description}")
L.append(f"**Systemic theme:** {analysis.systemic_theme}\n")
for i, step in enumerate(analysis.chain):
L.append(f"{i}. {step}")
L.append("")
L.append("### Theme-Based Recommendations\n")
for theme, recs in report.theme_recommendations.items():
L.append(f"**{theme.capitalize()}:**")
for rec in recs:
L.append(f"- {rec}")
L.append("")
# Mitigation
L.append("## Mitigation and Resolution\n")
L.append("### Mitigation Steps Taken\n")
for step in report.resolution.get("mitigation_steps", []):
L.append(f"- {step}")
L.append("")
L.append("### Permanent Fix\n")
L.append(report.resolution.get("permanent_fix", "TBD"))
L.append("")
# Action Items
L.append("## Action Items\n")
L.append("| Priority | Type | Owner | Deadline | Quality | Title |")
L.append("|----------|------|-------|----------|---------|-------|")
for a in sorted(report.action_items, key=lambda x: PRIORITY_ORDER.get(x.priority, 99)):
flag = " **OVERDUE**" if a.is_past_deadline else ""
L.append(f"| {a.priority} | {a.type} | {a.owner} | {a.deadline} | {a.quality_score}/100 | {a.title}{flag} |")
L.append("")
if report.coverage_gaps:
L.append("### Coverage Gaps\n")
for gap in report.coverage_gaps:
L.append(f"> **WARNING:** {gap}")
L.append("")
if report.suggested_actions:
L.append("### Suggested Additional Actions\n")
for s in report.suggested_actions:
L.append(f"- **[{s['type'].upper()}]** {s['suggestion']}")
L.append(f" - _Reason: {s['reason']}_")
L.append("")
overdue = report.overdue_p1_items()
if overdue:
L.append("### Overdue P0/P1 Items\n")
for item in overdue:
L.append(f"- **{item['title']}** (owner: {item['owner']}, deadline: {item['deadline']})")
L.append("")
# Participants
L.append("## Participants\n")
L.append("| Name | Role |")
L.append("|------|------|")
for p in report.participants:
L.append(f"| {p.get('name', 'Unknown')} | {p.get('role', '')} |")
L.append("")
# Lessons Learned
L.append("## Lessons Learned\n")
for i, lesson in enumerate(_generate_lessons(report), 1):
L.append(f"{i}. {lesson}")
L.append("")
L.append("---")
L.append(f"_Generated by postmortem_generator v{VERSION}_")
L.append("")
return "\n".join(L)
# ---------- Input Loading ----------
def load_input(filepath: Optional[str]) -> Dict[str, Any]:
"""Load incident data from a file path or stdin."""
if filepath:
try:
with open(filepath, "r", encoding="utf-8") as fh:
return json.load(fh)
except FileNotFoundError:
print(f"Error: File not found: {filepath}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as exc:
print(f"Error: Invalid JSON in {filepath}: {exc}", file=sys.stderr)
sys.exit(1)
else:
if sys.stdin.isatty():
print("Error: No input file specified and no data on stdin.", file=sys.stderr)
print("Usage: postmortem_generator.py [data_file] or pipe JSON via stdin.", file=sys.stderr)
sys.exit(1)
try:
return json.load(sys.stdin)
except json.JSONDecodeError as exc:
print(f"Error: Invalid JSON on stdin: {exc}", file=sys.stderr)
sys.exit(1)
def validate_input(data: Dict[str, Any]) -> List[str]:
"""Return a list of validation warnings (non-fatal)."""
warnings: List[str] = []
for key in ("incident", "timeline", "resolution", "action_items"):
if key not in data:
warnings.append(f"Missing '{key}' section")
for ts in ("issue_started", "detected_at", "mitigated_at", "resolved_at"):
if ts not in data.get("timeline", {}):
warnings.append(f"Missing timeline field: {ts}")
res = data.get("resolution", {})
if "root_cause" not in res:
warnings.append("Missing 'root_cause' in resolution")
if not res.get("contributing_factors"):
warnings.append("No contributing factors provided")
return warnings
# ---------- CLI Entry Point ----------
def main() -> None:
"""CLI entry point for postmortem generation."""
parser = argparse.ArgumentParser(
description="Generate structured postmortem reports with 5-Whys analysis.",
epilog="Reads JSON from a file or stdin. Outputs text, JSON, or markdown.")
parser.add_argument("data_file", nargs="?", default=None,
help="JSON file with incident + resolution data (reads stdin if omitted)")
parser.add_argument("--format", choices=["text", "json", "markdown"], default="text",
dest="output_format", help="Output format (default: text)")
args = parser.parse_args()
data = load_input(args.data_file)
warnings = validate_input(data)
for w in warnings:
print(f"Warning: {w}", file=sys.stderr)
report = PostmortemReport(data)
formatters = {"text": format_text, "json": format_json, "markdown": format_markdown}
print(formatters[args.output_format](report))
if __name__ == "__main__":
main()