merge: resolve conflict with dev, keep POWERFUL-tier SKILL.md
Kept our SKILL.md (POWERFUL-tier, 669 lines) over the codex-synced version. Accepted all new files from dev (additional scripts, references, assets).
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
"name": "claude-code-skills",
|
||||
"description": "Production-ready skill packages for AI agents - Marketing, Engineering, Product, C-Level, PM, and RA/QM",
|
||||
"repository": "https://github.com/alirezarezvani/claude-skills",
|
||||
"total_skills": 53,
|
||||
"total_skills": 54,
|
||||
"skills": [
|
||||
{
|
||||
"name": "customer-success-manager",
|
||||
@@ -47,6 +47,12 @@
|
||||
"category": "engineering",
|
||||
"description": "Code review automation for TypeScript, JavaScript, Python, Go, Swift, Kotlin. Analyzes PRs for complexity and risk, checks code quality for SOLID violations and code smells, generates review reports. Use when reviewing pull requests, analyzing code quality, identifying issues, generating review checklists."
|
||||
},
|
||||
{
|
||||
"name": "incident-commander",
|
||||
"source": "../../engineering-team/incident-commander",
|
||||
"category": "engineering",
|
||||
"description": "Production incident management with structured timeline analysis, severity classification (SEV1-4), automated postmortem generation, and SLA tracking. Features communication templates, escalation routing, 5-Whys root cause analysis, and MTTR/MTTD metrics for high-reliability engineering teams."
|
||||
},
|
||||
{
|
||||
"name": "ms365-tenant-manager",
|
||||
"source": "../../engineering-team/ms365-tenant-manager",
|
||||
@@ -336,7 +342,7 @@
|
||||
"description": "Executive leadership and advisory skills"
|
||||
},
|
||||
"engineering": {
|
||||
"count": 18,
|
||||
"count": 19,
|
||||
"source": "../../engineering-team",
|
||||
"description": "Software engineering and technical skills"
|
||||
},
|
||||
|
||||
1
.codex/skills/incident-commander
Symbolic link
1
.codex/skills/incident-commander
Symbolic link
@@ -0,0 +1 @@
|
||||
../../engineering-team/incident-commander
|
||||
3
.github/workflows/pr-issue-auto-close.yml
vendored
3
.github/workflows/pr-issue-auto-close.yml
vendored
@@ -17,6 +17,9 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check Workflow Kill Switch
|
||||
run: |
|
||||
if [ -f ".github/WORKFLOW_KILLSWITCH" ]; then
|
||||
|
||||
386
.github/workflows/smart-sync.yml
vendored
386
.github/workflows/smart-sync.yml
vendored
@@ -27,6 +27,9 @@ jobs:
|
||||
issue_number: ${{ steps.check.outputs.issue_number }}
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check Workflow Kill Switch
|
||||
run: |
|
||||
if [ -f ".github/WORKFLOW_KILLSWITCH" ]; then
|
||||
@@ -142,23 +145,35 @@ jobs:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Sync Issue to Project Board
|
||||
uses: anthropics/claude-code-action@v1
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.PROJECTS_TOKEN }}
|
||||
with:
|
||||
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
run: |
|
||||
echo "# Issue → Project Board Sync"
|
||||
echo "**Issue**: #${{ github.event.issue.number }} \"${{ github.event.issue.title }}\""
|
||||
echo "**State**: ${{ github.event.issue.state }}"
|
||||
echo "**Action**: ${{ github.event.action }}"
|
||||
|
||||
prompt: |
|
||||
# Issue → Project Board Sync
|
||||
# Step 1: Check if in Project
|
||||
PROJECT_ITEM=$(gh api graphql -f query='
|
||||
query {
|
||||
repository(owner: "alirezarezvani", name: "claude-skills") {
|
||||
issue(number: ${{ github.event.issue.number }}) {
|
||||
projectItems(first: 10) {
|
||||
nodes {
|
||||
id
|
||||
project { number }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
' --jq '.data.repository.issue.projectItems.nodes[] | select(.project.number == 9) | .id')
|
||||
|
||||
**Issue**: #${{ github.event.issue.number }} "${{ github.event.issue.title }}"
|
||||
**State**: ${{ github.event.issue.state }}
|
||||
**Action**: ${{ github.event.action }}
|
||||
if [ -z "$PROJECT_ITEM" ]; then
|
||||
echo "Adding to project..."
|
||||
gh project item-add 9 --owner alirezarezvani --url ${{ github.event.issue.html_url }}
|
||||
sleep 2
|
||||
|
||||
## Task: Sync issue status to project board
|
||||
|
||||
### Step 1: Check if in Project
|
||||
```bash
|
||||
PROJECT_ITEM=$(gh api graphql -f query='
|
||||
query {
|
||||
repository(owner: "alirezarezvani", name: "claude-skills") {
|
||||
@@ -173,118 +188,84 @@ jobs:
|
||||
}
|
||||
}
|
||||
' --jq '.data.repository.issue.projectItems.nodes[] | select(.project.number == 9) | .id')
|
||||
fi
|
||||
|
||||
if [ -z "$PROJECT_ITEM" ]; then
|
||||
echo "Adding to project..."
|
||||
gh project item-add 9 --owner alirezarezvani --url ${{ github.event.issue.html_url }}
|
||||
sleep 2
|
||||
echo "Project Item ID: $PROJECT_ITEM"
|
||||
|
||||
PROJECT_ITEM=$(gh api graphql -f query='
|
||||
query {
|
||||
repository(owner: "alirezarezvani", name: "claude-skills") {
|
||||
issue(number: ${{ github.event.issue.number }}) {
|
||||
projectItems(first: 10) {
|
||||
nodes {
|
||||
id
|
||||
project { number }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
' --jq '.data.repository.issue.projectItems.nodes[] | select(.project.number == 9) | .id')
|
||||
fi
|
||||
# Step 2: Determine Target Status
|
||||
LABELS=$(gh issue view ${{ github.event.issue.number }} --json labels --jq '[.labels[].name] | join(",")')
|
||||
ISSUE_STATE="${{ github.event.issue.state }}"
|
||||
|
||||
echo "Project Item ID: $PROJECT_ITEM"
|
||||
```
|
||||
# Priority order: closed state > status labels > default
|
||||
if [ "$ISSUE_STATE" = "closed" ]; then
|
||||
TARGET_STATUS="Done"
|
||||
elif echo "$LABELS" | grep -q "status: done"; then
|
||||
TARGET_STATUS="Done"
|
||||
elif echo "$LABELS" | grep -q "status: in-review"; then
|
||||
TARGET_STATUS="In Review"
|
||||
elif echo "$LABELS" | grep -q "status: in-progress"; then
|
||||
TARGET_STATUS="In Progress"
|
||||
elif echo "$LABELS" | grep -q "status: ready"; then
|
||||
TARGET_STATUS="Ready"
|
||||
elif echo "$LABELS" | grep -q "status: backlog"; then
|
||||
TARGET_STATUS="Backlog"
|
||||
elif echo "$LABELS" | grep -q "status: triage"; then
|
||||
TARGET_STATUS="To triage"
|
||||
else
|
||||
TARGET_STATUS=$([ "$ISSUE_STATE" = "open" ] && echo "To triage" || echo "Done")
|
||||
fi
|
||||
|
||||
### Step 2: Determine Target Status
|
||||
```bash
|
||||
LABELS=$(gh issue view ${{ github.event.issue.number }} --json labels --jq '[.labels[].name] | join(",")')
|
||||
ISSUE_STATE="${{ github.event.issue.state }}"
|
||||
echo "Target Status: $TARGET_STATUS"
|
||||
|
||||
# Priority order: closed state > status labels > default
|
||||
if [ "$ISSUE_STATE" = "closed" ]; then
|
||||
TARGET_STATUS="Done"
|
||||
elif echo "$LABELS" | grep -q "status: done"; then
|
||||
TARGET_STATUS="Done"
|
||||
elif echo "$LABELS" | grep -q "status: in-review"; then
|
||||
TARGET_STATUS="In Review"
|
||||
elif echo "$LABELS" | grep -q "status: in-progress"; then
|
||||
TARGET_STATUS="In Progress"
|
||||
elif echo "$LABELS" | grep -q "status: ready"; then
|
||||
TARGET_STATUS="Ready"
|
||||
elif echo "$LABELS" | grep -q "status: backlog"; then
|
||||
TARGET_STATUS="Backlog"
|
||||
elif echo "$LABELS" | grep -q "status: triage"; then
|
||||
TARGET_STATUS="To triage"
|
||||
else
|
||||
TARGET_STATUS=$([ "$ISSUE_STATE" = "open" ] && echo "To triage" || echo "Done")
|
||||
fi
|
||||
|
||||
echo "Target Status: $TARGET_STATUS"
|
||||
```
|
||||
|
||||
### Step 3: Get Project IDs
|
||||
```bash
|
||||
PROJECT_DATA=$(gh api graphql -f query='
|
||||
query {
|
||||
user(login: "alirezarezvani") {
|
||||
projectV2(number: 9) {
|
||||
id
|
||||
fields(first: 20) {
|
||||
nodes {
|
||||
... on ProjectV2SingleSelectField {
|
||||
# Step 3: Get Project IDs
|
||||
PROJECT_DATA=$(gh api graphql -f query='
|
||||
query {
|
||||
user(login: "alirezarezvani") {
|
||||
projectV2(number: 9) {
|
||||
id
|
||||
fields(first: 20) {
|
||||
nodes {
|
||||
... on ProjectV2SingleSelectField {
|
||||
id
|
||||
name
|
||||
options {
|
||||
id
|
||||
name
|
||||
options {
|
||||
id
|
||||
name
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
')
|
||||
}
|
||||
')
|
||||
|
||||
PROJECT_ID=$(echo "$PROJECT_DATA" | jq -r '.data.user.projectV2.id')
|
||||
STATUS_FIELD_ID=$(echo "$PROJECT_DATA" | \
|
||||
jq -r '.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .id')
|
||||
STATUS_OPTION_ID=$(echo "$PROJECT_DATA" | jq -r --arg status "$TARGET_STATUS" \
|
||||
'.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .options[] | select(.name == $status) | .id')
|
||||
```
|
||||
PROJECT_ID=$(echo "$PROJECT_DATA" | jq -r '.data.user.projectV2.id')
|
||||
STATUS_FIELD_ID=$(echo "$PROJECT_DATA" | \
|
||||
jq -r '.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .id')
|
||||
STATUS_OPTION_ID=$(echo "$PROJECT_DATA" | jq -r --arg status "$TARGET_STATUS" \
|
||||
'.data.user.projectV2.fields.nodes[] | select(.name == "Status") | .options[] | select(.name == $status) | .id')
|
||||
|
||||
### Step 4: Update Project Board
|
||||
```bash
|
||||
if [ -n "$PROJECT_ITEM" ] && [ -n "$STATUS_OPTION_ID" ]; then
|
||||
gh api graphql -f query='
|
||||
mutation {
|
||||
updateProjectV2ItemFieldValue(
|
||||
input: {
|
||||
projectId: "'"$PROJECT_ID"'"
|
||||
itemId: "'"$PROJECT_ITEM"'"
|
||||
fieldId: "'"$STATUS_FIELD_ID"'"
|
||||
value: { singleSelectOptionId: "'"$STATUS_OPTION_ID"'" }
|
||||
}
|
||||
) {
|
||||
projectV2Item { id }
|
||||
# Step 4: Update Project Board
|
||||
if [ -n "$PROJECT_ITEM" ] && [ -n "$STATUS_OPTION_ID" ]; then
|
||||
gh api graphql -f query='
|
||||
mutation {
|
||||
updateProjectV2ItemFieldValue(
|
||||
input: {
|
||||
projectId: "'"$PROJECT_ID"'"
|
||||
itemId: "'"$PROJECT_ITEM"'"
|
||||
fieldId: "'"$STATUS_FIELD_ID"'"
|
||||
value: { singleSelectOptionId: "'"$STATUS_OPTION_ID"'" }
|
||||
}
|
||||
) {
|
||||
projectV2Item { id }
|
||||
}
|
||||
'
|
||||
echo "✅ Project board updated to: $TARGET_STATUS"
|
||||
else
|
||||
echo "⚠️ Could not update (missing IDs)"
|
||||
fi
|
||||
```
|
||||
|
||||
## Rules
|
||||
- DO NOT comment on issue (prevents notification spam)
|
||||
- DO NOT modify issue labels (prevents sync loop)
|
||||
- Only update project board status
|
||||
|
||||
claude_args: '--allowed-tools "Bash(gh issue:*),Bash(gh api:*),Bash(gh project:*),Bash(echo:*),Bash(sleep:*)"'
|
||||
}
|
||||
'
|
||||
echo "✅ Project board updated to: $TARGET_STATUS"
|
||||
else
|
||||
echo "⚠️ Could not update (missing IDs)"
|
||||
fi
|
||||
|
||||
sync-project-to-issue:
|
||||
needs: [determine-direction, rate-limit-check, debounce]
|
||||
@@ -305,66 +286,55 @@ jobs:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Sync Project Board to Issue
|
||||
uses: anthropics/claude-code-action@v1
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.PROJECTS_TOKEN }}
|
||||
with:
|
||||
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
|
||||
run: |
|
||||
echo "# Project Board → Issue Sync"
|
||||
echo "**Project Item**: ${{ github.event.projects_v2_item.node_id }}"
|
||||
echo "**Content**: ${{ github.event.projects_v2_item.content_node_id }}"
|
||||
echo "**Changed By**: @${{ github.event.sender.login }}"
|
||||
|
||||
prompt: |
|
||||
# Project Board → Issue Sync
|
||||
# Step 1: Get Issue Number
|
||||
CONTENT_ID="${{ github.event.projects_v2_item.content_node_id }}"
|
||||
|
||||
**Project Item**: ${{ github.event.projects_v2_item.node_id }}
|
||||
**Content**: ${{ github.event.projects_v2_item.content_node_id }}
|
||||
**Changed By**: @${{ github.event.sender.login }}
|
||||
|
||||
## Task: Sync project board status to issue
|
||||
|
||||
### Step 1: Get Issue Number
|
||||
```bash
|
||||
CONTENT_ID="${{ github.event.projects_v2_item.content_node_id }}"
|
||||
|
||||
ISSUE_DATA=$(gh api graphql -f query='
|
||||
query {
|
||||
node(id: "${{ github.event.projects_v2_item.node_id }}") {
|
||||
... on ProjectV2Item {
|
||||
content {
|
||||
... on Issue {
|
||||
number
|
||||
url
|
||||
state
|
||||
title
|
||||
}
|
||||
ISSUE_DATA=$(gh api graphql -f query='
|
||||
query {
|
||||
node(id: "${{ github.event.projects_v2_item.node_id }}") {
|
||||
... on ProjectV2Item {
|
||||
content {
|
||||
... on Issue {
|
||||
number
|
||||
url
|
||||
state
|
||||
title
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
')
|
||||
}
|
||||
')
|
||||
|
||||
ISSUE_NUMBER=$(echo "$ISSUE_DATA" | jq -r '.data.node.content.number')
|
||||
ISSUE_NUMBER=$(echo "$ISSUE_DATA" | jq -r '.data.node.content.number')
|
||||
|
||||
if [ -z "$ISSUE_NUMBER" ] || [ "$ISSUE_NUMBER" = "null" ]; then
|
||||
echo "⏭️ Not an issue (might be PR or other content)"
|
||||
exit 0
|
||||
fi
|
||||
if [ -z "$ISSUE_NUMBER" ] || [ "$ISSUE_NUMBER" = "null" ]; then
|
||||
echo "⏭️ Not an issue (might be PR or other content)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Issue Number: $ISSUE_NUMBER"
|
||||
```
|
||||
echo "Issue Number: $ISSUE_NUMBER"
|
||||
|
||||
### Step 2: Get Project Status
|
||||
```bash
|
||||
STATUS=$(gh api graphql -f query='
|
||||
query {
|
||||
node(id: "${{ github.event.projects_v2_item.node_id }}") {
|
||||
... on ProjectV2Item {
|
||||
fieldValues(first: 20) {
|
||||
nodes {
|
||||
... on ProjectV2ItemFieldSingleSelectValue {
|
||||
name
|
||||
field {
|
||||
... on ProjectV2SingleSelectField {
|
||||
name
|
||||
}
|
||||
# Step 2: Get Project Status
|
||||
STATUS=$(gh api graphql -f query='
|
||||
query {
|
||||
node(id: "${{ github.event.projects_v2_item.node_id }}") {
|
||||
... on ProjectV2Item {
|
||||
fieldValues(first: 20) {
|
||||
nodes {
|
||||
... on ProjectV2ItemFieldSingleSelectValue {
|
||||
name
|
||||
field {
|
||||
... on ProjectV2SingleSelectField {
|
||||
name
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -372,71 +342,55 @@ jobs:
|
||||
}
|
||||
}
|
||||
}
|
||||
' --jq '.data.node.fieldValues.nodes[] | select(.field.name == "Status") | .name')
|
||||
}
|
||||
' --jq '.data.node.fieldValues.nodes[] | select(.field.name == "Status") | .name')
|
||||
|
||||
if [ -z "$STATUS" ]; then
|
||||
echo "⏭️ No status field found"
|
||||
if [ -z "$STATUS" ]; then
|
||||
echo "⏭️ No status field found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Project Status: $STATUS"
|
||||
|
||||
# Step 3: Map Status to Label
|
||||
case "$STATUS" in
|
||||
"To triage") NEW_LABEL="status: triage" ;;
|
||||
"Backlog") NEW_LABEL="status: backlog" ;;
|
||||
"Ready") NEW_LABEL="status: ready" ;;
|
||||
"In Progress") NEW_LABEL="status: in-progress" ;;
|
||||
"In Review") NEW_LABEL="status: in-review" ;;
|
||||
"Done") NEW_LABEL="status: done" ;;
|
||||
*)
|
||||
echo "⏭️ Unknown status: $STATUS"
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "Target Label: $NEW_LABEL"
|
||||
|
||||
# Step 4: Update Issue Labels
|
||||
CURRENT_LABELS=$(gh issue view $ISSUE_NUMBER --json labels --jq '[.labels[].name] | join(",")')
|
||||
|
||||
# Remove all status: labels
|
||||
for label in "status: triage" "status: backlog" "status: ready" "status: in-progress" "status: in-review" "status: done"; do
|
||||
if echo "$CURRENT_LABELS" | grep -q "$label"; then
|
||||
gh issue edit $ISSUE_NUMBER --remove-label "$label" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Project Status: $STATUS"
|
||||
```
|
||||
# Add new status label
|
||||
gh issue edit $ISSUE_NUMBER --add-label "$NEW_LABEL"
|
||||
echo "✅ Label updated to: $NEW_LABEL"
|
||||
|
||||
### Step 3: Map Status to Label
|
||||
```bash
|
||||
case "$STATUS" in
|
||||
"To triage") NEW_LABEL="status: triage" ;;
|
||||
"Backlog") NEW_LABEL="status: backlog" ;;
|
||||
"Ready") NEW_LABEL="status: ready" ;;
|
||||
"In Progress") NEW_LABEL="status: in-progress" ;;
|
||||
"In Review") NEW_LABEL="status: in-review" ;;
|
||||
"Done") NEW_LABEL="status: done" ;;
|
||||
*)
|
||||
echo "⏭️ Unknown status: $STATUS"
|
||||
exit 0
|
||||
;;
|
||||
esac
|
||||
# Step 5: Handle Issue State
|
||||
CURRENT_STATE=$(gh issue view $ISSUE_NUMBER --json state --jq '.state')
|
||||
|
||||
echo "Target Label: $NEW_LABEL"
|
||||
```
|
||||
if [ "$STATUS" = "Done" ] && [ "$CURRENT_STATE" = "OPEN" ]; then
|
||||
gh issue close $ISSUE_NUMBER --reason completed
|
||||
echo "✅ Issue closed (moved to Done)"
|
||||
elif [ "$STATUS" != "Done" ] && [ "$CURRENT_STATE" = "CLOSED" ]; then
|
||||
gh issue reopen $ISSUE_NUMBER
|
||||
echo "✅ Issue reopened (moved from Done)"
|
||||
fi
|
||||
|
||||
### Step 4: Update Issue Labels
|
||||
```bash
|
||||
CURRENT_LABELS=$(gh issue view $ISSUE_NUMBER --json labels --jq '[.labels[].name] | join(",")')
|
||||
|
||||
# Remove all status: labels
|
||||
for label in "status: triage" "status: backlog" "status: ready" "status: in-progress" "status: in-review" "status: done"; do
|
||||
if echo "$CURRENT_LABELS" | grep -q "$label"; then
|
||||
gh issue edit $ISSUE_NUMBER --remove-label "$label" 2>/dev/null || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Add new status label
|
||||
gh issue edit $ISSUE_NUMBER --add-label "$NEW_LABEL"
|
||||
echo "✅ Label updated to: $NEW_LABEL"
|
||||
```
|
||||
|
||||
### Step 5: Handle Issue State
|
||||
```bash
|
||||
CURRENT_STATE=$(gh issue view $ISSUE_NUMBER --json state --jq '.state')
|
||||
|
||||
if [ "$STATUS" = "Done" ] && [ "$CURRENT_STATE" = "OPEN" ]; then
|
||||
gh issue close $ISSUE_NUMBER --reason completed
|
||||
echo "✅ Issue closed (moved to Done)"
|
||||
elif [ "$STATUS" != "Done" ] && [ "$CURRENT_STATE" = "CLOSED" ]; then
|
||||
gh issue reopen $ISSUE_NUMBER
|
||||
echo "✅ Issue reopened (moved from Done)"
|
||||
fi
|
||||
```
|
||||
|
||||
### Step 6: Silent Completion
|
||||
```bash
|
||||
echo "✅ Sync complete: Issue #$ISSUE_NUMBER updated to $STATUS"
|
||||
```
|
||||
|
||||
## Rules
|
||||
- DO NOT comment on issue (prevents notification spam)
|
||||
- DO NOT modify project board (prevents sync loop)
|
||||
- Only update issue labels and state
|
||||
|
||||
claude_args: '--allowed-tools "Bash(gh issue:*),Bash(gh api:*),Bash(echo:*)"'
|
||||
echo "✅ Sync complete: Issue #$ISSUE_NUMBER updated to $STATUS"
|
||||
|
||||
316
AUDIT_REPORT.md
Normal file
316
AUDIT_REPORT.md
Normal file
@@ -0,0 +1,316 @@
|
||||
# Skills Audit Report
|
||||
|
||||
**Date:** 2026-02-15
|
||||
**Auditor:** Automated Skill Quality Audit
|
||||
**Scope:** Recently added skills in business-growth/, finance/, marketing-skill/campaign-analytics/, project-management/
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
The recently added skills fall into two distinct tiers:
|
||||
|
||||
1. **Business-growth, Finance, and Campaign Analytics skills** — Genuinely impressive. Production-ready Python tooling, deep domain frameworks, real structured outputs. These would make a domain practitioner say "this actually knows what it's doing."
|
||||
|
||||
2. **Project Management skills** — A mixed bag. The Atlassian-specific skills (jira-expert, confluence-expert, atlassian-admin, atlassian-templates) have strong knowledge-base content. The scrum-master and senior-pm skills are thin and generic. None of the PM skills have scripts or assets — they're pure prompt-engineering skills, which is a fundamentally different (and weaker) category.
|
||||
|
||||
**Overall: 4 POWERFUL, 1 SOLID, 4 SOLID, 2 GENERIC, 1 WEAK**
|
||||
|
||||
---
|
||||
|
||||
## Detailed Skill Audits
|
||||
|
||||
---
|
||||
|
||||
### 1. business-growth/customer-success-manager
|
||||
|
||||
**Code Quality: EXCELLENT**
|
||||
- 3 Python scripts (438 + 487 + 414 = 1,339 lines total)
|
||||
- Well-structured: proper typing, argparse CLI, JSON/text dual output, error handling
|
||||
- Zero external dependencies (stdlib only) — deliberate, documented design choice
|
||||
- `health_score_calculator.py`: Multi-dimensional weighted scoring with segment-aware benchmarks (Enterprise/Mid-Market/SMB). Not placeholder math — real configurable thresholds, normalization logic, trend analysis
|
||||
- `churn_risk_analyzer.py`: Behavioral signal detection with renewal urgency multipliers
|
||||
- `expansion_opportunity_scorer.py`: Whitespace mapping and effort-vs-impact prioritization
|
||||
- All scripts actually runnable with provided sample data
|
||||
|
||||
**Problem-Solving Quality: EXCELLENT**
|
||||
- Health scoring framework reference (80+ lines) explains *why* each dimension is weighted as it is — genuinely pedagogical
|
||||
- Real CS playbooks: not "be proactive" platitudes but specific intervention triggers (e.g., "if health score drops below yellow for 2 consecutive periods, escalate")
|
||||
- QBR template is production-ready — has ROI summary tables, value-delivered sections, next-quarter planning
|
||||
- Success plan template, onboarding checklist, executive business review — all structured with fill-in tables
|
||||
- Uses real industry frameworks: DAU/MAU ratio, NPS scoring methodology, multi-threading depth
|
||||
|
||||
**Structure: STRONG**
|
||||
- SKILL.md has proper frontmatter, TOC, input/output schemas, limitations section
|
||||
- References are actually used by the scripts (health-scoring-framework.md maps directly to score calculation logic)
|
||||
- Assets include sample data AND expected output JSON for validation
|
||||
|
||||
**Verdict: POWERFUL** ⭐
|
||||
*Evidence: A CS leader could hand this to a team and they'd have a working health scoring system same day. The weighted scoring model with segment-aware thresholds is exactly how real CS platforms (Gainsight, Totango) work. The scripts produce structured JSON that could feed a dashboard.*
|
||||
|
||||
---
|
||||
|
||||
### 2. business-growth/revenue-operations
|
||||
|
||||
**Code Quality: EXCELLENT**
|
||||
- 3 scripts (496 + 531 + 658 = 1,685 lines total) — the largest script set
|
||||
- `pipeline_analyzer.py`: Coverage ratios, stage conversion rates, sales velocity formula (Opportunities × Avg Deal × Win Rate / Cycle), deal aging detection, concentration risk warnings
|
||||
- `forecast_accuracy_tracker.py`: MAPE calculation, period-over-period accuracy trending
|
||||
- `gtm_efficiency_calculator.py`: CAC, LTV, CAC payback period, magic number, burn multiple — these are real SaaS metrics, not made up
|
||||
- Proper CLI args, dual output format, input validation
|
||||
|
||||
**Problem-Solving Quality: EXCELLENT**
|
||||
- RevOps metrics guide references real benchmarks (3-4x pipeline coverage, magic number >0.75)
|
||||
- Pipeline management framework covers qualification methodology
|
||||
- GTM efficiency benchmarks are industry-standard (Bessemer, OpenView style)
|
||||
- Templates: pipeline review, forecast report, GTM dashboard — all structured with metric tables
|
||||
|
||||
**Structure: STRONG**
|
||||
- Consistent with customer-success-manager pattern
|
||||
- Sample data files for all three scripts
|
||||
- Expected output JSON for validation
|
||||
|
||||
**Verdict: POWERFUL** ⭐
|
||||
*Evidence: The pipeline analyzer alone replaces basic Salesforce reporting. The GTM efficiency calculator uses the exact metrics VCs and board members ask for (magic number, burn multiple, CAC payback). A RevOps manager would find real utility here.*
|
||||
|
||||
---
|
||||
|
||||
### 3. business-growth/sales-engineer
|
||||
|
||||
**Code Quality: EXCELLENT**
|
||||
- 3 scripts (557 + 525 + 765 = 1,847 lines total) — largest individual script set
|
||||
- `rfp_response_analyzer.py`: Weighted coverage scoring (Full/Partial/Planned/Gap × Must-have/Should-have/Nice-to-have), automated bid/no-bid recommendation with configurable thresholds
|
||||
- `competitive_matrix_builder.py`: Feature-by-feature comparison with differentiator/vulnerability identification
|
||||
- `poc_planner.py`: Timeline generation, resource planning, success criteria definition, evaluation scorecards
|
||||
- 765-line POC planner is genuinely comprehensive
|
||||
|
||||
**Problem-Solving Quality: EXCELLENT**
|
||||
- 5-phase workflow (Discovery → Solution Design → Demo → POC → Close) maps to real SE methodology
|
||||
- RFP analyzer produces structured gap analysis with mitigation strategies — not just "you have gaps"
|
||||
- Competitive positioning framework includes feature-level comparison, not just "we're better"
|
||||
- Demo script template and POC scorecard are practitioner-level artifacts
|
||||
- Technical proposal template has architecture section
|
||||
|
||||
**Structure: STRONG**
|
||||
- Same consistent pattern as other business-growth skills
|
||||
- Rich asset set: demo script template, POC scorecard, technical proposal template, sample RFP data
|
||||
- References cover competitive positioning, POC best practices, RFP response methodology
|
||||
|
||||
**Verdict: POWERFUL** ⭐
|
||||
*Evidence: The RFP analyzer with weighted coverage scoring and bid/no-bid recommendation is something SEs actually need and usually do in spreadsheets. The POC planner at 765 lines is the most substantive single script in this batch. A pre-sales team could adopt this immediately.*
|
||||
|
||||
---
|
||||
|
||||
### 4. finance/financial-analyst
|
||||
|
||||
**Code Quality: EXCELLENT**
|
||||
- 4 scripts (432 + 449 + 406 + 494 = 1,781 lines total)
|
||||
- `ratio_calculator.py`: 20+ ratios across 5 categories (profitability, liquidity, leverage, efficiency, valuation) — ROE, ROA, DSCR, DSO, EV/EBITDA, PEG ratio
|
||||
- `dcf_valuation.py`: Full DCF model with WACC via CAPM, 5-year projections, terminal value (perpetuity growth AND exit multiple methods), two-way sensitivity analysis, equity bridge
|
||||
- `budget_variance_analyzer.py`: Favorable/unfavorable classification by department and category
|
||||
- `forecast_builder.py`: Driver-based forecasting with scenario modeling (base/bull/bear)
|
||||
- All stdlib only, handles edge cases (inf values in JSON serialization)
|
||||
|
||||
**Problem-Solving Quality: EXCELLENT**
|
||||
- DCF model implements real finance: CAPM cost of equity, after-tax cost of debt, terminal value via both methods, sensitivity matrix — this is textbook corporate finance done correctly
|
||||
- Ratio guide includes interpretation context (not just "here's the number" but "here's what it means")
|
||||
- Valuation methodology reference explains when to use DCF vs. comparables vs. precedent transactions
|
||||
- Forecasting best practices cover driver-based vs. trend-based approaches
|
||||
- Variance report template is exactly what FP&A teams produce monthly
|
||||
|
||||
**Structure: STRONG**
|
||||
- Consistent format with other skills
|
||||
- 4 scripts (most of any skill) — comprehensive coverage of analyst workflow
|
||||
- Sample data, expected output, 3 templates (DCF, forecast, variance)
|
||||
|
||||
**Verdict: POWERFUL** ⭐
|
||||
*Evidence: The DCF valuation model alone is genuinely useful — it implements WACC calculation, cash flow projection, terminal value via two methods, and sensitivity analysis. A junior analyst could use this as a learning tool; a senior analyst could use it for quick-and-dirty valuations. The sensitivity table output is exactly what you'd see in an investment banking pitch book.*
|
||||
|
||||
---
|
||||
|
||||
### 5. marketing-skill/campaign-analytics
|
||||
|
||||
**Code Quality: VERY GOOD**
|
||||
- 3 scripts (347 + 459 + 305 = 1,111 lines total) — smallest script set but still substantive
|
||||
- `attribution_analyzer.py`: 5 attribution models (first-touch, last-touch, linear, time-decay, position-based) — these are the real standard models used in marketing analytics
|
||||
- `campaign_roi_calculator.py`: ROI, ROAS, CPA, CPL, CAC with industry benchmarking
|
||||
- `funnel_analyzer.py`: Stage-by-stage conversion rates, drop-off identification, bottleneck detection
|
||||
- Time-decay model uses configurable half-life parameter — not just a label
|
||||
|
||||
**Problem-Solving Quality: VERY GOOD**
|
||||
- Attribution models guide explains when to use each model (rare — most resources just list them)
|
||||
- Funnel optimization framework covers real concepts (stage-specific interventions, not just "improve conversion")
|
||||
- Campaign metrics benchmarks provide industry reference points
|
||||
- A/B test template and channel comparison template are useful artifacts
|
||||
|
||||
**Structure: STRONG**
|
||||
- Consistent with business-growth pattern
|
||||
- References tied to script functionality
|
||||
- Sample data with customer journeys for attribution testing
|
||||
|
||||
**Verdict: SOLID** (borderline POWERFUL)
|
||||
*Evidence: The 5 attribution models are correctly implemented and genuinely useful for any marketing team not yet using a dedicated attribution platform. However, the funnel analyzer (305 lines) is thinner than the equivalent scripts in other skills, and the overall scope is narrower than the business-growth skills.*
|
||||
|
||||
---
|
||||
|
||||
### 6. project-management/jira-expert
|
||||
|
||||
**Code Quality: N/A** — No scripts
|
||||
|
||||
**Problem-Solving Quality: GOOD**
|
||||
- JQL examples reference is genuinely useful — covers sprint queries, team workload, SLA tracking, change management queries
|
||||
- Automation examples reference covers real Jira automation rules
|
||||
- SKILL.md has comprehensive workflow descriptions for project creation, workflow design, JQL building
|
||||
- Actually teaches JQL syntax with practical examples, not just theory
|
||||
|
||||
**Structure: ADEQUATE**
|
||||
- No scripts, no assets, no sample data
|
||||
- But the references are substantive (415 + 423 = 838 lines of reference material)
|
||||
- Workflows reference other PM skills (Scrum Master, Confluence Expert) — good cross-linking
|
||||
|
||||
**Verdict: SOLID**
|
||||
*Evidence: The JQL examples alone are a legitimate reference resource. The automation examples cover real-world rules. But without scripts or structured output tooling, this is fundamentally a knowledge-base skill, not a tool skill. It makes Claude better at Jira advice but doesn't produce artifacts.*
|
||||
|
||||
---
|
||||
|
||||
### 7. project-management/confluence-expert
|
||||
|
||||
**Code Quality: N/A** — No scripts
|
||||
|
||||
**Problem-Solving Quality: GOOD**
|
||||
- Templates reference (725 lines) contains 10+ ready-to-use Confluence page templates: meeting notes, decision log, project status, runbook, postmortem, ADR, onboarding guide
|
||||
- Space architecture guidance is practical and specific (max 3 levels deep, naming conventions)
|
||||
- Macro usage examples are helpful for Confluence power users
|
||||
|
||||
**Structure: ADEQUATE**
|
||||
- Strong reference content compensates for lack of scripts
|
||||
- Templates are the actual artifact output — when Claude uses this skill, it produces Confluence pages
|
||||
|
||||
**Verdict: SOLID**
|
||||
*Evidence: The templates reference is the real value here — it's a curated library of production-quality Confluence page templates. A team setting up Confluence from scratch would find this genuinely useful. The space architecture guidance reflects real organizational experience.*
|
||||
|
||||
---
|
||||
|
||||
### 8. project-management/atlassian-admin
|
||||
|
||||
**Code Quality: N/A** — No scripts
|
||||
|
||||
**Problem-Solving Quality: GOOD**
|
||||
- SKILL.md is comprehensive at 414 lines covering user provisioning, deprovisioning, group management, permission schemes, security configuration
|
||||
- Workflows are procedural and actionable (step-by-step with handoffs to other skills)
|
||||
- Permission scheme design section is practical — distinguishes public/team/restricted/confidential project types
|
||||
- SSO/SAML and security policy coverage is relevant
|
||||
|
||||
**Structure: ADEQUATE**
|
||||
- No references, no assets — all content in SKILL.md
|
||||
- Good cross-references to other PM skills (Jira Expert, Confluence Expert)
|
||||
|
||||
**Verdict: SOLID**
|
||||
*Evidence: The user provisioning/deprovisioning workflows with audit steps reflect real admin concerns (content reassignment before account deletion). Permission scheme design is specific enough to be useful. But without reference docs or scripts, it's a well-written playbook rather than a tool.*
|
||||
|
||||
---
|
||||
|
||||
### 9. project-management/atlassian-templates
|
||||
|
||||
**Code Quality: N/A** — No scripts
|
||||
|
||||
**Problem-Solving Quality: GOOD**
|
||||
- SKILL.md at 751 lines is the longest PM skill — contains actual template content inline
|
||||
- Template creation process (10-step) and modification process (8-step) are well-structured
|
||||
- Contains ready-to-use templates: meeting notes, decision log, sprint planning, retrospective, project charter
|
||||
- Blueprint development workflow is practical
|
||||
|
||||
**Structure: ADEQUATE**
|
||||
- All content in SKILL.md — no separate references or assets
|
||||
- Templates are embedded directly rather than in a templates/ directory
|
||||
|
||||
**Verdict: SOLID**
|
||||
*Evidence: The templates themselves are the deliverable, and they're decent. The template governance process (versioning, deprecation, migration) shows organizational maturity. However, significant overlap with confluence-expert/references/templates.md raises questions about redundancy.*
|
||||
|
||||
---
|
||||
|
||||
### 10. project-management/scrum-master
|
||||
|
||||
**Code Quality: N/A** — No scripts
|
||||
|
||||
**Problem-Solving Quality: MEDIOCRE**
|
||||
- SKILL.md at 189 lines is thin — covers basic Scrum ceremonies at a surface level
|
||||
- Nothing here goes beyond what's in the Scrum Guide
|
||||
- No velocity tracking formulas, no capacity planning models, no sprint health metrics
|
||||
- Retro formats reference (336 lines) is the saving grace — covers Start/Stop/Continue, Glad/Sad/Mad, 4Ls, Sailboat, DAKI formats with actual process steps
|
||||
|
||||
**Structure: WEAK**
|
||||
- No assets, no sample data
|
||||
- Single reference file
|
||||
- Cross-references to Jira Expert and Confluence Expert add some value
|
||||
|
||||
**Verdict: GENERIC**
|
||||
*Evidence: A certified Scrum Master would find nothing new here. The retro formats reference is genuinely useful but is the only substantive content. The SKILL.md reads like a job description, not a methodology. No metrics, no anti-patterns, no "when sprints go wrong" playbooks. Missing: burndown analysis tools, velocity prediction, capacity planning scripts.*
|
||||
|
||||
---
|
||||
|
||||
### 11. project-management/senior-pm
|
||||
|
||||
**Code Quality: N/A** — No scripts
|
||||
|
||||
**Problem-Solving Quality: WEAK**
|
||||
- SKILL.md at 146 lines is the thinnest skill in the entire batch
|
||||
- `references/api_reference.md` is literally a placeholder: "This is a placeholder for detailed reference documentation. Replace with actual reference content or delete if not needed."
|
||||
- Content is generic PM advice: "develop product roadmaps aligned with business objectives," "identify and mitigate project risks"
|
||||
- No frameworks, no decision models, no risk quantification methods
|
||||
- No RACI template, no project charter template despite mentioning them
|
||||
|
||||
**Structure: WEAK**
|
||||
- Placeholder reference file is a red flag
|
||||
- No assets, no templates, no sample data
|
||||
- Mentions creating artifacts (RACI matrix, project charter) but provides no templates
|
||||
|
||||
**Verdict: WEAK**
|
||||
*Evidence: The placeholder reference file tells the whole story — this skill was scaffolded but never completed. A senior PM would find nothing actionable. Compare to the financial-analyst skill (1,781 lines of working code + templates) vs. this (146 lines of generic advice + a placeholder). This is "act as a Senior PM" prompting dressed up as a skill.*
|
||||
|
||||
---
|
||||
|
||||
## Comparative Analysis
|
||||
|
||||
| Skill | Scripts (LOC) | References | Assets/Templates | Verdict |
|
||||
|-------|--------------|------------|-------------------|---------|
|
||||
| customer-success-manager | 3 (1,339) | 3 deep | 5 templates + sample data | **POWERFUL** |
|
||||
| revenue-operations | 3 (1,685) | 3 deep | 7 templates + sample data | **POWERFUL** |
|
||||
| sales-engineer | 3 (1,847) | 3 deep | 5 templates + sample data | **POWERFUL** |
|
||||
| financial-analyst | 4 (1,781) | 3 deep | 4 templates + sample data | **POWERFUL** |
|
||||
| campaign-analytics | 3 (1,111) | 3 deep | 5 templates + sample data | **SOLID** |
|
||||
| jira-expert | 0 | 2 substantive | 0 | **SOLID** |
|
||||
| confluence-expert | 0 | 1 (725 lines) | 0 | **SOLID** |
|
||||
| atlassian-admin | 0 | 0 | 0 | **SOLID** |
|
||||
| atlassian-templates | 0 | 0 | 0 | **SOLID** |
|
||||
| scrum-master | 0 | 1 (336 lines) | 0 | **GENERIC** |
|
||||
| senior-pm | 0 | 1 (placeholder!) | 0 | **WEAK** |
|
||||
|
||||
## Key Observations
|
||||
|
||||
### What Works (business-growth, finance, campaign-analytics)
|
||||
1. **Scripts that actually compute things** — Not wrappers, not boilerplate. Real algorithms with real business logic (DCF valuation, attribution modeling, health scoring)
|
||||
2. **Zero external dependencies** — Deliberate stdlib-only design means they run anywhere, immediately
|
||||
3. **Dual output format** — JSON for automation, text for humans. This is good engineering
|
||||
4. **Sample data + expected output** — Enables validation and serves as documentation
|
||||
5. **References that explain *why*** — The health scoring framework doesn't just list metrics; it explains why each dimension is weighted as it is
|
||||
6. **Templates that are fill-in-ready** — QBR template, variance report, demo script — these save real time
|
||||
|
||||
### What Doesn't Work (parts of project-management)
|
||||
1. **Senior PM is unfinished** — Placeholder reference file, no templates despite claiming to produce them
|
||||
2. **Scrum Master is generic** — Doesn't exceed the Scrum Guide in depth or utility
|
||||
3. **No scripts in any PM skill** — The business-growth skills prove that scripts add massive value. The PM skills could have had: sprint velocity calculator, capacity planner, risk matrix scorer, RACI generator
|
||||
4. **Two-tier quality** — The gap between POWERFUL and WEAK skills in the same repo is jarring
|
||||
|
||||
### Recommendations
|
||||
1. **Senior PM needs a complete rewrite or removal** — The placeholder reference is unacceptable. Either build it to the standard of financial-analyst (scripts + real frameworks) or don't ship it
|
||||
2. **Scrum Master needs depth** — Add velocity tracking scripts, burndown analysis, capacity planning calculator, sprint health scorer
|
||||
3. **PM skills should get scripts** — Even simple ones: RACI matrix generator, risk register scorer, project status report formatter
|
||||
4. **Deduplicate PM templates** — atlassian-templates and confluence-expert overlap significantly
|
||||
5. **Add expected_output.json to PM skills** — If they can't have scripts, at least define what "good output" looks like
|
||||
|
||||
---
|
||||
|
||||
*Report generated 2026-02-15. Skills assessed against the bar: "Would this make someone say 'holy shit, this actually knows what it's doing?'"*
|
||||
|
||||
*Business-growth and finance skills clear that bar. Campaign-analytics nearly does. PM skills mostly don't.*
|
||||
145
INSTALLATION.md
145
INSTALLATION.md
@@ -35,7 +35,7 @@ Native integration with automatic updates and version management.
|
||||
|
||||
```bash
|
||||
# Option 1: Universal installer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent codex
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent codex
|
||||
|
||||
# Option 2: Direct installation script
|
||||
git clone https://github.com/alirezarezvani/claude-skills.git
|
||||
@@ -48,11 +48,18 @@ Skills install to `~/.codex/skills/`. See [OpenAI Codex Installation](#openai-co
|
||||
### For All Other Agents (Cursor, VS Code, Goose, etc.)
|
||||
|
||||
```bash
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills
|
||||
```
|
||||
|
||||
This single command installs all skills to all supported agents automatically.
|
||||
|
||||
**What this does:**
|
||||
- ✅ Detects all 53 skills automatically
|
||||
- ✅ Installs to Claude, Cursor, Copilot, Windsurf, Cline, and 37+ other AI agents
|
||||
- ✅ Works across all skill formats
|
||||
|
||||
Learn more: https://www.agentskills.in
|
||||
|
||||
---
|
||||
|
||||
## Claude Code Native Marketplace (New!)
|
||||
@@ -129,13 +136,13 @@ This adds the skills library to your available marketplaces.
|
||||
|
||||
## Universal Installer
|
||||
|
||||
The universal installer uses the [ai-agent-skills](https://github.com/skillcreatorai/Ai-Agent-Skills) package to install skills across multiple agents simultaneously.
|
||||
The universal installer uses the [Agent Skills CLI](https://github.com/Karanjot786/agent-skills-cli) package to install skills across multiple agents simultaneously.
|
||||
|
||||
### Install All Skills
|
||||
|
||||
```bash
|
||||
# Install to all supported agents
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills
|
||||
```
|
||||
|
||||
**This installs to:**
|
||||
@@ -152,26 +159,26 @@ npx ai-agent-skills install alirezarezvani/claude-skills
|
||||
|
||||
```bash
|
||||
# Claude Code only
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent claude
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent claude
|
||||
|
||||
# Cursor only
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent cursor
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent cursor
|
||||
|
||||
# VS Code/Copilot only
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent vscode
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent vscode
|
||||
|
||||
# Goose only
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent goose
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent goose
|
||||
|
||||
# Project-specific installation (portable)
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent project
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent project
|
||||
```
|
||||
|
||||
### Preview Before Installing
|
||||
|
||||
```bash
|
||||
# Dry run to see what will be installed
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --dry-run
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --dry-run
|
||||
```
|
||||
|
||||
---
|
||||
@@ -184,126 +191,126 @@ Install individual skills instead of the entire library:
|
||||
|
||||
```bash
|
||||
# Content Creator
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator
|
||||
|
||||
# Demand Generation & Acquisition
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/marketing-demand-acquisition
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/marketing-demand-acquisition
|
||||
|
||||
# Product Marketing Strategy
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/marketing-strategy-pmm
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/marketing-strategy-pmm
|
||||
|
||||
# App Store Optimization
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/app-store-optimization
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/app-store-optimization
|
||||
|
||||
# Social Media Analyzer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/social-media-analyzer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/social-media-analyzer
|
||||
```
|
||||
|
||||
### C-Level Advisory Skills
|
||||
|
||||
```bash
|
||||
# CEO Advisor
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/c-level-advisor/ceo-advisor
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/c-level-advisor/ceo-advisor
|
||||
|
||||
# CTO Advisor
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/c-level-advisor/cto-advisor
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/c-level-advisor/cto-advisor
|
||||
```
|
||||
|
||||
### Product Team Skills
|
||||
|
||||
```bash
|
||||
# Product Manager Toolkit
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/product-team/product-manager-toolkit
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/product-team/product-manager-toolkit
|
||||
|
||||
# Agile Product Owner
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/product-team/agile-product-owner
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/product-team/agile-product-owner
|
||||
|
||||
# Product Strategist
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/product-team/product-strategist
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/product-team/product-strategist
|
||||
|
||||
# UX Researcher Designer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/product-team/ux-researcher-designer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/product-team/ux-researcher-designer
|
||||
|
||||
# UI Design System
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/product-team/ui-design-system
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/product-team/ui-design-system
|
||||
```
|
||||
|
||||
### Project Management Skills
|
||||
|
||||
```bash
|
||||
# Senior PM Expert
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/project-management/senior-pm-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/project-management/senior-pm-expert
|
||||
|
||||
# Scrum Master Expert
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/project-management/scrum-master-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/project-management/scrum-master-expert
|
||||
|
||||
# Atlassian Jira Expert
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-jira-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-jira-expert
|
||||
|
||||
# Atlassian Confluence Expert
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-confluence-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-confluence-expert
|
||||
|
||||
# Atlassian Administrator
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-administrator
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-administrator
|
||||
|
||||
# Atlassian Template Creator
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/project-management/atlassian-template-creator
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/project-management/atlassian-template-creator
|
||||
```
|
||||
|
||||
### Engineering Team Skills
|
||||
|
||||
```bash
|
||||
# Core Engineering
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-architect
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-frontend
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-backend
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-fullstack
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-qa
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-devops
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-secops
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/code-reviewer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-security
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-architect
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-frontend
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-backend
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-fullstack
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-qa
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-devops
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-secops
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/code-reviewer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-security
|
||||
|
||||
# Cloud & Enterprise
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/aws-solution-architect
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/ms365-tenant-manager
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/aws-solution-architect
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/ms365-tenant-manager
|
||||
|
||||
# Development Tools
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/tdd-guide
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/tech-stack-evaluator
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/tdd-guide
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/tech-stack-evaluator
|
||||
|
||||
# AI/ML/Data
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-data-scientist
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-data-engineer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-ml-engineer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-prompt-engineer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team/senior-computer-vision
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-data-scientist
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-data-engineer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-ml-engineer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-prompt-engineer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team/senior-computer-vision
|
||||
```
|
||||
|
||||
### Regulatory Affairs & Quality Management Skills
|
||||
|
||||
```bash
|
||||
# Regulatory & Quality Leadership
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/regulatory-affairs-head
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/quality-manager-qmr
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/quality-manager-qms-iso13485
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/regulatory-affairs-head
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/quality-manager-qmr
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/quality-manager-qms-iso13485
|
||||
|
||||
# Quality Processes
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/capa-officer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/quality-documentation-manager
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/risk-management-specialist
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/capa-officer
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/quality-documentation-manager
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/risk-management-specialist
|
||||
|
||||
# Security & Privacy
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/information-security-manager-iso27001
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/gdpr-dsgvo-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/information-security-manager-iso27001
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/gdpr-dsgvo-expert
|
||||
|
||||
# Regional Compliance
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/mdr-745-specialist
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/fda-consultant-specialist
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/mdr-745-specialist
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/fda-consultant-specialist
|
||||
|
||||
# Audit & Assessment
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/qms-audit-expert
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/ra-qm-team/isms-audit-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/qms-audit-expert
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/ra-qm-team/isms-audit-expert
|
||||
```
|
||||
|
||||
---
|
||||
@@ -316,23 +323,23 @@ Install the same skills across different agents for team consistency:
|
||||
|
||||
```bash
|
||||
# Install marketing skills to Claude Code (for content strategist)
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator --agent claude
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator --agent claude
|
||||
|
||||
# Install same skills to Cursor (for developer working on content)
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator --agent cursor
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator --agent cursor
|
||||
|
||||
# Install to VS Code (for SEO specialist)
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator --agent vscode
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator --agent vscode
|
||||
```
|
||||
|
||||
### Example: Engineering Team Setup
|
||||
|
||||
```bash
|
||||
# Full engineering suite to Claude Code
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team --agent claude
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team --agent claude
|
||||
|
||||
# Same suite to Cursor
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/engineering-team --agent cursor
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/engineering-team --agent cursor
|
||||
```
|
||||
|
||||
---
|
||||
@@ -557,7 +564,7 @@ rm -rf ~/.claude/skills/
|
||||
mkdir -p ~/.claude/skills/
|
||||
|
||||
# Reinstall
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent claude
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent claude
|
||||
```
|
||||
|
||||
#### Cursor
|
||||
@@ -630,10 +637,10 @@ OpenAI Codex users can install skills using the methods below. This repository p
|
||||
|
||||
```bash
|
||||
# Install all skills to Codex
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent codex
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent codex
|
||||
|
||||
# Preview before installing
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent codex --dry-run
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent codex --dry-run
|
||||
```
|
||||
|
||||
### Method 2: Direct Installation Script
|
||||
@@ -746,7 +753,7 @@ See `.codex/skills-index.json` for the complete manifest with descriptions.
|
||||
|
||||
**Installation Issues?**
|
||||
- Check [Troubleshooting](#troubleshooting) section above
|
||||
- Review [ai-agent-skills documentation](https://github.com/skillcreatorai/Ai-Agent-Skills)
|
||||
- Review [Agent Skills CLI documentation](https://github.com/Karanjot786/agent-skills-cli)
|
||||
- Open issue: https://github.com/alirezarezvani/claude-skills/issues
|
||||
|
||||
**Feature Requests:**
|
||||
@@ -758,6 +765,6 @@ See `.codex/skills-index.json` for the complete manifest with descriptions.
|
||||
|
||||
---
|
||||
|
||||
**Last Updated:** January 2026
|
||||
**Last Updated:** February 2026
|
||||
**Skills Version:** 1.0 (53 production skills)
|
||||
**Universal Installer:** [ai-agent-skills](https://github.com/skillcreatorai/Ai-Agent-Skills)
|
||||
**Universal Installer:** [Agent Skills CLI](https://github.com/Karanjot786/agent-skills-cli)
|
||||
|
||||
20
README.md
20
README.md
@@ -52,7 +52,7 @@ For OpenAI Codex users, install via universal installer or direct script:
|
||||
|
||||
```bash
|
||||
# Option A: Universal installer
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent codex
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent codex
|
||||
|
||||
# Option B: Direct installation script
|
||||
git clone https://github.com/alirezarezvani/claude-skills.git
|
||||
@@ -80,19 +80,19 @@ Install to Claude Code, Cursor, VS Code, Amp, Goose, and more - all with one com
|
||||
|
||||
```bash
|
||||
# Install all 53 skills to all supported agents
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills
|
||||
|
||||
# Install to specific agent (Claude Code)
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent claude
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent claude
|
||||
|
||||
# Install single skill
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator
|
||||
|
||||
# Install to Cursor
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent cursor
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent cursor
|
||||
|
||||
# Preview before installing
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --dry-run
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --dry-run
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
@@ -1541,7 +1541,7 @@ OpenAI Codex users can install and use these skills through the `.codex/skills/`
|
||||
|
||||
```bash
|
||||
# Install all 43 skills to Codex
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent codex
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent codex
|
||||
|
||||
# Verify installation
|
||||
ls ~/.codex/skills/
|
||||
@@ -1699,13 +1699,13 @@ Each skill package follows a consistent, modular structure:
|
||||
|
||||
```bash
|
||||
# Install all skills to Claude Code, Cursor, VS Code, Amp, Goose, etc.
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills
|
||||
|
||||
# Or install to specific agent
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills --agent claude
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills --agent claude
|
||||
|
||||
# Or install single skill
|
||||
npx ai-agent-skills install alirezarezvani/claude-skills/marketing-skill/content-creator
|
||||
npx agent-skills-cli add alirezarezvani/claude-skills/marketing-skill/content-creator
|
||||
```
|
||||
|
||||
**Supported Agents:**
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
# Incident Report: [INC-YYYY-NNNN] [Title]
|
||||
|
||||
**Severity:** SEV[1-4]
|
||||
**Status:** [Active | Mitigated | Resolved]
|
||||
**Incident Commander:** [Name]
|
||||
**Date:** [YYYY-MM-DD]
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
[2-3 sentence summary of the incident: what happened, impact scope, resolution status. Written for executive audience — no jargon, focus on business impact.]
|
||||
|
||||
---
|
||||
|
||||
## Impact Statement
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Duration** | [X hours Y minutes] |
|
||||
| **Affected Users** | [number or percentage] |
|
||||
| **Failed Transactions** | [number] |
|
||||
| **Revenue Impact** | $[amount] |
|
||||
| **Data Loss** | [Yes/No — if yes, detail below] |
|
||||
| **SLA Impact** | [X.XX% availability for period] |
|
||||
| **Affected Regions** | [list regions] |
|
||||
| **Affected Services** | [list services] |
|
||||
|
||||
### Customer-Facing Impact
|
||||
|
||||
[Describe what customers experienced: error messages, degraded functionality, complete outage. Be specific about which user journeys were affected.]
|
||||
|
||||
---
|
||||
|
||||
## Timeline
|
||||
|
||||
| Time (UTC) | Phase | Event |
|
||||
|------------|-------|-------|
|
||||
| HH:MM | Detection | [First alert or report] |
|
||||
| HH:MM | Declaration | [Incident declared, channel created] |
|
||||
| HH:MM | Investigation | [Key investigation findings] |
|
||||
| HH:MM | Mitigation | [Mitigation action taken] |
|
||||
| HH:MM | Resolution | [Permanent fix applied] |
|
||||
| HH:MM | Closure | [Incident closed, monitoring confirmed stable] |
|
||||
|
||||
### Key Decision Points
|
||||
|
||||
1. **[HH:MM] [Decision]** — [Rationale and outcome]
|
||||
2. **[HH:MM] [Decision]** — [Rationale and outcome]
|
||||
|
||||
### Timeline Gaps
|
||||
|
||||
[Note any periods >15 minutes without logged events. These represent potential blind spots in the response.]
|
||||
|
||||
---
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
### Root Cause
|
||||
|
||||
[Clear, specific statement of the root cause. Not "human error" — describe the systemic failure.]
|
||||
|
||||
### Contributing Factors
|
||||
|
||||
1. **[Factor Category: Process/Tooling/Human/Environment]** — [Description]
|
||||
2. **[Factor Category]** — [Description]
|
||||
3. **[Factor Category]** — [Description]
|
||||
|
||||
### 5-Whys Analysis
|
||||
|
||||
**Why did the service degrade?**
|
||||
→ [Answer]
|
||||
|
||||
**Why did [answer above] happen?**
|
||||
→ [Answer]
|
||||
|
||||
**Why did [answer above] happen?**
|
||||
→ [Answer]
|
||||
|
||||
**Why did [answer above] happen?**
|
||||
→ [Answer]
|
||||
|
||||
**Why did [answer above] happen?**
|
||||
→ [Root systemic cause]
|
||||
|
||||
---
|
||||
|
||||
## Response Metrics
|
||||
|
||||
| Metric | Value | Target | Status |
|
||||
|--------|-------|--------|--------|
|
||||
| **MTTD** (Mean Time to Detect) | [X min] | <5 min | [Met/Missed] |
|
||||
| **Time to Declare** | [X min] | <10 min | [Met/Missed] |
|
||||
| **Time to Mitigate** | [X min] | <60 min (SEV1) | [Met/Missed] |
|
||||
| **MTTR** (Mean Time to Resolve) | [X min] | <4 hr (SEV1) | [Met/Missed] |
|
||||
| **Postmortem Timeliness** | [X hours] | <72 hr | [Met/Missed] |
|
||||
|
||||
---
|
||||
|
||||
## Action Items
|
||||
|
||||
| # | Priority | Action | Owner | Deadline | Type | Status |
|
||||
|---|----------|--------|-------|----------|------|--------|
|
||||
| 1 | P1 | [Action description] | [owner] | [date] | Detection | Open |
|
||||
| 2 | P1 | [Action description] | [owner] | [date] | Prevention | Open |
|
||||
| 3 | P2 | [Action description] | [owner] | [date] | Prevention | Open |
|
||||
| 4 | P2 | [Action description] | [owner] | [date] | Process | Open |
|
||||
|
||||
### Action Item Types
|
||||
|
||||
- **Detection**: Improve ability to detect this class of issue faster
|
||||
- **Prevention**: Prevent this class of issue from occurring
|
||||
- **Mitigation**: Reduce impact when this class of issue occurs
|
||||
- **Process**: Improve response process and coordination
|
||||
|
||||
---
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
### What Went Well
|
||||
|
||||
- [Specific positive outcome from the response]
|
||||
- [Specific positive outcome]
|
||||
|
||||
### What Didn't Go Well
|
||||
|
||||
- [Specific area for improvement]
|
||||
- [Specific area for improvement]
|
||||
|
||||
### Where We Got Lucky
|
||||
|
||||
- [Things that could have made this worse but didn't]
|
||||
|
||||
---
|
||||
|
||||
## Communication Log
|
||||
|
||||
| Time (UTC) | Channel | Audience | Summary |
|
||||
|------------|---------|----------|---------|
|
||||
| HH:MM | Status Page | External | [Summary of update] |
|
||||
| HH:MM | Slack #exec | Internal | [Summary of update] |
|
||||
| HH:MM | Email | Customers | [Summary of notification] |
|
||||
|
||||
---
|
||||
|
||||
## Participants
|
||||
|
||||
| Name | Role |
|
||||
|------|------|
|
||||
| [Name] | Incident Commander |
|
||||
| [Name] | Operations Lead |
|
||||
| [Name] | Communications Lead |
|
||||
| [Name] | Subject Matter Expert |
|
||||
|
||||
---
|
||||
|
||||
## Appendix
|
||||
|
||||
### Related Incidents
|
||||
|
||||
- [INC-YYYY-NNNN] — [Brief description of related incident]
|
||||
|
||||
### Reference Links
|
||||
|
||||
- [Link to monitoring dashboard]
|
||||
- [Link to deployment logs]
|
||||
- [Link to incident channel archive]
|
||||
|
||||
---
|
||||
|
||||
*This report follows the blameless postmortem principle. The goal is systemic improvement, not individual accountability. All contributing factors should trace to process, tooling, or environmental gaps that can be addressed with concrete action items.*
|
||||
289
engineering-team/incident-commander/assets/runbook_template.md
Normal file
289
engineering-team/incident-commander/assets/runbook_template.md
Normal file
@@ -0,0 +1,289 @@
|
||||
# Runbook: [Service/Component Name]
|
||||
|
||||
**Owner:** [Team Name]
|
||||
**Last Updated:** [YYYY-MM-DD]
|
||||
**Reviewed By:** [Name]
|
||||
**Review Cadence:** Quarterly
|
||||
|
||||
---
|
||||
|
||||
## Service Overview
|
||||
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| **Service** | [service-name] |
|
||||
| **Repository** | [repo URL] |
|
||||
| **Dashboard** | [monitoring dashboard URL] |
|
||||
| **On-Call Rotation** | [PagerDuty/OpsGenie schedule URL] |
|
||||
| **SLA Tier** | [Tier 1/2/3] |
|
||||
| **Availability Target** | [99.9% / 99.95% / 99.99%] |
|
||||
| **Dependencies** | [list upstream/downstream services] |
|
||||
| **Owner Team** | [team name] |
|
||||
| **Escalation Contact** | [name/email] |
|
||||
|
||||
### Architecture Summary
|
||||
|
||||
[2-3 sentence description of the service architecture. Include key components, data stores, and external dependencies.]
|
||||
|
||||
---
|
||||
|
||||
## Alert Response Decision Tree
|
||||
|
||||
### High Error Rate (>5%)
|
||||
|
||||
```
|
||||
Error Rate Alert Fired
|
||||
├── Check: Is this a deployment-related issue?
|
||||
│ ├── YES → Go to "Recent Deployment Rollback" section
|
||||
│ └── NO → Continue
|
||||
├── Check: Is a downstream dependency failing?
|
||||
│ ├── YES → Go to "Dependency Failure" section
|
||||
│ └── NO → Continue
|
||||
├── Check: Is there unusual traffic volume?
|
||||
│ ├── YES → Go to "Traffic Spike" section
|
||||
│ └── NO → Continue
|
||||
└── Escalate: Engage on-call secondary + service owner
|
||||
```
|
||||
|
||||
### High Latency (p99 > [threshold]ms)
|
||||
|
||||
```
|
||||
Latency Alert Fired
|
||||
├── Check: Database query latency elevated?
|
||||
│ ├── YES → Go to "Database Performance" section
|
||||
│ └── NO → Continue
|
||||
├── Check: Connection pool utilization >80%?
|
||||
│ ├── YES → Go to "Connection Pool Exhaustion" section
|
||||
│ └── NO → Continue
|
||||
├── Check: Memory/CPU pressure on service instances?
|
||||
│ ├── YES → Go to "Resource Exhaustion" section
|
||||
│ └── NO → Continue
|
||||
└── Escalate: Engage on-call secondary + service owner
|
||||
```
|
||||
|
||||
### Service Unavailable (Health Check Failing)
|
||||
|
||||
```
|
||||
Health Check Alert Fired
|
||||
├── Check: Are all instances down?
|
||||
│ ├── YES → Go to "Complete Outage" section
|
||||
│ └── NO → Continue
|
||||
├── Check: Is only one AZ affected?
|
||||
│ ├── YES → Go to "AZ Failure" section
|
||||
│ └── NO → Continue
|
||||
├── Check: Can instances be restarted?
|
||||
│ ├── YES → Go to "Instance Restart" section
|
||||
│ └── NO → Continue
|
||||
└── Escalate: Declare incident, engage IC
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Scenarios
|
||||
|
||||
### Recent Deployment Rollback
|
||||
|
||||
**Symptoms:** Error rate spike or latency increase within 60 minutes of a deployment.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check deployment history: `kubectl rollout history deployment/[service-name]`
|
||||
2. Compare error rate timing with deployment timestamp
|
||||
3. Review deployment diff for risky changes
|
||||
|
||||
**Mitigation:**
|
||||
1. Initiate rollback: `kubectl rollout undo deployment/[service-name]`
|
||||
2. Verify rollback: `kubectl rollout status deployment/[service-name]`
|
||||
3. Confirm error rate returns to baseline (allow 5 minutes)
|
||||
4. If rollback fails: escalate immediately
|
||||
|
||||
**Communication:** If customer-impacting, update status page within 5 minutes of confirming impact.
|
||||
|
||||
---
|
||||
|
||||
### Database Performance
|
||||
|
||||
**Symptoms:** Elevated query latency, connection pool saturation, timeout errors.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check active queries: `SELECT * FROM pg_stat_activity WHERE state = 'active';`
|
||||
2. Check for long-running queries: `SELECT pid, now() - pg_stat_activity.query_start AS duration, query FROM pg_stat_activity WHERE state != 'idle' ORDER BY duration DESC;`
|
||||
3. Check connection count: `SELECT count(*) FROM pg_stat_activity;`
|
||||
4. Check table bloat and vacuum status
|
||||
|
||||
**Mitigation:**
|
||||
1. Kill long-running queries if identified: `SELECT pg_terminate_backend([pid]);`
|
||||
2. If connection pool exhausted: increase pool size via config (requires restart)
|
||||
3. If read replica available: redirect read traffic
|
||||
4. If write-heavy: identify and defer non-critical writes
|
||||
|
||||
**Escalation Trigger:** If query latency >10s for >5 minutes, escalate to DBA on-call.
|
||||
|
||||
---
|
||||
|
||||
### Connection Pool Exhaustion
|
||||
|
||||
**Symptoms:** Connection timeout errors, pool utilization >90%, requests queuing.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check pool metrics: current size, active connections, waiting requests
|
||||
2. Check for connection leaks: connections held >30s without activity
|
||||
3. Review recent config changes or deployments
|
||||
|
||||
**Mitigation:**
|
||||
1. Increase pool size (if infrastructure allows): update config, rolling restart
|
||||
2. Kill idle connections exceeding timeout
|
||||
3. If caused by leak: identify and restart affected instances
|
||||
4. Enable connection pool auto-scaling if available
|
||||
|
||||
**Prevention:** Pool utilization alerting at 70% (warning) and 85% (critical).
|
||||
|
||||
---
|
||||
|
||||
### Dependency Failure
|
||||
|
||||
**Symptoms:** Errors correlated with downstream service failures, circuit breakers tripping.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check dependency status dashboards
|
||||
2. Verify circuit breaker state: open/half-open/closed
|
||||
3. Check for correlation with dependency deployments or incidents
|
||||
4. Test dependency health endpoints directly
|
||||
|
||||
**Mitigation:**
|
||||
1. If circuit breaker not tripping: verify timeout/threshold configuration
|
||||
2. Enable graceful degradation (serve cached/default responses)
|
||||
3. If critical path: engage dependency team via incident process
|
||||
4. If non-critical path: disable feature flag for affected functionality
|
||||
|
||||
**Communication:** Coordinate with dependency team IC if both services have active incidents.
|
||||
|
||||
---
|
||||
|
||||
### Traffic Spike
|
||||
|
||||
**Symptoms:** Sudden traffic increase beyond normal patterns, resource saturation.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check traffic source: organic growth vs. bot traffic vs. DDoS
|
||||
2. Review rate limiting effectiveness
|
||||
3. Check auto-scaling status and capacity
|
||||
|
||||
**Mitigation:**
|
||||
1. If bot/DDoS: enable rate limiting, engage security team
|
||||
2. If organic: trigger manual scale-up, increase auto-scaling limits
|
||||
3. Enable request queuing or load shedding if at capacity
|
||||
4. Consider feature flag toggles to reduce per-request cost
|
||||
|
||||
---
|
||||
|
||||
### Complete Outage
|
||||
|
||||
**Symptoms:** All instances unreachable, health checks failing across AZs.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check infrastructure status (AWS/GCP status page)
|
||||
2. Verify network connectivity and DNS resolution
|
||||
3. Check for infrastructure-level incidents (region outage)
|
||||
4. Review recent infrastructure changes (Terraform, network config)
|
||||
|
||||
**Mitigation:**
|
||||
1. If infra provider issue: activate disaster recovery plan
|
||||
2. If DNS issue: update DNS records, reduce TTL
|
||||
3. If deployment corruption: redeploy last known good version
|
||||
4. If data corruption: engage data recovery procedures
|
||||
|
||||
**Escalation:** Immediately declare SEV1 incident. Engage infrastructure team and management.
|
||||
|
||||
---
|
||||
|
||||
### Instance Restart
|
||||
|
||||
**Symptoms:** Individual instances unhealthy, OOM kills, process crashes.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Check instance logs for crash reason
|
||||
2. Review memory/CPU usage patterns before crash
|
||||
3. Check for memory leaks or resource exhaustion
|
||||
4. Verify configuration consistency across instances
|
||||
|
||||
**Mitigation:**
|
||||
1. Restart unhealthy instances: `kubectl delete pod [pod-name]`
|
||||
2. If recurring: cordon node and migrate workloads
|
||||
3. If memory leak: schedule immediate patch with increased memory limit
|
||||
4. Monitor for recurrence after restart
|
||||
|
||||
---
|
||||
|
||||
### AZ Failure
|
||||
|
||||
**Symptoms:** All instances in one availability zone failing, others healthy.
|
||||
|
||||
**Diagnosis:**
|
||||
1. Confirm AZ-specific failure vs. instance-specific issues
|
||||
2. Check cloud provider AZ status
|
||||
3. Verify load balancer is routing around failed AZ
|
||||
|
||||
**Mitigation:**
|
||||
1. Ensure load balancer marks AZ instances as unhealthy
|
||||
2. Scale up remaining AZs to handle redirected traffic
|
||||
3. If auto-scaling: verify it's responding to increased load
|
||||
4. Monitor remaining AZs for cascade effects
|
||||
|
||||
---
|
||||
|
||||
## Key Metrics & Dashboards
|
||||
|
||||
| Metric | Normal Range | Warning | Critical | Dashboard |
|
||||
|--------|-------------|---------|----------|-----------|
|
||||
| Error Rate | <0.1% | >1% | >5% | [link] |
|
||||
| p99 Latency | <200ms | >500ms | >2000ms | [link] |
|
||||
| CPU Usage | <60% | >75% | >90% | [link] |
|
||||
| Memory Usage | <70% | >80% | >90% | [link] |
|
||||
| DB Pool Usage | <50% | >70% | >85% | [link] |
|
||||
| Request Rate | [baseline]±20% | ±50% | ±100% | [link] |
|
||||
|
||||
---
|
||||
|
||||
## Escalation Contacts
|
||||
|
||||
| Level | Contact | When |
|
||||
|-------|---------|------|
|
||||
| L1: On-Call Primary | [name/rotation] | First responder |
|
||||
| L2: On-Call Secondary | [name/rotation] | Primary unavailable or needs help |
|
||||
| L3: Service Owner | [name] | Complex issues, architectural decisions |
|
||||
| L4: Engineering Manager | [name] | SEV1/SEV2, customer impact, resource needs |
|
||||
| L5: VP Engineering | [name] | SEV1 >30 min, major customer/revenue impact |
|
||||
|
||||
---
|
||||
|
||||
## Maintenance Procedures
|
||||
|
||||
### Planned Maintenance Checklist
|
||||
|
||||
- [ ] Maintenance window scheduled and communicated (72 hours advance for Tier 1)
|
||||
- [ ] Status page updated with planned maintenance notice
|
||||
- [ ] Rollback plan documented and tested
|
||||
- [ ] On-call notified of maintenance window
|
||||
- [ ] Customer notification sent (if SLA-impacting)
|
||||
- [ ] Post-maintenance verification plan ready
|
||||
|
||||
### Health Verification After Changes
|
||||
|
||||
1. Check all health endpoints return 200
|
||||
2. Verify error rate returns to baseline within 5 minutes
|
||||
3. Confirm latency within normal range
|
||||
4. Run synthetic transaction test
|
||||
5. Monitor for 15 minutes before declaring success
|
||||
|
||||
---
|
||||
|
||||
## Revision History
|
||||
|
||||
| Date | Author | Change |
|
||||
|------|--------|--------|
|
||||
| [YYYY-MM-DD] | [Name] | Initial version |
|
||||
| [YYYY-MM-DD] | [Name] | [Description of update] |
|
||||
|
||||
---
|
||||
|
||||
*This runbook should be reviewed quarterly and updated after every incident that reveals missing procedures. The on-call engineer should be able to follow this document without prior context about the service. If any section requires tribal knowledge to execute, it needs to be expanded.*
|
||||
@@ -0,0 +1,276 @@
|
||||
{
|
||||
"incident": {
|
||||
"id": "INC-2024-0142",
|
||||
"title": "Payment Service Degradation",
|
||||
"severity": "SEV1",
|
||||
"status": "resolved",
|
||||
"declared_at": "2024-01-15T14:23:00Z",
|
||||
"resolved_at": "2024-01-15T16:45:00Z",
|
||||
"commander": "Jane Smith",
|
||||
"service": "payment-gateway",
|
||||
"affected_services": ["checkout", "subscription-billing"]
|
||||
},
|
||||
"events": [
|
||||
{
|
||||
"timestamp": "2024-01-15T14:15:00Z",
|
||||
"type": "trigger",
|
||||
"actor": "system",
|
||||
"description": "Database connection pool utilization reaches 95% on payment-gateway primary",
|
||||
"metadata": {"metric": "db_pool_utilization", "value": 95, "threshold": 90}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:20:00Z",
|
||||
"type": "detection",
|
||||
"actor": "monitoring",
|
||||
"description": "PagerDuty alert fired: payment-gateway error rate >5% (current: 8.2%)",
|
||||
"metadata": {"alert_id": "PD-98765", "source": "datadog", "error_rate": 8.2}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:21:00Z",
|
||||
"type": "detection",
|
||||
"actor": "monitoring",
|
||||
"description": "Datadog alert: p99 latency on /api/payments exceeds 5000ms (current: 8500ms)",
|
||||
"metadata": {"alert_id": "DD-54321", "source": "datadog", "latency_p99_ms": 8500}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:23:00Z",
|
||||
"type": "declaration",
|
||||
"actor": "Jane Smith",
|
||||
"description": "SEV1 declared. Incident channel #inc-20240115-payment-degradation created. Bridge call started.",
|
||||
"metadata": {"channel": "#inc-20240115-payment-degradation", "severity": "SEV1"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:25:00Z",
|
||||
"type": "investigation",
|
||||
"actor": "Alice Chen",
|
||||
"description": "Confirmed: database connection pool at 100% utilization. All new connections being rejected.",
|
||||
"metadata": {"pool_size": 20, "active_connections": 20, "waiting_requests": 147}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:28:00Z",
|
||||
"type": "investigation",
|
||||
"actor": "Carol Davis",
|
||||
"description": "Identified recent deployment of user-api v2.4.1 at 13:45 UTC. New ORM version (3.2.0) changed connection handling behavior.",
|
||||
"metadata": {"deployment": "user-api-v2.4.1", "deployed_at": "2024-01-15T13:45:00Z"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:30:00Z",
|
||||
"type": "communication",
|
||||
"actor": "Bob Kim",
|
||||
"description": "Status page updated: Investigating - We are investigating increased error rates affecting payment processing.",
|
||||
"metadata": {"channel": "status_page", "status": "investigating"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:35:00Z",
|
||||
"type": "escalation",
|
||||
"actor": "Jane Smith",
|
||||
"description": "Escalated to VP Engineering. Customer impact confirmed: 12,500+ users affected, failed transactions accumulating.",
|
||||
"metadata": {"escalated_to": "VP Engineering", "reason": "revenue_impact"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:40:00Z",
|
||||
"type": "mitigation",
|
||||
"actor": "Alice Chen",
|
||||
"description": "Attempting mitigation: increasing connection pool size from 20 to 50 via config override.",
|
||||
"metadata": {"action": "pool_resize", "old_value": 20, "new_value": 50}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:45:00Z",
|
||||
"type": "communication",
|
||||
"actor": "Bob Kim",
|
||||
"description": "Status page updated: Identified - The issue has been identified as a database configuration problem. We are implementing a fix.",
|
||||
"metadata": {"channel": "status_page", "status": "identified"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:50:00Z",
|
||||
"type": "investigation",
|
||||
"actor": "Carol Davis",
|
||||
"description": "Pool resize partially effective. Error rate dropped from 23% to 12%. ORM 3.2.0 opens 3x more connections per request than 3.1.2.",
|
||||
"metadata": {"error_rate_before": 23.5, "error_rate_after": 12.1}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T15:00:00Z",
|
||||
"type": "mitigation",
|
||||
"actor": "Alice Chen",
|
||||
"description": "Decision: roll back ORM version to 3.1.2. Initiating rollback deployment of user-api v2.3.9.",
|
||||
"metadata": {"action": "rollback", "target_version": "2.3.9", "rollback_reason": "orm_connection_leak"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T15:15:00Z",
|
||||
"type": "mitigation",
|
||||
"actor": "Alice Chen",
|
||||
"description": "Rollback deployment complete. user-api v2.3.9 running in production. Connection pool utilization dropping.",
|
||||
"metadata": {"deployment_duration_minutes": 15, "pool_utilization": 45}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T15:20:00Z",
|
||||
"type": "communication",
|
||||
"actor": "Bob Kim",
|
||||
"description": "Status page updated: Monitoring - A fix has been implemented and we are monitoring the results.",
|
||||
"metadata": {"channel": "status_page", "status": "monitoring"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T15:30:00Z",
|
||||
"type": "mitigation",
|
||||
"actor": "Jane Smith",
|
||||
"description": "Error rate back to baseline (<0.1%). Payment processing fully restored. Entering monitoring phase.",
|
||||
"metadata": {"error_rate": 0.08, "pool_utilization": 32}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T16:30:00Z",
|
||||
"type": "investigation",
|
||||
"actor": "Carol Davis",
|
||||
"description": "Confirmed stable for 60 minutes. No degradation detected. Root cause documented: ORM 3.2.0 connection pooling incompatibility.",
|
||||
"metadata": {"monitoring_duration_minutes": 60, "stable": true}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T16:45:00Z",
|
||||
"type": "resolution",
|
||||
"actor": "Jane Smith",
|
||||
"description": "Incident resolved. All services nominal. Postmortem scheduled for 2024-01-17 10:00 UTC.",
|
||||
"metadata": {"postmortem_scheduled": "2024-01-17T10:00:00Z"}
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T16:50:00Z",
|
||||
"type": "communication",
|
||||
"actor": "Bob Kim",
|
||||
"description": "Status page updated: Resolved - The issue has been resolved. Payment processing is operating normally.",
|
||||
"metadata": {"channel": "status_page", "status": "resolved"}
|
||||
}
|
||||
],
|
||||
"communications": [
|
||||
{
|
||||
"timestamp": "2024-01-15T14:30:00Z",
|
||||
"channel": "status_page",
|
||||
"audience": "external",
|
||||
"message": "Investigating - We are investigating increased error rates affecting payment processing. Some transactions may fail. We will provide an update within 15 minutes."
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:35:00Z",
|
||||
"channel": "slack_exec",
|
||||
"audience": "internal",
|
||||
"message": "SEV1 ACTIVE: Payment service degradation. ~12,500 users affected. Failed transactions accumulating. IC: Jane Smith. Bridge: [link]. ETA for mitigation: investigating."
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T14:45:00Z",
|
||||
"channel": "status_page",
|
||||
"audience": "external",
|
||||
"message": "Identified - The issue has been identified as a database configuration problem following a recent deployment. We are implementing a fix. Next update in 15 minutes."
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T15:20:00Z",
|
||||
"channel": "status_page",
|
||||
"audience": "external",
|
||||
"message": "Monitoring - A fix has been implemented and we are monitoring the results. Payment processing is recovering. We will provide a final update once we confirm stability."
|
||||
},
|
||||
{
|
||||
"timestamp": "2024-01-15T16:50:00Z",
|
||||
"channel": "status_page",
|
||||
"audience": "external",
|
||||
"message": "Resolved - The issue affecting payment processing has been resolved. All systems are operating normally. We will publish a full incident report within 48 hours."
|
||||
}
|
||||
],
|
||||
"impact": {
|
||||
"revenue_impact": "high",
|
||||
"affected_users_percentage": 45,
|
||||
"affected_regions": ["us-east-1", "eu-west-1"],
|
||||
"data_integrity_risk": false,
|
||||
"security_breach": false,
|
||||
"customer_facing": true,
|
||||
"degradation_type": "partial",
|
||||
"workaround_available": false
|
||||
},
|
||||
"signals": {
|
||||
"error_rate_percentage": 23.5,
|
||||
"latency_p99_ms": 8500,
|
||||
"affected_endpoints": ["/api/payments", "/api/checkout", "/api/subscriptions"],
|
||||
"dependent_services": ["checkout", "subscription-billing", "order-service"],
|
||||
"alert_count": 12,
|
||||
"customer_reports": 8
|
||||
},
|
||||
"context": {
|
||||
"recent_deployments": [
|
||||
{
|
||||
"service": "user-api",
|
||||
"deployed_at": "2024-01-15T13:45:00Z",
|
||||
"version": "2.4.1",
|
||||
"changes": "Upgraded ORM from 3.1.2 to 3.2.0"
|
||||
}
|
||||
],
|
||||
"ongoing_incidents": [],
|
||||
"maintenance_windows": [],
|
||||
"on_call": {
|
||||
"primary": "alice@company.com",
|
||||
"secondary": "bob@company.com",
|
||||
"escalation_manager": "director-eng@company.com"
|
||||
}
|
||||
},
|
||||
"resolution": {
|
||||
"root_cause": "Database connection pool exhaustion caused by ORM 3.2.0 opening 3x more connections per request than previous version 3.1.2, exceeding the pool size of 20",
|
||||
"contributing_factors": [
|
||||
"Insufficient load testing of new ORM version under production-scale connection patterns",
|
||||
"Connection pool monitoring alert threshold set too high (90%) with no warning at 70%",
|
||||
"No canary deployment process for database configuration or ORM changes",
|
||||
"Missing connection pool sizing documentation for service dependencies"
|
||||
],
|
||||
"mitigation_steps": [
|
||||
"Increased connection pool size from 20 to 50 as temporary relief",
|
||||
"Rolled back user-api from v2.4.1 (ORM 3.2.0) to v2.3.9 (ORM 3.1.2)"
|
||||
],
|
||||
"permanent_fix": "Load test ORM 3.2.0 with production connection patterns, update pool sizing, implement canary deployment for ORM changes",
|
||||
"customer_impact": {
|
||||
"affected_users": 12500,
|
||||
"failed_transactions": 342,
|
||||
"revenue_impact_usd": 28500,
|
||||
"data_loss": false
|
||||
}
|
||||
},
|
||||
"action_items": [
|
||||
{
|
||||
"title": "Add connection pool utilization alerting at 70% warning and 85% critical thresholds",
|
||||
"owner": "alice@company.com",
|
||||
"priority": "P1",
|
||||
"deadline": "2024-01-22",
|
||||
"type": "detection",
|
||||
"status": "open"
|
||||
},
|
||||
{
|
||||
"title": "Implement canary deployment pipeline for database configuration and ORM changes",
|
||||
"owner": "bob@company.com",
|
||||
"priority": "P1",
|
||||
"deadline": "2024-02-01",
|
||||
"type": "prevention",
|
||||
"status": "open"
|
||||
},
|
||||
{
|
||||
"title": "Load test ORM v3.2.0 with production-scale connection patterns before re-deployment",
|
||||
"owner": "carol@company.com",
|
||||
"priority": "P2",
|
||||
"deadline": "2024-01-29",
|
||||
"type": "prevention",
|
||||
"status": "open"
|
||||
},
|
||||
{
|
||||
"title": "Document connection pool sizing requirements for all services in runbook",
|
||||
"owner": "alice@company.com",
|
||||
"priority": "P2",
|
||||
"deadline": "2024-02-05",
|
||||
"type": "process",
|
||||
"status": "open"
|
||||
},
|
||||
{
|
||||
"title": "Add ORM connection behavior to integration test suite",
|
||||
"owner": "carol@company.com",
|
||||
"priority": "P3",
|
||||
"deadline": "2024-02-15",
|
||||
"type": "prevention",
|
||||
"status": "open"
|
||||
}
|
||||
],
|
||||
"participants": [
|
||||
{"name": "Jane Smith", "role": "Incident Commander"},
|
||||
{"name": "Alice Chen", "role": "Operations Lead"},
|
||||
{"name": "Bob Kim", "role": "Communications Lead"},
|
||||
{"name": "Carol Davis", "role": "Database SME"}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,372 @@
|
||||
# Incident Response Framework Reference
|
||||
|
||||
Production-grade incident management knowledge base synthesizing PagerDuty, Google SRE, and Atlassian methodologies into a unified, opinionated framework. This document is the source of truth for incident commanders operating under pressure.
|
||||
|
||||
---
|
||||
|
||||
## 1. Industry Framework Comparison
|
||||
|
||||
### PagerDuty Incident Response Model
|
||||
|
||||
PagerDuty's open-source incident response process defines four core roles and five process phases. The model prioritizes **speed of mobilization** over process perfection.
|
||||
|
||||
**Roles:**
|
||||
- **Incident Commander (IC):** Owns the incident end-to-end. Does NOT perform technical investigation. Delegates, coordinates, and makes final escalation decisions. The IC is the single point of authority; conflicting opinions are resolved by the IC, not by committee.
|
||||
- **Scribe:** Captures timestamped decisions, actions, and findings in the incident channel. The scribe never participates in technical work. A good scribe reduces postmortem preparation time by 70%.
|
||||
- **Subject Matter Expert (SME):** Pulled in on-demand for specific subsystems. SMEs report findings to the IC, not to each other. Parallel SME investigations must be coordinated through the IC to avoid duplicated effort.
|
||||
- **Customer Liaison:** Owns all outbound customer communication. Drafts status page updates for IC approval. Shields the technical team from inbound customer inquiries during active incidents.
|
||||
|
||||
**Process Phases:** Detect, Triage, Mobilize, Mitigate, Resolve, Postmortem.
|
||||
|
||||
**Communication Protocol:** PagerDuty mandates a dedicated Slack channel per incident, a bridge call for SEV1/SEV2, and status updates at fixed cadences (every 15 min for SEV1, every 30 min for SEV2). All decisions are announced in the channel, never in DMs or side threads.
|
||||
|
||||
### Google SRE: Managing Incidents (Chapter 14)
|
||||
|
||||
Google's SRE model, documented in *Site Reliability Engineering* (O'Reilly, 2016), emphasizes **role separation** and **clear handoffs** as the primary mechanisms for preventing incident chaos.
|
||||
|
||||
**Key Principles:**
|
||||
- **Operational vs. Communication Tracks:** Google splits incident work into two parallel tracks. The operational track handles technical mitigation. The communication track handles stakeholder updates, executive briefings, and customer notifications. These tracks run independently with the IC bridging them.
|
||||
- **Role Separation is Non-Negotiable:** The person debugging the system must never be the person updating stakeholders. Cognitive load from context-switching between technical work and communication degrades both outputs. Google measured a 40% increase in mean-time-to-resolution (MTTR) when a single person attempted both.
|
||||
- **Clear Handoffs:** When an IC rotates out (recommended every 60-90 minutes for SEV1), the handoff includes: current status summary, active hypotheses, pending actions, and escalation state. Handoffs happen on the bridge call, not asynchronously.
|
||||
- **Defined Command Post:** All communication flows through a single channel. Google uses the term "command post" -- a virtual or physical location where all incident participants converge.
|
||||
|
||||
### Atlassian Incident Management Model
|
||||
|
||||
Atlassian's model, published in their *Incident Management Handbook*, is **severity-driven** and **template-heavy**. It favors structured playbooks over improvisation.
|
||||
|
||||
**Key Characteristics:**
|
||||
- **Severity Levels Drive Everything:** The assigned severity determines who gets paged, what communication templates are used, response time SLAs, and postmortem requirements. Severity is assigned at triage and reassessed every 30 minutes.
|
||||
- **Handbook-Driven Approach:** Atlassian maintains runbooks for every known failure mode. During incidents, responders follow documented playbooks before improvising. This reduces MTTR for known issues by 50-60% but requires significant upfront investment in documentation.
|
||||
- **Communication Templates:** Pre-written templates for status page updates, customer emails, and executive summaries. Templates include severity-specific language and are reviewed quarterly. This eliminates wordsmithing during active incidents.
|
||||
- **Values-Based Decisions:** When runbooks do not cover the situation, Atlassian defaults to a decision hierarchy: (1) protect customer data, (2) restore service, (3) preserve evidence for root cause analysis.
|
||||
|
||||
### Framework Comparison Table
|
||||
|
||||
| Dimension | PagerDuty | Google SRE | Atlassian |
|
||||
|-----------|-----------|------------|-----------|
|
||||
| Primary strength | Speed of mobilization | Role separation discipline | Structured playbooks |
|
||||
| IC authority model | IC has final say | IC coordinates, escalates to VP if blocked | IC follows handbook, escalates if off-script |
|
||||
| Communication style | Dedicated channel + bridge | Command post with dual tracks | Template-driven status updates |
|
||||
| Handoff protocol | Informal | Formal on-call handoff script | Rotation policy in handbook |
|
||||
| Postmortem requirement | All SEV1/SEV2 | All incidents | SEV1/SEV2 mandatory, SEV3 optional |
|
||||
| Best for | Fast-moving startups | Large-scale distributed systems | Regulated or process-heavy orgs |
|
||||
| Weakness | Under-documented for edge cases | Heavyweight for small teams | Rigid, slow to adapt to novel failures |
|
||||
|
||||
### When to Use Which Framework
|
||||
|
||||
- **Teams under 20 engineers:** Start with PagerDuty's model. It is lightweight and prescriptive enough to work without heavy process investment. Add Atlassian-style runbooks as you identify recurring failure modes.
|
||||
- **Teams running 50+ microservices:** Adopt Google SRE's dual-track model. The operational/communication split becomes critical when incidents span multiple teams and subsystems.
|
||||
- **Regulated industries (finance, healthcare, government):** Use Atlassian's handbook-driven approach as the foundation. Regulatory auditors expect documented procedures, and templates satisfy compliance requirements for incident communication records.
|
||||
- **Hybrid (recommended for most teams at scale):** Use PagerDuty's role definitions, Google's track separation, and Atlassian's template library. This is the approach codified in the rest of this document.
|
||||
|
||||
---
|
||||
|
||||
## 2. Severity Definitions
|
||||
|
||||
### Severity Classification Matrix
|
||||
|
||||
| Severity | Impact | Response Time | Update Cadence | Escalation Trigger | Example |
|
||||
|----------|--------|---------------|----------------|---------------------|---------|
|
||||
| **SEV1** | Total service outage or data breach affecting all users. Revenue loss exceeding $10K/hour. Security incident with active exfiltration. | Page IC + on-call within 5 min. All hands mobilized within 15 min. | Every 15 min to stakeholders. Continuous updates in incident channel. | Immediate executive notification. Board notification for data breaches. | Primary database cluster down. Payment processing system offline. Active ransomware attack. |
|
||||
| **SEV2** | Major feature degraded for >30% of users. Revenue impact $1K-$10K/hour. Data integrity concerns without confirmed loss. | IC assigned within 15 min. Responders mobilized within 30 min. | Every 30 min to stakeholders. Every 15 min in incident channel. | Executive notification if unresolved after 1 hour. Upgrade to SEV1 if impact expands. | Search functionality returning errors for 40% of queries. Checkout flow failing intermittently. Authentication latency exceeding 10s. |
|
||||
| **SEV3** | Minor feature degraded or non-critical service impaired. Workaround available. No direct revenue impact. | Acknowledged within 1 hour. Investigation started within 4 hours. | Every 2 hours to stakeholders if actively worked. Daily if deferred. | Escalate to SEV2 if workaround fails or user complaints exceed 50 in 1 hour. | Admin dashboard loading slowly. Email notifications delayed by 30+ minutes. Non-critical API endpoint returning 5xx for <5% of requests. |
|
||||
| **SEV4** | Cosmetic issue, minor bug, or internal tooling degradation. No user-facing impact or negligible impact. | Acknowledged within 1 business day. Prioritized against backlog. | No scheduled updates. Tracked in issue tracker. | Escalate to SEV3 if internal productivity impact exceeds 2 hours/day across team. | Logging pipeline dropping non-critical debug logs. Internal metrics dashboard showing stale data. Minor UI alignment issue on one browser. |
|
||||
|
||||
### Customer-Facing Signals by Severity
|
||||
|
||||
**SEV1 Signals:** Support ticket volume spikes >500% of baseline within 15 minutes. Social media mentions of outage trend upward. Revenue dashboards show >95% drop in transaction volume. Multiple monitoring systems alarm simultaneously.
|
||||
|
||||
**SEV2 Signals:** Support ticket volume spikes 100-500% of baseline. Specific feature-related complaints cluster in support channels. Partial transaction failures visible in payment dashboards. Single monitoring system shows sustained alerting.
|
||||
|
||||
**SEV3 Signals:** Sporadic support tickets with a common pattern (under 20/hour). Users report intermittent issues with workarounds. Monitoring shows degraded but not critical metrics.
|
||||
|
||||
**SEV4 Signals:** Internal team notices issue during routine work. Occasional user mention with no pattern or urgency. Monitoring shows minor anomaly within acceptable thresholds.
|
||||
|
||||
### Severity Upgrade and Downgrade Criteria
|
||||
|
||||
**Upgrade from SEV2 to SEV1:** Impact expands to >80% of users, revenue impact confirmed above $10K/hour, data integrity compromise confirmed, or mitigation attempt fails after 45 minutes.
|
||||
|
||||
**Downgrade from SEV1 to SEV2:** Partial mitigation restores service for >70% of users, revenue impact drops below $10K/hour, and no ongoing data integrity concern.
|
||||
|
||||
**Downgrade from SEV2 to SEV3:** Workaround deployed and communicated, impact limited to <10% of users, and no revenue impact.
|
||||
|
||||
Severity changes must be announced by the IC in the incident channel with justification. The scribe logs the timestamp and rationale.
|
||||
|
||||
|
||||
---
|
||||
|
||||
## 3. Role Definitions
|
||||
|
||||
### Incident Commander (IC)
|
||||
|
||||
The IC is the single decision-maker during an incident. This role exists to eliminate decision-by-committee, which adds 20-40 minutes to MTTR in measured studies.
|
||||
|
||||
**Responsibilities:**
|
||||
- Assign severity level at triage (reassess every 30 minutes)
|
||||
- Assign all other incident roles
|
||||
- Approve status page updates before publication
|
||||
- Make go/no-go decisions on mitigation strategies (rollback, feature flag, scaling)
|
||||
- Decide when to escalate to executive leadership
|
||||
- Declare incident resolved and initiate postmortem scheduling
|
||||
|
||||
**Decision Authority:** The IC can authorize rollbacks, page any team member regardless of org chart, approve customer communications, and override objections from individual contributors during active mitigation. The IC cannot approve financial expenditures above $50K or public press statements -- those require VP/C-level approval.
|
||||
|
||||
**What the IC Must NOT Do:** Debug code, write queries, SSH into production servers, or perform any hands-on technical work. The moment an IC starts debugging, incident coordination degrades. If the IC is the only person with domain expertise, they must hand off IC duties before engaging technically.
|
||||
|
||||
### Communications Lead
|
||||
|
||||
**Responsibilities:**
|
||||
- Draft all status page updates using severity-appropriate templates
|
||||
- Coordinate with Customer Liaison on outbound customer messaging
|
||||
- Maintain the executive summary document (updated every 30 min for SEV1/SEV2)
|
||||
- Manage the stakeholder notification list and delivery
|
||||
- Post scheduled updates even when there is no new information ("We are continuing to investigate" is a valid update)
|
||||
|
||||
### Operations Lead
|
||||
|
||||
**Responsibilities:**
|
||||
- Coordinate technical investigation across SMEs
|
||||
- Maintain the running hypothesis list and assign investigation tasks
|
||||
- Report technical findings to the IC in plain language
|
||||
- Execute mitigation actions approved by the IC
|
||||
- Track parallel workstreams and prevent duplicated effort
|
||||
|
||||
### Scribe
|
||||
|
||||
**Responsibilities:**
|
||||
- Maintain a timestamped log of all decisions, actions, and findings
|
||||
- Document who said what and when in the incident channel
|
||||
- Capture rollback decisions, hypothesis changes, and escalation triggers
|
||||
- Produce the initial postmortem timeline (saves 2-4 hours of postmortem prep)
|
||||
|
||||
### Subject Matter Experts (SMEs)
|
||||
|
||||
SMEs are paged on-demand by the IC for specific subsystems. They report findings to the Operations Lead, not directly to stakeholders. An SME who identifies a potential fix proposes it to the IC for approval before executing. SMEs are released from the incident explicitly by the IC when their subsystem is cleared.
|
||||
|
||||
### Customer Liaison
|
||||
|
||||
Owns the customer-facing voice during the incident. Monitors support channels for inbound customer reports. Drafts customer notification emails. Updates the public status page (after IC approval). Shields the technical team from direct customer inquiries during active mitigation.
|
||||
|
||||
---
|
||||
|
||||
## 4. Communication Protocols
|
||||
|
||||
### Incident Channel Naming Convention
|
||||
|
||||
Format: `#inc-YYYYMMDD-brief-desc`
|
||||
|
||||
Examples:
|
||||
- `#inc-20260216-payment-api-timeout`
|
||||
- `#inc-20260216-db-primary-failover`
|
||||
- `#inc-20260216-auth-service-degraded`
|
||||
|
||||
Channel topic must include: severity, IC name, bridge call link, status page link.
|
||||
Example topic: `SEV1 | IC: @jane.smith | Bridge: https://meet.example.com/inc-20260216 | Status: https://status.example.com`
|
||||
|
||||
### Internal Status Update Templates
|
||||
|
||||
**SEV1/SEV2 Update Template (posted in incident channel and executive Slack channel):**
|
||||
```
|
||||
INCIDENT UPDATE - [SEV1/SEV2] - [HH:MM UTC]
|
||||
Status: [Investigating | Identified | Mitigating | Resolved]
|
||||
Impact: [Specific user-facing impact in plain language]
|
||||
Current Action: [What is actively being done right now]
|
||||
Next Update: [HH:MM UTC]
|
||||
IC: @[name]
|
||||
```
|
||||
|
||||
**Executive Summary Template (for SEV1, updated every 30 min):**
|
||||
```
|
||||
EXECUTIVE SUMMARY - [Incident Title] - [HH:MM UTC]
|
||||
Severity: SEV1
|
||||
Duration: [X hours Y minutes]
|
||||
Customer Impact: [Number of affected users/transactions]
|
||||
Revenue Impact: [Estimated $ if known, "assessing" if not]
|
||||
Current Status: [One sentence]
|
||||
Mitigation ETA: [Estimated time or "unknown"]
|
||||
Next Escalation Point: [What triggers executive action]
|
||||
```
|
||||
|
||||
### Status Page Update Templates
|
||||
|
||||
**SEV1 Initial Post:**
|
||||
```
|
||||
Title: [Service Name] - Service Disruption
|
||||
Body: We are currently experiencing a disruption affecting [service/feature].
|
||||
Users may encounter [specific symptom: errors, timeouts, inability to access].
|
||||
Our engineering team has been mobilized and is actively investigating.
|
||||
We will provide an update within 15 minutes.
|
||||
```
|
||||
|
||||
**SEV1 Update (mitigation in progress):**
|
||||
```
|
||||
Title: [Service Name] - Service Disruption (Update)
|
||||
Body: We have identified the cause of the disruption affecting [service/feature]
|
||||
and are implementing a fix. Some users may continue to experience [symptom].
|
||||
We expect to have an update on resolution within [X] minutes.
|
||||
```
|
||||
|
||||
**SEV1 Resolution:**
|
||||
```
|
||||
Title: [Service Name] - Resolved
|
||||
Body: The disruption affecting [service/feature] has been resolved as of [HH:MM UTC].
|
||||
Service has been restored to normal operation. Users should no longer experience
|
||||
[symptom]. We will publish a full incident report within 48 hours.
|
||||
We apologize for the inconvenience.
|
||||
```
|
||||
|
||||
**SEV2 Initial Post:**
|
||||
```
|
||||
Title: [Service Name] - Degraded Performance
|
||||
Body: We are investigating reports of degraded performance affecting [feature].
|
||||
Some users may experience [specific symptom]. A workaround is [available/not yet available].
|
||||
Our team is actively investigating and we will provide an update within 30 minutes.
|
||||
```
|
||||
|
||||
### Bridge Call / War Room Etiquette
|
||||
|
||||
1. **Mute by default.** Unmute only when speaking to the IC or Operations Lead.
|
||||
2. **Identify yourself before speaking.** "This is [name] from [team]." Every time.
|
||||
3. **State findings, then recommendations.** "Database replication lag is 45 seconds and climbing. I recommend we fail over to the secondary cluster."
|
||||
4. **IC confirms before action.** No unilateral action on production systems during an incident. The IC says "approved" or "hold" before anyone executes.
|
||||
5. **No side conversations.** If two SMEs need to discuss a hypothesis, they take it to a breakout channel and report back findings to the main bridge.
|
||||
6. **Time-box debugging.** The IC sets 15-minute timers for investigation threads. If a hypothesis is not confirmed or denied in 15 minutes, pivot to the next hypothesis or escalate.
|
||||
|
||||
### Customer Notification Templates
|
||||
|
||||
**SEV1 Customer Email (B2B, enterprise accounts):**
|
||||
```
|
||||
Subject: [Company Name] Service Incident - [Date]
|
||||
|
||||
Dear [Customer Name],
|
||||
|
||||
We are writing to inform you of a service incident affecting [product/service]
|
||||
that began at [HH:MM UTC] on [date].
|
||||
|
||||
Impact: [Specific impact to this customer's usage]
|
||||
Current Status: [Brief status]
|
||||
Expected Resolution: [ETA if known, or "We are working to resolve this as quickly as possible"]
|
||||
|
||||
We will continue to provide updates every [15/30] minutes until resolution.
|
||||
Your dedicated account team is available at [contact info] for any questions.
|
||||
|
||||
Sincerely,
|
||||
[Name], [Title]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Escalation Matrix
|
||||
|
||||
### Escalation Tiers
|
||||
|
||||
**Tier 1 - Within Team (0-15 minutes):**
|
||||
On-call engineer investigates. If the issue is within the team's domain and matches a known runbook, resolve without escalation. Page the IC if severity is SEV2 or higher, or if the issue is not resolved within 15 minutes.
|
||||
|
||||
**Tier 2 - Cross-Team (15-45 minutes):**
|
||||
IC pages SMEs from adjacent teams. Common cross-team escalations: database team for replication issues, networking team for connectivity failures, security team for suspicious activity. Cross-team SMEs join the incident channel and bridge call.
|
||||
|
||||
**Tier 3 - Executive (45+ minutes or immediate for SEV1):**
|
||||
VP of Engineering notified for all SEV1 incidents immediately. CTO notified if SEV1 exceeds 1 hour without mitigation progress. CEO notified if SEV1 involves data breach or regulatory implications. Executive involvement is for resource allocation and external communication decisions, not technical direction.
|
||||
|
||||
### Time-Based Escalation Triggers
|
||||
|
||||
| Elapsed Time | SEV1 Action | SEV2 Action |
|
||||
|-------------|-------------|-------------|
|
||||
| 0 min | Page IC + all on-call. Notify VP Eng. | Page IC + primary on-call. |
|
||||
| 15 min | Confirm all roles staffed. Open bridge call. | IC assesses if additional SMEs needed. |
|
||||
| 30 min | If no mitigation path identified, page backup on-call for all related services. | First stakeholder update. Reassess severity. |
|
||||
| 45 min | Escalate to CTO if no progress. Consider customer notification. | If no progress, consider escalating to SEV1. |
|
||||
| 60 min | CTO briefing. Initiate customer notification if not already done. | Notify VP Eng. Page cross-team SMEs. |
|
||||
| 90 min | IC rotation (fresh IC takes over). Reassess all hypotheses. | IC rotation if needed. |
|
||||
| 120 min | CEO briefing if data breach or regulatory risk. External PR team engaged. | Escalate to SEV1 if impact has not decreased. |
|
||||
|
||||
### Escalation Path Examples
|
||||
|
||||
**Database failover failure:**
|
||||
On-call DBA (Tier 1, 0-15 min) -> IC + DBA team lead (Tier 2, 15 min) -> Infrastructure VP + cloud provider support (Tier 3, 45 min)
|
||||
|
||||
**Payment processing outage:**
|
||||
On-call payments engineer (Tier 1, 0-5 min) -> IC + payments team lead + payment provider liaison (Tier 2, 5 min, immediate due to revenue impact) -> CFO + VP Eng (Tier 3, 15 min if provider-side issue confirmed)
|
||||
|
||||
**Security incident (suspected breach):**
|
||||
Security on-call (Tier 1, 0-5 min) -> CISO + IC + legal counsel (Tier 2, immediate) -> CEO + external incident response firm (Tier 3, within 1 hour if breach confirmed)
|
||||
|
||||
### On-Call Rotation Best Practices
|
||||
|
||||
- **Primary + secondary on-call** for every critical service. Secondary is paged automatically if primary does not acknowledge within 5 minutes.
|
||||
- **On-call shifts are 7 days maximum.** Longer rotations degrade alertness and response quality.
|
||||
- **Handoff checklist:** Current open issues, recent deploys in the last 48 hours, known risks or maintenance windows, escalation contacts for dependent services.
|
||||
- **On-call load budget:** No more than 2 pages per night on average, measured weekly. Exceeding this indicates systemic reliability issues that must be addressed with engineering investment, not heroic on-call effort.
|
||||
|
||||
---
|
||||
|
||||
## 6. Incident Lifecycle Phases
|
||||
|
||||
### Phase 1: Detection
|
||||
|
||||
Detection comes from three sources, in order of preference:
|
||||
|
||||
1. **Automated monitoring (preferred):** Alerting rules on latency (p99 > 2x baseline), error rates (5xx > 1% of requests), saturation (CPU > 85%, memory > 90%, disk > 80%), and business metrics (transaction volume drops > 20% from 15-minute rolling average). Alerts should fire within 60 seconds of threshold breach.
|
||||
2. **Internal reports:** An engineer notices anomalous behavior during routine work. Internal detection typically adds 5-15 minutes to response time compared to automated monitoring.
|
||||
3. **Customer reports:** Customers contact support about issues. This is the worst detection source. If customers detect incidents before monitoring, the monitoring coverage has a gap that must be closed in the postmortem.
|
||||
|
||||
**Detection SLA:** SEV1 incidents must be detected within 5 minutes of impact onset. If detection latency exceeds this, the postmortem must include a monitoring improvement action item.
|
||||
|
||||
### Phase 2: Triage
|
||||
|
||||
The first responder performs initial triage within 5 minutes of detection:
|
||||
|
||||
1. **Scope assessment:** How many users, services, or regions are affected? Check dashboards, not assumptions.
|
||||
2. **Severity assignment:** Use the severity matrix in Section 2. When in doubt, assign higher severity. Downgrading is cheap; delayed escalation is expensive.
|
||||
3. **IC assignment:** For SEV1/SEV2, page the on-call IC immediately. For SEV3, the first responder may self-assign IC duties.
|
||||
4. **Initial hypothesis:** What changed in the last 2 hours? Check deploy logs, config changes, upstream dependency status, and traffic patterns. 70% of incidents correlate with a change deployed in the prior 2 hours.
|
||||
|
||||
### Phase 3: Mobilization
|
||||
|
||||
The IC executes mobilization within 10 minutes of assignment:
|
||||
|
||||
1. **Create incident channel:** `#inc-YYYYMMDD-brief-desc`. Set topic with severity, IC name, bridge link.
|
||||
2. **Assign roles:** Communications Lead, Operations Lead, Scribe. For SEV3/SEV4, the IC may cover multiple roles.
|
||||
3. **Open bridge call (SEV1/SEV2):** Share link in incident channel. All responders join within 5 minutes.
|
||||
4. **Post initial summary:** Current understanding, affected services, assigned roles, first actions.
|
||||
5. **Notify stakeholders:** Page dependent teams. Notify customer support leadership. For SEV1, notify executive chain per escalation matrix.
|
||||
|
||||
### Phase 4: Investigation
|
||||
|
||||
Investigation runs as parallel workstreams coordinated by the Operations Lead:
|
||||
|
||||
- **Workstream discipline:** Each SME investigates one hypothesis at a time. The Operations Lead tracks active hypotheses on a shared list. Completed investigations report: confirmed, denied, or inconclusive.
|
||||
- **Hypothesis testing priority:** (1) Recent changes (deploys, configs, feature flags), (2) Upstream dependency failures, (3) Capacity exhaustion, (4) Data corruption, (5) Security compromise.
|
||||
- **15-minute rule:** If a hypothesis is not confirmed or denied within 15 minutes, the IC decides whether to continue, pivot, or escalate. Unbounded investigation is the leading cause of extended MTTR.
|
||||
- **Evidence collection:** Screenshots, log snippets, metric graphs, and query results are posted in the incident channel, not described verbally. The scribe tags evidence with timestamps.
|
||||
|
||||
### Phase 5: Mitigation
|
||||
|
||||
Mitigation prioritizes restoring service over finding root cause:
|
||||
|
||||
- **Rollback first:** If a deploy correlates with the incident, roll it back before investigating further. A 5-minute rollback beats a 45-minute investigation. Rollback authority rests with the IC.
|
||||
- **Feature flags:** Disable the suspected feature via feature flag if available. This is faster and less risky than a full rollback.
|
||||
- **Scaling:** If the issue is capacity-related, scale horizontally before investigating the traffic source.
|
||||
- **Failover:** If a primary system is unrecoverable, fail over to the secondary. Test failover procedures quarterly so this is a routine, not a gamble.
|
||||
- **Customer workaround:** If mitigation will take time, publish a workaround for customers (e.g., "Use the mobile app while we restore web access").
|
||||
|
||||
**Mitigation verification:** After applying mitigation, monitor key metrics for 15 minutes before declaring the issue mitigated. Premature declarations that the issue is mitigated followed by recurrence damage team credibility and customer trust.
|
||||
|
||||
### Phase 6: Resolution
|
||||
|
||||
Resolution is declared when the root cause is addressed and service is operating normally:
|
||||
|
||||
- **Verification checklist:** Error rates returned to baseline, latency returned to baseline, no ongoing customer reports, monitoring confirms stability for 30+ minutes.
|
||||
- **Incident channel update:** IC posts final status with resolution summary, total duration, and next steps.
|
||||
- **Status page update:** Post resolution notice within 15 minutes of declaring resolved.
|
||||
- **Stand down:** IC explicitly releases all responders. SMEs return to normal work. Bridge call is closed.
|
||||
|
||||
### Phase 7: Postmortem
|
||||
|
||||
Postmortem is mandatory for SEV1 and SEV2. Optional but recommended for SEV3. Never conducted for SEV4.
|
||||
|
||||
- **Timeline:** Postmortem document drafted within 24 hours. Postmortem meeting held within 72 hours (3 business days). Action items assigned and tracked in the team's issue tracker.
|
||||
- **Blameless standard:** The postmortem examines systems, processes, and tools -- not individual performance. "Why did the system allow this?" not "Why did [person] do this?"
|
||||
- **Required sections:** Timeline (from scribe's log), root cause analysis (using 5 Whys or fault tree), impact summary (users, revenue, duration), what went well, what went poorly, action items with owners and due dates.
|
||||
- **Action items and recurrence:** Every postmortem produces 3-7 concrete action items. Items without owners and due dates are not action items. Teams should close 80%+ within 30 days. If the same root cause appears in two postmortems within 6 months, escalate to engineering leadership as a systemic reliability investment area.
|
||||
@@ -0,0 +1,566 @@
|
||||
# SLA Management Guide
|
||||
|
||||
> Comprehensive reference for Service Level Agreements, Objectives, and Indicators.
|
||||
> Designed for incident commanders who must understand, protect, and communicate SLA status during and after incidents.
|
||||
|
||||
---
|
||||
|
||||
## 1. Definitions & Relationships
|
||||
|
||||
### Service Level Indicator (SLI)
|
||||
|
||||
An SLI is the quantitative measurement of a specific aspect of service quality. SLIs are the raw data that feed everything above them. They must be precisely defined, automatically collected, and unambiguous.
|
||||
|
||||
**Common SLI types by service:**
|
||||
|
||||
| Service Type | SLI | Measurement Method |
|
||||
|---|---|---|
|
||||
| Web Application | Request latency (p50, p95, p99) | Server-side histogram |
|
||||
| Web Application | Availability (successful responses / total requests) | Load balancer logs |
|
||||
| REST API | Error rate (5xx responses / total responses) | API gateway metrics |
|
||||
| REST API | Throughput (requests per second) | Counter metric |
|
||||
| Database | Query latency (p99) | Slow query log + APM |
|
||||
| Database | Replication lag (seconds) | Replica monitoring |
|
||||
| Message Queue | End-to-end delivery latency | Timestamp comparison |
|
||||
| Message Queue | Message loss rate | Producer vs consumer counts |
|
||||
| Storage | Durability (objects lost / objects stored) | Integrity checksums |
|
||||
| CDN | Cache hit ratio | Edge server logs |
|
||||
|
||||
**SLI specification formula:**
|
||||
|
||||
```
|
||||
SLI = (good events / total events) x 100
|
||||
```
|
||||
|
||||
For availability: `SLI = (successful requests / total requests) x 100`
|
||||
For latency: `SLI = (requests faster than threshold / total requests) x 100`
|
||||
|
||||
### Service Level Objective (SLO)
|
||||
|
||||
An SLO is the target value or range for an SLI. It defines the acceptable level of reliability. SLOs are internal goals that engineering teams commit to.
|
||||
|
||||
**Setting meaningful SLOs:**
|
||||
|
||||
1. Measure the current baseline over 30 days minimum
|
||||
2. Subtract a safety margin (typically 0.05%-0.1% below actual performance)
|
||||
3. Validate against user expectations and business requirements
|
||||
4. Never set an SLO higher than what the system can sustain without heroics
|
||||
|
||||
**Common pitfall:** Setting 99.99% availability when 99.9% meets every user need. The jump from 99.9% to 99.99% is a 10x reduction in allowed downtime and typically requires 3-5x the engineering investment.
|
||||
|
||||
**SLO examples:**
|
||||
|
||||
- `99.9% of HTTP requests return a non-5xx response within each calendar month`
|
||||
- `95% of API requests complete in under 200ms (p95 latency)`
|
||||
- `99.95% of messages are delivered within 30 seconds of production`
|
||||
|
||||
### Service Level Agreement (SLA)
|
||||
|
||||
An SLA is a formal contract between a service provider and its customers that specifies consequences for failing to meet defined service levels. SLAs must always be looser than SLOs to provide a buffer zone.
|
||||
|
||||
**Rule of thumb:** If your SLO is 99.95%, your SLA should be 99.9% or lower. The gap between SLO and SLA is your safety margin.
|
||||
|
||||
### The Hierarchy
|
||||
|
||||
```
|
||||
SLA (99.9%) ← Contract with customers, financial penalties
|
||||
↑ backs
|
||||
SLO (99.95%) ← Internal target, triggers error budget policy
|
||||
↑ targets
|
||||
SLI (measured) ← Raw metric: actual uptime = 99.97% this month
|
||||
```
|
||||
|
||||
**Standard combinations by tier:**
|
||||
|
||||
| Tier | SLI (Metric) | SLO (Target) | SLA (Contract) | Allowed Downtime/Month |
|
||||
|---|---|---|---|---|
|
||||
| Critical (payments) | Availability | 99.99% | 99.95% | SLO: 4.38 min / SLA: 21.9 min |
|
||||
| High (core API) | Availability | 99.95% | 99.9% | SLO: 21.9 min / SLA: 43.8 min |
|
||||
| Standard (dashboard) | Availability | 99.9% | 99.5% | SLO: 43.8 min / SLA: 3.65 hrs |
|
||||
| Low (internal tools) | Availability | 99.5% | 99.0% | SLO: 3.65 hrs / SLA: 7.3 hrs |
|
||||
|
||||
---
|
||||
|
||||
## 2. Error Budget Policy
|
||||
|
||||
### What Is an Error Budget
|
||||
|
||||
An error budget is the maximum amount of unreliability a service can have within a given period while still meeting its SLO. It is calculated as:
|
||||
|
||||
```
|
||||
Error Budget = 1 - SLO target
|
||||
```
|
||||
|
||||
For a 99.9% SLO over a 30-day month (43,200 minutes):
|
||||
|
||||
```
|
||||
Error Budget = 1 - 0.999 = 0.001 = 0.1%
|
||||
Allowed Downtime = 43,200 x 0.001 = 43.2 minutes
|
||||
```
|
||||
|
||||
### Downtime Allowances by SLO
|
||||
|
||||
| SLO | Error Budget | Monthly Downtime | Quarterly Downtime | Annual Downtime |
|
||||
|---|---|---|---|---|
|
||||
| 99.0% | 1.0% | 7 hrs 18 min | 21 hrs 54 min | 3 days 15 hrs |
|
||||
| 99.5% | 0.5% | 3 hrs 39 min | 10 hrs 57 min | 1 day 19 hrs |
|
||||
| 99.9% | 0.1% | 43.8 min | 2 hrs 11 min | 8 hrs 46 min |
|
||||
| 99.95% | 0.05% | 21.9 min | 1 hr 6 min | 4 hrs 23 min |
|
||||
| 99.99% | 0.01% | 4.38 min | 13.1 min | 52.6 min |
|
||||
| 99.999% | 0.001% | 26.3 sec | 78.9 sec | 5.26 min |
|
||||
|
||||
### Error Budget Consumption Tracking
|
||||
|
||||
Track budget consumption as a percentage of the total budget used so far in the current window:
|
||||
|
||||
```
|
||||
Budget Consumed (%) = (actual bad minutes / allowed bad minutes) x 100
|
||||
```
|
||||
|
||||
Example: SLO is 99.9% (43.8 min budget/month). On day 10, you have had 15 minutes of downtime.
|
||||
|
||||
```
|
||||
Budget Consumed = (15 / 43.8) x 100 = 34.2%
|
||||
Expected consumption at day 10 = (10/30) x 100 = 33.3%
|
||||
Status: Slightly over pace (34.2% consumed at 33.3% of month elapsed)
|
||||
```
|
||||
|
||||
### Burn Rate
|
||||
|
||||
Burn rate measures how fast the error budget is being consumed relative to the steady-state rate:
|
||||
|
||||
```
|
||||
Burn Rate = (error rate observed / error rate allowed by SLO)
|
||||
```
|
||||
|
||||
A burn rate of 1.0 means the budget will be exactly exhausted by the end of the window. A burn rate of 10 means the budget will be exhausted in 1/10th of the window.
|
||||
|
||||
**Burn rate to time-to-exhaustion (30-day month):**
|
||||
|
||||
| Burn Rate | Budget Exhausted In | Urgency |
|
||||
|---|---|---|
|
||||
| 1x | 30 days | On pace, monitoring only |
|
||||
| 2x | 15 days | Elevated attention |
|
||||
| 6x | 5 days | Active investigation required |
|
||||
| 14.4x | 2.08 days (~50 hours) | Immediate page |
|
||||
| 36x | 20 hours | Critical, all-hands |
|
||||
| 720x | 1 hour | Total outage scenario |
|
||||
|
||||
### Error Budget Exhaustion Policy
|
||||
|
||||
When the error budget is consumed, the following actions trigger based on threshold:
|
||||
|
||||
**Tier 1 - Budget at 75% consumed (Yellow):**
|
||||
- Notify service team lead via automated alert
|
||||
- Freeze non-critical deployments to the affected service
|
||||
- Conduct pre-emptive review of upcoming changes for risk
|
||||
- Increase monitoring sensitivity (lower alert thresholds)
|
||||
|
||||
**Tier 2 - Budget at 100% consumed (Orange):**
|
||||
- Hard feature freeze on the affected service
|
||||
- Mandatory reliability sprint: all engineering effort redirected to reliability
|
||||
- Daily status updates to engineering leadership
|
||||
- Postmortem required for the incidents that consumed the budget
|
||||
- Freeze lasts until budget replenishes to 50% or systemic fixes are verified
|
||||
|
||||
**Tier 3 - Budget at 150% consumed / SLA breach imminent (Red):**
|
||||
- Escalation to VP Engineering and CTO
|
||||
- Cross-team war room if dependencies are involved
|
||||
- Customer communication prepared and staged
|
||||
- Legal and finance teams briefed on potential SLA credit obligations
|
||||
- Recovery plan with specific milestones required within 24 hours
|
||||
|
||||
### Error Budget Policy Template
|
||||
|
||||
```
|
||||
SERVICE: [service-name]
|
||||
SLO: [target]% availability over [rolling 30-day / calendar month] window
|
||||
ERROR BUDGET: [calculated] minutes per window
|
||||
|
||||
BUDGET THRESHOLDS:
|
||||
- 50% consumed: Team notification, increased vigilance
|
||||
- 75% consumed: Feature freeze for this service, reliability focus
|
||||
- 100% consumed: Full feature freeze, reliability sprint mandatory
|
||||
- SLA threshold crossed: Executive escalation, customer communication
|
||||
|
||||
REVIEW CADENCE: Monthly budget review on [day], quarterly SLO adjustment
|
||||
|
||||
EXCEPTIONS: Planned maintenance windows excluded if communicated 72+ hours in advance
|
||||
and within agreed maintenance allowance.
|
||||
|
||||
APPROVED BY: [Engineering Lead] / [Product Lead] / [Date]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. SLA Breach Handling
|
||||
|
||||
### Detection Methods
|
||||
|
||||
**Automated detection (primary):**
|
||||
- Real-time monitoring dashboards with SLA burn-rate alerts
|
||||
- Automated SLA compliance calculations running every 5 minutes
|
||||
- Threshold-based alerts when cumulative downtime approaches SLA limits
|
||||
- Synthetic monitoring (external probes) for customer-perspective validation
|
||||
|
||||
**Manual review (secondary):**
|
||||
- Monthly SLA compliance reports generated on the 1st of each month
|
||||
- Customer-reported incidents cross-referenced with internal metrics
|
||||
- Quarterly audits comparing measured SLIs against contracted SLAs
|
||||
- Discrepancy review between internal metrics and customer-perceived availability
|
||||
|
||||
### Breach Classification
|
||||
|
||||
**Minor Breach:**
|
||||
- SLA missed by less than 0.05 percentage points (e.g., 99.85% vs 99.9% SLA)
|
||||
- Fewer than 3 discrete incidents contributed
|
||||
- No single incident exceeded 30 minutes
|
||||
- Customer impact was limited or partial degradation only
|
||||
- Financial credit: typically 5-10% of monthly service fee
|
||||
|
||||
**Major Breach:**
|
||||
- SLA missed by 0.05 to 0.5 percentage points
|
||||
- Extended outage of 1-4 hours in a single incident, or multiple significant incidents
|
||||
- Clear customer impact with support tickets generated
|
||||
- Financial credit: typically 10-25% of monthly service fee
|
||||
|
||||
**Critical Breach:**
|
||||
- SLA missed by more than 0.5 percentage points
|
||||
- Total outage exceeding 4 hours, or repeated major incidents in same window
|
||||
- Data loss, security incident, or compliance violation involved
|
||||
- Financial credit: typically 25-100% of monthly service fee
|
||||
- May trigger contract termination clauses
|
||||
|
||||
### Response Protocol
|
||||
|
||||
**For Minor Breach (within 3 business days):**
|
||||
1. Generate SLA compliance report with exact metrics
|
||||
2. Document contributing incidents with root causes
|
||||
3. Send proactive notification to customer success manager
|
||||
4. Issue service credits if contractually required (do not wait for customer to ask)
|
||||
5. File internal improvement ticket with 30-day remediation target
|
||||
|
||||
**For Major Breach (within 24 hours):**
|
||||
1. Incident commander confirms SLA impact calculation
|
||||
2. Draft customer communication (see template below)
|
||||
3. Executive sponsor reviews and approves communication
|
||||
4. Issue service credits with detailed breakdown
|
||||
5. Schedule root cause review with customer within 5 business days
|
||||
6. Produce remediation plan with committed timelines
|
||||
|
||||
**For Critical Breach (immediate):**
|
||||
1. Activate executive escalation chain
|
||||
2. Legal team reviews contractual exposure
|
||||
3. Finance team calculates credit obligations
|
||||
4. Customer communication from VP or C-level within 4 hours
|
||||
5. Dedicated remediation task force assigned
|
||||
6. Weekly status updates to customer until remediation complete
|
||||
7. Formal postmortem document shared with customer within 10 business days
|
||||
|
||||
### Customer Communication Template
|
||||
|
||||
```
|
||||
Subject: Service Level Update - [Service Name] - [Month Year]
|
||||
|
||||
Dear [Customer Name],
|
||||
|
||||
We are writing to inform you that [Service Name] did not meet the committed
|
||||
service level of [SLA target]% availability during [time period].
|
||||
|
||||
MEASURED PERFORMANCE: [actual]% availability
|
||||
COMMITTED SLA: [SLA target]% availability
|
||||
SHORTFALL: [delta] percentage points
|
||||
|
||||
CONTRIBUTING FACTORS:
|
||||
- [Date/Time]: [Brief description of incident] ([duration] impact)
|
||||
- [Date/Time]: [Brief description of incident] ([duration] impact)
|
||||
|
||||
SERVICE CREDIT: In accordance with our agreement, a credit of [amount/percentage]
|
||||
will be applied to your next invoice.
|
||||
|
||||
REMEDIATION ACTIONS:
|
||||
1. [Specific technical fix with completion date]
|
||||
2. [Process improvement with implementation date]
|
||||
3. [Monitoring enhancement with deployment date]
|
||||
|
||||
We take our service commitments seriously. [Name], [Title] is personally
|
||||
overseeing the remediation and is available to discuss further at your convenience.
|
||||
|
||||
Sincerely,
|
||||
[Name, Title]
|
||||
```
|
||||
|
||||
### Legal and Compliance Considerations
|
||||
|
||||
- Maintain auditable records of all SLA measurements for the full contract term plus 2 years
|
||||
- SLA calculations must use the measurement methodology defined in the contract, not internal approximations
|
||||
- Force majeure clauses typically exclude natural disasters, but verify per contract
|
||||
- Planned maintenance exclusions must match the exact notification procedures in the contract
|
||||
- Multi-region SLAs may have separate calculations per region; verify aggregation method
|
||||
|
||||
---
|
||||
|
||||
## 4. Incident-to-SLA Mapping
|
||||
|
||||
### Downtime Calculation Methodologies
|
||||
|
||||
**Full outage:** Service completely unavailable. Every minute counts as a full minute of downtime.
|
||||
|
||||
```
|
||||
Downtime = End Time - Start Time (in minutes)
|
||||
```
|
||||
|
||||
**Partial degradation:** Service available but impaired. Apply a degradation factor:
|
||||
|
||||
```
|
||||
Effective Downtime = Actual Duration x Degradation Factor
|
||||
```
|
||||
|
||||
| Degradation Level | Factor | Description |
|
||||
|---|---|---|
|
||||
| Complete outage | 1.0 | Service fully unavailable |
|
||||
| Severe degradation | 0.75 | >50% of requests failing or >10x latency |
|
||||
| Moderate degradation | 0.5 | 10-50% of requests affected or 3-10x latency |
|
||||
| Minor degradation | 0.25 | <10% of requests affected or <3x latency increase |
|
||||
| Cosmetic / non-functional | 0.0 | No impact on core SLI metrics |
|
||||
|
||||
**Note:** The exact degradation factors must be agreed upon in the SLA contract. The above are industry-standard starting points.
|
||||
|
||||
### Planned vs Unplanned Downtime
|
||||
|
||||
Most SLAs exclude pre-announced maintenance windows from availability calculations, subject to conditions:
|
||||
|
||||
- Notification provided N hours/days in advance (commonly 72 hours)
|
||||
- Maintenance occurs within an agreed window (e.g., Sunday 02:00-06:00 UTC)
|
||||
- Total planned downtime does not exceed the monthly maintenance allowance (e.g., 4 hours/month)
|
||||
- Any overrun beyond the planned window counts as unplanned downtime
|
||||
|
||||
```
|
||||
SLA Availability = (Total Minutes - Excluded Maintenance - Unplanned Downtime) / (Total Minutes - Excluded Maintenance) x 100
|
||||
```
|
||||
|
||||
### Multi-Service SLA Composition
|
||||
|
||||
When a customer-facing product depends on multiple services, composite SLA is calculated as:
|
||||
|
||||
**Serial dependency (all must be up):**
|
||||
```
|
||||
Composite SLA = SLA_A x SLA_B x SLA_C
|
||||
Example: 99.9% x 99.95% x 99.99% = 99.84%
|
||||
```
|
||||
|
||||
**Parallel / redundant (any one must be up):**
|
||||
```
|
||||
Composite Availability = 1 - ((1 - SLA_A) x (1 - SLA_B))
|
||||
Example: 1 - ((1 - 0.999) x (1 - 0.999)) = 1 - 0.000001 = 99.9999%
|
||||
```
|
||||
|
||||
This is critical during incidents: an outage in a shared dependency may breach SLAs for multiple customer-facing products simultaneously.
|
||||
|
||||
### Worked Examples
|
||||
|
||||
**Example 1: Simple outage**
|
||||
- Service: Core API (SLA: 99.9%)
|
||||
- Month: 30 days = 43,200 minutes
|
||||
- Incident: Full outage from 14:23 to 14:38 UTC on the 12th (15 minutes)
|
||||
- No other incidents this month
|
||||
|
||||
```
|
||||
Availability = (43,200 - 15) / 43,200 x 100 = 99.965%
|
||||
SLA Status: PASS (99.965% > 99.9%)
|
||||
Error Budget Consumed: 15 / 43.2 = 34.7%
|
||||
```
|
||||
|
||||
**Example 2: Partial degradation**
|
||||
- Service: Payment Processing (SLA: 99.95%)
|
||||
- Month: 30 days = 43,200 minutes
|
||||
- Incident: 50% of transactions failing for 4 hours (240 minutes)
|
||||
- Degradation factor: 0.5 (moderate - 50% of requests affected)
|
||||
|
||||
```
|
||||
Effective Downtime = 240 x 0.5 = 120 minutes
|
||||
Availability = (43,200 - 120) / 43,200 x 100 = 99.722%
|
||||
SLA Status: FAIL (99.722% < 99.95%)
|
||||
Shortfall: 0.228 percentage points → Major Breach
|
||||
```
|
||||
|
||||
**Example 3: Multiple incidents**
|
||||
- Service: Dashboard (SLA: 99.5%)
|
||||
- Month: 31 days = 44,640 minutes
|
||||
- Incident A: 45-minute full outage on the 5th
|
||||
- Incident B: 2-hour severe degradation (factor 0.75) on the 18th
|
||||
- Incident C: 30-minute full outage on the 25th
|
||||
|
||||
```
|
||||
Total Effective Downtime = 45 + (120 x 0.75) + 30 = 45 + 90 + 30 = 165 minutes
|
||||
Availability = (44,640 - 165) / 44,640 x 100 = 99.630%
|
||||
SLA Status: PASS (99.630% > 99.5%)
|
||||
Error Budget Consumed: 165 / 223.2 = 73.9% → Yellow threshold, feature freeze recommended
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. SLO Best Practices
|
||||
|
||||
### Start with User Journeys
|
||||
|
||||
Do not set SLOs based on infrastructure metrics. Start from what users experience:
|
||||
|
||||
1. Identify critical user journeys (e.g., "User completes checkout")
|
||||
2. Map each journey to the services and dependencies involved
|
||||
3. Define what "good" looks like for each journey (fast, error-free, complete)
|
||||
4. Select the SLIs that most directly measure that user experience
|
||||
5. Set SLO targets that reflect the minimum acceptable user experience
|
||||
|
||||
A database with 99.99% uptime is meaningless if the API in front of it has a bug causing 5% error rates.
|
||||
|
||||
### The Four Golden Signals as SLI Sources
|
||||
|
||||
From Google SRE, the four golden signals provide comprehensive service health:
|
||||
|
||||
| Signal | SLI Example | Typical SLO |
|
||||
|---|---|---|
|
||||
| Latency | p99 request duration < 500ms | 99% of requests under threshold |
|
||||
| Traffic | Requests per second | N/A (capacity planning, not SLO) |
|
||||
| Errors | 5xx rate as % of total requests | < 0.1% error rate over rolling window |
|
||||
| Saturation | CPU/memory/queue depth | < 80% utilization (capacity SLI) |
|
||||
|
||||
For most services, latency and error rate are the two most important SLIs to back with SLOs.
|
||||
|
||||
### Setting SLO Targets
|
||||
|
||||
1. Collect 90 days of historical SLI data
|
||||
2. Calculate the 5th percentile performance (worst 5% of days)
|
||||
3. Set SLO slightly above that baseline (this ensures the SLO is achievable without heroics)
|
||||
4. Validate: would a breach at this level actually impact users negatively?
|
||||
5. Adjust upward only if user impact analysis demands it
|
||||
|
||||
**Never set SLOs by aspiration.** A 99.99% SLO on a service that has historically achieved 99.93% is a guaranteed source of perpetual firefighting with no reliability improvement.
|
||||
|
||||
### Review Cadence
|
||||
|
||||
- **Weekly:** Review current error budget burn rate, flag services approaching thresholds
|
||||
- **Monthly:** Full SLO compliance review, adjust alert thresholds if needed
|
||||
- **Quarterly:** Reassess SLO targets based on 90-day data, review SLA contract alignment
|
||||
- **Annually:** Strategic SLO review tied to product roadmap and infrastructure investments
|
||||
|
||||
### Anti-Patterns
|
||||
|
||||
| Anti-Pattern | Problem | Fix |
|
||||
|---|---|---|
|
||||
| Vanity SLOs | Setting 99.99% to impress, then ignoring breaches | Set achievable targets, enforce budget policy |
|
||||
| SLO Inflation | Ratcheting SLOs up whenever performance is good | Only increase SLOs when users demonstrably need it |
|
||||
| Unmeasured SLAs | Committing contractual SLAs without actual SLI measurement | Instrument SLIs before signing SLA contracts |
|
||||
| Copy-Paste SLOs | Same SLO for every service regardless of criticality | Tier services by business impact, set SLOs accordingly |
|
||||
| Ignoring Dependencies | Setting aggressive SLOs without accounting for dependency reliability | Calculate composite SLA; your SLO cannot exceed dependency chain |
|
||||
| Alert-Free SLOs | Having SLOs but no automated alerting on budget consumption | Every SLO must have corresponding burn rate alerts |
|
||||
|
||||
---
|
||||
|
||||
## 6. Monitoring & Alerting for SLAs
|
||||
|
||||
### Multi-Window Burn Rate Alerting
|
||||
|
||||
The Google SRE approach uses multiple time windows to balance speed of detection against alert noise. Each alert condition requires both a short window (for speed) and a long window (for confirmation):
|
||||
|
||||
**Alert configuration matrix:**
|
||||
|
||||
| Severity | Short Window | Short Threshold | Long Window | Long Threshold | Action |
|
||||
|---|---|---|---|---|---|
|
||||
| Critical (Page) | 1 hour | > 14.4x burn rate | 5 minutes | > 14.4x burn rate | Wake someone up |
|
||||
| High (Page) | 6 hours | > 6x burn rate | 30 minutes | > 6x burn rate | Page on-call within 30 min |
|
||||
| Medium (Ticket) | 3 days | > 1x burn rate | 6 hours | > 1x burn rate | Create ticket, next business day |
|
||||
|
||||
**Why these specific numbers:**
|
||||
|
||||
- 14.4x burn rate over 1 hour consumes 2% of monthly budget in that hour. At this rate, the entire 30-day budget is gone in ~50 hours. This demands immediate human attention.
|
||||
- 6x burn rate over 6 hours consumes 5% of monthly budget. The budget will be exhausted in 5 days. Urgent but not wake-up-at-3am urgent.
|
||||
- 1x burn rate over 3 days means you are on pace to exactly exhaust the budget. This needs investigation but is not an emergency.
|
||||
|
||||
### Burn Rate Alert Formulas
|
||||
|
||||
For a given time window, calculate the burn rate:
|
||||
|
||||
```
|
||||
burn_rate = (error_count_in_window / request_count_in_window) / (1 - SLO_target)
|
||||
```
|
||||
|
||||
Example for a 99.9% SLO, observing 50 errors out of 10,000 requests in a 1-hour window:
|
||||
|
||||
```
|
||||
observed_error_rate = 50 / 10,000 = 0.005 (0.5%)
|
||||
allowed_error_rate = 1 - 0.999 = 0.001 (0.1%)
|
||||
burn_rate = 0.005 / 0.001 = 5.0
|
||||
```
|
||||
|
||||
A burn rate of 5.0 means the error budget is being consumed 5 times faster than the sustainable rate.
|
||||
|
||||
### Alert Severity to SLA Risk Mapping
|
||||
|
||||
| Burn Rate | Budget Impact | SLA Risk | Response |
|
||||
|---|---|---|---|
|
||||
| < 1x | Under budget pace | None | Routine monitoring |
|
||||
| 1x - 3x | On pace or slightly over | Low | Investigate next business day |
|
||||
| 3x - 6x | Budget will exhaust in 5-10 days | Moderate | Investigate within 4 hours |
|
||||
| 6x - 14.4x | Budget will exhaust in 2-5 days | High | Page on-call, respond in 30 min |
|
||||
| > 14.4x | Budget will exhaust in < 2 days | Critical | Immediate page, incident declared |
|
||||
| > 100x | Active major outage | SLA breach imminent | All-hands incident response |
|
||||
|
||||
### Dashboard Design for SLA Tracking
|
||||
|
||||
Every SLA-tracked service should have a dashboard with these panels:
|
||||
|
||||
**Row 1 - Current Status:**
|
||||
- Current availability (real-time, rolling 5-minute window)
|
||||
- Current error rate (real-time)
|
||||
- Current p99 latency (real-time)
|
||||
|
||||
**Row 2 - Budget Status:**
|
||||
- Error budget remaining (% of monthly budget, gauge visualization)
|
||||
- Budget consumption timeline (line chart, actual vs expected burn)
|
||||
- Budget burn rate (current 1h, 6h, and 3d burn rates)
|
||||
|
||||
**Row 3 - Historical Context:**
|
||||
- 30-day availability trend (daily granularity)
|
||||
- SLA compliance status for current and previous 3 months
|
||||
- Incident markers overlaid on availability timeline
|
||||
|
||||
**Row 4 - Dependencies:**
|
||||
- Upstream dependency availability (services this service depends on)
|
||||
- Downstream impact scope (services that depend on this service)
|
||||
- Composite SLA calculation for customer-facing products
|
||||
|
||||
### Alert Fatigue Prevention
|
||||
|
||||
Alert fatigue is the primary reason SLA monitoring fails in practice. Mitigation strategies:
|
||||
|
||||
1. **Require dual-window confirmation.** Never page on a single short window. Always require both the short window (for speed) and long window (for persistence) to fire simultaneously.
|
||||
|
||||
2. **Separate page-worthy from ticket-worthy.** Only two conditions should wake someone up: >14.4x burn rate sustained, or >6x burn rate sustained. Everything else is a ticket.
|
||||
|
||||
3. **Deduplicate aggressively.** If the same service triggers both a latency and error rate alert for the same underlying issue, group them into a single notification.
|
||||
|
||||
4. **Auto-resolve.** Alerts must auto-resolve when the burn rate drops below threshold. Never leave stale alerts open.
|
||||
|
||||
5. **Review alert quality monthly.** Track the ratio of actionable alerts to total alerts. Target >80% actionable rate. If an alert fires and no human action is needed, tune or remove it.
|
||||
|
||||
6. **Escalation, not repetition.** If an alert is not acknowledged within the response window, escalate to the next tier. Do not re-send the same alert every 5 minutes.
|
||||
|
||||
### Practical Monitoring Stack
|
||||
|
||||
| Layer | Tool Category | Purpose |
|
||||
|---|---|---|
|
||||
| Collection | Prometheus, OpenTelemetry, StatsD | Gather SLI metrics from services |
|
||||
| Storage | Prometheus TSDB, Thanos, Mimir | Retain metrics for SLO window + 90 days |
|
||||
| Calculation | Prometheus recording rules, Sloth | Pre-compute burn rates and budget consumption |
|
||||
| Alerting | Alertmanager, PagerDuty, OpsGenie | Route alerts by severity and schedule |
|
||||
| Visualization | Grafana, Datadog | Dashboards for real-time and historical SLA views |
|
||||
| Reporting | Custom scripts, SLO generators | Monthly SLA compliance reports for customers |
|
||||
|
||||
**Retention requirement:** SLI data must be retained for at least the SLA reporting period (typically monthly or quarterly) plus a 90-day dispute window. Annual SLA reviews require 12 months of data at daily granularity minimum.
|
||||
|
||||
---
|
||||
|
||||
*Last updated: February 2026*
|
||||
*For use with: incident-commander skill*
|
||||
*Maintainer: Engineering Team*
|
||||
@@ -0,0 +1,742 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Incident Timeline Builder
|
||||
|
||||
Builds structured incident timelines with automatic phase detection, gap analysis,
|
||||
communication template generation, and response metrics calculation. Produces
|
||||
professional reports suitable for post-incident review and stakeholder briefing.
|
||||
|
||||
Usage:
|
||||
python incident_timeline_builder.py incident_data.json
|
||||
python incident_timeline_builder.py incident_data.json --format json
|
||||
python incident_timeline_builder.py incident_data.json --format markdown
|
||||
cat incident_data.json | python incident_timeline_builder.py --format text
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ISO_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
|
||||
|
||||
EVENT_TYPES = [
|
||||
"detection", "declaration", "escalation", "investigation",
|
||||
"mitigation", "communication", "resolution", "action_item",
|
||||
]
|
||||
|
||||
SEVERITY_LEVELS = {
|
||||
"SEV1": {"label": "Critical", "rank": 1},
|
||||
"SEV2": {"label": "Major", "rank": 2},
|
||||
"SEV3": {"label": "Minor", "rank": 3},
|
||||
"SEV4": {"label": "Low", "rank": 4},
|
||||
}
|
||||
|
||||
PHASE_DEFINITIONS = [
|
||||
{"name": "Detection", "trigger_types": ["detection"],
|
||||
"description": "Issue detected via monitoring, alerting, or user report."},
|
||||
{"name": "Triage", "trigger_types": ["declaration", "escalation"],
|
||||
"description": "Incident declared, severity assessed, commander assigned."},
|
||||
{"name": "Investigation", "trigger_types": ["investigation"],
|
||||
"description": "Root cause analysis and impact assessment underway."},
|
||||
{"name": "Mitigation", "trigger_types": ["mitigation"],
|
||||
"description": "Active work to reduce or eliminate customer impact."},
|
||||
{"name": "Resolution", "trigger_types": ["resolution"],
|
||||
"description": "Service restored to normal operating parameters."},
|
||||
]
|
||||
|
||||
GAP_THRESHOLD_MINUTES = 15
|
||||
|
||||
DECISION_EVENT_TYPES = {"escalation", "mitigation", "declaration", "resolution"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data Model Classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class IncidentEvent:
|
||||
"""Represents a single event in the incident timeline."""
|
||||
|
||||
def __init__(self, data: Dict[str, Any]):
|
||||
self.timestamp_raw: str = data.get("timestamp", "")
|
||||
self.timestamp: Optional[datetime] = _parse_timestamp(self.timestamp_raw)
|
||||
self.type: str = data.get("type", "unknown").lower().strip()
|
||||
self.actor: str = data.get("actor", "unknown")
|
||||
self.description: str = data.get("description", "")
|
||||
self.metadata: Dict[str, Any] = data.get("metadata", {})
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
result: Dict[str, Any] = {
|
||||
"timestamp": self.timestamp_raw, "type": self.type,
|
||||
"actor": self.actor, "description": self.description,
|
||||
}
|
||||
if self.metadata:
|
||||
result["metadata"] = self.metadata
|
||||
return result
|
||||
|
||||
@property
|
||||
def is_decision_point(self) -> bool:
|
||||
return self.type in DECISION_EVENT_TYPES
|
||||
|
||||
|
||||
class IncidentPhase:
|
||||
"""Represents a detected phase of the incident lifecycle."""
|
||||
|
||||
def __init__(self, name: str, description: str):
|
||||
self.name: str = name
|
||||
self.description: str = description
|
||||
self.start_time: Optional[datetime] = None
|
||||
self.end_time: Optional[datetime] = None
|
||||
self.events: List[IncidentEvent] = []
|
||||
|
||||
@property
|
||||
def duration_minutes(self) -> Optional[float]:
|
||||
if self.start_time and self.end_time:
|
||||
return (self.end_time - self.start_time).total_seconds() / 60.0
|
||||
return None
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
dur = self.duration_minutes
|
||||
return {
|
||||
"name": self.name, "description": self.description,
|
||||
"start_time": self.start_time.strftime(ISO_FORMAT) if self.start_time else None,
|
||||
"end_time": self.end_time.strftime(ISO_FORMAT) if self.end_time else None,
|
||||
"duration_minutes": round(dur, 1) if dur is not None else None,
|
||||
"event_count": len(self.events),
|
||||
}
|
||||
|
||||
|
||||
class CommunicationTemplate:
|
||||
"""A generated communication message for a specific audience."""
|
||||
|
||||
def __init__(self, template_type: str, audience: str, subject: str, body: str):
|
||||
self.template_type = template_type
|
||||
self.audience = audience
|
||||
self.subject = subject
|
||||
self.body = body
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"template_type": self.template_type, "audience": self.audience,
|
||||
"subject": self.subject, "body": self.body}
|
||||
|
||||
|
||||
class TimelineGap:
|
||||
"""Represents a gap in the timeline where no events were logged."""
|
||||
|
||||
def __init__(self, start: datetime, end: datetime, duration_minutes: float):
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.duration_minutes = duration_minutes
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"start": self.start.strftime(ISO_FORMAT),
|
||||
"end": self.end.strftime(ISO_FORMAT),
|
||||
"duration_minutes": round(self.duration_minutes, 1)}
|
||||
|
||||
|
||||
class TimelineAnalysis:
|
||||
"""Holds the complete analysis result for an incident timeline."""
|
||||
|
||||
def __init__(self):
|
||||
self.incident_id: str = ""
|
||||
self.incident_title: str = ""
|
||||
self.severity: str = ""
|
||||
self.status: str = ""
|
||||
self.commander: str = ""
|
||||
self.service: str = ""
|
||||
self.affected_services: List[str] = []
|
||||
self.declared_at: Optional[datetime] = None
|
||||
self.resolved_at: Optional[datetime] = None
|
||||
self.events: List[IncidentEvent] = []
|
||||
self.phases: List[IncidentPhase] = []
|
||||
self.gaps: List[TimelineGap] = []
|
||||
self.decision_points: List[IncidentEvent] = []
|
||||
self.metrics: Dict[str, Any] = {}
|
||||
self.communications: List[CommunicationTemplate] = []
|
||||
self.errors: List[str] = []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Timestamp Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_timestamp(raw: str) -> Optional[datetime]:
|
||||
"""Parse an ISO-8601 timestamp string into a datetime object."""
|
||||
if not raw:
|
||||
return None
|
||||
cleaned = raw.replace("Z", "+00:00") if raw.endswith("Z") else raw
|
||||
try:
|
||||
return datetime.fromisoformat(cleaned).replace(tzinfo=None)
|
||||
except (ValueError, AttributeError):
|
||||
pass
|
||||
try:
|
||||
return datetime.strptime(raw, ISO_FORMAT)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _fmt_duration(minutes: Optional[float]) -> str:
|
||||
"""Format a duration in minutes as a human-readable string."""
|
||||
if minutes is None:
|
||||
return "N/A"
|
||||
if minutes < 1:
|
||||
return f"{minutes * 60:.0f}s"
|
||||
if minutes < 60:
|
||||
return f"{minutes:.0f}m"
|
||||
hours, remaining = int(minutes // 60), int(minutes % 60)
|
||||
return f"{hours}h" if remaining == 0 else f"{hours}h {remaining}m"
|
||||
|
||||
|
||||
def _fmt_ts(dt: Optional[datetime]) -> str:
|
||||
"""Format a datetime as HH:MM:SS for display."""
|
||||
return dt.strftime("%H:%M:%S") if dt else "??:??:??"
|
||||
|
||||
|
||||
def _sev_label(sev: str) -> str:
|
||||
"""Return the human label for a severity code."""
|
||||
return SEVERITY_LEVELS.get(sev, {}).get("label", sev)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core Analysis Functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def parse_incident_data(data: Dict[str, Any]) -> TimelineAnalysis:
|
||||
"""Parse raw incident JSON into a TimelineAnalysis with populated fields."""
|
||||
a = TimelineAnalysis()
|
||||
inc = data.get("incident", {})
|
||||
a.incident_id = inc.get("id", "UNKNOWN")
|
||||
a.incident_title = inc.get("title", "Untitled Incident")
|
||||
a.severity = inc.get("severity", "UNKNOWN").upper()
|
||||
a.status = inc.get("status", "unknown").lower()
|
||||
a.commander = inc.get("commander", "Unassigned")
|
||||
a.service = inc.get("service", "unknown")
|
||||
a.affected_services = inc.get("affected_services", [])
|
||||
a.declared_at = _parse_timestamp(inc.get("declared_at", ""))
|
||||
a.resolved_at = _parse_timestamp(inc.get("resolved_at", ""))
|
||||
|
||||
raw_events = data.get("events", [])
|
||||
if not raw_events:
|
||||
a.errors.append("No events found in incident data.")
|
||||
return a
|
||||
|
||||
for raw in raw_events:
|
||||
event = IncidentEvent(raw)
|
||||
if event.timestamp is None:
|
||||
a.errors.append(f"Skipping event with unparseable timestamp: {raw.get('timestamp', '')}")
|
||||
continue
|
||||
a.events.append(event)
|
||||
|
||||
a.events.sort(key=lambda e: e.timestamp) # type: ignore[arg-type]
|
||||
return a
|
||||
|
||||
|
||||
def detect_phases(analysis: TimelineAnalysis) -> None:
|
||||
"""Detect incident lifecycle phases from the ordered event stream."""
|
||||
if not analysis.events:
|
||||
return
|
||||
|
||||
trigger_map: Dict[str, Dict[str, str]] = {}
|
||||
for pdef in PHASE_DEFINITIONS:
|
||||
for ttype in pdef["trigger_types"]:
|
||||
trigger_map[ttype] = {"name": pdef["name"], "description": pdef["description"]}
|
||||
|
||||
phase_by_name: Dict[str, IncidentPhase] = {}
|
||||
phase_order: List[str] = []
|
||||
current: Optional[IncidentPhase] = None
|
||||
|
||||
for event in analysis.events:
|
||||
pinfo = trigger_map.get(event.type)
|
||||
if pinfo and pinfo["name"] not in phase_by_name:
|
||||
if current is not None:
|
||||
current.end_time = event.timestamp
|
||||
phase = IncidentPhase(pinfo["name"], pinfo["description"])
|
||||
phase.start_time = event.timestamp
|
||||
phase_by_name[pinfo["name"]] = phase
|
||||
phase_order.append(pinfo["name"])
|
||||
current = phase
|
||||
if current is not None:
|
||||
current.events.append(event)
|
||||
|
||||
if current is not None:
|
||||
current.end_time = analysis.resolved_at or analysis.events[-1].timestamp
|
||||
|
||||
analysis.phases = [phase_by_name[n] for n in phase_order]
|
||||
|
||||
|
||||
def detect_gaps(analysis: TimelineAnalysis) -> None:
|
||||
"""Identify gaps longer than GAP_THRESHOLD_MINUTES between consecutive events."""
|
||||
for i in range(len(analysis.events) - 1):
|
||||
ts_a, ts_b = analysis.events[i].timestamp, analysis.events[i + 1].timestamp
|
||||
if ts_a is None or ts_b is None:
|
||||
continue
|
||||
delta = (ts_b - ts_a).total_seconds() / 60.0
|
||||
if delta >= GAP_THRESHOLD_MINUTES:
|
||||
analysis.gaps.append(TimelineGap(start=ts_a, end=ts_b, duration_minutes=delta))
|
||||
|
||||
|
||||
def identify_decision_points(analysis: TimelineAnalysis) -> None:
|
||||
"""Extract key decision-point events from the timeline."""
|
||||
analysis.decision_points = [e for e in analysis.events if e.is_decision_point]
|
||||
|
||||
|
||||
def calculate_metrics(analysis: TimelineAnalysis) -> None:
|
||||
"""Calculate incident response metrics: MTTD, MTTR, phase durations."""
|
||||
m: Dict[str, Any] = {}
|
||||
det = [e for e in analysis.events if e.type == "detection"]
|
||||
first_det = det[0].timestamp if det else None
|
||||
first_ts = analysis.events[0].timestamp if analysis.events else None
|
||||
|
||||
# MTTD: first event to first detection.
|
||||
if first_ts and first_det:
|
||||
m["mttd_minutes"] = round((first_det - first_ts).total_seconds() / 60.0, 1)
|
||||
else:
|
||||
m["mttd_minutes"] = None
|
||||
|
||||
# MTTR: detection to resolution.
|
||||
if first_det and analysis.resolved_at:
|
||||
m["mttr_minutes"] = round((analysis.resolved_at - first_det).total_seconds() / 60.0, 1)
|
||||
else:
|
||||
m["mttr_minutes"] = None
|
||||
|
||||
# Total duration.
|
||||
if analysis.declared_at and analysis.resolved_at:
|
||||
m["total_duration_minutes"] = round(
|
||||
(analysis.resolved_at - analysis.declared_at).total_seconds() / 60.0, 1)
|
||||
else:
|
||||
m["total_duration_minutes"] = None
|
||||
|
||||
# Phase durations.
|
||||
m["phase_durations"] = {
|
||||
p.name: (round(p.duration_minutes, 1) if p.duration_minutes is not None else None)
|
||||
for p in analysis.phases
|
||||
}
|
||||
|
||||
# Event counts by type.
|
||||
tc: Dict[str, int] = {}
|
||||
for e in analysis.events:
|
||||
tc[e.type] = tc.get(e.type, 0) + 1
|
||||
m["event_counts_by_type"] = tc
|
||||
|
||||
# Gap statistics.
|
||||
m["gap_count"] = len(analysis.gaps)
|
||||
if analysis.gaps:
|
||||
gm = [g.duration_minutes for g in analysis.gaps]
|
||||
m["longest_gap_minutes"] = round(max(gm), 1)
|
||||
m["total_gap_minutes"] = round(sum(gm), 1)
|
||||
else:
|
||||
m["longest_gap_minutes"] = 0
|
||||
m["total_gap_minutes"] = 0
|
||||
|
||||
m["total_events"] = len(analysis.events)
|
||||
m["decision_point_count"] = len(analysis.decision_points)
|
||||
m["phase_count"] = len(analysis.phases)
|
||||
analysis.metrics = m
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Communication Template Generation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def generate_communications(analysis: TimelineAnalysis) -> None:
|
||||
"""Generate four communication templates based on incident data."""
|
||||
sev, sl = analysis.severity, _sev_label(analysis.severity)
|
||||
title, svc = analysis.incident_title, analysis.service
|
||||
affected = ", ".join(analysis.affected_services) or "none identified"
|
||||
cmd, iid = analysis.commander, analysis.incident_id
|
||||
decl = analysis.declared_at.strftime("%Y-%m-%d %H:%M UTC") if analysis.declared_at else "TBD"
|
||||
resv = analysis.resolved_at.strftime("%Y-%m-%d %H:%M UTC") if analysis.resolved_at else "TBD"
|
||||
dur = _fmt_duration(analysis.metrics.get("total_duration_minutes"))
|
||||
resolved = analysis.status == "resolved"
|
||||
|
||||
# 1 -- Initial stakeholder notification
|
||||
analysis.communications.append(CommunicationTemplate(
|
||||
"initial_notification", "internal", f"[{sev}] Incident Declared: {title}",
|
||||
f"An incident has been declared for {svc}.\n\n"
|
||||
f"Incident ID: {iid}\nSeverity: {sev} ({sl})\nCommander: {cmd}\n"
|
||||
f"Declared at: {decl}\nAffected services: {affected}\n\n"
|
||||
f"The incident team is actively investigating. Updates will follow.",
|
||||
))
|
||||
|
||||
# 2 -- Status page update
|
||||
if resolved:
|
||||
sp_subj = f"[Resolved] {title}"
|
||||
sp_body = (f"The incident affecting {svc} has been resolved.\n\n"
|
||||
f"Duration: {dur}\nAll affected services ({affected}) are restored. "
|
||||
f"A post-incident review will be published within 48 hours.")
|
||||
else:
|
||||
sp_subj = f"[Investigating] {title}"
|
||||
sp_body = (f"We are investigating degraded performance in {svc}. "
|
||||
f"Affected services: {affected}.\n\n"
|
||||
f"Our team is working to identify the root cause. Updates every 30 minutes.")
|
||||
analysis.communications.append(CommunicationTemplate(
|
||||
"status_page", "external", sp_subj, sp_body))
|
||||
|
||||
# 3 -- Executive summary
|
||||
phase_lines = "\n".join(
|
||||
f" - {p.name}: {_fmt_duration(p.duration_minutes)}" for p in analysis.phases
|
||||
) or " No phase data available."
|
||||
mttd = _fmt_duration(analysis.metrics.get("mttd_minutes"))
|
||||
mttr = _fmt_duration(analysis.metrics.get("mttr_minutes"))
|
||||
analysis.communications.append(CommunicationTemplate(
|
||||
"executive_summary", "executive", f"Executive Summary: {iid} - {title}",
|
||||
f"Incident: {iid} - {title}\nSeverity: {sev} ({sl})\n"
|
||||
f"Service: {svc}\nCommander: {cmd}\nStatus: {analysis.status.capitalize()}\n"
|
||||
f"Declared: {decl}\nResolved: {resv}\nDuration: {dur}\n\n"
|
||||
f"Key Metrics:\n - MTTD: {mttd}\n - MTTR: {mttr}\n"
|
||||
f" - Timeline Gaps: {analysis.metrics.get('gap_count', 0)}\n\n"
|
||||
f"Phase Breakdown:\n{phase_lines}\n\nAffected Services: {affected}",
|
||||
))
|
||||
|
||||
# 4 -- Customer notification
|
||||
if resolved:
|
||||
cust_body = (f"We experienced an issue affecting {svc} starting at {decl}.\n\n"
|
||||
f"The issue was resolved at {resv} (duration: {dur}). "
|
||||
f"We apologize for any inconvenience and are reviewing to prevent recurrence.")
|
||||
else:
|
||||
cust_body = (f"We are experiencing an issue affecting {svc} starting at {decl}.\n\n"
|
||||
f"Our engineering team is actively working to resolve this. "
|
||||
f"We will provide updates as the situation develops. We apologize for the inconvenience.")
|
||||
analysis.communications.append(CommunicationTemplate(
|
||||
"customer_notification", "external", f"Service Update: {title}", cust_body))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main Analysis Orchestrator
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_timeline(data: Dict[str, Any]) -> TimelineAnalysis:
|
||||
"""Run the full timeline analysis pipeline on raw incident data."""
|
||||
analysis = parse_incident_data(data)
|
||||
if analysis.errors and not analysis.events:
|
||||
return analysis
|
||||
detect_phases(analysis)
|
||||
detect_gaps(analysis)
|
||||
identify_decision_points(analysis)
|
||||
calculate_metrics(analysis)
|
||||
generate_communications(analysis)
|
||||
return analysis
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Output Formatters
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def format_text_output(analysis: TimelineAnalysis) -> str:
|
||||
"""Format the analysis as a human-readable text report."""
|
||||
L: List[str] = []
|
||||
w = 64
|
||||
|
||||
L.append("=" * w)
|
||||
L.append("INCIDENT TIMELINE REPORT")
|
||||
L.append("=" * w)
|
||||
L.append("")
|
||||
|
||||
if analysis.errors:
|
||||
for err in analysis.errors:
|
||||
L.append(f" WARNING: {err}")
|
||||
L.append("")
|
||||
if not analysis.events:
|
||||
return "\n".join(L)
|
||||
|
||||
# Summary
|
||||
L.append("INCIDENT SUMMARY")
|
||||
L.append("-" * 32)
|
||||
L.append(f" ID: {analysis.incident_id}")
|
||||
L.append(f" Title: {analysis.incident_title}")
|
||||
L.append(f" Severity: {analysis.severity}")
|
||||
L.append(f" Status: {analysis.status.capitalize()}")
|
||||
L.append(f" Commander: {analysis.commander}")
|
||||
L.append(f" Service: {analysis.service}")
|
||||
if analysis.affected_services:
|
||||
L.append(f" Affected: {', '.join(analysis.affected_services)}")
|
||||
L.append(f" Duration: {_fmt_duration(analysis.metrics.get('total_duration_minutes'))}")
|
||||
L.append("")
|
||||
|
||||
# Key metrics
|
||||
L.append("KEY METRICS")
|
||||
L.append("-" * 32)
|
||||
L.append(f" MTTD (Mean Time to Detect): {_fmt_duration(analysis.metrics.get('mttd_minutes'))}")
|
||||
L.append(f" MTTR (Mean Time to Resolve): {_fmt_duration(analysis.metrics.get('mttr_minutes'))}")
|
||||
L.append(f" Total Events: {analysis.metrics.get('total_events', 0)}")
|
||||
L.append(f" Decision Points: {analysis.metrics.get('decision_point_count', 0)}")
|
||||
L.append(f" Timeline Gaps (>{GAP_THRESHOLD_MINUTES}m): {analysis.metrics.get('gap_count', 0)}")
|
||||
L.append("")
|
||||
|
||||
# Phases
|
||||
L.append("INCIDENT PHASES")
|
||||
L.append("-" * 32)
|
||||
if analysis.phases:
|
||||
for p in analysis.phases:
|
||||
L.append(f" [{_fmt_ts(p.start_time)} - {_fmt_ts(p.end_time)}] {p.name} ({_fmt_duration(p.duration_minutes)})")
|
||||
L.append(f" {p.description}")
|
||||
L.append(f" Events: {len(p.events)}")
|
||||
else:
|
||||
L.append(" No phases detected.")
|
||||
L.append("")
|
||||
|
||||
# Chronological timeline
|
||||
L.append("CHRONOLOGICAL TIMELINE")
|
||||
L.append("-" * 32)
|
||||
for e in analysis.events:
|
||||
marker = "*" if e.is_decision_point else " "
|
||||
L.append(f" {_fmt_ts(e.timestamp)} {marker} [{e.type.upper():13s}] {e.actor}")
|
||||
L.append(f" {e.description}")
|
||||
L.append("")
|
||||
L.append(" (* = key decision point)")
|
||||
L.append("")
|
||||
|
||||
# Gap warnings
|
||||
if analysis.gaps:
|
||||
L.append("GAP ANALYSIS")
|
||||
L.append("-" * 32)
|
||||
for g in analysis.gaps:
|
||||
L.append(f" WARNING: {_fmt_duration(g.duration_minutes)} gap between {_fmt_ts(g.start)} and {_fmt_ts(g.end)}")
|
||||
L.append("")
|
||||
|
||||
# Decision points
|
||||
if analysis.decision_points:
|
||||
L.append("KEY DECISION POINTS")
|
||||
L.append("-" * 32)
|
||||
for dp in analysis.decision_points:
|
||||
L.append(f" {_fmt_ts(dp.timestamp)} [{dp.type.upper()}] {dp.description}")
|
||||
L.append("")
|
||||
|
||||
# Communications
|
||||
if analysis.communications:
|
||||
L.append("GENERATED COMMUNICATIONS")
|
||||
L.append("-" * 32)
|
||||
for c in analysis.communications:
|
||||
L.append(f" Type: {c.template_type}")
|
||||
L.append(f" Audience: {c.audience}")
|
||||
L.append(f" Subject: {c.subject}")
|
||||
L.append(" ---")
|
||||
for bl in c.body.split("\n"):
|
||||
L.append(f" {bl}")
|
||||
L.append("")
|
||||
|
||||
L.append("=" * w)
|
||||
L.append("END OF REPORT")
|
||||
L.append("=" * w)
|
||||
return "\n".join(L)
|
||||
|
||||
|
||||
def format_json_output(analysis: TimelineAnalysis) -> Dict[str, Any]:
|
||||
"""Format the analysis as a structured JSON-serializable dictionary."""
|
||||
return {
|
||||
"incident": {
|
||||
"id": analysis.incident_id, "title": analysis.incident_title,
|
||||
"severity": analysis.severity, "status": analysis.status,
|
||||
"commander": analysis.commander, "service": analysis.service,
|
||||
"affected_services": analysis.affected_services,
|
||||
"declared_at": analysis.declared_at.strftime(ISO_FORMAT) if analysis.declared_at else None,
|
||||
"resolved_at": analysis.resolved_at.strftime(ISO_FORMAT) if analysis.resolved_at else None,
|
||||
},
|
||||
"timeline": [e.to_dict() for e in analysis.events],
|
||||
"phases": [p.to_dict() for p in analysis.phases],
|
||||
"gaps": [g.to_dict() for g in analysis.gaps],
|
||||
"decision_points": [e.to_dict() for e in analysis.decision_points],
|
||||
"metrics": analysis.metrics,
|
||||
"communications": [c.to_dict() for c in analysis.communications],
|
||||
"errors": analysis.errors if analysis.errors else [],
|
||||
}
|
||||
|
||||
|
||||
def format_markdown_output(analysis: TimelineAnalysis) -> str:
|
||||
"""Format the analysis as a professional Markdown report."""
|
||||
L: List[str] = []
|
||||
|
||||
L.append(f"# Incident Timeline Report: {analysis.incident_id}")
|
||||
L.append("")
|
||||
|
||||
if analysis.errors:
|
||||
L.append("> **Warnings:**")
|
||||
for err in analysis.errors:
|
||||
L.append(f"> - {err}")
|
||||
L.append("")
|
||||
if not analysis.events:
|
||||
return "\n".join(L)
|
||||
|
||||
# Summary table
|
||||
L.append("## Incident Summary")
|
||||
L.append("")
|
||||
L.append("| Field | Value |")
|
||||
L.append("|-------|-------|")
|
||||
L.append(f"| **ID** | {analysis.incident_id} |")
|
||||
L.append(f"| **Title** | {analysis.incident_title} |")
|
||||
L.append(f"| **Severity** | {analysis.severity} ({_sev_label(analysis.severity)}) |")
|
||||
L.append(f"| **Status** | {analysis.status.capitalize()} |")
|
||||
L.append(f"| **Commander** | {analysis.commander} |")
|
||||
L.append(f"| **Service** | {analysis.service} |")
|
||||
if analysis.affected_services:
|
||||
L.append(f"| **Affected Services** | {', '.join(analysis.affected_services)} |")
|
||||
L.append(f"| **Duration** | {_fmt_duration(analysis.metrics.get('total_duration_minutes'))} |")
|
||||
L.append("")
|
||||
|
||||
# Key metrics
|
||||
L.append("## Key Metrics")
|
||||
L.append("")
|
||||
L.append(f"- **MTTD (Mean Time to Detect):** {_fmt_duration(analysis.metrics.get('mttd_minutes'))}")
|
||||
L.append(f"- **MTTR (Mean Time to Resolve):** {_fmt_duration(analysis.metrics.get('mttr_minutes'))}")
|
||||
L.append(f"- **Total Events:** {analysis.metrics.get('total_events', 0)}")
|
||||
L.append(f"- **Decision Points:** {analysis.metrics.get('decision_point_count', 0)}")
|
||||
L.append(f"- **Timeline Gaps (>{GAP_THRESHOLD_MINUTES}m):** {analysis.metrics.get('gap_count', 0)}")
|
||||
if analysis.metrics.get("longest_gap_minutes", 0) > 0:
|
||||
L.append(f"- **Longest Gap:** {_fmt_duration(analysis.metrics.get('longest_gap_minutes'))}")
|
||||
L.append("")
|
||||
|
||||
# Phases table
|
||||
L.append("## Incident Phases")
|
||||
L.append("")
|
||||
if analysis.phases:
|
||||
L.append("| Phase | Start | End | Duration | Events |")
|
||||
L.append("|-------|-------|-----|----------|--------|")
|
||||
for p in analysis.phases:
|
||||
L.append(f"| {p.name} | {_fmt_ts(p.start_time)} | {_fmt_ts(p.end_time)} | {_fmt_duration(p.duration_minutes)} | {len(p.events)} |")
|
||||
L.append("")
|
||||
# ASCII bar chart
|
||||
max_dur = max((p.duration_minutes for p in analysis.phases if p.duration_minutes), default=0)
|
||||
if max_dur and max_dur > 0:
|
||||
L.append("### Phase Duration Distribution")
|
||||
L.append("")
|
||||
L.append("```")
|
||||
for p in analysis.phases:
|
||||
d = p.duration_minutes or 0
|
||||
bar = "#" * int((d / max_dur) * 40)
|
||||
L.append(f" {p.name:15s} |{bar} {_fmt_duration(d)}")
|
||||
L.append("```")
|
||||
L.append("")
|
||||
else:
|
||||
L.append("No phases detected.")
|
||||
L.append("")
|
||||
|
||||
# Chronological timeline
|
||||
L.append("## Chronological Timeline")
|
||||
L.append("")
|
||||
for e in analysis.events:
|
||||
dm = " **[KEY DECISION]**" if e.is_decision_point else ""
|
||||
L.append(f"- `{_fmt_ts(e.timestamp)}` **{e.type.upper()}** ({e.actor}){dm}")
|
||||
L.append(f" - {e.description}")
|
||||
L.append("")
|
||||
|
||||
# Gap analysis
|
||||
if analysis.gaps:
|
||||
L.append("## Gap Analysis")
|
||||
L.append("")
|
||||
L.append(f"> {len(analysis.gaps)} gap(s) of >{GAP_THRESHOLD_MINUTES} minutes detected. "
|
||||
f"These may represent blind spots where important activity was not recorded.")
|
||||
L.append("")
|
||||
for g in analysis.gaps:
|
||||
L.append(f"- **{_fmt_duration(g.duration_minutes)}** gap from `{_fmt_ts(g.start)}` to `{_fmt_ts(g.end)}`")
|
||||
L.append("")
|
||||
|
||||
# Decision points
|
||||
if analysis.decision_points:
|
||||
L.append("## Key Decision Points")
|
||||
L.append("")
|
||||
for dp in analysis.decision_points:
|
||||
L.append(f"1. `{_fmt_ts(dp.timestamp)}` **{dp.type.upper()}** - {dp.description}")
|
||||
L.append("")
|
||||
|
||||
# Communications
|
||||
if analysis.communications:
|
||||
L.append("## Generated Communications")
|
||||
L.append("")
|
||||
for c in analysis.communications:
|
||||
L.append(f"### {c.template_type.replace('_', ' ').title()} ({c.audience})")
|
||||
L.append("")
|
||||
L.append(f"**Subject:** {c.subject}")
|
||||
L.append("")
|
||||
for bl in c.body.split("\n"):
|
||||
L.append(bl)
|
||||
L.append("")
|
||||
L.append("---")
|
||||
L.append("")
|
||||
|
||||
# Event type breakdown
|
||||
tc = analysis.metrics.get("event_counts_by_type", {})
|
||||
if tc:
|
||||
L.append("## Event Type Breakdown")
|
||||
L.append("")
|
||||
L.append("| Type | Count |")
|
||||
L.append("|------|-------|")
|
||||
for etype, count in sorted(tc.items(), key=lambda x: -x[1]):
|
||||
L.append(f"| {etype} | {count} |")
|
||||
L.append("")
|
||||
|
||||
L.append("---")
|
||||
L.append(f"*Report generated for incident {analysis.incident_id}. All timestamps in UTC.*")
|
||||
return "\n".join(L)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI Interface
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> int:
|
||||
"""Main CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build structured incident timelines with phase detection and communication templates."
|
||||
)
|
||||
parser.add_argument(
|
||||
"data_file", nargs="?", default=None,
|
||||
help="JSON file with incident data (reads stdin if omitted)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format", choices=["text", "json", "markdown"], default="text",
|
||||
help="Output format (default: text)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
if args.data_file:
|
||||
try:
|
||||
with open(args.data_file, "r") as f:
|
||||
raw_data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File '{args.data_file}' not found.", file=sys.stderr)
|
||||
return 1
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in '{args.data_file}': {e}", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
if sys.stdin.isatty():
|
||||
print("Error: No input file specified and stdin is a terminal. "
|
||||
"Provide a file argument or pipe JSON to stdin.", file=sys.stderr)
|
||||
return 1
|
||||
try:
|
||||
raw_data = json.load(sys.stdin)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON on stdin: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if not isinstance(raw_data, dict):
|
||||
print("Error: Input must be a JSON object.", file=sys.stderr)
|
||||
return 1
|
||||
if "incident" not in raw_data and "events" not in raw_data:
|
||||
print("Error: Input must contain at least 'incident' or 'events' keys.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
analysis = build_timeline(raw_data)
|
||||
|
||||
if args.format == "json":
|
||||
print(json.dumps(format_json_output(analysis), indent=2))
|
||||
elif args.format == "markdown":
|
||||
print(format_markdown_output(analysis))
|
||||
else:
|
||||
print(format_text_output(analysis))
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,804 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Postmortem Generator - Generate structured postmortem reports with 5-Whys analysis.
|
||||
|
||||
Produces comprehensive incident postmortem documents from structured JSON input,
|
||||
including root cause analysis, contributing factor classification, action item
|
||||
validation, MTTD/MTTR metrics, and customer impact summaries.
|
||||
|
||||
Usage:
|
||||
python postmortem_generator.py incident_data.json
|
||||
python postmortem_generator.py incident_data.json --format markdown
|
||||
python postmortem_generator.py incident_data.json --format json
|
||||
cat incident_data.json | python postmortem_generator.py
|
||||
|
||||
Input:
|
||||
JSON object with keys: incident, timeline, resolution, action_items, participants.
|
||||
See SKILL.md for the full input schema.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# ---------- Constants and Configuration ----------
|
||||
|
||||
VERSION = "1.0.0"
|
||||
SEVERITY_ORDER = {"SEV0": 0, "SEV1": 1, "SEV2": 2, "SEV3": 3, "SEV4": 4}
|
||||
FACTOR_CATEGORIES = ("process", "tooling", "human", "environment", "external")
|
||||
ACTION_TYPES = ("detection", "prevention", "mitigation", "process")
|
||||
PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "P4": 4}
|
||||
POSTMORTEM_TARGET_HOURS = 72
|
||||
|
||||
# Industry benchmarks for incident response (minutes, except postmortem)
|
||||
BENCHMARKS = {
|
||||
"SEV0": {"mttd": 5, "mttr": 60, "mitigate": 30, "declare": 5},
|
||||
"SEV1": {"mttd": 10, "mttr": 120, "mitigate": 60, "declare": 10},
|
||||
"SEV2": {"mttd": 30, "mttr": 480, "mitigate": 120, "declare": 30},
|
||||
"SEV3": {"mttd": 60, "mttr": 1440, "mitigate": 240, "declare": 60},
|
||||
"SEV4": {"mttd": 120, "mttr": 2880, "mitigate": 480, "declare": 120},
|
||||
}
|
||||
|
||||
CAT_TO_ACTION = {"process": "process", "tooling": "detection", "human": "prevention",
|
||||
"environment": "mitigation", "external": "prevention"}
|
||||
CAT_WEIGHT = {"process": 1.0, "tooling": 0.9, "human": 0.8, "environment": 0.7, "external": 0.6}
|
||||
|
||||
# Keywords used to classify contributing factors into categories
|
||||
FACTOR_KEYWORDS = {
|
||||
"process": ["process", "procedure", "workflow", "review", "approval", "checklist",
|
||||
"runbook", "documentation", "policy", "standard", "protocol", "canary",
|
||||
"deployment", "rollback", "change management"],
|
||||
"tooling": ["tool", "monitor", "alert", "threshold", "automation", "test", "pipeline",
|
||||
"ci/cd", "observability", "dashboard", "logging", "infrastructure",
|
||||
"configuration", "config"],
|
||||
"human": ["training", "knowledge", "experience", "communication", "handoff", "fatigue",
|
||||
"oversight", "mistake", "error", "misunderstand", "assumption", "awareness"],
|
||||
"environment": ["load", "traffic", "scale", "capacity", "resource", "network", "hardware",
|
||||
"region", "latency", "timeout", "connection", "performance", "spike"],
|
||||
"external": ["vendor", "third-party", "upstream", "downstream", "provider", "api",
|
||||
"dependency", "partner", "dns", "cdn", "certificate"],
|
||||
}
|
||||
|
||||
# 5-Whys templates per category (each list is 5 why->answer steps)
|
||||
WHY_TEMPLATES = {
|
||||
"process": [
|
||||
"Why did this process gap exist? -> The existing process did not account for this scenario.",
|
||||
"Why was the scenario not accounted for? -> It was not identified during the last process review.",
|
||||
"Why was the process review incomplete? -> Reviews focus on known failure modes, not emerging risks.",
|
||||
"Why are emerging risks not surfaced? -> No systematic mechanism to capture lessons from near-misses.",
|
||||
"Why is there no near-miss capture mechanism? -> Incident learning is ad-hoc rather than systematic."],
|
||||
"tooling": [
|
||||
"Why did the tooling fail to catch this? -> The relevant metric was not monitored or the threshold was misconfigured.",
|
||||
"Why was the threshold misconfigured? -> It was set during initial deployment and never revisited.",
|
||||
"Why was it never revisited? -> There is no scheduled review of monitoring configurations.",
|
||||
"Why is there no scheduled review? -> Monitoring ownership is diffuse across teams.",
|
||||
"Why is ownership diffuse? -> No clear operational runbook assigns monitoring review responsibilities."],
|
||||
"human": [
|
||||
"Why did the human factor contribute? -> The individual lacked context needed to prevent the issue.",
|
||||
"Why was context lacking? -> Knowledge was siloed and not documented accessibly.",
|
||||
"Why was knowledge siloed? -> No structured onboarding or knowledge-sharing process for this area.",
|
||||
"Why is there no knowledge-sharing process? -> Team capacity has been focused on feature delivery.",
|
||||
"Why is capacity skewed toward features? -> Operational excellence is not weighted equally in planning."],
|
||||
"environment": [
|
||||
"Why did the environment cause this failure? -> System capacity was insufficient for the load pattern.",
|
||||
"Why was capacity insufficient? -> Load projections did not account for this traffic pattern.",
|
||||
"Why were projections inaccurate? -> Load testing does not replicate production-scale variability.",
|
||||
"Why doesn't load testing replicate production? -> Test environments lack realistic traffic generators.",
|
||||
"Why are traffic generators missing? -> Investment in production-like test infrastructure was deferred."],
|
||||
"external": [
|
||||
"Why did the external factor cause an incident? -> The system had a hard dependency with no fallback.",
|
||||
"Why was there no fallback? -> The integration was assumed to be highly available.",
|
||||
"Why was high availability assumed? -> SLA review of the external dependency was not performed.",
|
||||
"Why was SLA review skipped? -> No standard checklist for evaluating third-party dependencies.",
|
||||
"Why is there no evaluation checklist? -> Vendor management practices are informal and undocumented."],
|
||||
}
|
||||
|
||||
THEME_RECS = {
|
||||
"process": ["Establish a quarterly process review cadence covering change management and deployment procedures.",
|
||||
"Implement a near-miss tracking system to surface latent risks before they become incidents.",
|
||||
"Create pre-deployment checklists that require sign-off from the service owner."],
|
||||
"tooling": ["Schedule quarterly reviews of alerting thresholds and monitoring coverage.",
|
||||
"Assign explicit monitoring ownership per service in operational runbooks.",
|
||||
"Invest in synthetic monitoring and canary analysis for critical paths."],
|
||||
"human": ["Build structured onboarding that covers incident-prone areas and past postmortems.",
|
||||
"Implement blameless knowledge-sharing sessions after each incident.",
|
||||
"Balance operational excellence work alongside feature delivery in sprint planning."],
|
||||
"environment": ["Conduct periodic capacity planning reviews using production traffic replays.",
|
||||
"Invest in production-like load-testing infrastructure with realistic traffic profiles.",
|
||||
"Implement auto-scaling policies with validated upper-bound thresholds."],
|
||||
"external": ["Perform formal SLA reviews for all third-party dependencies annually.",
|
||||
"Implement circuit breakers and fallbacks for external service integrations.",
|
||||
"Maintain a dependency registry with risk ratings and contingency plans."],
|
||||
}
|
||||
|
||||
MISSING_ACTION_TEMPLATES = {
|
||||
"process": "Create or update runbook/checklist to prevent recurrence of this process gap",
|
||||
"detection": "Add monitoring and alerting to detect this class of issue earlier",
|
||||
"mitigation": "Implement auto-scaling or circuit-breaker to reduce blast radius",
|
||||
"prevention": "Add automated safeguards (canary deploy, load test gate) to prevent recurrence",
|
||||
}
|
||||
|
||||
|
||||
# ---------- Data Model Classes ----------
|
||||
|
||||
class IncidentData:
|
||||
"""Parsed incident metadata."""
|
||||
def __init__(self, data: Dict[str, Any]) -> None:
|
||||
self.id: str = data.get("id", "UNKNOWN")
|
||||
self.title: str = data.get("title", "Untitled Incident")
|
||||
self.severity: str = data.get("severity", "SEV3").upper()
|
||||
self.commander: str = data.get("commander", "Unassigned")
|
||||
self.service: str = data.get("service", "unknown-service")
|
||||
self.affected_services: List[str] = data.get("affected_services", [])
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"id": self.id, "title": self.title, "severity": self.severity,
|
||||
"commander": self.commander, "service": self.service,
|
||||
"affected_services": self.affected_services}
|
||||
|
||||
|
||||
class TimelineMetrics:
|
||||
"""MTTD, MTTR, and other timing metrics computed from raw timestamps."""
|
||||
def __init__(self, timeline: Dict[str, str], severity: str) -> None:
|
||||
self.severity = severity
|
||||
self.issue_started = self._parse(timeline.get("issue_started"))
|
||||
self.detected_at = self._parse(timeline.get("detected_at"))
|
||||
self.declared_at = self._parse(timeline.get("declared_at"))
|
||||
self.mitigated_at = self._parse(timeline.get("mitigated_at"))
|
||||
self.resolved_at = self._parse(timeline.get("resolved_at"))
|
||||
self.postmortem_at = self._parse(timeline.get("postmortem_at"))
|
||||
|
||||
@staticmethod
|
||||
def _parse(ts: Optional[str]) -> Optional[datetime]:
|
||||
if ts is None:
|
||||
return None
|
||||
for fmt in ("%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S"):
|
||||
try:
|
||||
dt = datetime.strptime(ts, fmt)
|
||||
return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
def _delta_min(self, start: Optional[datetime], end: Optional[datetime]) -> Optional[float]:
|
||||
if start is None or end is None:
|
||||
return None
|
||||
return round((end - start).total_seconds() / 60.0, 1)
|
||||
|
||||
@property
|
||||
def mttd(self) -> Optional[float]:
|
||||
return self._delta_min(self.issue_started, self.detected_at)
|
||||
|
||||
@property
|
||||
def mttr(self) -> Optional[float]:
|
||||
return self._delta_min(self.detected_at, self.resolved_at)
|
||||
|
||||
@property
|
||||
def time_to_mitigate(self) -> Optional[float]:
|
||||
return self._delta_min(self.detected_at, self.mitigated_at)
|
||||
|
||||
@property
|
||||
def time_to_declare(self) -> Optional[float]:
|
||||
return self._delta_min(self.detected_at, self.declared_at)
|
||||
|
||||
@property
|
||||
def postmortem_timeliness_hours(self) -> Optional[float]:
|
||||
m = self._delta_min(self.resolved_at, self.postmortem_at)
|
||||
return round(m / 60.0, 1) if m is not None else None
|
||||
|
||||
@property
|
||||
def postmortem_on_time(self) -> Optional[bool]:
|
||||
h = self.postmortem_timeliness_hours
|
||||
return h <= POSTMORTEM_TARGET_HOURS if h is not None else None
|
||||
|
||||
def benchmark_comparison(self) -> Dict[str, Dict[str, Any]]:
|
||||
bench = BENCHMARKS.get(self.severity, BENCHMARKS["SEV3"])
|
||||
results: Dict[str, Dict[str, Any]] = {}
|
||||
for name, actual, target in [("mttd", self.mttd, bench["mttd"]),
|
||||
("mttr", self.mttr, bench["mttr"]),
|
||||
("time_to_mitigate", self.time_to_mitigate, bench["mitigate"]),
|
||||
("time_to_declare", self.time_to_declare, bench["declare"])]:
|
||||
if actual is not None:
|
||||
results[name] = {"actual_minutes": actual, "benchmark_minutes": target,
|
||||
"met_benchmark": actual <= target,
|
||||
"delta_minutes": round(actual - target, 1)}
|
||||
h = self.postmortem_timeliness_hours
|
||||
if h is not None:
|
||||
results["postmortem_timeliness"] = {
|
||||
"actual_hours": h, "target_hours": POSTMORTEM_TARGET_HOURS,
|
||||
"met_target": self.postmortem_on_time, "delta_hours": round(h - POSTMORTEM_TARGET_HOURS, 1)}
|
||||
return results
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"mttd_minutes": self.mttd, "mttr_minutes": self.mttr,
|
||||
"time_to_mitigate_minutes": self.time_to_mitigate,
|
||||
"time_to_declare_minutes": self.time_to_declare,
|
||||
"postmortem_timeliness_hours": self.postmortem_timeliness_hours,
|
||||
"postmortem_on_time": self.postmortem_on_time,
|
||||
"benchmarks": self.benchmark_comparison()}
|
||||
|
||||
|
||||
class ContributingFactor:
|
||||
"""A classified contributing factor with weight and action-type mapping."""
|
||||
def __init__(self, description: str, index: int) -> None:
|
||||
self.description = description
|
||||
self.index = index
|
||||
self.category = self._classify()
|
||||
self.weight = round(max(1.0 - index * 0.15, 0.3) * CAT_WEIGHT.get(self.category, 0.8), 2)
|
||||
self.mapped_action_type = CAT_TO_ACTION.get(self.category, "process")
|
||||
|
||||
def _classify(self) -> str:
|
||||
lower = self.description.lower()
|
||||
scores = {cat: sum(1 for kw in kws if kw in lower) for cat, kws in FACTOR_KEYWORDS.items()}
|
||||
best = max(scores, key=lambda k: scores[k])
|
||||
return best if scores[best] > 0 else "process"
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"description": self.description, "category": self.category,
|
||||
"weight": self.weight, "mapped_action_type": self.mapped_action_type}
|
||||
|
||||
|
||||
class FiveWhysAnalysis:
|
||||
"""Structured 5-Whys chain for a contributing factor."""
|
||||
def __init__(self, factor: ContributingFactor) -> None:
|
||||
self.factor = factor
|
||||
self.systemic_theme: str = factor.category
|
||||
self.chain: List[str] = [f"Why? {factor.description}"] + \
|
||||
WHY_TEMPLATES.get(factor.category, WHY_TEMPLATES["process"])
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"factor": self.factor.description, "category": self.factor.category,
|
||||
"chain": self.chain, "systemic_theme": self.systemic_theme}
|
||||
|
||||
|
||||
class ActionItem:
|
||||
"""Parsed and validated action item."""
|
||||
def __init__(self, data: Dict[str, Any]) -> None:
|
||||
self.title: str = data.get("title", "")
|
||||
self.owner: str = data.get("owner", "")
|
||||
self.priority: str = data.get("priority", "P3")
|
||||
self.deadline: str = data.get("deadline", "")
|
||||
self.type: str = data.get("type", "process")
|
||||
self.status: str = data.get("status", "open")
|
||||
self.validation_issues: List[str] = []
|
||||
self.quality_score: int = 0
|
||||
self._validate()
|
||||
|
||||
def _validate(self) -> None:
|
||||
self.validation_issues = []
|
||||
if not self.title:
|
||||
self.validation_issues.append("Missing title")
|
||||
if not self.owner:
|
||||
self.validation_issues.append("Missing owner")
|
||||
if not self.deadline:
|
||||
self.validation_issues.append("Missing deadline")
|
||||
if self.priority not in PRIORITY_ORDER:
|
||||
self.validation_issues.append(f"Invalid priority: {self.priority}")
|
||||
if self.type not in ACTION_TYPES:
|
||||
self.validation_issues.append(f"Invalid type: {self.type}")
|
||||
self.quality_score = self._score_quality()
|
||||
|
||||
def _score_quality(self) -> int:
|
||||
"""Score 0-100: specific, measurable, achievable."""
|
||||
s = 0
|
||||
if len(self.title) > 10: s += 20
|
||||
if self.owner: s += 20
|
||||
if self.deadline: s += 20
|
||||
if self.priority in PRIORITY_ORDER: s += 10
|
||||
if self.type in ACTION_TYPES: s += 10
|
||||
if any(kw in self.title.lower() for kw in ["%", "threshold", "within", "before",
|
||||
"after", "less than", "greater than"]):
|
||||
s += 10
|
||||
if len(self.title.split()) >= 5: s += 10
|
||||
return min(s, 100)
|
||||
|
||||
@property
|
||||
def is_valid(self) -> bool:
|
||||
return len(self.validation_issues) == 0
|
||||
|
||||
@property
|
||||
def is_past_deadline(self) -> bool:
|
||||
if not self.deadline or self.status != "open":
|
||||
return False
|
||||
try:
|
||||
dl = datetime.strptime(self.deadline, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
||||
return datetime.now(timezone.utc) > dl
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {"title": self.title, "owner": self.owner, "priority": self.priority,
|
||||
"deadline": self.deadline, "type": self.type, "status": self.status,
|
||||
"is_valid": self.is_valid, "validation_issues": self.validation_issues,
|
||||
"quality_score": self.quality_score, "is_past_deadline": self.is_past_deadline}
|
||||
|
||||
|
||||
class PostmortemReport:
|
||||
"""Complete postmortem document assembled from all analysis components."""
|
||||
|
||||
def __init__(self, raw: Dict[str, Any]) -> None:
|
||||
self.raw = raw
|
||||
self.incident = IncidentData(raw.get("incident", {}))
|
||||
self.timeline = TimelineMetrics(raw.get("timeline", {}), self.incident.severity)
|
||||
self.resolution: Dict[str, Any] = raw.get("resolution", {})
|
||||
self.participants: List[Dict[str, str]] = raw.get("participants", [])
|
||||
# Derived analysis
|
||||
self.contributing_factors = [ContributingFactor(f, i)
|
||||
for i, f in enumerate(self.resolution.get("contributing_factors", []))]
|
||||
self.five_whys = [FiveWhysAnalysis(f) for f in self.contributing_factors]
|
||||
self.action_items = [ActionItem(a) for a in raw.get("action_items", [])]
|
||||
self.factor_distribution = self._compute_factor_distribution()
|
||||
self.coverage_gaps = self._find_coverage_gaps()
|
||||
self.suggested_actions = self._suggest_missing_actions()
|
||||
self.theme_recommendations = self._build_theme_recommendations()
|
||||
|
||||
def _compute_factor_distribution(self) -> Dict[str, float]:
|
||||
dist: Dict[str, float] = {c: 0.0 for c in FACTOR_CATEGORIES}
|
||||
total = sum(f.weight for f in self.contributing_factors) or 1.0
|
||||
for f in self.contributing_factors:
|
||||
dist[f.category] += f.weight
|
||||
return {k: round(v / total * 100, 1) for k, v in dist.items()}
|
||||
|
||||
def _find_coverage_gaps(self) -> List[str]:
|
||||
factor_cats = {f.category for f in self.contributing_factors}
|
||||
action_types = {a.type for a in self.action_items}
|
||||
gaps = []
|
||||
for cat in factor_cats:
|
||||
expected = CAT_TO_ACTION.get(cat)
|
||||
if expected and expected not in action_types:
|
||||
gaps.append(f"No '{expected}' action item to address '{cat}' contributing factor")
|
||||
return gaps
|
||||
|
||||
def _suggest_missing_actions(self) -> List[Dict[str, str]]:
|
||||
factor_cats = {f.category for f in self.contributing_factors}
|
||||
action_types = {a.type for a in self.action_items}
|
||||
suggestions = []
|
||||
for cat in factor_cats:
|
||||
expected = CAT_TO_ACTION.get(cat)
|
||||
if expected and expected not in action_types:
|
||||
suggestions.append({
|
||||
"type": expected,
|
||||
"suggestion": MISSING_ACTION_TEMPLATES.get(expected, "Add an action item for this gap"),
|
||||
"reason": f"No action item addresses the '{cat}' contributing factor"})
|
||||
return suggestions
|
||||
|
||||
def _build_theme_recommendations(self) -> Dict[str, List[str]]:
|
||||
seen: Dict[str, List[str]] = {}
|
||||
for a in self.five_whys:
|
||||
if a.systemic_theme not in seen:
|
||||
seen[a.systemic_theme] = THEME_RECS.get(a.systemic_theme, [])
|
||||
return seen
|
||||
|
||||
def customer_impact_summary(self) -> Dict[str, Any]:
|
||||
impact = self.resolution.get("customer_impact", {})
|
||||
affected = impact.get("affected_users", 0)
|
||||
failed_tx = impact.get("failed_transactions", 0)
|
||||
revenue = impact.get("revenue_impact_usd", 0)
|
||||
data_loss = impact.get("data_loss", False)
|
||||
comm_required = affected > 1000 or data_loss or revenue > 10000
|
||||
sev = "high" if (affected > 10000 or revenue > 50000) else (
|
||||
"medium" if (affected > 1000 or revenue > 5000) else "low")
|
||||
return {"affected_users": affected, "failed_transactions": failed_tx,
|
||||
"revenue_impact_usd": revenue, "data_loss": data_loss,
|
||||
"data_integrity": "compromised" if data_loss else "intact",
|
||||
"customer_communication_required": comm_required, "impact_severity": sev}
|
||||
|
||||
def executive_summary(self) -> str:
|
||||
mttr = self.timeline.mttr
|
||||
ci = self.customer_impact_summary()
|
||||
mttr_str = f"{mttr:.0f} minutes" if mttr is not None else "unknown duration"
|
||||
parts = [
|
||||
f"On {self._fmt_date(self.timeline.issue_started)}, a {self.incident.severity} "
|
||||
f"incident (\"{self.incident.title}\") impacted the {self.incident.service} service.",
|
||||
f"The root cause was identified as: {self.resolution.get('root_cause', 'Unknown root cause')}.",
|
||||
f"The incident was resolved in {mttr_str}, affecting approximately "
|
||||
f"{ci['affected_users']:,} users with an estimated revenue impact of ${ci['revenue_impact_usd']:,.2f}.",
|
||||
"Data loss was confirmed; affected customers must be notified." if ci["data_loss"]
|
||||
else "No data loss occurred during this incident."]
|
||||
return " ".join(parts)
|
||||
|
||||
@staticmethod
|
||||
def _fmt_date(dt: Optional[datetime]) -> str:
|
||||
return dt.strftime("%Y-%m-%d at %H:%M UTC") if dt else "an unknown date"
|
||||
|
||||
def overdue_p1_items(self) -> List[Dict[str, str]]:
|
||||
return [{"title": a.title, "owner": a.owner, "deadline": a.deadline}
|
||||
for a in self.action_items if a.priority in ("P0", "P1") and a.is_past_deadline]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"version": VERSION, "incident": self.incident.to_dict(),
|
||||
"executive_summary": self.executive_summary(),
|
||||
"timeline_metrics": self.timeline.to_dict(),
|
||||
"customer_impact": self.customer_impact_summary(),
|
||||
"root_cause": self.resolution.get("root_cause", ""),
|
||||
"contributing_factors": [f.to_dict() for f in self.contributing_factors],
|
||||
"factor_distribution": self.factor_distribution,
|
||||
"five_whys_analysis": [a.to_dict() for a in self.five_whys],
|
||||
"theme_recommendations": self.theme_recommendations,
|
||||
"mitigation_steps": self.resolution.get("mitigation_steps", []),
|
||||
"permanent_fix": self.resolution.get("permanent_fix", ""),
|
||||
"action_items": [a.to_dict() for a in self.action_items],
|
||||
"action_item_coverage_gaps": self.coverage_gaps,
|
||||
"suggested_actions": self.suggested_actions,
|
||||
"overdue_p1_items": self.overdue_p1_items(),
|
||||
"participants": self.participants}
|
||||
|
||||
|
||||
# ---------- Core Analysis Helpers ----------
|
||||
|
||||
def _bar(pct: float, width: int = 30) -> str:
|
||||
"""Render a text-based horizontal bar chart segment."""
|
||||
filled = int(round(pct / 100 * width))
|
||||
return "[" + "#" * filled + "." * (width - filled) + "]"
|
||||
|
||||
|
||||
def _generate_lessons(report: PostmortemReport) -> List[str]:
|
||||
"""Derive lessons learned from the analysis."""
|
||||
lessons: List[str] = []
|
||||
bench = BENCHMARKS.get(report.incident.severity, BENCHMARKS["SEV3"])
|
||||
mttd = report.timeline.mttd
|
||||
if mttd is not None and mttd > bench["mttd"]:
|
||||
lessons.append(
|
||||
f"Detection took {mttd:.0f} minutes, exceeding the {bench['mttd']}-minute "
|
||||
f"benchmark for {report.incident.severity}. Invest in earlier detection mechanisms.")
|
||||
dist = report.factor_distribution
|
||||
dominant = max(dist, key=lambda k: dist[k])
|
||||
if dist[dominant] >= 50:
|
||||
lessons.append(
|
||||
f"The '{dominant}' category accounts for {dist[dominant]:.0f}% of contributing factors. "
|
||||
f"Targeted improvements in this area will yield the highest return.")
|
||||
if report.coverage_gaps:
|
||||
lessons.append(
|
||||
f"There are {len(report.coverage_gaps)} action item coverage gap(s). "
|
||||
"Ensure every contributing factor category has a corresponding remediation action.")
|
||||
avg_q = (sum(a.quality_score for a in report.action_items) / len(report.action_items)
|
||||
if report.action_items else 0)
|
||||
if avg_q < 70:
|
||||
lessons.append(
|
||||
f"Average action item quality score is {avg_q:.0f}/100. "
|
||||
"Make action items more specific with measurable targets and clear ownership.")
|
||||
if report.timeline.postmortem_on_time is False:
|
||||
h = report.timeline.postmortem_timeliness_hours
|
||||
lessons.append(
|
||||
f"Postmortem was held {h:.0f} hours after resolution, exceeding the "
|
||||
f"{POSTMORTEM_TARGET_HOURS}-hour target. Schedule postmortems sooner to capture context.")
|
||||
if not lessons:
|
||||
lessons.append("This incident was handled within benchmarks. Continue reinforcing "
|
||||
"current practices and share this postmortem for organizational learning.")
|
||||
return lessons
|
||||
|
||||
|
||||
# ---------- Output Formatters ----------
|
||||
|
||||
def format_text(report: PostmortemReport) -> str:
|
||||
"""Format the postmortem as plain text."""
|
||||
L: List[str] = []
|
||||
W = 72
|
||||
|
||||
def h1(title: str) -> None:
|
||||
L.append(""); L.append("=" * W); L.append(f" {title}"); L.append("=" * W)
|
||||
|
||||
def h2(title: str) -> None:
|
||||
L.append(""); L.append(f"--- {title} ---")
|
||||
|
||||
inc = report.incident
|
||||
h1(f"POSTMORTEM: {inc.title}")
|
||||
L.append(f" ID: {inc.id} | Severity: {inc.severity} | Service: {inc.service}")
|
||||
L.append(f" Commander: {inc.commander}")
|
||||
if inc.affected_services:
|
||||
L.append(f" Affected services: {', '.join(inc.affected_services)}")
|
||||
# Executive Summary
|
||||
h1("EXECUTIVE SUMMARY")
|
||||
L.append("")
|
||||
for sentence in report.executive_summary().split(". "):
|
||||
s = sentence.strip()
|
||||
if s and not s.endswith("."): s += "."
|
||||
if s: L.append(f" {s}")
|
||||
# Timeline Metrics
|
||||
h1("TIMELINE METRICS")
|
||||
tm = report.timeline
|
||||
L.append("")
|
||||
for label, val, unit in [("MTTD (Time to Detect)", tm.mttd, "min"),
|
||||
("MTTR (Time to Resolve)", tm.mttr, "min"),
|
||||
("Time to Mitigate", tm.time_to_mitigate, "min"),
|
||||
("Time to Declare", tm.time_to_declare, "min"),
|
||||
("Postmortem Timeliness", tm.postmortem_timeliness_hours, "hrs")]:
|
||||
L.append(f" {label:<30s} {f'{val:.1f} {unit}' if val is not None else 'N/A'}")
|
||||
h2("Benchmark Comparison")
|
||||
for name, d in tm.benchmark_comparison().items():
|
||||
if "actual_minutes" in d:
|
||||
st = "PASS" if d["met_benchmark"] else "FAIL"
|
||||
L.append(f" {name:<25s} actual={d['actual_minutes']}min benchmark={d['benchmark_minutes']}min [{st}]")
|
||||
elif "actual_hours" in d:
|
||||
st = "PASS" if d["met_target"] else "FAIL"
|
||||
L.append(f" {name:<25s} actual={d['actual_hours']}hrs target={d['target_hours']}hrs [{st}]")
|
||||
# Customer Impact
|
||||
h1("CUSTOMER IMPACT")
|
||||
ci = report.customer_impact_summary()
|
||||
L.append("")
|
||||
L.append(f" Affected users: {ci['affected_users']:,}")
|
||||
L.append(f" Failed transactions: {ci['failed_transactions']:,}")
|
||||
L.append(f" Revenue impact: ${ci['revenue_impact_usd']:,.2f}")
|
||||
L.append(f" Data integrity: {ci['data_integrity']}")
|
||||
L.append(f" Impact severity: {ci['impact_severity']}")
|
||||
L.append(f" Comms required: {'Yes' if ci['customer_communication_required'] else 'No'}")
|
||||
# Root Cause
|
||||
h1("ROOT CAUSE ANALYSIS")
|
||||
L.append("")
|
||||
L.append(f" {report.resolution.get('root_cause', 'Unknown')}")
|
||||
h2("Contributing Factors")
|
||||
for f in report.contributing_factors:
|
||||
L.append(f" [{f.category.upper():<12s} w={f.weight:.2f}] {f.description}")
|
||||
h2("Factor Distribution")
|
||||
for cat, pct in sorted(report.factor_distribution.items(), key=lambda x: -x[1]):
|
||||
if pct > 0:
|
||||
L.append(f" {cat:<14s} {pct:5.1f}% {_bar(pct)}")
|
||||
# 5-Whys
|
||||
h1("5-WHYS ANALYSIS")
|
||||
for analysis in report.five_whys:
|
||||
L.append("")
|
||||
L.append(f" Factor: {analysis.factor.description}")
|
||||
L.append(f" Theme: {analysis.systemic_theme}")
|
||||
for i, step in enumerate(analysis.chain):
|
||||
L.append(f" {i}. {step}")
|
||||
h2("Theme-Based Recommendations")
|
||||
for theme, recs in report.theme_recommendations.items():
|
||||
L.append(f" [{theme.upper()}]")
|
||||
for rec in recs:
|
||||
L.append(f" - {rec}")
|
||||
# Mitigation & Fix
|
||||
h1("MITIGATION AND RESOLUTION")
|
||||
h2("Mitigation Steps Taken")
|
||||
for step in report.resolution.get("mitigation_steps", []):
|
||||
L.append(f" - {step}")
|
||||
h2("Permanent Fix")
|
||||
L.append(f" {report.resolution.get('permanent_fix', 'TBD')}")
|
||||
# Action Items
|
||||
h1("ACTION ITEMS")
|
||||
L.append("")
|
||||
hdr = f" {'Priority':<10s} {'Type':<14s} {'Owner':<25s} {'Deadline':<12s} {'Quality':<8s} Title"
|
||||
L.append(hdr)
|
||||
L.append(" " + "-" * (len(hdr) - 2))
|
||||
for a in sorted(report.action_items, key=lambda x: PRIORITY_ORDER.get(x.priority, 99)):
|
||||
flag = " *OVERDUE*" if a.is_past_deadline else ""
|
||||
L.append(f" {a.priority:<10s} {a.type:<14s} {a.owner:<25s} {a.deadline:<12s} "
|
||||
f"{a.quality_score:<8d} {a.title}{flag}")
|
||||
if report.coverage_gaps:
|
||||
h2("Coverage Gaps")
|
||||
for gap in report.coverage_gaps:
|
||||
L.append(f" WARNING: {gap}")
|
||||
if report.suggested_actions:
|
||||
h2("Suggested Additional Actions")
|
||||
for s in report.suggested_actions:
|
||||
L.append(f" [{s['type'].upper()}] {s['suggestion']}")
|
||||
L.append(f" Reason: {s['reason']}")
|
||||
overdue = report.overdue_p1_items()
|
||||
if overdue:
|
||||
h2("Overdue P0/P1 Items")
|
||||
for item in overdue:
|
||||
L.append(f" OVERDUE: {item['title']} (owner: {item['owner']}, deadline: {item['deadline']})")
|
||||
# Participants
|
||||
h1("PARTICIPANTS")
|
||||
L.append("")
|
||||
for p in report.participants:
|
||||
L.append(f" {p.get('name', 'Unknown'):<25s} {p.get('role', '')}")
|
||||
# Lessons Learned
|
||||
h1("LESSONS LEARNED")
|
||||
L.append("")
|
||||
for i, lesson in enumerate(_generate_lessons(report), 1):
|
||||
L.append(f" {i}. {lesson}")
|
||||
L.append("")
|
||||
L.append("=" * W)
|
||||
L.append(f" Generated by postmortem_generator v{VERSION}")
|
||||
L.append("=" * W)
|
||||
L.append("")
|
||||
return "\n".join(L)
|
||||
|
||||
|
||||
def format_json(report: PostmortemReport) -> str:
|
||||
"""Format the postmortem as JSON."""
|
||||
data = report.to_dict()
|
||||
data["lessons_learned"] = _generate_lessons(report)
|
||||
return json.dumps(data, indent=2, default=str)
|
||||
|
||||
|
||||
def format_markdown(report: PostmortemReport) -> str:
|
||||
"""Format the postmortem as a Markdown document."""
|
||||
L: List[str] = []
|
||||
inc = report.incident
|
||||
L.append(f"# Postmortem: {inc.title}")
|
||||
L.append("")
|
||||
L.append("| Field | Value |")
|
||||
L.append("|-------|-------|")
|
||||
L.append(f"| **ID** | {inc.id} |")
|
||||
L.append(f"| **Severity** | {inc.severity} |")
|
||||
L.append(f"| **Service** | {inc.service} |")
|
||||
L.append(f"| **Commander** | {inc.commander} |")
|
||||
if inc.affected_services:
|
||||
L.append(f"| **Affected Services** | {', '.join(inc.affected_services)} |")
|
||||
L.append("")
|
||||
# Executive Summary
|
||||
L.append("## Executive Summary\n")
|
||||
L.append(report.executive_summary())
|
||||
L.append("")
|
||||
# Timeline Metrics
|
||||
L.append("## Timeline Metrics\n")
|
||||
L.append("| Metric | Value | Benchmark | Status |")
|
||||
L.append("|--------|-------|-----------|--------|")
|
||||
labels = {"mttd": "MTTD (Time to Detect)", "mttr": "MTTR (Time to Resolve)",
|
||||
"time_to_mitigate": "Time to Mitigate", "time_to_declare": "Time to Declare",
|
||||
"postmortem_timeliness": "Postmortem Timeliness"}
|
||||
for key, label in labels.items():
|
||||
b = report.timeline.benchmark_comparison().get(key)
|
||||
if b and "actual_minutes" in b:
|
||||
st = "PASS" if b["met_benchmark"] else "FAIL"
|
||||
L.append(f"| {label} | {b['actual_minutes']} min | {b['benchmark_minutes']} min | {st} |")
|
||||
elif b and "actual_hours" in b:
|
||||
st = "PASS" if b["met_target"] else "FAIL"
|
||||
L.append(f"| {label} | {b['actual_hours']} hrs | {b['target_hours']} hrs | {st} |")
|
||||
L.append("")
|
||||
# Customer Impact
|
||||
L.append("## Customer Impact\n")
|
||||
ci = report.customer_impact_summary()
|
||||
L.append(f"- **Affected users:** {ci['affected_users']:,}")
|
||||
L.append(f"- **Failed transactions:** {ci['failed_transactions']:,}")
|
||||
L.append(f"- **Revenue impact:** ${ci['revenue_impact_usd']:,.2f}")
|
||||
L.append(f"- **Data integrity:** {ci['data_integrity']}")
|
||||
L.append(f"- **Impact severity:** {ci['impact_severity']}")
|
||||
L.append(f"- **Customer communication required:** {'Yes' if ci['customer_communication_required'] else 'No'}")
|
||||
L.append("")
|
||||
# Root Cause Analysis
|
||||
L.append("## Root Cause Analysis\n")
|
||||
L.append(f"**Root cause:** {report.resolution.get('root_cause', 'Unknown')}")
|
||||
L.append("")
|
||||
L.append("### Contributing Factors\n")
|
||||
L.append("| # | Category | Weight | Description |")
|
||||
L.append("|---|----------|--------|-------------|")
|
||||
for i, f in enumerate(report.contributing_factors, 1):
|
||||
L.append(f"| {i} | {f.category} | {f.weight:.2f} | {f.description} |")
|
||||
L.append("")
|
||||
L.append("### Factor Distribution\n")
|
||||
L.append("```")
|
||||
for cat, pct in sorted(report.factor_distribution.items(), key=lambda x: -x[1]):
|
||||
if pct > 0:
|
||||
L.append(f" {cat:<14s} {pct:5.1f}% {_bar(pct, 25)}")
|
||||
L.append("```")
|
||||
L.append("")
|
||||
# 5-Whys
|
||||
L.append("## 5-Whys Analysis\n")
|
||||
for analysis in report.five_whys:
|
||||
L.append(f"### Factor: {analysis.factor.description}")
|
||||
L.append(f"**Systemic theme:** {analysis.systemic_theme}\n")
|
||||
for i, step in enumerate(analysis.chain):
|
||||
L.append(f"{i}. {step}")
|
||||
L.append("")
|
||||
L.append("### Theme-Based Recommendations\n")
|
||||
for theme, recs in report.theme_recommendations.items():
|
||||
L.append(f"**{theme.capitalize()}:**")
|
||||
for rec in recs:
|
||||
L.append(f"- {rec}")
|
||||
L.append("")
|
||||
# Mitigation
|
||||
L.append("## Mitigation and Resolution\n")
|
||||
L.append("### Mitigation Steps Taken\n")
|
||||
for step in report.resolution.get("mitigation_steps", []):
|
||||
L.append(f"- {step}")
|
||||
L.append("")
|
||||
L.append("### Permanent Fix\n")
|
||||
L.append(report.resolution.get("permanent_fix", "TBD"))
|
||||
L.append("")
|
||||
# Action Items
|
||||
L.append("## Action Items\n")
|
||||
L.append("| Priority | Type | Owner | Deadline | Quality | Title |")
|
||||
L.append("|----------|------|-------|----------|---------|-------|")
|
||||
for a in sorted(report.action_items, key=lambda x: PRIORITY_ORDER.get(x.priority, 99)):
|
||||
flag = " **OVERDUE**" if a.is_past_deadline else ""
|
||||
L.append(f"| {a.priority} | {a.type} | {a.owner} | {a.deadline} | {a.quality_score}/100 | {a.title}{flag} |")
|
||||
L.append("")
|
||||
if report.coverage_gaps:
|
||||
L.append("### Coverage Gaps\n")
|
||||
for gap in report.coverage_gaps:
|
||||
L.append(f"> **WARNING:** {gap}")
|
||||
L.append("")
|
||||
if report.suggested_actions:
|
||||
L.append("### Suggested Additional Actions\n")
|
||||
for s in report.suggested_actions:
|
||||
L.append(f"- **[{s['type'].upper()}]** {s['suggestion']}")
|
||||
L.append(f" - _Reason: {s['reason']}_")
|
||||
L.append("")
|
||||
overdue = report.overdue_p1_items()
|
||||
if overdue:
|
||||
L.append("### Overdue P0/P1 Items\n")
|
||||
for item in overdue:
|
||||
L.append(f"- **{item['title']}** (owner: {item['owner']}, deadline: {item['deadline']})")
|
||||
L.append("")
|
||||
# Participants
|
||||
L.append("## Participants\n")
|
||||
L.append("| Name | Role |")
|
||||
L.append("|------|------|")
|
||||
for p in report.participants:
|
||||
L.append(f"| {p.get('name', 'Unknown')} | {p.get('role', '')} |")
|
||||
L.append("")
|
||||
# Lessons Learned
|
||||
L.append("## Lessons Learned\n")
|
||||
for i, lesson in enumerate(_generate_lessons(report), 1):
|
||||
L.append(f"{i}. {lesson}")
|
||||
L.append("")
|
||||
L.append("---")
|
||||
L.append(f"_Generated by postmortem_generator v{VERSION}_")
|
||||
L.append("")
|
||||
return "\n".join(L)
|
||||
|
||||
|
||||
# ---------- Input Loading ----------
|
||||
|
||||
def load_input(filepath: Optional[str]) -> Dict[str, Any]:
|
||||
"""Load incident data from a file path or stdin."""
|
||||
if filepath:
|
||||
try:
|
||||
with open(filepath, "r", encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {filepath}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"Error: Invalid JSON in {filepath}: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
if sys.stdin.isatty():
|
||||
print("Error: No input file specified and no data on stdin.", file=sys.stderr)
|
||||
print("Usage: postmortem_generator.py [data_file] or pipe JSON via stdin.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
try:
|
||||
return json.load(sys.stdin)
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"Error: Invalid JSON on stdin: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def validate_input(data: Dict[str, Any]) -> List[str]:
|
||||
"""Return a list of validation warnings (non-fatal)."""
|
||||
warnings: List[str] = []
|
||||
for key in ("incident", "timeline", "resolution", "action_items"):
|
||||
if key not in data:
|
||||
warnings.append(f"Missing '{key}' section")
|
||||
for ts in ("issue_started", "detected_at", "mitigated_at", "resolved_at"):
|
||||
if ts not in data.get("timeline", {}):
|
||||
warnings.append(f"Missing timeline field: {ts}")
|
||||
res = data.get("resolution", {})
|
||||
if "root_cause" not in res:
|
||||
warnings.append("Missing 'root_cause' in resolution")
|
||||
if not res.get("contributing_factors"):
|
||||
warnings.append("No contributing factors provided")
|
||||
return warnings
|
||||
|
||||
|
||||
# ---------- CLI Entry Point ----------
|
||||
|
||||
def main() -> None:
|
||||
"""CLI entry point for postmortem generation."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate structured postmortem reports with 5-Whys analysis.",
|
||||
epilog="Reads JSON from a file or stdin. Outputs text, JSON, or markdown.")
|
||||
parser.add_argument("data_file", nargs="?", default=None,
|
||||
help="JSON file with incident + resolution data (reads stdin if omitted)")
|
||||
parser.add_argument("--format", choices=["text", "json", "markdown"], default="text",
|
||||
dest="output_format", help="Output format (default: text)")
|
||||
args = parser.parse_args()
|
||||
|
||||
data = load_input(args.data_file)
|
||||
warnings = validate_input(data)
|
||||
for w in warnings:
|
||||
print(f"Warning: {w}", file=sys.stderr)
|
||||
|
||||
report = PostmortemReport(data)
|
||||
formatters = {"text": format_text, "json": format_json, "markdown": format_markdown}
|
||||
print(formatters[args.output_format](report))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1228
engineering-team/incident-commander/scripts/severity_classifier.py
Normal file
1228
engineering-team/incident-commander/scripts/severity_classifier.py
Normal file
File diff suppressed because it is too large
Load Diff
416
engineering/api-design-reviewer/SKILL.md
Normal file
416
engineering/api-design-reviewer/SKILL.md
Normal file
@@ -0,0 +1,416 @@
|
||||
# API Design Reviewer
|
||||
|
||||
**Tier:** POWERFUL
|
||||
**Category:** Engineering / Architecture
|
||||
**Maintainer:** Claude Skills Team
|
||||
|
||||
## Overview
|
||||
|
||||
The API Design Reviewer skill provides comprehensive analysis and review of API designs, focusing on REST conventions, best practices, and industry standards. This skill helps engineering teams build consistent, maintainable, and well-designed APIs through automated linting, breaking change detection, and design scorecards.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
### 1. API Linting and Convention Analysis
|
||||
- **Resource Naming Conventions**: Enforces kebab-case for resources, camelCase for fields
|
||||
- **HTTP Method Usage**: Validates proper use of GET, POST, PUT, PATCH, DELETE
|
||||
- **URL Structure**: Analyzes endpoint patterns for consistency and RESTful design
|
||||
- **Status Code Compliance**: Ensures appropriate HTTP status codes are used
|
||||
- **Error Response Formats**: Validates consistent error response structures
|
||||
- **Documentation Coverage**: Checks for missing descriptions and documentation gaps
|
||||
|
||||
### 2. Breaking Change Detection
|
||||
- **Endpoint Removal**: Detects removed or deprecated endpoints
|
||||
- **Response Shape Changes**: Identifies modifications to response structures
|
||||
- **Field Removal**: Tracks removed or renamed fields in API responses
|
||||
- **Type Changes**: Catches field type modifications that could break clients
|
||||
- **Required Field Additions**: Flags new required fields that could break existing integrations
|
||||
- **Status Code Changes**: Detects changes to expected status codes
|
||||
|
||||
### 3. API Design Scoring and Assessment
|
||||
- **Consistency Analysis** (30%): Evaluates naming conventions, response patterns, and structural consistency
|
||||
- **Documentation Quality** (20%): Assesses completeness and clarity of API documentation
|
||||
- **Security Implementation** (20%): Reviews authentication, authorization, and security headers
|
||||
- **Usability Design** (15%): Analyzes ease of use, discoverability, and developer experience
|
||||
- **Performance Patterns** (15%): Evaluates caching, pagination, and efficiency patterns
|
||||
|
||||
## REST Design Principles
|
||||
|
||||
### Resource Naming Conventions
|
||||
```
|
||||
✅ Good Examples:
|
||||
- /api/v1/users
|
||||
- /api/v1/user-profiles
|
||||
- /api/v1/orders/123/line-items
|
||||
|
||||
❌ Bad Examples:
|
||||
- /api/v1/getUsers
|
||||
- /api/v1/user_profiles
|
||||
- /api/v1/orders/123/lineItems
|
||||
```
|
||||
|
||||
### HTTP Method Usage
|
||||
- **GET**: Retrieve resources (safe, idempotent)
|
||||
- **POST**: Create new resources (not idempotent)
|
||||
- **PUT**: Replace entire resources (idempotent)
|
||||
- **PATCH**: Partial resource updates (not necessarily idempotent)
|
||||
- **DELETE**: Remove resources (idempotent)
|
||||
|
||||
### URL Structure Best Practices
|
||||
```
|
||||
Collection Resources: /api/v1/users
|
||||
Individual Resources: /api/v1/users/123
|
||||
Nested Resources: /api/v1/users/123/orders
|
||||
Actions: /api/v1/users/123/activate (POST)
|
||||
Filtering: /api/v1/users?status=active&role=admin
|
||||
```
|
||||
|
||||
## Versioning Strategies
|
||||
|
||||
### 1. URL Versioning (Recommended)
|
||||
```
|
||||
/api/v1/users
|
||||
/api/v2/users
|
||||
```
|
||||
**Pros**: Clear, explicit, easy to route
|
||||
**Cons**: URL proliferation, caching complexity
|
||||
|
||||
### 2. Header Versioning
|
||||
```
|
||||
GET /api/users
|
||||
Accept: application/vnd.api+json;version=1
|
||||
```
|
||||
**Pros**: Clean URLs, content negotiation
|
||||
**Cons**: Less visible, harder to test manually
|
||||
|
||||
### 3. Media Type Versioning
|
||||
```
|
||||
GET /api/users
|
||||
Accept: application/vnd.myapi.v1+json
|
||||
```
|
||||
**Pros**: RESTful, supports multiple representations
|
||||
**Cons**: Complex, harder to implement
|
||||
|
||||
### 4. Query Parameter Versioning
|
||||
```
|
||||
/api/users?version=1
|
||||
```
|
||||
**Pros**: Simple to implement
|
||||
**Cons**: Not RESTful, can be ignored
|
||||
|
||||
## Pagination Patterns
|
||||
|
||||
### Offset-Based Pagination
|
||||
```json
|
||||
{
|
||||
"data": [...],
|
||||
"pagination": {
|
||||
"offset": 20,
|
||||
"limit": 10,
|
||||
"total": 150,
|
||||
"hasMore": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Cursor-Based Pagination
|
||||
```json
|
||||
{
|
||||
"data": [...],
|
||||
"pagination": {
|
||||
"nextCursor": "eyJpZCI6MTIzfQ==",
|
||||
"hasMore": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Page-Based Pagination
|
||||
```json
|
||||
{
|
||||
"data": [...],
|
||||
"pagination": {
|
||||
"page": 3,
|
||||
"pageSize": 10,
|
||||
"totalPages": 15,
|
||||
"totalItems": 150
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Error Response Formats
|
||||
|
||||
### Standard Error Structure
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "VALIDATION_ERROR",
|
||||
"message": "The request contains invalid parameters",
|
||||
"details": [
|
||||
{
|
||||
"field": "email",
|
||||
"code": "INVALID_FORMAT",
|
||||
"message": "Email address is not valid"
|
||||
}
|
||||
],
|
||||
"requestId": "req-123456",
|
||||
"timestamp": "2024-02-16T13:00:00Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### HTTP Status Code Usage
|
||||
- **400 Bad Request**: Invalid request syntax or parameters
|
||||
- **401 Unauthorized**: Authentication required
|
||||
- **403 Forbidden**: Access denied (authenticated but not authorized)
|
||||
- **404 Not Found**: Resource not found
|
||||
- **409 Conflict**: Resource conflict (duplicate, version mismatch)
|
||||
- **422 Unprocessable Entity**: Valid syntax but semantic errors
|
||||
- **429 Too Many Requests**: Rate limit exceeded
|
||||
- **500 Internal Server Error**: Unexpected server error
|
||||
|
||||
## Authentication and Authorization Patterns
|
||||
|
||||
### Bearer Token Authentication
|
||||
```
|
||||
Authorization: Bearer <token>
|
||||
```
|
||||
|
||||
### API Key Authentication
|
||||
```
|
||||
X-API-Key: <api-key>
|
||||
Authorization: Api-Key <api-key>
|
||||
```
|
||||
|
||||
### OAuth 2.0 Flow
|
||||
```
|
||||
Authorization: Bearer <oauth-access-token>
|
||||
```
|
||||
|
||||
### Role-Based Access Control (RBAC)
|
||||
```json
|
||||
{
|
||||
"user": {
|
||||
"id": "123",
|
||||
"roles": ["admin", "editor"],
|
||||
"permissions": ["read:users", "write:orders"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Rate Limiting Implementation
|
||||
|
||||
### Headers
|
||||
```
|
||||
X-RateLimit-Limit: 1000
|
||||
X-RateLimit-Remaining: 999
|
||||
X-RateLimit-Reset: 1640995200
|
||||
```
|
||||
|
||||
### Response on Limit Exceeded
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "RATE_LIMIT_EXCEEDED",
|
||||
"message": "Too many requests",
|
||||
"retryAfter": 3600
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## HATEOAS (Hypermedia as the Engine of Application State)
|
||||
|
||||
### Example Implementation
|
||||
```json
|
||||
{
|
||||
"id": "123",
|
||||
"name": "John Doe",
|
||||
"email": "john@example.com",
|
||||
"_links": {
|
||||
"self": { "href": "/api/v1/users/123" },
|
||||
"orders": { "href": "/api/v1/users/123/orders" },
|
||||
"profile": { "href": "/api/v1/users/123/profile" },
|
||||
"deactivate": {
|
||||
"href": "/api/v1/users/123/deactivate",
|
||||
"method": "POST"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Idempotency
|
||||
|
||||
### Idempotent Methods
|
||||
- **GET**: Always safe and idempotent
|
||||
- **PUT**: Should be idempotent (replace entire resource)
|
||||
- **DELETE**: Should be idempotent (same result)
|
||||
- **PATCH**: May or may not be idempotent
|
||||
|
||||
### Idempotency Keys
|
||||
```
|
||||
POST /api/v1/payments
|
||||
Idempotency-Key: 123e4567-e89b-12d3-a456-426614174000
|
||||
```
|
||||
|
||||
## Backward Compatibility Guidelines
|
||||
|
||||
### Safe Changes (Non-Breaking)
|
||||
- Adding optional fields to requests
|
||||
- Adding fields to responses
|
||||
- Adding new endpoints
|
||||
- Making required fields optional
|
||||
- Adding new enum values (with graceful handling)
|
||||
|
||||
### Breaking Changes (Require Version Bump)
|
||||
- Removing fields from responses
|
||||
- Making optional fields required
|
||||
- Changing field types
|
||||
- Removing endpoints
|
||||
- Changing URL structures
|
||||
- Modifying error response formats
|
||||
|
||||
## OpenAPI/Swagger Validation
|
||||
|
||||
### Required Components
|
||||
- **API Information**: Title, description, version
|
||||
- **Server Information**: Base URLs and descriptions
|
||||
- **Path Definitions**: All endpoints with methods
|
||||
- **Parameter Definitions**: Query, path, header parameters
|
||||
- **Request/Response Schemas**: Complete data models
|
||||
- **Security Definitions**: Authentication schemes
|
||||
- **Error Responses**: Standard error formats
|
||||
|
||||
### Best Practices
|
||||
- Use consistent naming conventions
|
||||
- Provide detailed descriptions for all components
|
||||
- Include examples for complex objects
|
||||
- Define reusable components and schemas
|
||||
- Validate against OpenAPI specification
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
### Caching Strategies
|
||||
```
|
||||
Cache-Control: public, max-age=3600
|
||||
ETag: "123456789"
|
||||
Last-Modified: Wed, 21 Oct 2015 07:28:00 GMT
|
||||
```
|
||||
|
||||
### Efficient Data Transfer
|
||||
- Use appropriate HTTP methods
|
||||
- Implement field selection (`?fields=id,name,email`)
|
||||
- Support compression (gzip)
|
||||
- Implement efficient pagination
|
||||
- Use ETags for conditional requests
|
||||
|
||||
### Resource Optimization
|
||||
- Avoid N+1 queries
|
||||
- Implement batch operations
|
||||
- Use async processing for heavy operations
|
||||
- Support partial updates (PATCH)
|
||||
|
||||
## Security Best Practices
|
||||
|
||||
### Input Validation
|
||||
- Validate all input parameters
|
||||
- Sanitize user data
|
||||
- Use parameterized queries
|
||||
- Implement request size limits
|
||||
|
||||
### Authentication Security
|
||||
- Use HTTPS everywhere
|
||||
- Implement secure token storage
|
||||
- Support token expiration and refresh
|
||||
- Use strong authentication mechanisms
|
||||
|
||||
### Authorization Controls
|
||||
- Implement principle of least privilege
|
||||
- Use resource-based permissions
|
||||
- Support fine-grained access control
|
||||
- Audit access patterns
|
||||
|
||||
## Tools and Scripts
|
||||
|
||||
### api_linter.py
|
||||
Analyzes API specifications for compliance with REST conventions and best practices.
|
||||
|
||||
**Features:**
|
||||
- OpenAPI/Swagger spec validation
|
||||
- Naming convention checks
|
||||
- HTTP method usage validation
|
||||
- Error format consistency
|
||||
- Documentation completeness analysis
|
||||
|
||||
### breaking_change_detector.py
|
||||
Compares API specification versions to identify breaking changes.
|
||||
|
||||
**Features:**
|
||||
- Endpoint comparison
|
||||
- Schema change detection
|
||||
- Field removal/modification tracking
|
||||
- Migration guide generation
|
||||
- Impact severity assessment
|
||||
|
||||
### api_scorecard.py
|
||||
Provides comprehensive scoring of API design quality.
|
||||
|
||||
**Features:**
|
||||
- Multi-dimensional scoring
|
||||
- Detailed improvement recommendations
|
||||
- Letter grade assessment (A-F)
|
||||
- Benchmark comparisons
|
||||
- Progress tracking
|
||||
|
||||
## Integration Examples
|
||||
|
||||
### CI/CD Integration
|
||||
```yaml
|
||||
- name: API Linting
|
||||
run: python scripts/api_linter.py openapi.json
|
||||
|
||||
- name: Breaking Change Detection
|
||||
run: python scripts/breaking_change_detector.py openapi-v1.json openapi-v2.json
|
||||
|
||||
- name: API Scorecard
|
||||
run: python scripts/api_scorecard.py openapi.json
|
||||
```
|
||||
|
||||
### Pre-commit Hooks
|
||||
```bash
|
||||
#!/bin/bash
|
||||
python engineering/api-design-reviewer/scripts/api_linter.py api/openapi.json
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "API linting failed. Please fix the issues before committing."
|
||||
exit 1
|
||||
fi
|
||||
```
|
||||
|
||||
## Best Practices Summary
|
||||
|
||||
1. **Consistency First**: Maintain consistent naming, response formats, and patterns
|
||||
2. **Documentation**: Provide comprehensive, up-to-date API documentation
|
||||
3. **Versioning**: Plan for evolution with clear versioning strategies
|
||||
4. **Error Handling**: Implement consistent, informative error responses
|
||||
5. **Security**: Build security into every layer of the API
|
||||
6. **Performance**: Design for scale and efficiency from the start
|
||||
7. **Backward Compatibility**: Minimize breaking changes and provide migration paths
|
||||
8. **Testing**: Implement comprehensive testing including contract testing
|
||||
9. **Monitoring**: Add observability for API usage and performance
|
||||
10. **Developer Experience**: Prioritize ease of use and clear documentation
|
||||
|
||||
## Common Anti-Patterns to Avoid
|
||||
|
||||
1. **Verb-based URLs**: Use nouns for resources, not actions
|
||||
2. **Inconsistent Response Formats**: Maintain standard response structures
|
||||
3. **Over-nesting**: Avoid deeply nested resource hierarchies
|
||||
4. **Ignoring HTTP Status Codes**: Use appropriate status codes for different scenarios
|
||||
5. **Poor Error Messages**: Provide actionable, specific error information
|
||||
6. **Missing Pagination**: Always paginate list endpoints
|
||||
7. **No Versioning Strategy**: Plan for API evolution from day one
|
||||
8. **Exposing Internal Structure**: Design APIs for external consumption, not internal convenience
|
||||
9. **Missing Rate Limiting**: Protect your API from abuse and overload
|
||||
10. **Inadequate Testing**: Test all aspects including error cases and edge conditions
|
||||
|
||||
## Conclusion
|
||||
|
||||
The API Design Reviewer skill provides a comprehensive framework for building, reviewing, and maintaining high-quality REST APIs. By following these guidelines and using the provided tools, development teams can create APIs that are consistent, well-documented, secure, and maintainable.
|
||||
|
||||
Regular use of the linting, breaking change detection, and scoring tools ensures continuous improvement and helps maintain API quality throughout the development lifecycle.
|
||||
680
engineering/api-design-reviewer/references/api_antipatterns.md
Normal file
680
engineering/api-design-reviewer/references/api_antipatterns.md
Normal file
@@ -0,0 +1,680 @@
|
||||
# Common API Anti-Patterns and How to Avoid Them
|
||||
|
||||
## Introduction
|
||||
|
||||
This document outlines common anti-patterns in REST API design that can lead to poor developer experience, maintenance nightmares, and scalability issues. Each anti-pattern is accompanied by examples and recommended solutions.
|
||||
|
||||
## 1. Verb-Based URLs (The RPC Trap)
|
||||
|
||||
### Anti-Pattern
|
||||
Using verbs in URLs instead of treating endpoints as resources.
|
||||
|
||||
```
|
||||
❌ Bad Examples:
|
||||
POST /api/getUsers
|
||||
POST /api/createUser
|
||||
GET /api/deleteUser/123
|
||||
POST /api/updateUserPassword
|
||||
GET /api/calculateOrderTotal/456
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Violates REST principles
|
||||
- Makes the API feel like RPC instead of REST
|
||||
- HTTP methods lose their semantic meaning
|
||||
- Reduces cacheability
|
||||
- Harder to understand resource relationships
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
GET /api/users # Get users
|
||||
POST /api/users # Create user
|
||||
DELETE /api/users/123 # Delete user
|
||||
PATCH /api/users/123/password # Update password
|
||||
GET /api/orders/456/total # Get order total
|
||||
```
|
||||
|
||||
## 2. Inconsistent Naming Conventions
|
||||
|
||||
### Anti-Pattern
|
||||
Mixed naming conventions across the API.
|
||||
|
||||
```json
|
||||
❌ Bad Examples:
|
||||
{
|
||||
"user_id": 123, // snake_case
|
||||
"firstName": "John", // camelCase
|
||||
"last-name": "Doe", // kebab-case
|
||||
"EMAIL": "john@example.com", // UPPER_CASE
|
||||
"IsActive": true // PascalCase
|
||||
}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Confuses developers
|
||||
- Increases cognitive load
|
||||
- Makes code generation difficult
|
||||
- Reduces API adoption
|
||||
|
||||
### Solution
|
||||
```json
|
||||
✅ Choose one convention and stick to it (camelCase recommended):
|
||||
{
|
||||
"userId": 123,
|
||||
"firstName": "John",
|
||||
"lastName": "Doe",
|
||||
"email": "john@example.com",
|
||||
"isActive": true
|
||||
}
|
||||
```
|
||||
|
||||
## 3. Ignoring HTTP Status Codes
|
||||
|
||||
### Anti-Pattern
|
||||
Always returning HTTP 200 regardless of the actual result.
|
||||
|
||||
```json
|
||||
❌ Bad Example:
|
||||
HTTP/1.1 200 OK
|
||||
{
|
||||
"status": "error",
|
||||
"code": 404,
|
||||
"message": "User not found"
|
||||
}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Breaks HTTP semantics
|
||||
- Prevents proper error handling by clients
|
||||
- Breaks caching and proxies
|
||||
- Makes monitoring and debugging harder
|
||||
|
||||
### Solution
|
||||
```json
|
||||
✅ Good Example:
|
||||
HTTP/1.1 404 Not Found
|
||||
{
|
||||
"error": {
|
||||
"code": "USER_NOT_FOUND",
|
||||
"message": "User with ID 123 not found",
|
||||
"requestId": "req-abc123"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 4. Overly Complex Nested Resources
|
||||
|
||||
### Anti-Pattern
|
||||
Creating deeply nested URL structures that are hard to navigate.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
/companies/123/departments/456/teams/789/members/012/projects/345/tasks/678/comments/901
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- URLs become unwieldy
|
||||
- Creates tight coupling between resources
|
||||
- Makes independent resource access difficult
|
||||
- Complicates authorization logic
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
/tasks/678 # Direct access to task
|
||||
/tasks/678/comments # Task comments
|
||||
/users/012/tasks # User's tasks
|
||||
/projects/345?team=789 # Project filtering
|
||||
```
|
||||
|
||||
## 5. Inconsistent Error Response Formats
|
||||
|
||||
### Anti-Pattern
|
||||
Different error response structures across endpoints.
|
||||
|
||||
```json
|
||||
❌ Bad Examples:
|
||||
# Endpoint 1
|
||||
{"error": "Invalid email"}
|
||||
|
||||
# Endpoint 2
|
||||
{"success": false, "msg": "User not found", "code": 404}
|
||||
|
||||
# Endpoint 3
|
||||
{"errors": [{"field": "name", "message": "Required"}]}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Makes error handling complex for clients
|
||||
- Reduces code reusability
|
||||
- Poor developer experience
|
||||
|
||||
### Solution
|
||||
```json
|
||||
✅ Standardized Error Format:
|
||||
{
|
||||
"error": {
|
||||
"code": "VALIDATION_ERROR",
|
||||
"message": "The request contains invalid data",
|
||||
"details": [
|
||||
{
|
||||
"field": "email",
|
||||
"code": "INVALID_FORMAT",
|
||||
"message": "Email address is not valid"
|
||||
}
|
||||
],
|
||||
"requestId": "req-123456",
|
||||
"timestamp": "2024-02-16T13:00:00Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 6. Missing or Poor Pagination
|
||||
|
||||
### Anti-Pattern
|
||||
Returning all results in a single response or inconsistent pagination.
|
||||
|
||||
```json
|
||||
❌ Bad Examples:
|
||||
# No pagination (returns 10,000 records)
|
||||
GET /api/users
|
||||
|
||||
# Inconsistent pagination parameters
|
||||
GET /api/users?page=1&size=10
|
||||
GET /api/orders?offset=0&limit=20
|
||||
GET /api/products?start=0&count=50
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Can cause performance issues
|
||||
- May overwhelm clients
|
||||
- Inconsistent pagination parameters confuse developers
|
||||
- No way to estimate total results
|
||||
|
||||
### Solution
|
||||
```json
|
||||
✅ Good Example:
|
||||
GET /api/users?page=1&pageSize=10
|
||||
|
||||
{
|
||||
"data": [...],
|
||||
"pagination": {
|
||||
"page": 1,
|
||||
"pageSize": 10,
|
||||
"total": 150,
|
||||
"totalPages": 15,
|
||||
"hasNext": true,
|
||||
"hasPrev": false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 7. Exposing Internal Implementation Details
|
||||
|
||||
### Anti-Pattern
|
||||
URLs and field names that reflect database structure or internal architecture.
|
||||
|
||||
```
|
||||
❌ Bad Examples:
|
||||
/api/user_table/123
|
||||
/api/db_orders
|
||||
/api/legacy_customer_data
|
||||
/api/temp_migration_users
|
||||
|
||||
Response fields:
|
||||
{
|
||||
"user_id_pk": 123,
|
||||
"internal_ref_code": "usr_abc",
|
||||
"db_created_timestamp": 1645123456
|
||||
}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Couples API to internal implementation
|
||||
- Makes refactoring difficult
|
||||
- Exposes unnecessary technical details
|
||||
- Reduces API longevity
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
/api/users/123
|
||||
/api/orders
|
||||
/api/customers
|
||||
|
||||
Response fields:
|
||||
{
|
||||
"id": 123,
|
||||
"referenceCode": "usr_abc",
|
||||
"createdAt": "2024-02-16T13:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
## 8. Overloading Single Endpoint
|
||||
|
||||
### Anti-Pattern
|
||||
Using one endpoint for multiple unrelated operations based on request parameters.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
POST /api/user-actions
|
||||
{
|
||||
"action": "create_user",
|
||||
"userData": {...}
|
||||
}
|
||||
|
||||
POST /api/user-actions
|
||||
{
|
||||
"action": "delete_user",
|
||||
"userId": 123
|
||||
}
|
||||
|
||||
POST /api/user-actions
|
||||
{
|
||||
"action": "send_email",
|
||||
"userId": 123,
|
||||
"emailType": "welcome"
|
||||
}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Breaks REST principles
|
||||
- Makes documentation complex
|
||||
- Complicates client implementation
|
||||
- Reduces discoverability
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
POST /api/users # Create user
|
||||
DELETE /api/users/123 # Delete user
|
||||
POST /api/users/123/emails # Send email to user
|
||||
```
|
||||
|
||||
## 9. Lack of Versioning Strategy
|
||||
|
||||
### Anti-Pattern
|
||||
Making breaking changes without version management.
|
||||
|
||||
```
|
||||
❌ Bad Examples:
|
||||
# Original API
|
||||
{
|
||||
"name": "John Doe",
|
||||
"age": 30
|
||||
}
|
||||
|
||||
# Later (breaking change with no versioning)
|
||||
{
|
||||
"firstName": "John",
|
||||
"lastName": "Doe",
|
||||
"birthDate": "1994-02-16"
|
||||
}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Breaks existing clients
|
||||
- Forces all clients to update simultaneously
|
||||
- No graceful migration path
|
||||
- Reduces API stability
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
# Version 1
|
||||
GET /api/v1/users/123
|
||||
{
|
||||
"name": "John Doe",
|
||||
"age": 30
|
||||
}
|
||||
|
||||
# Version 2 (with both versions supported)
|
||||
GET /api/v2/users/123
|
||||
{
|
||||
"firstName": "John",
|
||||
"lastName": "Doe",
|
||||
"birthDate": "1994-02-16",
|
||||
"age": 30 // Backwards compatibility
|
||||
}
|
||||
```
|
||||
|
||||
## 10. Poor Error Messages
|
||||
|
||||
### Anti-Pattern
|
||||
Vague, unhelpful, or technical error messages.
|
||||
|
||||
```json
|
||||
❌ Bad Examples:
|
||||
{"error": "Something went wrong"}
|
||||
{"error": "Invalid input"}
|
||||
{"error": "SQL constraint violation: FK_user_profile_id"}
|
||||
{"error": "NullPointerException at line 247"}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Doesn't help developers fix issues
|
||||
- Increases support burden
|
||||
- Poor developer experience
|
||||
- May expose sensitive information
|
||||
|
||||
### Solution
|
||||
```json
|
||||
✅ Good Examples:
|
||||
{
|
||||
"error": {
|
||||
"code": "VALIDATION_ERROR",
|
||||
"message": "The email address is required and must be in a valid format",
|
||||
"details": [
|
||||
{
|
||||
"field": "email",
|
||||
"code": "REQUIRED",
|
||||
"message": "Email address is required"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 11. Ignoring Content Negotiation
|
||||
|
||||
### Anti-Pattern
|
||||
Hard-coding response format without considering client preferences.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
# Always returns JSON regardless of Accept header
|
||||
GET /api/users/123
|
||||
Accept: application/xml
|
||||
# Returns JSON anyway
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Reduces API flexibility
|
||||
- Ignores HTTP standards
|
||||
- Makes integration harder for diverse clients
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Example:
|
||||
GET /api/users/123
|
||||
Accept: application/xml
|
||||
|
||||
HTTP/1.1 200 OK
|
||||
Content-Type: application/xml
|
||||
|
||||
<?xml version="1.0"?>
|
||||
<user>
|
||||
<id>123</id>
|
||||
<name>John Doe</name>
|
||||
</user>
|
||||
```
|
||||
|
||||
## 12. Stateful API Design
|
||||
|
||||
### Anti-Pattern
|
||||
Maintaining session state on the server between requests.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
# Step 1: Initialize session
|
||||
POST /api/session/init
|
||||
|
||||
# Step 2: Set context (requires step 1)
|
||||
POST /api/session/set-user/123
|
||||
|
||||
# Step 3: Get data (requires steps 1 & 2)
|
||||
GET /api/session/user-data
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Breaks REST statelessness principle
|
||||
- Reduces scalability
|
||||
- Makes caching difficult
|
||||
- Complicates error recovery
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Example:
|
||||
# Self-contained requests
|
||||
GET /api/users/123/data
|
||||
Authorization: Bearer jwt-token-with-context
|
||||
```
|
||||
|
||||
## 13. Inconsistent HTTP Method Usage
|
||||
|
||||
### Anti-Pattern
|
||||
Using HTTP methods inappropriately or inconsistently.
|
||||
|
||||
```
|
||||
❌ Bad Examples:
|
||||
GET /api/users/123/delete # DELETE operation with GET
|
||||
POST /api/users/123/get # GET operation with POST
|
||||
PUT /api/users # Creating with PUT on collection
|
||||
GET /api/users/search # Search with side effects
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Violates HTTP semantics
|
||||
- Breaks caching and idempotency expectations
|
||||
- Confuses developers and tools
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
DELETE /api/users/123 # Delete with DELETE
|
||||
GET /api/users/123 # Get with GET
|
||||
POST /api/users # Create on collection
|
||||
GET /api/users?q=search # Safe search with GET
|
||||
```
|
||||
|
||||
## 14. Missing Rate Limiting Information
|
||||
|
||||
### Anti-Pattern
|
||||
Not providing rate limiting information to clients.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
HTTP/1.1 429 Too Many Requests
|
||||
{
|
||||
"error": "Rate limit exceeded"
|
||||
}
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Clients don't know when to retry
|
||||
- No information about current limits
|
||||
- Difficult to implement proper backoff strategies
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Example:
|
||||
HTTP/1.1 429 Too Many Requests
|
||||
X-RateLimit-Limit: 1000
|
||||
X-RateLimit-Remaining: 0
|
||||
X-RateLimit-Reset: 1640995200
|
||||
Retry-After: 3600
|
||||
|
||||
{
|
||||
"error": {
|
||||
"code": "RATE_LIMIT_EXCEEDED",
|
||||
"message": "API rate limit exceeded",
|
||||
"retryAfter": 3600
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 15. Chatty API Design
|
||||
|
||||
### Anti-Pattern
|
||||
Requiring multiple API calls to accomplish common tasks.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
# Get user profile requires 4 API calls
|
||||
GET /api/users/123 # Basic info
|
||||
GET /api/users/123/profile # Profile details
|
||||
GET /api/users/123/settings # User settings
|
||||
GET /api/users/123/stats # User statistics
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Increases latency
|
||||
- Creates network overhead
|
||||
- Makes mobile apps inefficient
|
||||
- Complicates client implementation
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Examples:
|
||||
# Single call with expansion
|
||||
GET /api/users/123?include=profile,settings,stats
|
||||
|
||||
# Or provide composite endpoints
|
||||
GET /api/users/123/dashboard
|
||||
|
||||
# Or batch operations
|
||||
POST /api/batch
|
||||
{
|
||||
"requests": [
|
||||
{"method": "GET", "url": "/users/123"},
|
||||
{"method": "GET", "url": "/users/123/profile"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## 16. No Input Validation
|
||||
|
||||
### Anti-Pattern
|
||||
Accepting and processing invalid input without proper validation.
|
||||
|
||||
```json
|
||||
❌ Bad Example:
|
||||
POST /api/users
|
||||
{
|
||||
"email": "not-an-email",
|
||||
"age": -5,
|
||||
"name": ""
|
||||
}
|
||||
|
||||
# API processes this and fails later or stores invalid data
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Leads to data corruption
|
||||
- Security vulnerabilities
|
||||
- Difficult to debug issues
|
||||
- Poor user experience
|
||||
|
||||
### Solution
|
||||
```json
|
||||
✅ Good Example:
|
||||
POST /api/users
|
||||
{
|
||||
"email": "not-an-email",
|
||||
"age": -5,
|
||||
"name": ""
|
||||
}
|
||||
|
||||
HTTP/1.1 400 Bad Request
|
||||
{
|
||||
"error": {
|
||||
"code": "VALIDATION_ERROR",
|
||||
"message": "The request contains invalid data",
|
||||
"details": [
|
||||
{
|
||||
"field": "email",
|
||||
"code": "INVALID_FORMAT",
|
||||
"message": "Email must be a valid email address"
|
||||
},
|
||||
{
|
||||
"field": "age",
|
||||
"code": "INVALID_RANGE",
|
||||
"message": "Age must be between 0 and 150"
|
||||
},
|
||||
{
|
||||
"field": "name",
|
||||
"code": "REQUIRED",
|
||||
"message": "Name is required and cannot be empty"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 17. Synchronous Long-Running Operations
|
||||
|
||||
### Anti-Pattern
|
||||
Blocking the client with long-running operations in synchronous endpoints.
|
||||
|
||||
```
|
||||
❌ Bad Example:
|
||||
POST /api/reports/generate
|
||||
# Client waits 30 seconds for response
|
||||
```
|
||||
|
||||
### Why It's Bad
|
||||
- Poor user experience
|
||||
- Timeouts and connection issues
|
||||
- Resource waste on client and server
|
||||
- Doesn't scale well
|
||||
|
||||
### Solution
|
||||
```
|
||||
✅ Good Example:
|
||||
# Async pattern
|
||||
POST /api/reports
|
||||
HTTP/1.1 202 Accepted
|
||||
Location: /api/reports/job-123
|
||||
{
|
||||
"jobId": "job-123",
|
||||
"status": "processing",
|
||||
"estimatedCompletion": "2024-02-16T13:05:00Z"
|
||||
}
|
||||
|
||||
# Check status
|
||||
GET /api/reports/job-123
|
||||
{
|
||||
"jobId": "job-123",
|
||||
"status": "completed",
|
||||
"result": "/api/reports/download/report-456"
|
||||
}
|
||||
```
|
||||
|
||||
## Prevention Strategies
|
||||
|
||||
### 1. API Design Reviews
|
||||
- Implement mandatory design reviews
|
||||
- Use checklists based on these anti-patterns
|
||||
- Include multiple stakeholders
|
||||
|
||||
### 2. API Style Guides
|
||||
- Create and enforce API style guides
|
||||
- Use linting tools for consistency
|
||||
- Regular training for development teams
|
||||
|
||||
### 3. Automated Testing
|
||||
- Test for common anti-patterns
|
||||
- Include contract testing
|
||||
- Monitor API usage patterns
|
||||
|
||||
### 4. Documentation Standards
|
||||
- Require comprehensive API documentation
|
||||
- Include examples and error scenarios
|
||||
- Keep documentation up-to-date
|
||||
|
||||
### 5. Client Feedback
|
||||
- Regularly collect feedback from API consumers
|
||||
- Monitor API usage analytics
|
||||
- Conduct developer experience surveys
|
||||
|
||||
## Conclusion
|
||||
|
||||
Avoiding these anti-patterns requires:
|
||||
- Understanding REST principles
|
||||
- Consistent design standards
|
||||
- Regular review and refactoring
|
||||
- Focus on developer experience
|
||||
- Proper tooling and automation
|
||||
|
||||
Remember: A well-designed API is an asset that grows in value over time, while a poorly designed API becomes a liability that hampers development and adoption.
|
||||
487
engineering/api-design-reviewer/references/rest_design_rules.md
Normal file
487
engineering/api-design-reviewer/references/rest_design_rules.md
Normal file
@@ -0,0 +1,487 @@
|
||||
# REST API Design Rules Reference
|
||||
|
||||
## Core Principles
|
||||
|
||||
### 1. Resources, Not Actions
|
||||
REST APIs should focus on **resources** (nouns) rather than **actions** (verbs). The HTTP methods provide the actions.
|
||||
|
||||
```
|
||||
✅ Good:
|
||||
GET /users # Get all users
|
||||
GET /users/123 # Get user 123
|
||||
POST /users # Create new user
|
||||
PUT /users/123 # Update user 123
|
||||
DELETE /users/123 # Delete user 123
|
||||
|
||||
❌ Bad:
|
||||
POST /getUsers
|
||||
POST /createUser
|
||||
POST /updateUser/123
|
||||
POST /deleteUser/123
|
||||
```
|
||||
|
||||
### 2. Hierarchical Resource Structure
|
||||
Use hierarchical URLs to represent resource relationships:
|
||||
|
||||
```
|
||||
/users/123/orders/456/items/789
|
||||
```
|
||||
|
||||
But avoid excessive nesting (max 3-4 levels):
|
||||
|
||||
```
|
||||
❌ Too deep: /companies/123/departments/456/teams/789/members/012/tasks/345
|
||||
✅ Better: /tasks/345?member=012&team=789
|
||||
```
|
||||
|
||||
## Resource Naming Conventions
|
||||
|
||||
### URLs Should Use Kebab-Case
|
||||
```
|
||||
✅ Good:
|
||||
/user-profiles
|
||||
/order-items
|
||||
/shipping-addresses
|
||||
|
||||
❌ Bad:
|
||||
/userProfiles
|
||||
/user_profiles
|
||||
/orderItems
|
||||
```
|
||||
|
||||
### Collections vs Individual Resources
|
||||
```
|
||||
Collection: /users
|
||||
Individual: /users/123
|
||||
Sub-resource: /users/123/orders
|
||||
```
|
||||
|
||||
### Pluralization Rules
|
||||
- Use **plural nouns** for collections: `/users`, `/orders`
|
||||
- Use **singular nouns** for single resources: `/user-profile`, `/current-session`
|
||||
- Be consistent throughout your API
|
||||
|
||||
## HTTP Methods Usage
|
||||
|
||||
### GET - Safe and Idempotent
|
||||
- **Purpose**: Retrieve data
|
||||
- **Safe**: No side effects
|
||||
- **Idempotent**: Multiple calls return same result
|
||||
- **Request Body**: Should not have one
|
||||
- **Cacheable**: Yes
|
||||
|
||||
```
|
||||
GET /users/123
|
||||
GET /users?status=active&limit=10
|
||||
```
|
||||
|
||||
### POST - Not Idempotent
|
||||
- **Purpose**: Create resources, non-idempotent operations
|
||||
- **Safe**: No
|
||||
- **Idempotent**: No
|
||||
- **Request Body**: Usually required
|
||||
- **Cacheable**: Generally no
|
||||
|
||||
```
|
||||
POST /users # Create new user
|
||||
POST /users/123/activate # Activate user (action)
|
||||
```
|
||||
|
||||
### PUT - Idempotent
|
||||
- **Purpose**: Create or completely replace a resource
|
||||
- **Safe**: No
|
||||
- **Idempotent**: Yes
|
||||
- **Request Body**: Required (complete resource)
|
||||
- **Cacheable**: No
|
||||
|
||||
```
|
||||
PUT /users/123 # Replace entire user resource
|
||||
```
|
||||
|
||||
### PATCH - Partial Update
|
||||
- **Purpose**: Partially update a resource
|
||||
- **Safe**: No
|
||||
- **Idempotent**: Not necessarily
|
||||
- **Request Body**: Required (partial resource)
|
||||
- **Cacheable**: No
|
||||
|
||||
```
|
||||
PATCH /users/123 # Update only specified fields
|
||||
```
|
||||
|
||||
### DELETE - Idempotent
|
||||
- **Purpose**: Remove a resource
|
||||
- **Safe**: No
|
||||
- **Idempotent**: Yes (same result if called multiple times)
|
||||
- **Request Body**: Usually not needed
|
||||
- **Cacheable**: No
|
||||
|
||||
```
|
||||
DELETE /users/123
|
||||
```
|
||||
|
||||
## Status Codes
|
||||
|
||||
### Success Codes (2xx)
|
||||
- **200 OK**: Standard success response
|
||||
- **201 Created**: Resource created successfully (POST)
|
||||
- **202 Accepted**: Request accepted for processing (async)
|
||||
- **204 No Content**: Success with no response body (DELETE, PUT)
|
||||
|
||||
### Redirection Codes (3xx)
|
||||
- **301 Moved Permanently**: Resource permanently moved
|
||||
- **302 Found**: Temporary redirect
|
||||
- **304 Not Modified**: Use cached version
|
||||
|
||||
### Client Error Codes (4xx)
|
||||
- **400 Bad Request**: Invalid request syntax or data
|
||||
- **401 Unauthorized**: Authentication required
|
||||
- **403 Forbidden**: Access denied (user authenticated but not authorized)
|
||||
- **404 Not Found**: Resource not found
|
||||
- **405 Method Not Allowed**: HTTP method not supported
|
||||
- **409 Conflict**: Resource conflict (duplicates, version mismatch)
|
||||
- **422 Unprocessable Entity**: Valid syntax but semantic errors
|
||||
- **429 Too Many Requests**: Rate limit exceeded
|
||||
|
||||
### Server Error Codes (5xx)
|
||||
- **500 Internal Server Error**: Unexpected server error
|
||||
- **502 Bad Gateway**: Invalid response from upstream server
|
||||
- **503 Service Unavailable**: Server temporarily unavailable
|
||||
- **504 Gateway Timeout**: Upstream server timeout
|
||||
|
||||
## URL Design Patterns
|
||||
|
||||
### Query Parameters for Filtering
|
||||
```
|
||||
GET /users?status=active
|
||||
GET /users?role=admin&department=engineering
|
||||
GET /orders?created_after=2024-01-01&status=pending
|
||||
```
|
||||
|
||||
### Pagination Parameters
|
||||
```
|
||||
# Offset-based
|
||||
GET /users?offset=20&limit=10
|
||||
|
||||
# Cursor-based
|
||||
GET /users?cursor=eyJpZCI6MTIzfQ&limit=10
|
||||
|
||||
# Page-based
|
||||
GET /users?page=3&page_size=10
|
||||
```
|
||||
|
||||
### Sorting Parameters
|
||||
```
|
||||
GET /users?sort=created_at # Ascending
|
||||
GET /users?sort=-created_at # Descending (prefix with -)
|
||||
GET /users?sort=last_name,first_name # Multiple fields
|
||||
```
|
||||
|
||||
### Field Selection
|
||||
```
|
||||
GET /users?fields=id,name,email
|
||||
GET /users/123?include=orders,profile
|
||||
GET /users/123?exclude=internal_notes
|
||||
```
|
||||
|
||||
### Search Parameters
|
||||
```
|
||||
GET /users?q=john
|
||||
GET /products?search=laptop&category=electronics
|
||||
```
|
||||
|
||||
## Response Format Standards
|
||||
|
||||
### Consistent Response Structure
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"id": 123,
|
||||
"name": "John Doe",
|
||||
"email": "john@example.com"
|
||||
},
|
||||
"meta": {
|
||||
"timestamp": "2024-02-16T13:00:00Z",
|
||||
"version": "1.0"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Collection Responses
|
||||
```json
|
||||
{
|
||||
"data": [
|
||||
{"id": 1, "name": "Item 1"},
|
||||
{"id": 2, "name": "Item 2"}
|
||||
],
|
||||
"pagination": {
|
||||
"total": 150,
|
||||
"page": 1,
|
||||
"pageSize": 10,
|
||||
"totalPages": 15,
|
||||
"hasNext": true,
|
||||
"hasPrev": false
|
||||
},
|
||||
"meta": {
|
||||
"timestamp": "2024-02-16T13:00:00Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Error Response Format
|
||||
```json
|
||||
{
|
||||
"error": {
|
||||
"code": "VALIDATION_ERROR",
|
||||
"message": "The request contains invalid parameters",
|
||||
"details": [
|
||||
{
|
||||
"field": "email",
|
||||
"code": "INVALID_FORMAT",
|
||||
"message": "Email address is not valid"
|
||||
}
|
||||
],
|
||||
"requestId": "req-123456",
|
||||
"timestamp": "2024-02-16T13:00:00Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Field Naming Conventions
|
||||
|
||||
### Use camelCase for JSON Fields
|
||||
```json
|
||||
✅ Good:
|
||||
{
|
||||
"firstName": "John",
|
||||
"lastName": "Doe",
|
||||
"createdAt": "2024-02-16T13:00:00Z",
|
||||
"isActive": true
|
||||
}
|
||||
|
||||
❌ Bad:
|
||||
{
|
||||
"first_name": "John",
|
||||
"LastName": "Doe",
|
||||
"created-at": "2024-02-16T13:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
### Boolean Fields
|
||||
Use positive, clear names with "is", "has", "can", or "should" prefixes:
|
||||
|
||||
```json
|
||||
✅ Good:
|
||||
{
|
||||
"isActive": true,
|
||||
"hasPermission": false,
|
||||
"canEdit": true,
|
||||
"shouldNotify": false
|
||||
}
|
||||
|
||||
❌ Bad:
|
||||
{
|
||||
"active": true,
|
||||
"disabled": false, // Double negative
|
||||
"permission": false // Unclear meaning
|
||||
}
|
||||
```
|
||||
|
||||
### Date/Time Fields
|
||||
- Use ISO 8601 format: `2024-02-16T13:00:00Z`
|
||||
- Include timezone information
|
||||
- Use consistent field naming:
|
||||
|
||||
```json
|
||||
{
|
||||
"createdAt": "2024-02-16T13:00:00Z",
|
||||
"updatedAt": "2024-02-16T13:30:00Z",
|
||||
"deletedAt": null,
|
||||
"publishedAt": "2024-02-16T14:00:00Z"
|
||||
}
|
||||
```
|
||||
|
||||
## Content Negotiation
|
||||
|
||||
### Accept Headers
|
||||
```
|
||||
Accept: application/json
|
||||
Accept: application/xml
|
||||
Accept: application/json; version=1
|
||||
```
|
||||
|
||||
### Content-Type Headers
|
||||
```
|
||||
Content-Type: application/json
|
||||
Content-Type: application/json; charset=utf-8
|
||||
Content-Type: multipart/form-data
|
||||
```
|
||||
|
||||
### Versioning via Headers
|
||||
```
|
||||
Accept: application/vnd.myapi.v1+json
|
||||
API-Version: 1.0
|
||||
```
|
||||
|
||||
## Caching Guidelines
|
||||
|
||||
### Cache-Control Headers
|
||||
```
|
||||
Cache-Control: public, max-age=3600 # Cache for 1 hour
|
||||
Cache-Control: private, max-age=0 # Don't cache
|
||||
Cache-Control: no-cache, must-revalidate # Always validate
|
||||
```
|
||||
|
||||
### ETags for Conditional Requests
|
||||
```
|
||||
HTTP/1.1 200 OK
|
||||
ETag: "123456789"
|
||||
Last-Modified: Wed, 21 Oct 2015 07:28:00 GMT
|
||||
|
||||
# Client subsequent request:
|
||||
If-None-Match: "123456789"
|
||||
If-Modified-Since: Wed, 21 Oct 2015 07:28:00 GMT
|
||||
```
|
||||
|
||||
## Security Headers
|
||||
|
||||
### Authentication
|
||||
```
|
||||
Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...
|
||||
Authorization: Basic dXNlcjpwYXNzd29yZA==
|
||||
Authorization: Api-Key abc123def456
|
||||
```
|
||||
|
||||
### CORS Headers
|
||||
```
|
||||
Access-Control-Allow-Origin: https://example.com
|
||||
Access-Control-Allow-Methods: GET, POST, PUT, DELETE
|
||||
Access-Control-Allow-Headers: Content-Type, Authorization
|
||||
```
|
||||
|
||||
## Rate Limiting
|
||||
|
||||
### Rate Limit Headers
|
||||
```
|
||||
X-RateLimit-Limit: 1000
|
||||
X-RateLimit-Remaining: 999
|
||||
X-RateLimit-Reset: 1640995200
|
||||
X-RateLimit-Window: 3600
|
||||
```
|
||||
|
||||
### Rate Limit Exceeded Response
|
||||
```json
|
||||
HTTP/1.1 429 Too Many Requests
|
||||
Retry-After: 3600
|
||||
|
||||
{
|
||||
"error": {
|
||||
"code": "RATE_LIMIT_EXCEEDED",
|
||||
"message": "API rate limit exceeded",
|
||||
"details": {
|
||||
"limit": 1000,
|
||||
"window": "1 hour",
|
||||
"retryAfter": 3600
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Hypermedia (HATEOAS)
|
||||
|
||||
### Links in Responses
|
||||
```json
|
||||
{
|
||||
"id": 123,
|
||||
"name": "John Doe",
|
||||
"email": "john@example.com",
|
||||
"_links": {
|
||||
"self": {
|
||||
"href": "/users/123"
|
||||
},
|
||||
"orders": {
|
||||
"href": "/users/123/orders"
|
||||
},
|
||||
"edit": {
|
||||
"href": "/users/123",
|
||||
"method": "PUT"
|
||||
},
|
||||
"delete": {
|
||||
"href": "/users/123",
|
||||
"method": "DELETE"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Link Relations
|
||||
- **self**: Link to the resource itself
|
||||
- **edit**: Link to edit the resource
|
||||
- **delete**: Link to delete the resource
|
||||
- **related**: Link to related resources
|
||||
- **next/prev**: Pagination links
|
||||
|
||||
## Common Anti-Patterns to Avoid
|
||||
|
||||
### 1. Verbs in URLs
|
||||
```
|
||||
❌ Bad: /api/getUser/123
|
||||
✅ Good: GET /api/users/123
|
||||
```
|
||||
|
||||
### 2. Inconsistent Naming
|
||||
```
|
||||
❌ Bad: /user-profiles and /userAddresses
|
||||
✅ Good: /user-profiles and /user-addresses
|
||||
```
|
||||
|
||||
### 3. Deep Nesting
|
||||
```
|
||||
❌ Bad: /companies/123/departments/456/teams/789/members/012
|
||||
✅ Good: /team-members/012?team=789
|
||||
```
|
||||
|
||||
### 4. Ignoring HTTP Status Codes
|
||||
```
|
||||
❌ Bad: Always return 200 with error info in body
|
||||
✅ Good: Use appropriate status codes (404, 400, 500, etc.)
|
||||
```
|
||||
|
||||
### 5. Exposing Internal Structure
|
||||
```
|
||||
❌ Bad: /api/database_table_users
|
||||
✅ Good: /api/users
|
||||
```
|
||||
|
||||
### 6. No Versioning Strategy
|
||||
```
|
||||
❌ Bad: Breaking changes without version management
|
||||
✅ Good: /api/v1/users or Accept: application/vnd.api+json;version=1
|
||||
```
|
||||
|
||||
### 7. Inconsistent Error Responses
|
||||
```
|
||||
❌ Bad: Different error formats for different endpoints
|
||||
✅ Good: Standardized error response structure
|
||||
```
|
||||
|
||||
## Best Practices Summary
|
||||
|
||||
1. **Use nouns for resources, not verbs**
|
||||
2. **Leverage HTTP methods correctly**
|
||||
3. **Maintain consistent naming conventions**
|
||||
4. **Implement proper error handling**
|
||||
5. **Use appropriate HTTP status codes**
|
||||
6. **Design for cacheability**
|
||||
7. **Implement security from the start**
|
||||
8. **Plan for versioning**
|
||||
9. **Provide comprehensive documentation**
|
||||
10. **Follow HATEOAS principles when applicable**
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [RFC 7231 - HTTP/1.1 Semantics and Content](https://tools.ietf.org/html/rfc7231)
|
||||
- [RFC 6570 - URI Template](https://tools.ietf.org/html/rfc6570)
|
||||
- [OpenAPI Specification](https://swagger.io/specification/)
|
||||
- [REST API Design Best Practices](https://www.restapitutorial.com/)
|
||||
- [HTTP Status Code Definitions](https://httpstatuses.com/)
|
||||
914
engineering/api-design-reviewer/scripts/api_linter.py
Normal file
914
engineering/api-design-reviewer/scripts/api_linter.py
Normal file
@@ -0,0 +1,914 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
API Linter - Analyzes OpenAPI/Swagger specifications for REST conventions and best practices.
|
||||
|
||||
This script validates API designs against established conventions including:
|
||||
- Resource naming conventions (kebab-case resources, camelCase fields)
|
||||
- HTTP method usage patterns
|
||||
- URL structure consistency
|
||||
- Error response format standards
|
||||
- Documentation completeness
|
||||
- Pagination patterns
|
||||
- Versioning compliance
|
||||
|
||||
Supports both OpenAPI JSON specifications and raw endpoint definition JSON.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, Dict, List, Tuple, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class LintIssue:
|
||||
"""Represents a linting issue found in the API specification."""
|
||||
severity: str # 'error', 'warning', 'info'
|
||||
category: str
|
||||
message: str
|
||||
path: str
|
||||
suggestion: str = ""
|
||||
line_number: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class LintReport:
|
||||
"""Complete linting report with issues and statistics."""
|
||||
issues: List[LintIssue] = field(default_factory=list)
|
||||
total_endpoints: int = 0
|
||||
endpoints_with_issues: int = 0
|
||||
score: float = 0.0
|
||||
|
||||
def add_issue(self, issue: LintIssue) -> None:
|
||||
"""Add an issue to the report."""
|
||||
self.issues.append(issue)
|
||||
|
||||
def get_issues_by_severity(self) -> Dict[str, List[LintIssue]]:
|
||||
"""Group issues by severity level."""
|
||||
grouped = {'error': [], 'warning': [], 'info': []}
|
||||
for issue in self.issues:
|
||||
if issue.severity in grouped:
|
||||
grouped[issue.severity].append(issue)
|
||||
return grouped
|
||||
|
||||
def calculate_score(self) -> float:
|
||||
"""Calculate overall API quality score (0-100)."""
|
||||
if self.total_endpoints == 0:
|
||||
return 100.0
|
||||
|
||||
error_penalty = len([i for i in self.issues if i.severity == 'error']) * 10
|
||||
warning_penalty = len([i for i in self.issues if i.severity == 'warning']) * 3
|
||||
info_penalty = len([i for i in self.issues if i.severity == 'info']) * 1
|
||||
|
||||
total_penalty = error_penalty + warning_penalty + info_penalty
|
||||
base_score = 100.0
|
||||
|
||||
# Penalty per endpoint to normalize across API sizes
|
||||
penalty_per_endpoint = total_penalty / self.total_endpoints if self.total_endpoints > 0 else total_penalty
|
||||
|
||||
self.score = max(0.0, base_score - penalty_per_endpoint)
|
||||
return self.score
|
||||
|
||||
|
||||
class APILinter:
|
||||
"""Main API linting engine."""
|
||||
|
||||
def __init__(self):
|
||||
self.report = LintReport()
|
||||
self.openapi_spec: Optional[Dict] = None
|
||||
self.raw_endpoints: Optional[Dict] = None
|
||||
|
||||
# Regex patterns for naming conventions
|
||||
self.kebab_case_pattern = re.compile(r'^[a-z]+(?:-[a-z0-9]+)*$')
|
||||
self.camel_case_pattern = re.compile(r'^[a-z][a-zA-Z0-9]*$')
|
||||
self.snake_case_pattern = re.compile(r'^[a-z]+(?:_[a-z0-9]+)*$')
|
||||
self.pascal_case_pattern = re.compile(r'^[A-Z][a-zA-Z0-9]*$')
|
||||
|
||||
# Standard HTTP methods
|
||||
self.http_methods = {'GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD', 'OPTIONS'}
|
||||
|
||||
# Standard HTTP status codes by method
|
||||
self.standard_status_codes = {
|
||||
'GET': {200, 304, 404},
|
||||
'POST': {200, 201, 400, 409, 422},
|
||||
'PUT': {200, 204, 400, 404, 409},
|
||||
'PATCH': {200, 204, 400, 404, 409},
|
||||
'DELETE': {200, 204, 404},
|
||||
'HEAD': {200, 404},
|
||||
'OPTIONS': {200}
|
||||
}
|
||||
|
||||
# Common error status codes
|
||||
self.common_error_codes = {400, 401, 403, 404, 405, 409, 422, 429, 500, 502, 503}
|
||||
|
||||
def lint_openapi_spec(self, spec: Dict[str, Any]) -> LintReport:
|
||||
"""Lint an OpenAPI/Swagger specification."""
|
||||
self.openapi_spec = spec
|
||||
self.report = LintReport()
|
||||
|
||||
# Basic structure validation
|
||||
self._validate_openapi_structure()
|
||||
|
||||
# Info section validation
|
||||
self._validate_info_section()
|
||||
|
||||
# Server section validation
|
||||
self._validate_servers_section()
|
||||
|
||||
# Paths validation (main linting logic)
|
||||
self._validate_paths_section()
|
||||
|
||||
# Components validation
|
||||
self._validate_components_section()
|
||||
|
||||
# Security validation
|
||||
self._validate_security_section()
|
||||
|
||||
# Calculate final score
|
||||
self.report.calculate_score()
|
||||
|
||||
return self.report
|
||||
|
||||
def lint_raw_endpoints(self, endpoints: Dict[str, Any]) -> LintReport:
|
||||
"""Lint raw endpoint definitions."""
|
||||
self.raw_endpoints = endpoints
|
||||
self.report = LintReport()
|
||||
|
||||
# Validate raw endpoint structure
|
||||
self._validate_raw_endpoint_structure()
|
||||
|
||||
# Lint each endpoint
|
||||
for endpoint_path, endpoint_data in endpoints.get('endpoints', {}).items():
|
||||
self._lint_raw_endpoint(endpoint_path, endpoint_data)
|
||||
|
||||
self.report.calculate_score()
|
||||
return self.report
|
||||
|
||||
def _validate_openapi_structure(self) -> None:
|
||||
"""Validate basic OpenAPI document structure."""
|
||||
required_fields = ['openapi', 'info', 'paths']
|
||||
|
||||
for field in required_fields:
|
||||
if field not in self.openapi_spec:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='structure',
|
||||
message=f"Missing required field: {field}",
|
||||
path=f"/{field}",
|
||||
suggestion=f"Add the '{field}' field to the root of your OpenAPI specification"
|
||||
))
|
||||
|
||||
def _validate_info_section(self) -> None:
|
||||
"""Validate the info section of OpenAPI spec."""
|
||||
if 'info' not in self.openapi_spec:
|
||||
return
|
||||
|
||||
info = self.openapi_spec['info']
|
||||
required_info_fields = ['title', 'version']
|
||||
recommended_info_fields = ['description', 'contact']
|
||||
|
||||
for field in required_info_fields:
|
||||
if field not in info:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='documentation',
|
||||
message=f"Missing required info field: {field}",
|
||||
path=f"/info/{field}",
|
||||
suggestion=f"Add a '{field}' field to the info section"
|
||||
))
|
||||
|
||||
for field in recommended_info_fields:
|
||||
if field not in info:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='documentation',
|
||||
message=f"Missing recommended info field: {field}",
|
||||
path=f"/info/{field}",
|
||||
suggestion=f"Consider adding a '{field}' field to improve API documentation"
|
||||
))
|
||||
|
||||
# Validate version format
|
||||
if 'version' in info:
|
||||
version = info['version']
|
||||
if not re.match(r'^\d+\.\d+(\.\d+)?(-\w+)?$', version):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='versioning',
|
||||
message=f"Version format '{version}' doesn't follow semantic versioning",
|
||||
path="/info/version",
|
||||
suggestion="Use semantic versioning format (e.g., '1.0.0', '2.1.3-beta')"
|
||||
))
|
||||
|
||||
def _validate_servers_section(self) -> None:
|
||||
"""Validate the servers section."""
|
||||
if 'servers' not in self.openapi_spec:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='configuration',
|
||||
message="Missing servers section",
|
||||
path="/servers",
|
||||
suggestion="Add a servers section to specify API base URLs"
|
||||
))
|
||||
return
|
||||
|
||||
servers = self.openapi_spec['servers']
|
||||
if not isinstance(servers, list) or len(servers) == 0:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='configuration',
|
||||
message="Empty servers section",
|
||||
path="/servers",
|
||||
suggestion="Add at least one server URL"
|
||||
))
|
||||
|
||||
def _validate_paths_section(self) -> None:
|
||||
"""Validate all API paths and operations."""
|
||||
if 'paths' not in self.openapi_spec:
|
||||
return
|
||||
|
||||
paths = self.openapi_spec['paths']
|
||||
if not paths:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='structure',
|
||||
message="No paths defined in API specification",
|
||||
path="/paths",
|
||||
suggestion="Define at least one API endpoint"
|
||||
))
|
||||
return
|
||||
|
||||
self.report.total_endpoints = sum(
|
||||
len([method for method in path_obj.keys() if method.upper() in self.http_methods])
|
||||
for path_obj in paths.values() if isinstance(path_obj, dict)
|
||||
)
|
||||
|
||||
endpoints_with_issues = set()
|
||||
|
||||
for path, path_obj in paths.items():
|
||||
if not isinstance(path_obj, dict):
|
||||
continue
|
||||
|
||||
# Validate path structure
|
||||
path_issues = self._validate_path_structure(path)
|
||||
if path_issues:
|
||||
endpoints_with_issues.add(path)
|
||||
|
||||
# Validate each operation in the path
|
||||
for method, operation in path_obj.items():
|
||||
if method.upper() not in self.http_methods:
|
||||
continue
|
||||
|
||||
operation_issues = self._validate_operation(path, method.upper(), operation)
|
||||
if operation_issues:
|
||||
endpoints_with_issues.add(path)
|
||||
|
||||
self.report.endpoints_with_issues = len(endpoints_with_issues)
|
||||
|
||||
def _validate_path_structure(self, path: str) -> bool:
|
||||
"""Validate REST path structure and naming conventions."""
|
||||
has_issues = False
|
||||
|
||||
# Check if path starts with slash
|
||||
if not path.startswith('/'):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='url_structure',
|
||||
message=f"Path must start with '/' character: {path}",
|
||||
path=f"/paths/{path}",
|
||||
suggestion=f"Change '{path}' to '/{path.lstrip('/')}'"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Split path into segments
|
||||
segments = [seg for seg in path.split('/') if seg]
|
||||
|
||||
# Check for empty segments (double slashes)
|
||||
if '//' in path:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='url_structure',
|
||||
message=f"Path contains empty segments: {path}",
|
||||
path=f"/paths/{path}",
|
||||
suggestion="Remove double slashes from the path"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Validate each segment
|
||||
for i, segment in enumerate(segments):
|
||||
# Skip parameter segments
|
||||
if segment.startswith('{') and segment.endswith('}'):
|
||||
# Validate parameter naming
|
||||
param_name = segment[1:-1]
|
||||
if not self.camel_case_pattern.match(param_name) and not self.kebab_case_pattern.match(param_name):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='naming',
|
||||
message=f"Path parameter '{param_name}' should use camelCase or kebab-case",
|
||||
path=f"/paths/{path}",
|
||||
suggestion=f"Use camelCase (e.g., 'userId') or kebab-case (e.g., 'user-id')"
|
||||
))
|
||||
has_issues = True
|
||||
continue
|
||||
|
||||
# Check for resource naming conventions
|
||||
if not self.kebab_case_pattern.match(segment):
|
||||
# Allow version segments like 'v1', 'v2'
|
||||
if not re.match(r'^v\d+$', segment):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='naming',
|
||||
message=f"Resource segment '{segment}' should use kebab-case",
|
||||
path=f"/paths/{path}",
|
||||
suggestion=f"Use kebab-case for '{segment}' (e.g., 'user-profiles', 'order-items')"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Check for verb usage in URLs (anti-pattern)
|
||||
common_verbs = {'get', 'post', 'put', 'delete', 'create', 'update', 'remove', 'add'}
|
||||
if segment.lower() in common_verbs:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='rest_conventions',
|
||||
message=f"Avoid verbs in URLs: '{segment}' in {path}",
|
||||
path=f"/paths/{path}",
|
||||
suggestion="Use HTTP methods instead of verbs in URLs. Use nouns for resources."
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Check path depth (avoid over-nesting)
|
||||
if len(segments) > 6:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='url_structure',
|
||||
message=f"Path has excessive nesting ({len(segments)} levels): {path}",
|
||||
path=f"/paths/{path}",
|
||||
suggestion="Consider flattening the resource hierarchy or using query parameters"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Check for consistent versioning
|
||||
if any('v' + str(i) in segments for i in range(1, 10)):
|
||||
version_segments = [seg for seg in segments if re.match(r'^v\d+$', seg)]
|
||||
if len(version_segments) > 1:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='versioning',
|
||||
message=f"Multiple version segments in path: {path}",
|
||||
path=f"/paths/{path}",
|
||||
suggestion="Use only one version segment per path"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
return has_issues
|
||||
|
||||
def _validate_operation(self, path: str, method: str, operation: Dict[str, Any]) -> bool:
|
||||
"""Validate individual operation (HTTP method + path combination)."""
|
||||
has_issues = False
|
||||
operation_path = f"/paths/{path}/{method.lower()}"
|
||||
|
||||
# Check for required operation fields
|
||||
if 'responses' not in operation:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='structure',
|
||||
message=f"Missing responses section for {method} {path}",
|
||||
path=f"{operation_path}/responses",
|
||||
suggestion="Define expected responses for this operation"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Check for operation documentation
|
||||
if 'summary' not in operation:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='documentation',
|
||||
message=f"Missing summary for {method} {path}",
|
||||
path=f"{operation_path}/summary",
|
||||
suggestion="Add a brief summary describing what this operation does"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
if 'description' not in operation:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='info',
|
||||
category='documentation',
|
||||
message=f"Missing description for {method} {path}",
|
||||
path=f"{operation_path}/description",
|
||||
suggestion="Add a detailed description for better API documentation"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Validate HTTP method usage patterns
|
||||
method_issues = self._validate_http_method_usage(path, method, operation)
|
||||
if method_issues:
|
||||
has_issues = True
|
||||
|
||||
# Validate responses
|
||||
if 'responses' in operation:
|
||||
response_issues = self._validate_responses(path, method, operation['responses'])
|
||||
if response_issues:
|
||||
has_issues = True
|
||||
|
||||
# Validate parameters
|
||||
if 'parameters' in operation:
|
||||
param_issues = self._validate_parameters(path, method, operation['parameters'])
|
||||
if param_issues:
|
||||
has_issues = True
|
||||
|
||||
# Validate request body
|
||||
if 'requestBody' in operation:
|
||||
body_issues = self._validate_request_body(path, method, operation['requestBody'])
|
||||
if body_issues:
|
||||
has_issues = True
|
||||
|
||||
return has_issues
|
||||
|
||||
def _validate_http_method_usage(self, path: str, method: str, operation: Dict[str, Any]) -> bool:
|
||||
"""Validate proper HTTP method usage patterns."""
|
||||
has_issues = False
|
||||
|
||||
# GET requests should not have request body
|
||||
if method == 'GET' and 'requestBody' in operation:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='rest_conventions',
|
||||
message=f"GET request should not have request body: {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/requestBody",
|
||||
suggestion="Remove requestBody from GET request or use POST if body is needed"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# DELETE requests typically should not have request body
|
||||
if method == 'DELETE' and 'requestBody' in operation:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='rest_conventions',
|
||||
message=f"DELETE request typically should not have request body: {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/requestBody",
|
||||
suggestion="Consider using query parameters or path parameters instead"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# POST/PUT/PATCH should typically have request body (except for actions)
|
||||
if method in ['POST', 'PUT', 'PATCH'] and 'requestBody' not in operation:
|
||||
# Check if this is an action endpoint
|
||||
if not any(action in path.lower() for action in ['activate', 'deactivate', 'reset', 'confirm']):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='info',
|
||||
category='rest_conventions',
|
||||
message=f"{method} request typically should have request body: {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}",
|
||||
suggestion=f"Consider adding requestBody for {method} operation or use GET if no data is being sent"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
return has_issues
|
||||
|
||||
def _validate_responses(self, path: str, method: str, responses: Dict[str, Any]) -> bool:
|
||||
"""Validate response definitions."""
|
||||
has_issues = False
|
||||
|
||||
# Check for success response
|
||||
success_codes = {'200', '201', '202', '204'}
|
||||
has_success = any(code in responses for code in success_codes)
|
||||
|
||||
if not has_success:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='responses',
|
||||
message=f"Missing success response for {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/responses",
|
||||
suggestion="Define at least one success response (200, 201, 202, or 204)"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Check for error responses
|
||||
has_error_responses = any(code.startswith('4') or code.startswith('5') for code in responses.keys())
|
||||
|
||||
if not has_error_responses:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='responses',
|
||||
message=f"Missing error responses for {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/responses",
|
||||
suggestion="Define common error responses (400, 404, 500, etc.)"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Validate individual response codes
|
||||
for status_code, response in responses.items():
|
||||
if status_code == 'default':
|
||||
continue
|
||||
|
||||
try:
|
||||
code_int = int(status_code)
|
||||
except ValueError:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='responses',
|
||||
message=f"Invalid status code '{status_code}' for {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/responses/{status_code}",
|
||||
suggestion="Use valid HTTP status codes (e.g., 200, 404, 500)"
|
||||
))
|
||||
has_issues = True
|
||||
continue
|
||||
|
||||
# Check if status code is appropriate for the method
|
||||
expected_codes = self.standard_status_codes.get(method, set())
|
||||
common_codes = {400, 401, 403, 404, 429, 500} # Always acceptable
|
||||
|
||||
if expected_codes and code_int not in expected_codes and code_int not in common_codes:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='info',
|
||||
category='responses',
|
||||
message=f"Uncommon status code {status_code} for {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/responses/{status_code}",
|
||||
suggestion=f"Consider using standard codes for {method}: {sorted(expected_codes)}"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
return has_issues
|
||||
|
||||
def _validate_parameters(self, path: str, method: str, parameters: List[Dict[str, Any]]) -> bool:
|
||||
"""Validate parameter definitions."""
|
||||
has_issues = False
|
||||
|
||||
for i, param in enumerate(parameters):
|
||||
param_path = f"/paths/{path}/{method.lower()}/parameters[{i}]"
|
||||
|
||||
# Check required fields
|
||||
if 'name' not in param:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='parameters',
|
||||
message=f"Parameter missing name field in {method} {path}",
|
||||
path=f"{param_path}/name",
|
||||
suggestion="Add a name field to the parameter"
|
||||
))
|
||||
has_issues = True
|
||||
continue
|
||||
|
||||
if 'in' not in param:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='parameters',
|
||||
message=f"Parameter '{param['name']}' missing 'in' field in {method} {path}",
|
||||
path=f"{param_path}/in",
|
||||
suggestion="Specify parameter location (query, path, header, cookie)"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Validate parameter naming
|
||||
param_name = param['name']
|
||||
param_location = param.get('in', '')
|
||||
|
||||
if param_location == 'query':
|
||||
# Query parameters should use camelCase or kebab-case
|
||||
if not self.camel_case_pattern.match(param_name) and not self.kebab_case_pattern.match(param_name):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='naming',
|
||||
message=f"Query parameter '{param_name}' should use camelCase or kebab-case in {method} {path}",
|
||||
path=f"{param_path}/name",
|
||||
suggestion="Use camelCase (e.g., 'pageSize') or kebab-case (e.g., 'page-size')"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
elif param_location == 'path':
|
||||
# Path parameters should use camelCase or kebab-case
|
||||
if not self.camel_case_pattern.match(param_name) and not self.kebab_case_pattern.match(param_name):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='naming',
|
||||
message=f"Path parameter '{param_name}' should use camelCase or kebab-case in {method} {path}",
|
||||
path=f"{param_path}/name",
|
||||
suggestion="Use camelCase (e.g., 'userId') or kebab-case (e.g., 'user-id')"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
# Path parameters must be required
|
||||
if not param.get('required', False):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='parameters',
|
||||
message=f"Path parameter '{param_name}' must be required in {method} {path}",
|
||||
path=f"{param_path}/required",
|
||||
suggestion="Set required: true for path parameters"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
return has_issues
|
||||
|
||||
def _validate_request_body(self, path: str, method: str, request_body: Dict[str, Any]) -> bool:
|
||||
"""Validate request body definition."""
|
||||
has_issues = False
|
||||
|
||||
if 'content' not in request_body:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='request_body',
|
||||
message=f"Request body missing content for {method} {path}",
|
||||
path=f"/paths/{path}/{method.lower()}/requestBody/content",
|
||||
suggestion="Define content types for the request body"
|
||||
))
|
||||
has_issues = True
|
||||
|
||||
return has_issues
|
||||
|
||||
def _validate_components_section(self) -> None:
|
||||
"""Validate the components section."""
|
||||
if 'components' not in self.openapi_spec:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='info',
|
||||
category='structure',
|
||||
message="Missing components section",
|
||||
path="/components",
|
||||
suggestion="Consider defining reusable components (schemas, responses, parameters)"
|
||||
))
|
||||
return
|
||||
|
||||
components = self.openapi_spec['components']
|
||||
|
||||
# Validate schemas
|
||||
if 'schemas' in components:
|
||||
self._validate_schemas(components['schemas'])
|
||||
|
||||
def _validate_schemas(self, schemas: Dict[str, Any]) -> None:
|
||||
"""Validate schema definitions."""
|
||||
for schema_name, schema in schemas.items():
|
||||
# Check schema naming (should be PascalCase)
|
||||
if not self.pascal_case_pattern.match(schema_name):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='naming',
|
||||
message=f"Schema name '{schema_name}' should use PascalCase",
|
||||
path=f"/components/schemas/{schema_name}",
|
||||
suggestion=f"Use PascalCase for schema names (e.g., 'UserProfile', 'OrderItem')"
|
||||
))
|
||||
|
||||
# Validate schema properties
|
||||
if isinstance(schema, dict) and 'properties' in schema:
|
||||
self._validate_schema_properties(schema_name, schema['properties'])
|
||||
|
||||
def _validate_schema_properties(self, schema_name: str, properties: Dict[str, Any]) -> None:
|
||||
"""Validate schema property naming."""
|
||||
for prop_name, prop_def in properties.items():
|
||||
# Properties should use camelCase
|
||||
if not self.camel_case_pattern.match(prop_name):
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='naming',
|
||||
message=f"Property '{prop_name}' in schema '{schema_name}' should use camelCase",
|
||||
path=f"/components/schemas/{schema_name}/properties/{prop_name}",
|
||||
suggestion="Use camelCase for property names (e.g., 'firstName', 'createdAt')"
|
||||
))
|
||||
|
||||
def _validate_security_section(self) -> None:
|
||||
"""Validate security definitions."""
|
||||
if 'security' not in self.openapi_spec and 'components' not in self.openapi_spec:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='warning',
|
||||
category='security',
|
||||
message="No security configuration found",
|
||||
path="/security",
|
||||
suggestion="Define security schemes and apply them to operations"
|
||||
))
|
||||
|
||||
def _validate_raw_endpoint_structure(self) -> None:
|
||||
"""Validate structure of raw endpoint definitions."""
|
||||
if 'endpoints' not in self.raw_endpoints:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='structure',
|
||||
message="Missing 'endpoints' field in raw endpoint definition",
|
||||
path="/endpoints",
|
||||
suggestion="Provide an 'endpoints' object containing endpoint definitions"
|
||||
))
|
||||
return
|
||||
|
||||
endpoints = self.raw_endpoints['endpoints']
|
||||
self.report.total_endpoints = len(endpoints)
|
||||
|
||||
def _lint_raw_endpoint(self, path: str, endpoint_data: Dict[str, Any]) -> None:
|
||||
"""Lint individual raw endpoint definition."""
|
||||
# Validate path structure
|
||||
self._validate_path_structure(path)
|
||||
|
||||
# Check for required fields
|
||||
if 'method' not in endpoint_data:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='structure',
|
||||
message=f"Missing method field for endpoint {path}",
|
||||
path=f"/endpoints/{path}/method",
|
||||
suggestion="Specify HTTP method (GET, POST, PUT, PATCH, DELETE)"
|
||||
))
|
||||
return
|
||||
|
||||
method = endpoint_data['method'].upper()
|
||||
if method not in self.http_methods:
|
||||
self.report.add_issue(LintIssue(
|
||||
severity='error',
|
||||
category='structure',
|
||||
message=f"Invalid HTTP method '{method}' for endpoint {path}",
|
||||
path=f"/endpoints/{path}/method",
|
||||
suggestion=f"Use valid HTTP methods: {', '.join(sorted(self.http_methods))}"
|
||||
))
|
||||
|
||||
def generate_json_report(self) -> str:
|
||||
"""Generate JSON format report."""
|
||||
issues_by_severity = self.report.get_issues_by_severity()
|
||||
|
||||
report_data = {
|
||||
"summary": {
|
||||
"total_endpoints": self.report.total_endpoints,
|
||||
"endpoints_with_issues": self.report.endpoints_with_issues,
|
||||
"total_issues": len(self.report.issues),
|
||||
"errors": len(issues_by_severity['error']),
|
||||
"warnings": len(issues_by_severity['warning']),
|
||||
"info": len(issues_by_severity['info']),
|
||||
"score": round(self.report.score, 2)
|
||||
},
|
||||
"issues": []
|
||||
}
|
||||
|
||||
for issue in self.report.issues:
|
||||
report_data["issues"].append({
|
||||
"severity": issue.severity,
|
||||
"category": issue.category,
|
||||
"message": issue.message,
|
||||
"path": issue.path,
|
||||
"suggestion": issue.suggestion
|
||||
})
|
||||
|
||||
return json.dumps(report_data, indent=2)
|
||||
|
||||
def generate_text_report(self) -> str:
|
||||
"""Generate human-readable text report."""
|
||||
issues_by_severity = self.report.get_issues_by_severity()
|
||||
|
||||
report_lines = [
|
||||
"═══════════════════════════════════════════════════════════════",
|
||||
" API LINTING REPORT",
|
||||
"═══════════════════════════════════════════════════════════════",
|
||||
"",
|
||||
"SUMMARY:",
|
||||
f" Total Endpoints: {self.report.total_endpoints}",
|
||||
f" Endpoints with Issues: {self.report.endpoints_with_issues}",
|
||||
f" Overall Score: {self.report.score:.1f}/100.0",
|
||||
"",
|
||||
"ISSUE BREAKDOWN:",
|
||||
f" 🔴 Errors: {len(issues_by_severity['error'])}",
|
||||
f" 🟡 Warnings: {len(issues_by_severity['warning'])}",
|
||||
f" ℹ️ Info: {len(issues_by_severity['info'])}",
|
||||
"",
|
||||
]
|
||||
|
||||
if not self.report.issues:
|
||||
report_lines.extend([
|
||||
"🎉 Congratulations! No issues found in your API specification.",
|
||||
""
|
||||
])
|
||||
else:
|
||||
# Group issues by category
|
||||
issues_by_category = {}
|
||||
for issue in self.report.issues:
|
||||
if issue.category not in issues_by_category:
|
||||
issues_by_category[issue.category] = []
|
||||
issues_by_category[issue.category].append(issue)
|
||||
|
||||
for category, issues in issues_by_category.items():
|
||||
report_lines.append(f"{'═' * 60}")
|
||||
report_lines.append(f"CATEGORY: {category.upper().replace('_', ' ')}")
|
||||
report_lines.append(f"{'═' * 60}")
|
||||
|
||||
for issue in issues:
|
||||
severity_icon = {"error": "🔴", "warning": "🟡", "info": "ℹ️"}[issue.severity]
|
||||
|
||||
report_lines.extend([
|
||||
f"{severity_icon} {issue.severity.upper()}: {issue.message}",
|
||||
f" Path: {issue.path}",
|
||||
])
|
||||
|
||||
if issue.suggestion:
|
||||
report_lines.append(f" 💡 Suggestion: {issue.suggestion}")
|
||||
|
||||
report_lines.append("")
|
||||
|
||||
# Add scoring breakdown
|
||||
report_lines.extend([
|
||||
"═══════════════════════════════════════════════════════════════",
|
||||
"SCORING DETAILS:",
|
||||
"═══════════════════════════════════════════════════════════════",
|
||||
f"Base Score: 100.0",
|
||||
f"Errors Penalty: -{len(issues_by_severity['error']) * 10} (10 points per error)",
|
||||
f"Warnings Penalty: -{len(issues_by_severity['warning']) * 3} (3 points per warning)",
|
||||
f"Info Penalty: -{len(issues_by_severity['info']) * 1} (1 point per info)",
|
||||
f"Final Score: {self.report.score:.1f}/100.0",
|
||||
""
|
||||
])
|
||||
|
||||
# Add recommendations based on score
|
||||
if self.report.score >= 90:
|
||||
report_lines.append("🏆 Excellent! Your API design follows best practices.")
|
||||
elif self.report.score >= 80:
|
||||
report_lines.append("✅ Good API design with minor areas for improvement.")
|
||||
elif self.report.score >= 70:
|
||||
report_lines.append("⚠️ Fair API design. Consider addressing warnings and errors.")
|
||||
elif self.report.score >= 50:
|
||||
report_lines.append("❌ Poor API design. Multiple issues need attention.")
|
||||
else:
|
||||
report_lines.append("🚨 Critical API design issues. Immediate attention required.")
|
||||
|
||||
return "\n".join(report_lines)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze OpenAPI/Swagger specifications for REST conventions and best practices",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python api_linter.py openapi.json
|
||||
python api_linter.py --format json openapi.json > report.json
|
||||
python api_linter.py --raw-endpoints endpoints.json
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'input_file',
|
||||
help='Input file: OpenAPI/Swagger JSON file or raw endpoints JSON'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--format',
|
||||
choices=['text', 'json'],
|
||||
default='text',
|
||||
help='Output format (default: text)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--raw-endpoints',
|
||||
action='store_true',
|
||||
help='Treat input as raw endpoint definitions instead of OpenAPI spec'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output',
|
||||
help='Output file (default: stdout)'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load input file
|
||||
try:
|
||||
with open(args.input_file, 'r') as f:
|
||||
input_data = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Input file '{args.input_file}' not found.", file=sys.stderr)
|
||||
return 1
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in '{args.input_file}': {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Initialize linter and run analysis
|
||||
linter = APILinter()
|
||||
|
||||
try:
|
||||
if args.raw_endpoints:
|
||||
report = linter.lint_raw_endpoints(input_data)
|
||||
else:
|
||||
report = linter.lint_openapi_spec(input_data)
|
||||
except Exception as e:
|
||||
print(f"Error during linting: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Generate report
|
||||
if args.format == 'json':
|
||||
output = linter.generate_json_report()
|
||||
else:
|
||||
output = linter.generate_text_report()
|
||||
|
||||
# Write output
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(output)
|
||||
print(f"Report written to {args.output}")
|
||||
except IOError as e:
|
||||
print(f"Error writing to '{args.output}': {e}", file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
print(output)
|
||||
|
||||
# Return appropriate exit code
|
||||
error_count = len([i for i in report.issues if i.severity == 'error'])
|
||||
return 1 if error_count > 0 else 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
1661
engineering/api-design-reviewer/scripts/api_scorecard.py
Normal file
1661
engineering/api-design-reviewer/scripts/api_scorecard.py
Normal file
File diff suppressed because it is too large
Load Diff
1102
engineering/api-design-reviewer/scripts/breaking_change_detector.py
Normal file
1102
engineering/api-design-reviewer/scripts/breaking_change_detector.py
Normal file
File diff suppressed because it is too large
Load Diff
458
engineering/interview-system-designer/SKILL.md
Normal file
458
engineering/interview-system-designer/SKILL.md
Normal file
@@ -0,0 +1,458 @@
|
||||
---
|
||||
name: interview-system-designer
|
||||
description: This skill should be used when the user asks to "design interview processes", "create hiring pipelines", "calibrate interview loops", "generate interview questions", "design competency matrices", "analyze interviewer bias", "create scoring rubrics", "build question banks", or "optimize hiring systems". Use for designing role-specific interview loops, competency assessments, and hiring calibration systems.
|
||||
---
|
||||
|
||||
# Interview System Designer
|
||||
|
||||
Comprehensive interview system design, competency assessment, and hiring process optimization.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
- [Tools Overview](#tools-overview)
|
||||
- [Interview Loop Designer](#1-interview-loop-designer)
|
||||
- [Question Bank Generator](#2-question-bank-generator)
|
||||
- [Hiring Calibrator](#3-hiring-calibrator)
|
||||
- [Interview System Workflows](#interview-system-workflows)
|
||||
- [Role-Specific Loop Design](#role-specific-loop-design)
|
||||
- [Competency Matrix Development](#competency-matrix-development)
|
||||
- [Question Bank Creation](#question-bank-creation)
|
||||
- [Bias Mitigation Framework](#bias-mitigation-framework)
|
||||
- [Hiring Bar Calibration](#hiring-bar-calibration)
|
||||
- [Competency Frameworks](#competency-frameworks)
|
||||
- [Scoring & Calibration](#scoring--calibration)
|
||||
- [Reference Documentation](#reference-documentation)
|
||||
- [Industry Standards](#industry-standards)
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Design a complete interview loop for a senior software engineer role
|
||||
python loop_designer.py --role "Senior Software Engineer" --level senior --team platform --output loops/
|
||||
|
||||
# Generate a comprehensive question bank for a product manager position
|
||||
python question_bank_generator.py --role "Product Manager" --level senior --competencies leadership,strategy,analytics --output questions/
|
||||
|
||||
# Analyze interview calibration across multiple candidates and interviewers
|
||||
python hiring_calibrator.py --input interview_data.json --output calibration_report.json --analysis-type full
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Tools Overview
|
||||
|
||||
### 1. Interview Loop Designer
|
||||
|
||||
Generates calibrated interview loops tailored to specific roles, levels, and teams.
|
||||
|
||||
**Input:** Role definition (title, level, team, competency requirements)
|
||||
**Output:** Complete interview loop with rounds, focus areas, time allocation, scorecard templates
|
||||
|
||||
**Key Features:**
|
||||
- Role-specific competency mapping
|
||||
- Level-appropriate question difficulty
|
||||
- Interviewer skill requirements
|
||||
- Time-optimized scheduling
|
||||
- Standardized scorecards
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Design loop for a specific role
|
||||
python loop_designer.py --role "Staff Data Scientist" --level staff --team ml-platform
|
||||
|
||||
# Generate loop with specific focus areas
|
||||
python loop_designer.py --role "Engineering Manager" --level senior --competencies leadership,technical,strategy
|
||||
|
||||
# Create loop for multiple levels
|
||||
python loop_designer.py --role "Backend Engineer" --levels junior,mid,senior --output loops/backend/
|
||||
```
|
||||
|
||||
### 2. Question Bank Generator
|
||||
|
||||
Creates comprehensive, competency-based interview questions with detailed scoring criteria.
|
||||
|
||||
**Input:** Role requirements, competency areas, experience level
|
||||
**Output:** Structured question bank with scoring rubrics, follow-up probes, and calibration examples
|
||||
|
||||
**Key Features:**
|
||||
- Competency-based question organization
|
||||
- Level-appropriate difficulty progression
|
||||
- Behavioral and technical question types
|
||||
- Anti-bias question design
|
||||
- Calibration examples (poor/good/great answers)
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Generate questions for technical competencies
|
||||
python question_bank_generator.py --role "Frontend Engineer" --competencies react,typescript,system-design
|
||||
|
||||
# Create behavioral question bank
|
||||
python question_bank_generator.py --role "Product Manager" --question-types behavioral,leadership --output pm_questions/
|
||||
|
||||
# Generate questions for all levels
|
||||
python question_bank_generator.py --role "DevOps Engineer" --levels junior,mid,senior,staff
|
||||
```
|
||||
|
||||
### 3. Hiring Calibrator
|
||||
|
||||
Analyzes interview scores to detect bias, calibration issues, and recommends improvements.
|
||||
|
||||
**Input:** Interview results data (candidate scores, interviewer feedback, demographics)
|
||||
**Output:** Calibration analysis, bias detection report, interviewer coaching recommendations
|
||||
|
||||
**Key Features:**
|
||||
- Statistical bias detection
|
||||
- Interviewer calibration analysis
|
||||
- Score distribution analysis
|
||||
- Recommendation engine
|
||||
- Trend tracking over time
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Analyze calibration across all interviews
|
||||
python hiring_calibrator.py --input interview_results.json --analysis-type comprehensive
|
||||
|
||||
# Focus on specific competency areas
|
||||
python hiring_calibrator.py --input data.json --competencies technical,leadership --output bias_report.json
|
||||
|
||||
# Track calibration trends over time
|
||||
python hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Interview System Workflows
|
||||
|
||||
### Role-Specific Loop Design
|
||||
|
||||
#### Software Engineering Roles
|
||||
|
||||
**Junior/Mid Software Engineer (2-4 years)**
|
||||
- **Duration:** 3-4 hours across 3-4 rounds
|
||||
- **Focus Areas:** Coding fundamentals, debugging, system understanding, growth mindset
|
||||
- **Rounds:**
|
||||
1. Technical Phone Screen (45min) - Coding fundamentals, algorithms
|
||||
2. Coding Deep Dive (60min) - Problem-solving, code quality, testing
|
||||
3. System Design Basics (45min) - Component interaction, basic scalability
|
||||
4. Behavioral & Values (30min) - Team collaboration, learning agility
|
||||
|
||||
**Senior Software Engineer (5-8 years)**
|
||||
- **Duration:** 4-5 hours across 4-5 rounds
|
||||
- **Focus Areas:** System design, technical leadership, mentoring capability, domain expertise
|
||||
- **Rounds:**
|
||||
1. Technical Phone Screen (45min) - Advanced algorithms, optimization
|
||||
2. System Design (60min) - Scalability, trade-offs, architectural decisions
|
||||
3. Coding Excellence (60min) - Code quality, testing strategies, refactoring
|
||||
4. Technical Leadership (45min) - Mentoring, technical decisions, cross-team collaboration
|
||||
5. Behavioral & Culture (30min) - Leadership examples, conflict resolution
|
||||
|
||||
**Staff+ Engineer (8+ years)**
|
||||
- **Duration:** 5-6 hours across 5-6 rounds
|
||||
- **Focus Areas:** Architectural vision, organizational impact, technical strategy, cross-functional leadership
|
||||
- **Rounds:**
|
||||
1. Technical Phone Screen (45min) - System architecture, complex problem-solving
|
||||
2. Architecture Design (90min) - Large-scale systems, technology choices, evolution patterns
|
||||
3. Technical Strategy (60min) - Technical roadmaps, technology adoption, risk assessment
|
||||
4. Leadership & Influence (60min) - Cross-team impact, technical vision, stakeholder management
|
||||
5. Coding & Best Practices (45min) - Code quality standards, development processes
|
||||
6. Cultural & Strategic Fit (30min) - Company values, strategic thinking
|
||||
|
||||
#### Product Management Roles
|
||||
|
||||
**Product Manager (3-6 years)**
|
||||
- **Duration:** 3-4 hours across 4 rounds
|
||||
- **Focus Areas:** Product sense, analytical thinking, stakeholder management, execution
|
||||
- **Rounds:**
|
||||
1. Product Sense (60min) - Feature prioritization, user empathy, market understanding
|
||||
2. Analytical Thinking (45min) - Data interpretation, metrics design, experimentation
|
||||
3. Execution & Process (45min) - Project management, cross-functional collaboration
|
||||
4. Behavioral & Leadership (30min) - Stakeholder management, conflict resolution
|
||||
|
||||
**Senior Product Manager (6-10 years)**
|
||||
- **Duration:** 4-5 hours across 4-5 rounds
|
||||
- **Focus Areas:** Product strategy, team leadership, business impact, market analysis
|
||||
- **Rounds:**
|
||||
1. Product Strategy (75min) - Market analysis, competitive positioning, roadmap planning
|
||||
2. Leadership & Influence (60min) - Team building, stakeholder management, decision-making
|
||||
3. Data & Analytics (45min) - Advanced metrics, experimentation design, business intelligence
|
||||
4. Technical Collaboration (45min) - Technical trade-offs, engineering partnership
|
||||
5. Case Study Presentation (45min) - Past impact, lessons learned, strategic thinking
|
||||
|
||||
#### Design Roles
|
||||
|
||||
**UX Designer (2-5 years)**
|
||||
- **Duration:** 3-4 hours across 3-4 rounds
|
||||
- **Focus Areas:** Design process, user research, visual design, collaboration
|
||||
- **Rounds:**
|
||||
1. Portfolio Review (60min) - Design process, problem-solving approach, visual skills
|
||||
2. Design Challenge (90min) - User-centered design, wireframing, iteration
|
||||
3. Collaboration & Process (45min) - Cross-functional work, feedback incorporation
|
||||
4. Behavioral & Values (30min) - User advocacy, creative problem-solving
|
||||
|
||||
**Senior UX Designer (5+ years)**
|
||||
- **Duration:** 4-5 hours across 4-5 rounds
|
||||
- **Focus Areas:** Design leadership, system thinking, research methodology, business impact
|
||||
- **Rounds:**
|
||||
1. Portfolio Deep Dive (75min) - Design impact, methodology, leadership examples
|
||||
2. Design System Challenge (90min) - Systems thinking, scalability, consistency
|
||||
3. Research & Strategy (60min) - User research methods, data-driven design decisions
|
||||
4. Leadership & Mentoring (45min) - Design team leadership, process improvement
|
||||
5. Business & Strategy (30min) - Design's business impact, stakeholder management
|
||||
|
||||
### Competency Matrix Development
|
||||
|
||||
#### Technical Competencies
|
||||
|
||||
**Software Engineering**
|
||||
- **Coding Proficiency:** Algorithm design, data structures, language expertise
|
||||
- **System Design:** Architecture patterns, scalability, performance optimization
|
||||
- **Testing & Quality:** Unit testing, integration testing, code review practices
|
||||
- **DevOps & Tools:** CI/CD, monitoring, debugging, development workflows
|
||||
|
||||
**Data Science & Analytics**
|
||||
- **Statistical Analysis:** Statistical methods, hypothesis testing, experimental design
|
||||
- **Machine Learning:** Algorithm selection, model evaluation, feature engineering
|
||||
- **Data Engineering:** ETL processes, data pipeline design, data quality
|
||||
- **Business Intelligence:** Metrics design, dashboard creation, stakeholder communication
|
||||
|
||||
**Product Management**
|
||||
- **Product Strategy:** Market analysis, competitive research, roadmap planning
|
||||
- **User Research:** User interviews, usability testing, persona development
|
||||
- **Data Analysis:** Metrics interpretation, A/B testing, cohort analysis
|
||||
- **Technical Understanding:** API design, database concepts, system architecture
|
||||
|
||||
#### Behavioral Competencies
|
||||
|
||||
**Leadership & Influence**
|
||||
- **Team Building:** Hiring, onboarding, team culture development
|
||||
- **Mentoring & Coaching:** Skill development, career guidance, feedback delivery
|
||||
- **Strategic Thinking:** Long-term planning, vision setting, decision-making frameworks
|
||||
- **Change Management:** Process improvement, organizational change, resistance handling
|
||||
|
||||
**Communication & Collaboration**
|
||||
- **Stakeholder Management:** Expectation setting, conflict resolution, alignment building
|
||||
- **Cross-Functional Partnership:** Engineering-Product-Design collaboration
|
||||
- **Presentation Skills:** Technical communication, executive briefings, documentation
|
||||
- **Active Listening:** Empathy, question asking, perspective taking
|
||||
|
||||
**Problem-Solving & Innovation**
|
||||
- **Analytical Thinking:** Problem decomposition, root cause analysis, hypothesis formation
|
||||
- **Creative Problem-Solving:** Alternative solution generation, constraint navigation
|
||||
- **Learning Agility:** Skill acquisition, adaptation to change, knowledge transfer
|
||||
- **Risk Assessment:** Uncertainty navigation, trade-off analysis, mitigation planning
|
||||
|
||||
### Question Bank Creation
|
||||
|
||||
#### Technical Questions by Level
|
||||
|
||||
**Junior Level Questions**
|
||||
- **Coding:** "Implement a function to find the second largest element in an array"
|
||||
- **System Design:** "How would you design a simple URL shortener for 1000 users?"
|
||||
- **Debugging:** "Walk through how you would debug a slow-loading web page"
|
||||
|
||||
**Senior Level Questions**
|
||||
- **Architecture:** "Design a real-time chat system supporting 1M concurrent users"
|
||||
- **Leadership:** "Describe how you would onboard a new team member in your area"
|
||||
- **Trade-offs:** "Compare microservices vs monolith for a rapidly scaling startup"
|
||||
|
||||
**Staff+ Level Questions**
|
||||
- **Strategy:** "How would you evaluate and introduce a new programming language to the organization?"
|
||||
- **Influence:** "Describe a time you drove technical consensus across multiple teams"
|
||||
- **Vision:** "How do you balance technical debt against feature development?"
|
||||
|
||||
#### Behavioral Questions Framework
|
||||
|
||||
**STAR Method Implementation**
|
||||
- **Situation:** Context and background of the scenario
|
||||
- **Task:** Specific challenge or goal that needed to be addressed
|
||||
- **Action:** Concrete steps taken to address the challenge
|
||||
- **Result:** Measurable outcomes and lessons learned
|
||||
|
||||
**Sample Questions:**
|
||||
- "Tell me about a time you had to influence a decision without formal authority"
|
||||
- "Describe a situation where you had to deliver difficult feedback to a colleague"
|
||||
- "Give an example of when you had to adapt your communication style for different audiences"
|
||||
- "Walk me through a time when you had to make a decision with incomplete information"
|
||||
|
||||
### Bias Mitigation Framework
|
||||
|
||||
#### Structural Bias Prevention
|
||||
|
||||
**Interview Panel Composition**
|
||||
- Diverse interviewer panels (gender, ethnicity, experience level)
|
||||
- Rotating panel assignments to prevent pattern bias
|
||||
- Anonymous resume screening for initial phone screens
|
||||
- Standardized question sets to ensure consistency
|
||||
|
||||
**Process Standardization**
|
||||
- Structured interview guides with required probing questions
|
||||
- Consistent time allocation across all candidates
|
||||
- Standardized evaluation criteria and scoring rubrics
|
||||
- Required justification for all scoring decisions
|
||||
|
||||
#### Cognitive Bias Recognition
|
||||
|
||||
**Common Interview Biases**
|
||||
- **Halo Effect:** One strong impression influences overall assessment
|
||||
- **Confirmation Bias:** Seeking information that confirms initial impressions
|
||||
- **Similarity Bias:** Favoring candidates with similar backgrounds/experiences
|
||||
- **Contrast Effect:** Comparing candidates against each other rather than standard
|
||||
- **Anchoring Bias:** Over-relying on first piece of information received
|
||||
|
||||
**Mitigation Strategies**
|
||||
- Pre-interview bias awareness training for all interviewers
|
||||
- Structured debrief sessions with independent score recording
|
||||
- Regular calibration sessions with example candidate discussions
|
||||
- Statistical monitoring of scoring patterns by interviewer and demographic
|
||||
|
||||
### Hiring Bar Calibration
|
||||
|
||||
#### Calibration Methodology
|
||||
|
||||
**Regular Calibration Sessions**
|
||||
- Monthly interviewer calibration meetings
|
||||
- Shadow interviewing for new interviewers (minimum 5 sessions)
|
||||
- Quarterly cross-team calibration reviews
|
||||
- Annual hiring bar review and adjustment process
|
||||
|
||||
**Performance Tracking**
|
||||
- New hire performance correlation with interview scores
|
||||
- Interviewer accuracy tracking (prediction vs actual performance)
|
||||
- False positive/negative analysis
|
||||
- Offer acceptance rate analysis by interviewer
|
||||
|
||||
**Feedback Loops**
|
||||
- Six-month new hire performance reviews
|
||||
- Manager feedback on interview process effectiveness
|
||||
- Candidate experience surveys and feedback integration
|
||||
- Continuous process improvement based on data analysis
|
||||
|
||||
---
|
||||
|
||||
## Competency Frameworks
|
||||
|
||||
### Engineering Competency Levels
|
||||
|
||||
#### Level 1-2: Individual Contributor (Junior/Mid)
|
||||
- **Technical Skills:** Language proficiency, testing basics, code review participation
|
||||
- **Problem Solving:** Structured approach to debugging, logical thinking
|
||||
- **Communication:** Clear status updates, effective question asking
|
||||
- **Learning:** Proactive skill development, mentorship seeking
|
||||
|
||||
#### Level 3-4: Senior Individual Contributor
|
||||
- **Technical Leadership:** Architecture decisions, code quality advocacy
|
||||
- **Mentoring:** Junior developer guidance, knowledge sharing
|
||||
- **Project Ownership:** End-to-end feature delivery, stakeholder communication
|
||||
- **Innovation:** Process improvement, technology evaluation
|
||||
|
||||
#### Level 5-6: Staff+ Engineer
|
||||
- **Organizational Impact:** Cross-team technical leadership, strategic planning
|
||||
- **Technical Vision:** Long-term architectural planning, technology roadmap
|
||||
- **People Development:** Team growth, hiring contribution, culture building
|
||||
- **External Influence:** Industry contribution, thought leadership
|
||||
|
||||
### Product Management Competency Levels
|
||||
|
||||
#### Level 1-2: Associate/Product Manager
|
||||
- **Product Execution:** Feature specification, requirements gathering
|
||||
- **User Focus:** User research participation, feedback collection
|
||||
- **Data Analysis:** Basic metrics analysis, experiment interpretation
|
||||
- **Stakeholder Management:** Cross-functional collaboration, communication
|
||||
|
||||
#### Level 3-4: Senior Product Manager
|
||||
- **Strategic Thinking:** Market analysis, competitive positioning
|
||||
- **Leadership:** Cross-functional team leadership, decision making
|
||||
- **Business Impact:** Revenue impact, market share growth
|
||||
- **Process Innovation:** Product development process improvement
|
||||
|
||||
#### Level 5-6: Principal Product Manager
|
||||
- **Vision Setting:** Product strategy, market direction
|
||||
- **Organizational Influence:** Executive communication, team building
|
||||
- **Innovation Leadership:** New market creation, disruptive thinking
|
||||
- **Talent Development:** PM team growth, hiring leadership
|
||||
|
||||
---
|
||||
|
||||
## Scoring & Calibration
|
||||
|
||||
### Scoring Rubric Framework
|
||||
|
||||
#### 4-Point Scoring Scale
|
||||
- **4 - Exceeds Expectations:** Demonstrates mastery beyond required level
|
||||
- **3 - Meets Expectations:** Solid performance meeting all requirements
|
||||
- **2 - Partially Meets:** Shows potential but has development areas
|
||||
- **1 - Does Not Meet:** Significant gaps in required competencies
|
||||
|
||||
#### Competency-Specific Scoring
|
||||
|
||||
**Technical Competencies**
|
||||
- Code Quality (4): Clean, maintainable, well-tested code with excellent documentation
|
||||
- Code Quality (3): Functional code with good structure and basic testing
|
||||
- Code Quality (2): Working code with some structural issues or missing tests
|
||||
- Code Quality (1): Non-functional or poorly structured code with significant issues
|
||||
|
||||
**Leadership Competencies**
|
||||
- Team Influence (4): Drives team success, develops others, creates lasting positive change
|
||||
- Team Influence (3): Contributes positively to team dynamics and outcomes
|
||||
- Team Influence (2): Shows leadership potential with some effective examples
|
||||
- Team Influence (1): Limited evidence of leadership ability or negative team impact
|
||||
|
||||
### Calibration Standards
|
||||
|
||||
#### Statistical Benchmarks
|
||||
- Target score distribution: 20% (4s), 40% (3s), 30% (2s), 10% (1s)
|
||||
- Interviewer consistency target: <0.5 standard deviation from team average
|
||||
- Pass rate target: 15-25% for most roles (varies by level and market conditions)
|
||||
- Time to hire target: 2-3 weeks from first interview to offer
|
||||
|
||||
#### Quality Metrics
|
||||
- New hire 6-month performance correlation: >0.6 with interview scores
|
||||
- Interviewer agreement rate: >80% within 1 point on final recommendations
|
||||
- Candidate experience satisfaction: >4.0/5.0 average rating
|
||||
- Offer acceptance rate: >85% for preferred candidates
|
||||
|
||||
---
|
||||
|
||||
## Reference Documentation
|
||||
|
||||
### Interview Templates
|
||||
- Role-specific interview guides and question banks
|
||||
- Scorecard templates for consistent evaluation
|
||||
- Debrief facilitation guides for effective team discussions
|
||||
|
||||
### Bias Mitigation Resources
|
||||
- Unconscious bias training materials and exercises
|
||||
- Structured interviewing best practices checklist
|
||||
- Demographic diversity tracking and reporting templates
|
||||
|
||||
### Calibration Tools
|
||||
- Interview performance correlation analysis templates
|
||||
- Interviewer coaching and development frameworks
|
||||
- Hiring pipeline metrics and dashboard specifications
|
||||
|
||||
---
|
||||
|
||||
## Industry Standards
|
||||
|
||||
### Best Practices Integration
|
||||
- Google's structured interviewing methodology
|
||||
- Amazon's Leadership Principles assessment framework
|
||||
- Microsoft's competency-based evaluation system
|
||||
- Netflix's culture fit assessment approach
|
||||
|
||||
### Compliance & Legal Considerations
|
||||
- EEOC compliance requirements and documentation
|
||||
- ADA accommodation procedures and guidelines
|
||||
- International hiring law considerations
|
||||
- Privacy and data protection requirements (GDPR, CCPA)
|
||||
|
||||
### Continuous Improvement Framework
|
||||
- Regular process auditing and refinement cycles
|
||||
- Industry benchmarking and comparative analysis
|
||||
- Technology integration for interview optimization
|
||||
- Candidate experience enhancement initiatives
|
||||
|
||||
This comprehensive interview system design framework provides the structure and tools necessary to build fair, effective, and scalable hiring processes that consistently identify top talent while minimizing bias and maximizing candidate experience.
|
||||
908
engineering/interview-system-designer/loop_designer.py
Normal file
908
engineering/interview-system-designer/loop_designer.py
Normal file
@@ -0,0 +1,908 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Interview Loop Designer
|
||||
|
||||
Generates calibrated interview loops tailored to specific roles, levels, and teams.
|
||||
Creates complete interview loops with rounds, focus areas, time allocation,
|
||||
interviewer skill requirements, and scorecard templates.
|
||||
|
||||
Usage:
|
||||
python loop_designer.py --role "Senior Software Engineer" --level senior --team platform
|
||||
python loop_designer.py --role "Product Manager" --level mid --competencies leadership,strategy
|
||||
python loop_designer.py --input role_definition.json --output loops/
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class InterviewLoopDesigner:
|
||||
"""Designs comprehensive interview loops based on role requirements."""
|
||||
|
||||
def __init__(self):
|
||||
self.competency_frameworks = self._init_competency_frameworks()
|
||||
self.role_templates = self._init_role_templates()
|
||||
self.interviewer_skills = self._init_interviewer_skills()
|
||||
|
||||
def _init_competency_frameworks(self) -> Dict[str, Dict]:
|
||||
"""Initialize competency frameworks for different roles."""
|
||||
return {
|
||||
"software_engineer": {
|
||||
"junior": {
|
||||
"required": ["coding_fundamentals", "debugging", "testing_basics", "version_control"],
|
||||
"preferred": ["system_understanding", "code_review", "collaboration"],
|
||||
"focus_areas": ["technical_execution", "learning_agility", "team_collaboration"]
|
||||
},
|
||||
"mid": {
|
||||
"required": ["advanced_coding", "system_design_basics", "testing_strategy", "debugging_complex"],
|
||||
"preferred": ["mentoring_basics", "technical_communication", "project_ownership"],
|
||||
"focus_areas": ["technical_depth", "system_thinking", "ownership"]
|
||||
},
|
||||
"senior": {
|
||||
"required": ["system_architecture", "technical_leadership", "mentoring", "cross_team_collab"],
|
||||
"preferred": ["technology_evaluation", "process_improvement", "hiring_contribution"],
|
||||
"focus_areas": ["technical_leadership", "system_architecture", "people_development"]
|
||||
},
|
||||
"staff": {
|
||||
"required": ["architectural_vision", "organizational_impact", "technical_strategy", "team_building"],
|
||||
"preferred": ["industry_influence", "innovation_leadership", "executive_communication"],
|
||||
"focus_areas": ["organizational_impact", "technical_vision", "strategic_influence"]
|
||||
},
|
||||
"principal": {
|
||||
"required": ["company_wide_impact", "technical_vision", "talent_development", "strategic_planning"],
|
||||
"preferred": ["industry_leadership", "board_communication", "market_influence"],
|
||||
"focus_areas": ["strategic_leadership", "organizational_transformation", "external_influence"]
|
||||
}
|
||||
},
|
||||
"product_manager": {
|
||||
"junior": {
|
||||
"required": ["product_execution", "user_research", "data_analysis", "stakeholder_comm"],
|
||||
"preferred": ["market_awareness", "technical_understanding", "project_management"],
|
||||
"focus_areas": ["execution_excellence", "user_focus", "analytical_thinking"]
|
||||
},
|
||||
"mid": {
|
||||
"required": ["product_strategy", "cross_functional_leadership", "metrics_design", "market_analysis"],
|
||||
"preferred": ["team_building", "technical_collaboration", "competitive_analysis"],
|
||||
"focus_areas": ["strategic_thinking", "leadership", "business_impact"]
|
||||
},
|
||||
"senior": {
|
||||
"required": ["business_strategy", "team_leadership", "p&l_ownership", "market_positioning"],
|
||||
"preferred": ["hiring_leadership", "board_communication", "partnership_development"],
|
||||
"focus_areas": ["business_leadership", "market_strategy", "organizational_impact"]
|
||||
},
|
||||
"staff": {
|
||||
"required": ["portfolio_management", "organizational_leadership", "strategic_planning", "market_creation"],
|
||||
"preferred": ["executive_presence", "investor_relations", "acquisition_strategy"],
|
||||
"focus_areas": ["strategic_leadership", "market_innovation", "organizational_transformation"]
|
||||
}
|
||||
},
|
||||
"designer": {
|
||||
"junior": {
|
||||
"required": ["design_fundamentals", "user_research", "prototyping", "design_tools"],
|
||||
"preferred": ["user_empathy", "visual_design", "collaboration"],
|
||||
"focus_areas": ["design_execution", "user_research", "creative_problem_solving"]
|
||||
},
|
||||
"mid": {
|
||||
"required": ["design_systems", "user_testing", "cross_functional_collab", "design_strategy"],
|
||||
"preferred": ["mentoring", "process_improvement", "business_understanding"],
|
||||
"focus_areas": ["design_leadership", "system_thinking", "business_impact"]
|
||||
},
|
||||
"senior": {
|
||||
"required": ["design_leadership", "team_building", "strategic_design", "stakeholder_management"],
|
||||
"preferred": ["design_culture", "hiring_leadership", "executive_communication"],
|
||||
"focus_areas": ["design_strategy", "team_leadership", "organizational_impact"]
|
||||
}
|
||||
},
|
||||
"data_scientist": {
|
||||
"junior": {
|
||||
"required": ["statistical_analysis", "python_r", "data_visualization", "sql"],
|
||||
"preferred": ["machine_learning", "business_understanding", "communication"],
|
||||
"focus_areas": ["analytical_skills", "technical_execution", "business_impact"]
|
||||
},
|
||||
"mid": {
|
||||
"required": ["advanced_ml", "experiment_design", "data_engineering", "stakeholder_comm"],
|
||||
"preferred": ["mentoring", "project_leadership", "product_collaboration"],
|
||||
"focus_areas": ["advanced_analytics", "project_leadership", "cross_functional_impact"]
|
||||
},
|
||||
"senior": {
|
||||
"required": ["data_strategy", "team_leadership", "ml_systems", "business_strategy"],
|
||||
"preferred": ["hiring_leadership", "executive_communication", "technology_evaluation"],
|
||||
"focus_areas": ["strategic_leadership", "technical_vision", "organizational_impact"]
|
||||
}
|
||||
},
|
||||
"devops_engineer": {
|
||||
"junior": {
|
||||
"required": ["infrastructure_basics", "scripting", "monitoring", "troubleshooting"],
|
||||
"preferred": ["automation", "cloud_platforms", "security_awareness"],
|
||||
"focus_areas": ["operational_excellence", "automation_mindset", "problem_solving"]
|
||||
},
|
||||
"mid": {
|
||||
"required": ["ci_cd_design", "infrastructure_as_code", "security_implementation", "performance_optimization"],
|
||||
"preferred": ["team_collaboration", "incident_management", "capacity_planning"],
|
||||
"focus_areas": ["system_reliability", "automation_leadership", "cross_team_collaboration"]
|
||||
},
|
||||
"senior": {
|
||||
"required": ["platform_architecture", "team_leadership", "security_strategy", "organizational_impact"],
|
||||
"preferred": ["hiring_contribution", "technology_evaluation", "executive_communication"],
|
||||
"focus_areas": ["platform_leadership", "strategic_thinking", "organizational_transformation"]
|
||||
}
|
||||
},
|
||||
"engineering_manager": {
|
||||
"junior": {
|
||||
"required": ["team_leadership", "technical_background", "people_management", "project_coordination"],
|
||||
"preferred": ["hiring_experience", "performance_management", "technical_mentoring"],
|
||||
"focus_areas": ["people_leadership", "team_building", "execution_excellence"]
|
||||
},
|
||||
"senior": {
|
||||
"required": ["organizational_leadership", "strategic_planning", "talent_development", "cross_functional_leadership"],
|
||||
"preferred": ["technical_vision", "culture_building", "executive_communication"],
|
||||
"focus_areas": ["organizational_impact", "strategic_leadership", "talent_development"]
|
||||
},
|
||||
"staff": {
|
||||
"required": ["multi_team_leadership", "organizational_strategy", "executive_presence", "cultural_transformation"],
|
||||
"preferred": ["board_communication", "market_understanding", "acquisition_integration"],
|
||||
"focus_areas": ["organizational_transformation", "strategic_leadership", "cultural_evolution"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def _init_role_templates(self) -> Dict[str, Dict]:
|
||||
"""Initialize role-specific interview templates."""
|
||||
return {
|
||||
"software_engineer": {
|
||||
"core_rounds": ["technical_phone_screen", "coding_deep_dive", "system_design", "behavioral"],
|
||||
"optional_rounds": ["technical_leadership", "domain_expertise", "culture_fit"],
|
||||
"total_duration_range": (180, 360), # 3-6 hours
|
||||
"required_competencies": ["coding", "problem_solving", "communication"]
|
||||
},
|
||||
"product_manager": {
|
||||
"core_rounds": ["product_sense", "analytical_thinking", "execution_process", "behavioral"],
|
||||
"optional_rounds": ["strategic_thinking", "technical_collaboration", "leadership"],
|
||||
"total_duration_range": (180, 300), # 3-5 hours
|
||||
"required_competencies": ["product_strategy", "analytical_thinking", "stakeholder_management"]
|
||||
},
|
||||
"designer": {
|
||||
"core_rounds": ["portfolio_review", "design_challenge", "collaboration_process", "behavioral"],
|
||||
"optional_rounds": ["design_system_thinking", "research_methodology", "leadership"],
|
||||
"total_duration_range": (180, 300), # 3-5 hours
|
||||
"required_competencies": ["design_process", "user_empathy", "visual_communication"]
|
||||
},
|
||||
"data_scientist": {
|
||||
"core_rounds": ["technical_assessment", "case_study", "statistical_thinking", "behavioral"],
|
||||
"optional_rounds": ["ml_systems", "business_strategy", "technical_leadership"],
|
||||
"total_duration_range": (210, 330), # 3.5-5.5 hours
|
||||
"required_competencies": ["statistical_analysis", "programming", "business_acumen"]
|
||||
},
|
||||
"devops_engineer": {
|
||||
"core_rounds": ["technical_assessment", "system_design", "troubleshooting", "behavioral"],
|
||||
"optional_rounds": ["security_assessment", "automation_design", "leadership"],
|
||||
"total_duration_range": (180, 300), # 3-5 hours
|
||||
"required_competencies": ["infrastructure", "automation", "problem_solving"]
|
||||
},
|
||||
"engineering_manager": {
|
||||
"core_rounds": ["leadership_assessment", "technical_background", "people_management", "behavioral"],
|
||||
"optional_rounds": ["strategic_thinking", "hiring_assessment", "culture_building"],
|
||||
"total_duration_range": (240, 360), # 4-6 hours
|
||||
"required_competencies": ["people_leadership", "technical_understanding", "strategic_thinking"]
|
||||
}
|
||||
}
|
||||
|
||||
def _init_interviewer_skills(self) -> Dict[str, Dict]:
|
||||
"""Initialize interviewer skill requirements for different round types."""
|
||||
return {
|
||||
"technical_phone_screen": {
|
||||
"required_skills": ["technical_assessment", "coding_evaluation"],
|
||||
"preferred_experience": ["same_domain", "senior_level"],
|
||||
"calibration_level": "standard"
|
||||
},
|
||||
"coding_deep_dive": {
|
||||
"required_skills": ["advanced_technical", "code_quality_assessment"],
|
||||
"preferred_experience": ["senior_engineer", "system_design"],
|
||||
"calibration_level": "high"
|
||||
},
|
||||
"system_design": {
|
||||
"required_skills": ["architecture_design", "scalability_assessment"],
|
||||
"preferred_experience": ["senior_architect", "large_scale_systems"],
|
||||
"calibration_level": "high"
|
||||
},
|
||||
"behavioral": {
|
||||
"required_skills": ["behavioral_interviewing", "competency_assessment"],
|
||||
"preferred_experience": ["hiring_manager", "people_leadership"],
|
||||
"calibration_level": "standard"
|
||||
},
|
||||
"technical_leadership": {
|
||||
"required_skills": ["leadership_assessment", "technical_mentoring"],
|
||||
"preferred_experience": ["engineering_manager", "tech_lead"],
|
||||
"calibration_level": "high"
|
||||
},
|
||||
"product_sense": {
|
||||
"required_skills": ["product_evaluation", "market_analysis"],
|
||||
"preferred_experience": ["product_manager", "product_leadership"],
|
||||
"calibration_level": "high"
|
||||
},
|
||||
"analytical_thinking": {
|
||||
"required_skills": ["data_analysis", "metrics_evaluation"],
|
||||
"preferred_experience": ["data_analyst", "product_manager"],
|
||||
"calibration_level": "standard"
|
||||
},
|
||||
"design_challenge": {
|
||||
"required_skills": ["design_evaluation", "user_experience"],
|
||||
"preferred_experience": ["senior_designer", "design_manager"],
|
||||
"calibration_level": "high"
|
||||
}
|
||||
}
|
||||
|
||||
def generate_interview_loop(self, role: str, level: str, team: Optional[str] = None,
|
||||
competencies: Optional[List[str]] = None) -> Dict[str, Any]:
|
||||
"""Generate a complete interview loop for the specified role and level."""
|
||||
|
||||
# Normalize inputs
|
||||
role_key = role.lower().replace(" ", "_").replace("-", "_")
|
||||
level_key = level.lower()
|
||||
|
||||
# Get role template and competency requirements
|
||||
if role_key not in self.competency_frameworks:
|
||||
role_key = self._find_closest_role(role_key)
|
||||
|
||||
if level_key not in self.competency_frameworks[role_key]:
|
||||
level_key = self._find_closest_level(role_key, level_key)
|
||||
|
||||
competency_req = self.competency_frameworks[role_key][level_key]
|
||||
role_template = self.role_templates.get(role_key, self.role_templates["software_engineer"])
|
||||
|
||||
# Design the interview loop
|
||||
rounds = self._design_rounds(role_key, level_key, competency_req, role_template, competencies)
|
||||
schedule = self._create_schedule(rounds)
|
||||
scorecard = self._generate_scorecard(role_key, level_key, competency_req)
|
||||
interviewer_requirements = self._define_interviewer_requirements(rounds)
|
||||
|
||||
return {
|
||||
"role": role,
|
||||
"level": level,
|
||||
"team": team,
|
||||
"generated_at": datetime.now().isoformat(),
|
||||
"total_duration_minutes": sum(round_info["duration_minutes"] for round_info in rounds.values()),
|
||||
"total_rounds": len(rounds),
|
||||
"rounds": rounds,
|
||||
"suggested_schedule": schedule,
|
||||
"scorecard_template": scorecard,
|
||||
"interviewer_requirements": interviewer_requirements,
|
||||
"competency_framework": competency_req,
|
||||
"calibration_notes": self._generate_calibration_notes(role_key, level_key)
|
||||
}
|
||||
|
||||
def _find_closest_role(self, role_key: str) -> str:
|
||||
"""Find the closest matching role template."""
|
||||
role_mappings = {
|
||||
"engineer": "software_engineer",
|
||||
"developer": "software_engineer",
|
||||
"swe": "software_engineer",
|
||||
"backend": "software_engineer",
|
||||
"frontend": "software_engineer",
|
||||
"fullstack": "software_engineer",
|
||||
"pm": "product_manager",
|
||||
"product": "product_manager",
|
||||
"ux": "designer",
|
||||
"ui": "designer",
|
||||
"graphic": "designer",
|
||||
"data": "data_scientist",
|
||||
"analyst": "data_scientist",
|
||||
"ml": "data_scientist",
|
||||
"ops": "devops_engineer",
|
||||
"sre": "devops_engineer",
|
||||
"infrastructure": "devops_engineer",
|
||||
"manager": "engineering_manager",
|
||||
"lead": "engineering_manager"
|
||||
}
|
||||
|
||||
for key_part in role_key.split("_"):
|
||||
if key_part in role_mappings:
|
||||
return role_mappings[key_part]
|
||||
|
||||
return "software_engineer" # Default fallback
|
||||
|
||||
def _find_closest_level(self, role_key: str, level_key: str) -> str:
|
||||
"""Find the closest matching level for the role."""
|
||||
available_levels = list(self.competency_frameworks[role_key].keys())
|
||||
|
||||
level_mappings = {
|
||||
"entry": "junior",
|
||||
"associate": "junior",
|
||||
"jr": "junior",
|
||||
"mid": "mid",
|
||||
"middle": "mid",
|
||||
"sr": "senior",
|
||||
"senior": "senior",
|
||||
"staff": "staff",
|
||||
"principal": "principal",
|
||||
"lead": "senior",
|
||||
"manager": "senior"
|
||||
}
|
||||
|
||||
mapped_level = level_mappings.get(level_key, level_key)
|
||||
|
||||
if mapped_level in available_levels:
|
||||
return mapped_level
|
||||
elif "senior" in available_levels:
|
||||
return "senior"
|
||||
else:
|
||||
return available_levels[0]
|
||||
|
||||
def _design_rounds(self, role_key: str, level_key: str, competency_req: Dict,
|
||||
role_template: Dict, custom_competencies: Optional[List[str]]) -> Dict[str, Dict]:
|
||||
"""Design the specific interview rounds based on role and level."""
|
||||
rounds = {}
|
||||
|
||||
# Determine which rounds to include
|
||||
core_rounds = role_template["core_rounds"].copy()
|
||||
optional_rounds = role_template["optional_rounds"].copy()
|
||||
|
||||
# Add optional rounds based on level
|
||||
if level_key in ["senior", "staff", "principal"]:
|
||||
if "technical_leadership" in optional_rounds and role_key in ["software_engineer", "engineering_manager"]:
|
||||
core_rounds.append("technical_leadership")
|
||||
if "strategic_thinking" in optional_rounds and role_key in ["product_manager", "engineering_manager"]:
|
||||
core_rounds.append("strategic_thinking")
|
||||
if "design_system_thinking" in optional_rounds and role_key == "designer":
|
||||
core_rounds.append("design_system_thinking")
|
||||
|
||||
if level_key in ["staff", "principal"]:
|
||||
if "domain_expertise" in optional_rounds:
|
||||
core_rounds.append("domain_expertise")
|
||||
|
||||
# Define round details
|
||||
round_definitions = self._get_round_definitions()
|
||||
|
||||
for i, round_type in enumerate(core_rounds, 1):
|
||||
if round_type in round_definitions:
|
||||
round_def = round_definitions[round_type].copy()
|
||||
round_def["order"] = i
|
||||
round_def["focus_areas"] = self._customize_focus_areas(round_type, competency_req, custom_competencies)
|
||||
rounds[f"round_{i}_{round_type}"] = round_def
|
||||
|
||||
return rounds
|
||||
|
||||
def _get_round_definitions(self) -> Dict[str, Dict]:
|
||||
"""Get predefined round definitions with standard durations and formats."""
|
||||
return {
|
||||
"technical_phone_screen": {
|
||||
"name": "Technical Phone Screen",
|
||||
"duration_minutes": 45,
|
||||
"format": "virtual",
|
||||
"objectives": ["Assess coding fundamentals", "Evaluate problem-solving approach", "Screen for basic technical competency"],
|
||||
"question_types": ["coding_problems", "technical_concepts", "experience_questions"],
|
||||
"evaluation_criteria": ["technical_accuracy", "problem_solving_process", "communication_clarity"]
|
||||
},
|
||||
"coding_deep_dive": {
|
||||
"name": "Coding Deep Dive",
|
||||
"duration_minutes": 75,
|
||||
"format": "in_person_or_virtual",
|
||||
"objectives": ["Evaluate coding skills in depth", "Assess code quality and testing", "Review debugging approach"],
|
||||
"question_types": ["complex_coding_problems", "code_review", "testing_strategy"],
|
||||
"evaluation_criteria": ["code_quality", "testing_approach", "debugging_skills", "optimization_thinking"]
|
||||
},
|
||||
"system_design": {
|
||||
"name": "System Design",
|
||||
"duration_minutes": 75,
|
||||
"format": "collaborative_whiteboard",
|
||||
"objectives": ["Assess architectural thinking", "Evaluate scalability considerations", "Review trade-off analysis"],
|
||||
"question_types": ["system_architecture", "scalability_design", "trade_off_analysis"],
|
||||
"evaluation_criteria": ["architectural_thinking", "scalability_awareness", "trade_off_reasoning"]
|
||||
},
|
||||
"behavioral": {
|
||||
"name": "Behavioral Interview",
|
||||
"duration_minutes": 45,
|
||||
"format": "conversational",
|
||||
"objectives": ["Assess cultural fit", "Evaluate past experiences", "Review leadership examples"],
|
||||
"question_types": ["star_method_questions", "situational_scenarios", "values_alignment"],
|
||||
"evaluation_criteria": ["communication_skills", "leadership_examples", "cultural_alignment"]
|
||||
},
|
||||
"technical_leadership": {
|
||||
"name": "Technical Leadership",
|
||||
"duration_minutes": 60,
|
||||
"format": "discussion_based",
|
||||
"objectives": ["Evaluate mentoring capability", "Assess technical decision making", "Review cross-team collaboration"],
|
||||
"question_types": ["leadership_scenarios", "technical_decisions", "mentoring_examples"],
|
||||
"evaluation_criteria": ["leadership_potential", "technical_judgment", "influence_skills"]
|
||||
},
|
||||
"product_sense": {
|
||||
"name": "Product Sense",
|
||||
"duration_minutes": 75,
|
||||
"format": "case_study",
|
||||
"objectives": ["Assess product intuition", "Evaluate user empathy", "Review market understanding"],
|
||||
"question_types": ["product_scenarios", "feature_prioritization", "user_journey_analysis"],
|
||||
"evaluation_criteria": ["product_intuition", "user_empathy", "analytical_thinking"]
|
||||
},
|
||||
"analytical_thinking": {
|
||||
"name": "Analytical Thinking",
|
||||
"duration_minutes": 60,
|
||||
"format": "data_analysis",
|
||||
"objectives": ["Evaluate data interpretation", "Assess metric design", "Review experiment planning"],
|
||||
"question_types": ["data_interpretation", "metric_design", "experiment_analysis"],
|
||||
"evaluation_criteria": ["analytical_rigor", "metric_intuition", "experimental_thinking"]
|
||||
},
|
||||
"design_challenge": {
|
||||
"name": "Design Challenge",
|
||||
"duration_minutes": 90,
|
||||
"format": "hands_on_design",
|
||||
"objectives": ["Assess design process", "Evaluate user-centered thinking", "Review iteration approach"],
|
||||
"question_types": ["design_problems", "user_research", "design_critique"],
|
||||
"evaluation_criteria": ["design_process", "user_focus", "visual_communication"]
|
||||
},
|
||||
"portfolio_review": {
|
||||
"name": "Portfolio Review",
|
||||
"duration_minutes": 75,
|
||||
"format": "presentation_discussion",
|
||||
"objectives": ["Review past work", "Assess design thinking", "Evaluate impact measurement"],
|
||||
"question_types": ["portfolio_walkthrough", "design_decisions", "impact_stories"],
|
||||
"evaluation_criteria": ["design_quality", "process_thinking", "business_impact"]
|
||||
}
|
||||
}
|
||||
|
||||
def _customize_focus_areas(self, round_type: str, competency_req: Dict,
|
||||
custom_competencies: Optional[List[str]]) -> List[str]:
|
||||
"""Customize focus areas based on role competency requirements."""
|
||||
base_focus_areas = competency_req.get("focus_areas", [])
|
||||
|
||||
round_focus_mapping = {
|
||||
"technical_phone_screen": ["coding_fundamentals", "problem_solving"],
|
||||
"coding_deep_dive": ["technical_execution", "code_quality"],
|
||||
"system_design": ["system_thinking", "architectural_reasoning"],
|
||||
"behavioral": ["cultural_fit", "communication", "teamwork"],
|
||||
"technical_leadership": ["leadership", "mentoring", "influence"],
|
||||
"product_sense": ["product_intuition", "user_empathy"],
|
||||
"analytical_thinking": ["data_analysis", "metric_design"],
|
||||
"design_challenge": ["design_process", "user_focus"]
|
||||
}
|
||||
|
||||
focus_areas = round_focus_mapping.get(round_type, [])
|
||||
|
||||
# Add custom competencies if specified
|
||||
if custom_competencies:
|
||||
focus_areas.extend([comp for comp in custom_competencies if comp not in focus_areas])
|
||||
|
||||
# Add role-specific focus areas
|
||||
focus_areas.extend([area for area in base_focus_areas if area not in focus_areas])
|
||||
|
||||
return focus_areas[:5] # Limit to top 5 focus areas
|
||||
|
||||
def _create_schedule(self, rounds: Dict[str, Dict]) -> Dict[str, Any]:
|
||||
"""Create a suggested interview schedule."""
|
||||
sorted_rounds = sorted(rounds.items(), key=lambda x: x[1]["order"])
|
||||
|
||||
# Calculate optimal scheduling
|
||||
total_duration = sum(round_info["duration_minutes"] for _, round_info in sorted_rounds)
|
||||
|
||||
if total_duration <= 240: # 4 hours or less - single day
|
||||
schedule_type = "single_day"
|
||||
day_structure = self._create_single_day_schedule(sorted_rounds)
|
||||
else: # Multi-day schedule
|
||||
schedule_type = "multi_day"
|
||||
day_structure = self._create_multi_day_schedule(sorted_rounds)
|
||||
|
||||
return {
|
||||
"type": schedule_type,
|
||||
"total_duration_minutes": total_duration,
|
||||
"recommended_breaks": self._calculate_breaks(total_duration),
|
||||
"day_structure": day_structure,
|
||||
"logistics_notes": self._generate_logistics_notes(sorted_rounds)
|
||||
}
|
||||
|
||||
def _create_single_day_schedule(self, rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||
"""Create a single-day interview schedule."""
|
||||
start_time = datetime.strptime("09:00", "%H:%M")
|
||||
current_time = start_time
|
||||
|
||||
schedule = []
|
||||
|
||||
for round_name, round_info in rounds:
|
||||
# Add break if needed (after 90 minutes of interviews)
|
||||
if schedule and sum(item.get("duration_minutes", 0) for item in schedule if "break" not in item.get("type", "")) >= 90:
|
||||
schedule.append({
|
||||
"type": "break",
|
||||
"start_time": current_time.strftime("%H:%M"),
|
||||
"duration_minutes": 15,
|
||||
"end_time": (current_time + timedelta(minutes=15)).strftime("%H:%M")
|
||||
})
|
||||
current_time += timedelta(minutes=15)
|
||||
|
||||
# Add the interview round
|
||||
end_time = current_time + timedelta(minutes=round_info["duration_minutes"])
|
||||
schedule.append({
|
||||
"type": "interview",
|
||||
"round_name": round_name,
|
||||
"title": round_info["name"],
|
||||
"start_time": current_time.strftime("%H:%M"),
|
||||
"end_time": end_time.strftime("%H:%M"),
|
||||
"duration_minutes": round_info["duration_minutes"],
|
||||
"format": round_info["format"]
|
||||
})
|
||||
current_time = end_time
|
||||
|
||||
return {
|
||||
"day_1": {
|
||||
"date": "TBD",
|
||||
"start_time": start_time.strftime("%H:%M"),
|
||||
"end_time": current_time.strftime("%H:%M"),
|
||||
"rounds": schedule
|
||||
}
|
||||
}
|
||||
|
||||
def _create_multi_day_schedule(self, rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||
"""Create a multi-day interview schedule."""
|
||||
# Split rounds across days (max 4 hours per day)
|
||||
max_daily_minutes = 240
|
||||
days = {}
|
||||
current_day = 1
|
||||
current_day_duration = 0
|
||||
current_day_rounds = []
|
||||
|
||||
for round_name, round_info in rounds:
|
||||
duration = round_info["duration_minutes"] + 15 # Add buffer time
|
||||
|
||||
if current_day_duration + duration > max_daily_minutes and current_day_rounds:
|
||||
# Finalize current day
|
||||
days[f"day_{current_day}"] = self._finalize_day_schedule(current_day_rounds)
|
||||
current_day += 1
|
||||
current_day_duration = 0
|
||||
current_day_rounds = []
|
||||
|
||||
current_day_rounds.append((round_name, round_info))
|
||||
current_day_duration += duration
|
||||
|
||||
# Finalize last day
|
||||
if current_day_rounds:
|
||||
days[f"day_{current_day}"] = self._finalize_day_schedule(current_day_rounds)
|
||||
|
||||
return days
|
||||
|
||||
def _finalize_day_schedule(self, day_rounds: List[Tuple[str, Dict]]) -> Dict[str, Any]:
|
||||
"""Finalize the schedule for a specific day."""
|
||||
start_time = datetime.strptime("09:00", "%H:%M")
|
||||
current_time = start_time
|
||||
schedule = []
|
||||
|
||||
for round_name, round_info in day_rounds:
|
||||
end_time = current_time + timedelta(minutes=round_info["duration_minutes"])
|
||||
schedule.append({
|
||||
"type": "interview",
|
||||
"round_name": round_name,
|
||||
"title": round_info["name"],
|
||||
"start_time": current_time.strftime("%H:%M"),
|
||||
"end_time": end_time.strftime("%H:%M"),
|
||||
"duration_minutes": round_info["duration_minutes"],
|
||||
"format": round_info["format"]
|
||||
})
|
||||
current_time = end_time + timedelta(minutes=15) # 15-min buffer
|
||||
|
||||
return {
|
||||
"date": "TBD",
|
||||
"start_time": start_time.strftime("%H:%M"),
|
||||
"end_time": (current_time - timedelta(minutes=15)).strftime("%H:%M"),
|
||||
"rounds": schedule
|
||||
}
|
||||
|
||||
def _calculate_breaks(self, total_duration: int) -> List[Dict[str, Any]]:
|
||||
"""Calculate recommended breaks based on total duration."""
|
||||
breaks = []
|
||||
|
||||
if total_duration >= 120: # 2+ hours
|
||||
breaks.append({"type": "short_break", "duration": 15, "after_minutes": 90})
|
||||
|
||||
if total_duration >= 240: # 4+ hours
|
||||
breaks.append({"type": "lunch_break", "duration": 60, "after_minutes": 180})
|
||||
|
||||
if total_duration >= 360: # 6+ hours
|
||||
breaks.append({"type": "short_break", "duration": 15, "after_minutes": 300})
|
||||
|
||||
return breaks
|
||||
|
||||
def _generate_scorecard(self, role_key: str, level_key: str, competency_req: Dict) -> Dict[str, Any]:
|
||||
"""Generate a scorecard template for the interview loop."""
|
||||
scoring_dimensions = []
|
||||
|
||||
# Add competency-based scoring dimensions
|
||||
for competency in competency_req["required"]:
|
||||
scoring_dimensions.append({
|
||||
"dimension": competency,
|
||||
"weight": "high",
|
||||
"scale": "1-4",
|
||||
"description": f"Assessment of {competency.replace('_', ' ')} competency"
|
||||
})
|
||||
|
||||
for competency in competency_req.get("preferred", []):
|
||||
scoring_dimensions.append({
|
||||
"dimension": competency,
|
||||
"weight": "medium",
|
||||
"scale": "1-4",
|
||||
"description": f"Assessment of {competency.replace('_', ' ')} competency"
|
||||
})
|
||||
|
||||
# Add standard dimensions
|
||||
standard_dimensions = [
|
||||
{"dimension": "communication", "weight": "high", "scale": "1-4"},
|
||||
{"dimension": "cultural_fit", "weight": "medium", "scale": "1-4"},
|
||||
{"dimension": "learning_agility", "weight": "medium", "scale": "1-4"}
|
||||
]
|
||||
|
||||
scoring_dimensions.extend(standard_dimensions)
|
||||
|
||||
return {
|
||||
"scoring_scale": {
|
||||
"4": "Exceeds Expectations - Demonstrates mastery beyond required level",
|
||||
"3": "Meets Expectations - Solid performance meeting all requirements",
|
||||
"2": "Partially Meets - Shows potential but has development areas",
|
||||
"1": "Does Not Meet - Significant gaps in required competencies"
|
||||
},
|
||||
"dimensions": scoring_dimensions,
|
||||
"overall_recommendation": {
|
||||
"options": ["Strong Hire", "Hire", "No Hire", "Strong No Hire"],
|
||||
"criteria": "Based on weighted average and minimum thresholds"
|
||||
},
|
||||
"calibration_notes": {
|
||||
"required": True,
|
||||
"min_length": 100,
|
||||
"sections": ["strengths", "areas_for_development", "specific_examples"]
|
||||
}
|
||||
}
|
||||
|
||||
def _define_interviewer_requirements(self, rounds: Dict[str, Dict]) -> Dict[str, Dict]:
|
||||
"""Define interviewer skill requirements for each round."""
|
||||
requirements = {}
|
||||
|
||||
for round_name, round_info in rounds.items():
|
||||
round_type = round_name.split("_", 2)[-1] # Extract round type
|
||||
|
||||
if round_type in self.interviewer_skills:
|
||||
skill_req = self.interviewer_skills[round_type].copy()
|
||||
skill_req["suggested_interviewers"] = self._suggest_interviewer_profiles(round_type)
|
||||
requirements[round_name] = skill_req
|
||||
else:
|
||||
# Default requirements
|
||||
requirements[round_name] = {
|
||||
"required_skills": ["interviewing_basics", "evaluation_skills"],
|
||||
"preferred_experience": ["relevant_domain"],
|
||||
"calibration_level": "standard",
|
||||
"suggested_interviewers": ["experienced_interviewer"]
|
||||
}
|
||||
|
||||
return requirements
|
||||
|
||||
def _suggest_interviewer_profiles(self, round_type: str) -> List[str]:
|
||||
"""Suggest specific interviewer profiles for different round types."""
|
||||
profile_mapping = {
|
||||
"technical_phone_screen": ["senior_engineer", "tech_lead"],
|
||||
"coding_deep_dive": ["senior_engineer", "staff_engineer"],
|
||||
"system_design": ["senior_architect", "staff_engineer"],
|
||||
"behavioral": ["hiring_manager", "people_manager"],
|
||||
"technical_leadership": ["engineering_manager", "senior_staff"],
|
||||
"product_sense": ["senior_pm", "product_leader"],
|
||||
"analytical_thinking": ["senior_analyst", "data_scientist"],
|
||||
"design_challenge": ["senior_designer", "design_manager"]
|
||||
}
|
||||
|
||||
return profile_mapping.get(round_type, ["experienced_interviewer"])
|
||||
|
||||
def _generate_calibration_notes(self, role_key: str, level_key: str) -> Dict[str, Any]:
|
||||
"""Generate calibration notes and best practices."""
|
||||
return {
|
||||
"hiring_bar_notes": f"Calibrated for {level_key} level {role_key.replace('_', ' ')} role",
|
||||
"common_pitfalls": [
|
||||
"Avoid comparing candidates to each other rather than to the role standard",
|
||||
"Don't let one strong/weak area overshadow overall assessment",
|
||||
"Ensure consistent application of evaluation criteria"
|
||||
],
|
||||
"calibration_checkpoints": [
|
||||
"Review score distribution after every 5 candidates",
|
||||
"Conduct monthly interviewer calibration sessions",
|
||||
"Track correlation with 6-month performance reviews"
|
||||
],
|
||||
"escalation_criteria": [
|
||||
"Any candidate receiving all 4s or all 1s",
|
||||
"Significant disagreement between interviewers (>1.5 point spread)",
|
||||
"Unusual circumstances or accommodations needed"
|
||||
]
|
||||
}
|
||||
|
||||
def _generate_logistics_notes(self, rounds: List[Tuple[str, Dict]]) -> List[str]:
|
||||
"""Generate logistics and coordination notes."""
|
||||
notes = [
|
||||
"Coordinate interviewer availability before scheduling",
|
||||
"Ensure all interviewers have access to job description and competency requirements",
|
||||
"Prepare interview rooms/virtual links for all rounds",
|
||||
"Share candidate resume and application with all interviewers"
|
||||
]
|
||||
|
||||
# Add format-specific notes
|
||||
formats_used = {round_info["format"] for _, round_info in rounds}
|
||||
|
||||
if "virtual" in formats_used:
|
||||
notes.append("Test video conferencing setup before virtual interviews")
|
||||
notes.append("Share virtual meeting links with candidate 24 hours in advance")
|
||||
|
||||
if "collaborative_whiteboard" in formats_used:
|
||||
notes.append("Prepare whiteboard or collaborative online tool for design sessions")
|
||||
|
||||
if "hands_on_design" in formats_used:
|
||||
notes.append("Provide design tools access or ensure candidate can screen share their preferred tools")
|
||||
|
||||
return notes
|
||||
|
||||
|
||||
def format_human_readable(loop_data: Dict[str, Any]) -> str:
|
||||
"""Format the interview loop data in a human-readable format."""
|
||||
output = []
|
||||
|
||||
# Header
|
||||
output.append(f"Interview Loop Design for {loop_data['role']} ({loop_data['level'].title()} Level)")
|
||||
output.append("=" * 60)
|
||||
|
||||
if loop_data.get('team'):
|
||||
output.append(f"Team: {loop_data['team']}")
|
||||
|
||||
output.append(f"Generated: {loop_data['generated_at']}")
|
||||
output.append(f"Total Duration: {loop_data['total_duration_minutes']} minutes ({loop_data['total_duration_minutes']//60}h {loop_data['total_duration_minutes']%60}m)")
|
||||
output.append(f"Total Rounds: {loop_data['total_rounds']}")
|
||||
output.append("")
|
||||
|
||||
# Interview Rounds
|
||||
output.append("INTERVIEW ROUNDS")
|
||||
output.append("-" * 40)
|
||||
|
||||
sorted_rounds = sorted(loop_data['rounds'].items(), key=lambda x: x[1]['order'])
|
||||
for round_name, round_info in sorted_rounds:
|
||||
output.append(f"\nRound {round_info['order']}: {round_info['name']}")
|
||||
output.append(f"Duration: {round_info['duration_minutes']} minutes")
|
||||
output.append(f"Format: {round_info['format'].replace('_', ' ').title()}")
|
||||
|
||||
output.append("Objectives:")
|
||||
for obj in round_info['objectives']:
|
||||
output.append(f" • {obj}")
|
||||
|
||||
output.append("Focus Areas:")
|
||||
for area in round_info['focus_areas']:
|
||||
output.append(f" • {area.replace('_', ' ').title()}")
|
||||
|
||||
# Suggested Schedule
|
||||
output.append("\nSUGGESTED SCHEDULE")
|
||||
output.append("-" * 40)
|
||||
|
||||
schedule = loop_data['suggested_schedule']
|
||||
output.append(f"Schedule Type: {schedule['type'].replace('_', ' ').title()}")
|
||||
|
||||
for day_name, day_info in schedule['day_structure'].items():
|
||||
output.append(f"\n{day_name.replace('_', ' ').title()}:")
|
||||
output.append(f"Time: {day_info['start_time']} - {day_info['end_time']}")
|
||||
|
||||
for item in day_info['rounds']:
|
||||
if item['type'] == 'interview':
|
||||
output.append(f" {item['start_time']}-{item['end_time']}: {item['title']} ({item['duration_minutes']}min)")
|
||||
else:
|
||||
output.append(f" {item['start_time']}-{item['end_time']}: {item['type'].title()} ({item['duration_minutes']}min)")
|
||||
|
||||
# Interviewer Requirements
|
||||
output.append("\nINTERVIEWER REQUIREMENTS")
|
||||
output.append("-" * 40)
|
||||
|
||||
for round_name, requirements in loop_data['interviewer_requirements'].items():
|
||||
round_display = round_name.split("_", 2)[-1].replace("_", " ").title()
|
||||
output.append(f"\n{round_display}:")
|
||||
output.append(f"Required Skills: {', '.join(requirements['required_skills'])}")
|
||||
output.append(f"Suggested Interviewers: {', '.join(requirements['suggested_interviewers'])}")
|
||||
output.append(f"Calibration Level: {requirements['calibration_level'].title()}")
|
||||
|
||||
# Scorecard Overview
|
||||
output.append("\nSCORECARD TEMPLATE")
|
||||
output.append("-" * 40)
|
||||
|
||||
scorecard = loop_data['scorecard_template']
|
||||
output.append("Scoring Scale:")
|
||||
for score, description in scorecard['scoring_scale'].items():
|
||||
output.append(f" {score}: {description}")
|
||||
|
||||
output.append("\nEvaluation Dimensions:")
|
||||
for dim in scorecard['dimensions']:
|
||||
output.append(f" • {dim['dimension'].replace('_', ' ').title()} (Weight: {dim['weight']})")
|
||||
|
||||
# Calibration Notes
|
||||
output.append("\nCALIBRATION NOTES")
|
||||
output.append("-" * 40)
|
||||
|
||||
calibration = loop_data['calibration_notes']
|
||||
output.append(f"Hiring Bar: {calibration['hiring_bar_notes']}")
|
||||
|
||||
output.append("\nCommon Pitfalls:")
|
||||
for pitfall in calibration['common_pitfalls']:
|
||||
output.append(f" • {pitfall}")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate calibrated interview loops for specific roles and levels")
|
||||
parser.add_argument("--role", type=str, help="Job role title (e.g., 'Senior Software Engineer')")
|
||||
parser.add_argument("--level", type=str, help="Experience level (junior, mid, senior, staff, principal)")
|
||||
parser.add_argument("--team", type=str, help="Team or department (optional)")
|
||||
parser.add_argument("--competencies", type=str, help="Comma-separated list of specific competencies to focus on")
|
||||
parser.add_argument("--input", type=str, help="Input JSON file with role definition")
|
||||
parser.add_argument("--output", type=str, help="Output directory or file path")
|
||||
parser.add_argument("--format", choices=["json", "text", "both"], default="both", help="Output format")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
designer = InterviewLoopDesigner()
|
||||
|
||||
# Handle input
|
||||
if args.input:
|
||||
try:
|
||||
with open(args.input, 'r') as f:
|
||||
role_data = json.load(f)
|
||||
role = role_data.get('role') or role_data.get('title', '')
|
||||
level = role_data.get('level', 'senior')
|
||||
team = role_data.get('team')
|
||||
competencies = role_data.get('competencies')
|
||||
except Exception as e:
|
||||
print(f"Error reading input file: {e}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
if not args.role or not args.level:
|
||||
print("Error: --role and --level are required when not using --input")
|
||||
sys.exit(1)
|
||||
|
||||
role = args.role
|
||||
level = args.level
|
||||
team = args.team
|
||||
competencies = args.competencies.split(',') if args.competencies else None
|
||||
|
||||
# Generate interview loop
|
||||
try:
|
||||
loop_data = designer.generate_interview_loop(role, level, team, competencies)
|
||||
|
||||
# Handle output
|
||||
if args.output:
|
||||
output_path = args.output
|
||||
if os.path.isdir(output_path):
|
||||
safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_')
|
||||
base_filename = f"{safe_role}_{level}_interview_loop"
|
||||
json_path = os.path.join(output_path, f"{base_filename}.json")
|
||||
text_path = os.path.join(output_path, f"{base_filename}.txt")
|
||||
else:
|
||||
# Use provided path as base
|
||||
json_path = output_path if output_path.endswith('.json') else f"{output_path}.json"
|
||||
text_path = output_path.replace('.json', '.txt') if output_path.endswith('.json') else f"{output_path}.txt"
|
||||
else:
|
||||
safe_role = "".join(c for c in role.lower() if c.isalnum() or c in (' ', '-', '_')).replace(' ', '_')
|
||||
base_filename = f"{safe_role}_{level}_interview_loop"
|
||||
json_path = f"{base_filename}.json"
|
||||
text_path = f"{base_filename}.txt"
|
||||
|
||||
# Write outputs
|
||||
if args.format in ["json", "both"]:
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump(loop_data, f, indent=2, default=str)
|
||||
print(f"JSON output written to: {json_path}")
|
||||
|
||||
if args.format in ["text", "both"]:
|
||||
with open(text_path, 'w') as f:
|
||||
f.write(format_human_readable(loop_data))
|
||||
print(f"Text output written to: {text_path}")
|
||||
|
||||
# Always print summary to stdout
|
||||
print("\nInterview Loop Summary:")
|
||||
print(f"Role: {loop_data['role']} ({loop_data['level'].title()})")
|
||||
print(f"Total Duration: {loop_data['total_duration_minutes']} minutes")
|
||||
print(f"Number of Rounds: {loop_data['total_rounds']}")
|
||||
print(f"Schedule Type: {loop_data['suggested_schedule']['type'].replace('_', ' ').title()}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating interview loop: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
308
engineering/tech-debt-tracker/README.md
Normal file
308
engineering/tech-debt-tracker/README.md
Normal file
@@ -0,0 +1,308 @@
|
||||
# Tech Debt Tracker
|
||||
|
||||
A comprehensive technical debt management system that helps engineering teams identify, prioritize, and track technical debt across codebases. This skill provides three interconnected tools for a complete debt management workflow.
|
||||
|
||||
## Overview
|
||||
|
||||
Technical debt is like financial debt - it compounds over time and reduces team velocity if not managed systematically. This skill provides:
|
||||
|
||||
- **Automated Debt Detection**: Scan codebases to identify various types of technical debt
|
||||
- **Intelligent Prioritization**: Use proven frameworks to prioritize debt based on business impact
|
||||
- **Trend Analysis**: Track debt evolution over time with executive-friendly dashboards
|
||||
|
||||
## Tools
|
||||
|
||||
### 1. Debt Scanner (`debt_scanner.py`)
|
||||
|
||||
Scans codebases to automatically detect technical debt signals using AST parsing for Python and regex patterns for other languages.
|
||||
|
||||
**Features:**
|
||||
- Detects 15+ types of technical debt (large functions, complexity, duplicates, security issues, etc.)
|
||||
- Multi-language support (Python, JavaScript, Java, C#, Go, etc.)
|
||||
- Configurable thresholds and rules
|
||||
- Dual output: JSON for tools, human-readable for reports
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Basic scan
|
||||
python scripts/debt_scanner.py /path/to/codebase
|
||||
|
||||
# With custom config and output
|
||||
python scripts/debt_scanner.py /path/to/codebase --config config.json --output report.json
|
||||
|
||||
# Different output formats
|
||||
python scripts/debt_scanner.py /path/to/codebase --format both
|
||||
```
|
||||
|
||||
### 2. Debt Prioritizer (`debt_prioritizer.py`)
|
||||
|
||||
Takes debt inventory and creates prioritized backlog using proven prioritization frameworks.
|
||||
|
||||
**Features:**
|
||||
- Multiple prioritization frameworks (Cost of Delay, WSJF, RICE)
|
||||
- Business impact analysis with ROI calculations
|
||||
- Sprint allocation recommendations
|
||||
- Effort estimation with risk adjustment
|
||||
- Executive and engineering reports
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Basic prioritization
|
||||
python scripts/debt_prioritizer.py debt_inventory.json
|
||||
|
||||
# Custom framework and team size
|
||||
python scripts/debt_prioritizer.py inventory.json --framework wsjf --team-size 8
|
||||
|
||||
# Sprint capacity planning
|
||||
python scripts/debt_prioritizer.py inventory.json --sprint-capacity 80 --output backlog.json
|
||||
```
|
||||
|
||||
### 3. Debt Dashboard (`debt_dashboard.py`)
|
||||
|
||||
Analyzes historical debt data to provide trend analysis, health scoring, and executive reporting.
|
||||
|
||||
**Features:**
|
||||
- Health score trending over time
|
||||
- Debt velocity analysis (accumulation vs resolution)
|
||||
- Executive summary with business impact
|
||||
- Forecasting based on current trends
|
||||
- Strategic recommendations
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Single directory of scans
|
||||
python scripts/debt_dashboard.py --input-dir ./debt_scans/
|
||||
|
||||
# Multiple specific files
|
||||
python scripts/debt_dashboard.py scan1.json scan2.json scan3.json
|
||||
|
||||
# Custom analysis period
|
||||
python scripts/debt_dashboard.py data.json --period quarterly --team-size 6
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Scan Your Codebase
|
||||
|
||||
```bash
|
||||
# Scan your project
|
||||
python scripts/debt_scanner.py ~/my-project --output initial_scan.json
|
||||
|
||||
# Review the results
|
||||
python scripts/debt_scanner.py ~/my-project --format text
|
||||
```
|
||||
|
||||
### 2. Prioritize Your Debt
|
||||
|
||||
```bash
|
||||
# Create prioritized backlog
|
||||
python scripts/debt_prioritizer.py initial_scan.json --output backlog.json
|
||||
|
||||
# View sprint recommendations
|
||||
python scripts/debt_prioritizer.py initial_scan.json --format text
|
||||
```
|
||||
|
||||
### 3. Track Over Time
|
||||
|
||||
```bash
|
||||
# After multiple scans, analyze trends
|
||||
python scripts/debt_dashboard.py scan1.json scan2.json scan3.json --output dashboard.json
|
||||
|
||||
# Generate executive report
|
||||
python scripts/debt_dashboard.py --input-dir ./scans/ --format text
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Scanner Configuration
|
||||
|
||||
Create `config.json` to customize detection rules:
|
||||
|
||||
```json
|
||||
{
|
||||
"max_function_length": 50,
|
||||
"max_complexity": 10,
|
||||
"max_nesting_depth": 4,
|
||||
"ignore_patterns": ["*.test.js", "build/", "node_modules/"],
|
||||
"file_extensions": {
|
||||
"python": [".py"],
|
||||
"javascript": [".js", ".jsx", ".ts", ".tsx"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Team Configuration
|
||||
|
||||
Adjust tools for your team size and sprint capacity:
|
||||
|
||||
```bash
|
||||
# 8-person team with 2-week sprints
|
||||
python scripts/debt_prioritizer.py inventory.json --team-size 8 --sprint-capacity 160
|
||||
```
|
||||
|
||||
## Sample Data
|
||||
|
||||
The `assets/` directory contains sample data for testing:
|
||||
|
||||
- `sample_codebase/`: Example codebase with various debt types
|
||||
- `sample_debt_inventory.json`: Example debt inventory
|
||||
- `historical_debt_*.json`: Sample historical data for trending
|
||||
|
||||
Try the tools on sample data:
|
||||
|
||||
```bash
|
||||
# Test scanner
|
||||
python scripts/debt_scanner.py assets/sample_codebase
|
||||
|
||||
# Test prioritizer
|
||||
python scripts/debt_prioritizer.py assets/sample_debt_inventory.json
|
||||
|
||||
# Test dashboard
|
||||
python scripts/debt_dashboard.py assets/historical_debt_*.json
|
||||
```
|
||||
|
||||
## Understanding the Output
|
||||
|
||||
### Health Score (0-100)
|
||||
|
||||
- **85-100**: Excellent - Minimal debt, sustainable practices
|
||||
- **70-84**: Good - Manageable debt level, some attention needed
|
||||
- **55-69**: Fair - Debt accumulating, requires focused effort
|
||||
- **40-54**: Poor - High debt level, impacts productivity
|
||||
- **0-39**: Critical - Immediate action required
|
||||
|
||||
### Priority Levels
|
||||
|
||||
- **Critical**: Security issues, blocking problems (fix immediately)
|
||||
- **High**: Significant impact on quality or velocity (next sprint)
|
||||
- **Medium**: Moderate impact, plan for upcoming work (next quarter)
|
||||
- **Low**: Minor issues, fix opportunistically (when convenient)
|
||||
|
||||
### Debt Categories
|
||||
|
||||
- **Code Quality**: Large functions, complexity, duplicates
|
||||
- **Architecture**: Design issues, coupling problems
|
||||
- **Security**: Vulnerabilities, hardcoded secrets
|
||||
- **Testing**: Missing tests, poor coverage
|
||||
- **Documentation**: Missing or outdated docs
|
||||
- **Dependencies**: Outdated packages, license issues
|
||||
|
||||
## Integration with Development Workflow
|
||||
|
||||
### CI/CD Integration
|
||||
|
||||
Add debt scanning to your CI pipeline:
|
||||
|
||||
```bash
|
||||
# In your CI script
|
||||
python scripts/debt_scanner.py . --output ci_scan.json
|
||||
# Compare with baseline, fail build if critical issues found
|
||||
```
|
||||
|
||||
### Sprint Planning
|
||||
|
||||
1. **Weekly**: Run scanner to detect new debt
|
||||
2. **Sprint Planning**: Use prioritizer for debt story sizing
|
||||
3. **Monthly**: Generate dashboard for trend analysis
|
||||
4. **Quarterly**: Executive review with strategic recommendations
|
||||
|
||||
### Code Review Integration
|
||||
|
||||
Use scanner output to focus code reviews:
|
||||
|
||||
```bash
|
||||
# Scan PR branch
|
||||
python scripts/debt_scanner.py . --output pr_scan.json
|
||||
|
||||
# Compare with main branch baseline
|
||||
# Focus review on areas with new debt
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Debt Management Strategy
|
||||
|
||||
1. **Prevention**: Use scanner in CI to catch debt early
|
||||
2. **Prioritization**: Always use business impact for priority
|
||||
3. **Allocation**: Reserve 15-20% sprint capacity for debt work
|
||||
4. **Measurement**: Track health score and velocity impact
|
||||
5. **Communication**: Use dashboard reports for stakeholders
|
||||
|
||||
### Common Pitfalls to Avoid
|
||||
|
||||
- **Analysis Paralysis**: Don't spend too long on perfect prioritization
|
||||
- **Technical Focus Only**: Always consider business impact
|
||||
- **Inconsistent Application**: Ensure all teams use same approach
|
||||
- **Ignoring Trends**: Pay attention to debt accumulation rate
|
||||
- **All-or-Nothing**: Incremental debt reduction is better than none
|
||||
|
||||
### Success Metrics
|
||||
|
||||
- **Health Score Improvement**: Target 5+ point quarterly improvement
|
||||
- **Velocity Impact**: Keep debt velocity impact below 20%
|
||||
- **Team Satisfaction**: Survey developers on code quality satisfaction
|
||||
- **Incident Reduction**: Track correlation between debt and production issues
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Custom Debt Types
|
||||
|
||||
Extend the scanner for organization-specific debt patterns:
|
||||
|
||||
1. Add patterns to `config.json`
|
||||
2. Modify detection logic in scanner
|
||||
3. Update categorization in prioritizer
|
||||
|
||||
### Integration with External Tools
|
||||
|
||||
- **Jira/GitHub**: Import debt items as tickets
|
||||
- **SonarQube**: Combine with static analysis metrics
|
||||
- **APM Tools**: Correlate debt with performance metrics
|
||||
- **Chat Systems**: Send debt alerts to team channels
|
||||
|
||||
### Automated Reporting
|
||||
|
||||
Set up automated debt reporting:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Daily debt monitoring script
|
||||
python scripts/debt_scanner.py . --output daily_scan.json
|
||||
python scripts/debt_dashboard.py daily_scan.json --output daily_report.json
|
||||
# Send report to stakeholders
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Scanner not finding files**: Check `ignore_patterns` in config
|
||||
**Prioritizer giving unexpected results**: Verify business impact scoring
|
||||
**Dashboard shows flat trends**: Need more historical data points
|
||||
|
||||
### Performance Tips
|
||||
|
||||
- Use `.gitignore` patterns to exclude irrelevant files
|
||||
- Limit scan depth for large monorepos
|
||||
- Run dashboard analysis on subset for faster iteration
|
||||
|
||||
### Getting Help
|
||||
|
||||
1. Check the `references/` directory for detailed documentation
|
||||
2. Review sample data and expected outputs
|
||||
3. Examine the tool source code for customization ideas
|
||||
|
||||
## Contributing
|
||||
|
||||
This skill is designed to be customized for your organization's needs:
|
||||
|
||||
1. **Add Detection Rules**: Extend scanner patterns for your tech stack
|
||||
2. **Custom Prioritization**: Modify scoring algorithms for your business context
|
||||
3. **New Report Formats**: Add output formats for your stakeholders
|
||||
4. **Integration Hooks**: Add connectors to your existing tools
|
||||
|
||||
The codebase is designed with extensibility in mind - each tool is modular and can be enhanced independently.
|
||||
|
||||
---
|
||||
|
||||
**Remember**: Technical debt management is a journey, not a destination. These tools help you make informed decisions about balancing new feature development with technical excellence. Start small, measure impact, and iterate based on what works for your team.
|
||||
570
engineering/tech-debt-tracker/SKILL.md
Normal file
570
engineering/tech-debt-tracker/SKILL.md
Normal file
@@ -0,0 +1,570 @@
|
||||
# Tech Debt Tracker
|
||||
|
||||
**Tier**: POWERFUL 🔥
|
||||
**Category**: Engineering Process Automation
|
||||
**Expertise**: Code Quality, Technical Debt Management, Software Engineering
|
||||
|
||||
## Overview
|
||||
|
||||
Tech debt is one of the most insidious challenges in software development - it compounds over time, slowing down development velocity, increasing maintenance costs, and reducing code quality. This skill provides a comprehensive framework for identifying, analyzing, prioritizing, and tracking technical debt across codebases.
|
||||
|
||||
Tech debt isn't just about messy code - it encompasses architectural shortcuts, missing tests, outdated dependencies, documentation gaps, and infrastructure compromises. Like financial debt, it accrues "interest" through increased development time, higher bug rates, and reduced team velocity.
|
||||
|
||||
## What This Skill Provides
|
||||
|
||||
This skill offers three interconnected tools that form a complete tech debt management system:
|
||||
|
||||
1. **Debt Scanner** - Automatically identifies tech debt signals in your codebase
|
||||
2. **Debt Prioritizer** - Analyzes and prioritizes debt items using cost-of-delay frameworks
|
||||
3. **Debt Dashboard** - Tracks debt trends over time and provides executive reporting
|
||||
|
||||
Together, these tools enable engineering teams to make data-driven decisions about tech debt, balancing new feature development with maintenance work.
|
||||
|
||||
## Technical Debt Classification Framework
|
||||
|
||||
### 1. Code Debt
|
||||
Code-level issues that make the codebase harder to understand, modify, and maintain.
|
||||
|
||||
**Indicators:**
|
||||
- Long functions (>50 lines for complex logic, >20 for simple operations)
|
||||
- Deep nesting (>4 levels of indentation)
|
||||
- High cyclomatic complexity (>10)
|
||||
- Duplicate code patterns (>3 similar blocks)
|
||||
- Missing or inadequate error handling
|
||||
- Poor variable/function naming
|
||||
- Magic numbers and hardcoded values
|
||||
- Commented-out code blocks
|
||||
|
||||
**Impact:**
|
||||
- Increased debugging time
|
||||
- Higher defect rates
|
||||
- Slower feature development
|
||||
- Knowledge silos (only original author understands the code)
|
||||
|
||||
**Detection Methods:**
|
||||
- AST parsing for structural analysis
|
||||
- Pattern matching for common anti-patterns
|
||||
- Complexity metrics calculation
|
||||
- Duplicate code detection algorithms
|
||||
|
||||
### 2. Architecture Debt
|
||||
High-level design decisions that seemed reasonable at the time but now limit scalability or maintainability.
|
||||
|
||||
**Indicators:**
|
||||
- Monolithic components that should be modular
|
||||
- Circular dependencies between modules
|
||||
- Violation of separation of concerns
|
||||
- Inconsistent data flow patterns
|
||||
- Over-engineering or under-engineering for current scale
|
||||
- Tightly coupled components
|
||||
- Missing abstraction layers
|
||||
|
||||
**Impact:**
|
||||
- Difficult to scale individual components
|
||||
- Cascading changes required for simple modifications
|
||||
- Testing becomes complex and brittle
|
||||
- Onboarding new team members takes longer
|
||||
|
||||
**Detection Methods:**
|
||||
- Dependency analysis
|
||||
- Module coupling metrics
|
||||
- Component size analysis
|
||||
- Interface consistency checks
|
||||
|
||||
### 3. Test Debt
|
||||
Inadequate or missing test coverage, poor test quality, and testing infrastructure issues.
|
||||
|
||||
**Indicators:**
|
||||
- Low test coverage (<80% for critical paths)
|
||||
- Missing unit tests for complex logic
|
||||
- No integration tests for key workflows
|
||||
- Flaky tests that pass/fail intermittently
|
||||
- Slow test execution (>10 minutes for unit tests)
|
||||
- Tests that don't test meaningful behavior
|
||||
- Missing test data management strategy
|
||||
|
||||
**Impact:**
|
||||
- Fear of refactoring ("don't touch it, it works")
|
||||
- Regression bugs in production
|
||||
- Slow feedback cycles during development
|
||||
- Difficulty validating complex business logic
|
||||
|
||||
**Detection Methods:**
|
||||
- Coverage report analysis
|
||||
- Test execution time monitoring
|
||||
- Test failure pattern analysis
|
||||
- Test code quality assessment
|
||||
|
||||
### 4. Documentation Debt
|
||||
Missing, outdated, or poor-quality documentation that makes the system harder to understand and maintain.
|
||||
|
||||
**Indicators:**
|
||||
- Missing API documentation
|
||||
- Outdated README files
|
||||
- No architectural decision records (ADRs)
|
||||
- Missing code comments for complex algorithms
|
||||
- No onboarding documentation for new team members
|
||||
- Inconsistent documentation formats
|
||||
- Documentation that contradicts actual implementation
|
||||
|
||||
**Impact:**
|
||||
- Increased onboarding time for new team members
|
||||
- Knowledge loss when team members leave
|
||||
- Miscommunication between teams
|
||||
- Repeated questions in team channels
|
||||
|
||||
**Detection Methods:**
|
||||
- Documentation coverage analysis
|
||||
- Freshness checking (last modified dates)
|
||||
- Link validation
|
||||
- Comment density analysis
|
||||
|
||||
### 5. Dependency Debt
|
||||
Issues related to external libraries, frameworks, and system dependencies.
|
||||
|
||||
**Indicators:**
|
||||
- Outdated packages with known security vulnerabilities
|
||||
- Dependencies with incompatible licenses
|
||||
- Unused dependencies bloating the build
|
||||
- Version conflicts between packages
|
||||
- Deprecated APIs still in use
|
||||
- Heavy dependencies for simple tasks
|
||||
- Missing dependency pinning
|
||||
|
||||
**Impact:**
|
||||
- Security vulnerabilities
|
||||
- Build instability
|
||||
- Longer build times
|
||||
- Legal compliance issues
|
||||
- Difficulty upgrading core frameworks
|
||||
|
||||
**Detection Methods:**
|
||||
- Vulnerability scanning
|
||||
- License compliance checking
|
||||
- Usage analysis
|
||||
- Version compatibility checking
|
||||
|
||||
### 6. Infrastructure Debt
|
||||
Operations and deployment-related technical debt.
|
||||
|
||||
**Indicators:**
|
||||
- Manual deployment processes
|
||||
- Missing monitoring and alerting
|
||||
- Inadequate logging
|
||||
- No disaster recovery plan
|
||||
- Inconsistent environments (dev/staging/prod)
|
||||
- Missing CI/CD pipelines
|
||||
- Infrastructure as code gaps
|
||||
|
||||
**Impact:**
|
||||
- Deployment risks and downtime
|
||||
- Difficult troubleshooting
|
||||
- Inconsistent behavior across environments
|
||||
- Manual work that should be automated
|
||||
|
||||
**Detection Methods:**
|
||||
- Infrastructure audit checklists
|
||||
- Configuration drift detection
|
||||
- Monitoring coverage analysis
|
||||
- Deployment process documentation review
|
||||
|
||||
## Severity Scoring Framework
|
||||
|
||||
Each piece of tech debt is scored on multiple dimensions to determine overall severity:
|
||||
|
||||
### Impact Assessment (1-10 scale)
|
||||
|
||||
**Development Velocity Impact**
|
||||
- 1-2: Negligible impact on development speed
|
||||
- 3-4: Minor slowdown, workarounds available
|
||||
- 5-6: Moderate impact, affects some features
|
||||
- 7-8: Significant slowdown, affects most work
|
||||
- 9-10: Critical blocker, prevents new development
|
||||
|
||||
**Quality Impact**
|
||||
- 1-2: No impact on defect rates
|
||||
- 3-4: Minor increase in minor bugs
|
||||
- 5-6: Moderate increase in defects
|
||||
- 7-8: Regular production issues
|
||||
- 9-10: Critical reliability problems
|
||||
|
||||
**Team Productivity Impact**
|
||||
- 1-2: No impact on team morale or efficiency
|
||||
- 3-4: Occasional frustration
|
||||
- 5-6: Regular complaints from developers
|
||||
- 7-8: Team actively avoiding the area
|
||||
- 9-10: Causing developer turnover
|
||||
|
||||
**Business Impact**
|
||||
- 1-2: No customer-facing impact
|
||||
- 3-4: Minor UX degradation
|
||||
- 5-6: Moderate performance impact
|
||||
- 7-8: Customer complaints or churn
|
||||
- 9-10: Revenue-impacting issues
|
||||
|
||||
### Effort Assessment
|
||||
|
||||
**Size (Story Points or Hours)**
|
||||
- XS (1-4 hours): Simple refactor or documentation update
|
||||
- S (1-2 days): Minor architectural change
|
||||
- M (3-5 days): Moderate refactoring effort
|
||||
- L (1-2 weeks): Major component restructuring
|
||||
- XL (3+ weeks): System-wide architectural changes
|
||||
|
||||
**Risk Level**
|
||||
- Low: Well-understood change with clear scope
|
||||
- Medium: Some unknowns but manageable
|
||||
- High: Significant unknowns, potential for scope creep
|
||||
|
||||
**Skill Requirements**
|
||||
- Junior: Can be handled by any team member
|
||||
- Mid: Requires experienced developer
|
||||
- Senior: Needs architectural expertise
|
||||
- Expert: Requires deep system knowledge
|
||||
|
||||
## Interest Rate Calculation
|
||||
|
||||
Technical debt accrues "interest" - the additional cost of leaving it unfixed. This interest rate helps prioritize which debt to pay down first.
|
||||
|
||||
### Interest Rate Formula
|
||||
|
||||
```
|
||||
Interest Rate = (Impact Score × Frequency of Encounter) / Time Period
|
||||
```
|
||||
|
||||
Where:
|
||||
- **Impact Score**: Average severity score (1-10)
|
||||
- **Frequency of Encounter**: How often developers interact with this code
|
||||
- **Time Period**: Usually measured per sprint or month
|
||||
|
||||
### Cost of Delay Calculation
|
||||
|
||||
```
|
||||
Cost of Delay = Interest Rate × Time Until Fix × Team Size Multiplier
|
||||
```
|
||||
|
||||
### Example Calculation
|
||||
|
||||
**Scenario**: Legacy authentication module with poor error handling
|
||||
|
||||
- Impact Score: 7 (causes regular production issues)
|
||||
- Frequency: 15 encounters per sprint (3 developers × 5 times each)
|
||||
- Team Size: 8 developers
|
||||
- Current sprint: 1, planned fix: sprint 4
|
||||
|
||||
```
|
||||
Interest Rate = 7 × 15 = 105 points per sprint
|
||||
Cost of Delay = 105 × 3 × 1.2 = 378 total cost points
|
||||
```
|
||||
|
||||
This debt item should be prioritized over lower-cost items.
|
||||
|
||||
## Debt Inventory Management
|
||||
|
||||
### Data Structure
|
||||
|
||||
Each debt item is tracked with the following attributes:
|
||||
|
||||
```json
|
||||
{
|
||||
"id": "DEBT-2024-001",
|
||||
"title": "Legacy user authentication module",
|
||||
"category": "code",
|
||||
"subcategory": "error_handling",
|
||||
"location": "src/auth/legacy_auth.py:45-120",
|
||||
"description": "Authentication error handling uses generic exceptions",
|
||||
"impact": {
|
||||
"velocity": 7,
|
||||
"quality": 8,
|
||||
"productivity": 6,
|
||||
"business": 5
|
||||
},
|
||||
"effort": {
|
||||
"size": "M",
|
||||
"risk": "medium",
|
||||
"skill_required": "mid"
|
||||
},
|
||||
"interest_rate": 105,
|
||||
"cost_of_delay": 378,
|
||||
"priority": "high",
|
||||
"created_date": "2024-01-15",
|
||||
"last_updated": "2024-01-20",
|
||||
"assigned_to": null,
|
||||
"status": "identified",
|
||||
"tags": ["security", "user-experience", "maintainability"]
|
||||
}
|
||||
```
|
||||
|
||||
### Status Lifecycle
|
||||
|
||||
1. **Identified** - Debt detected but not yet analyzed
|
||||
2. **Analyzed** - Impact and effort assessed
|
||||
3. **Prioritized** - Added to backlog with priority
|
||||
4. **Planned** - Assigned to specific sprint/release
|
||||
5. **In Progress** - Actively being worked on
|
||||
6. **Review** - Implementation complete, under review
|
||||
7. **Done** - Debt resolved and verified
|
||||
8. **Won't Fix** - Consciously decided not to address
|
||||
|
||||
## Prioritization Frameworks
|
||||
|
||||
### 1. Cost-of-Delay vs Effort Matrix
|
||||
|
||||
Plot debt items on a 2D matrix:
|
||||
- X-axis: Effort (XS to XL)
|
||||
- Y-axis: Cost of Delay (calculated value)
|
||||
|
||||
**Priority Quadrants:**
|
||||
- High Cost, Low Effort: **Immediate** (quick wins)
|
||||
- High Cost, High Effort: **Planned** (major initiatives)
|
||||
- Low Cost, Low Effort: **Opportunistic** (during related work)
|
||||
- Low Cost, High Effort: **Backlog** (consider for future)
|
||||
|
||||
### 2. Weighted Shortest Job First (WSJF)
|
||||
|
||||
```
|
||||
WSJF Score = (Business Value + Time Criticality + Risk Reduction) / Effort
|
||||
```
|
||||
|
||||
Where each component is scored 1-10:
|
||||
- **Business Value**: Direct impact on customer value
|
||||
- **Time Criticality**: How much value decreases over time
|
||||
- **Risk Reduction**: How much risk is mitigated by fixing this debt
|
||||
|
||||
### 3. Technical Debt Quadrant
|
||||
|
||||
Based on Martin Fowler's framework:
|
||||
|
||||
**Quadrant 1: Reckless & Deliberate**
|
||||
- "We don't have time for design"
|
||||
- Highest priority for remediation
|
||||
|
||||
**Quadrant 2: Prudent & Deliberate**
|
||||
- "We must ship now and deal with consequences"
|
||||
- Schedule for near-term resolution
|
||||
|
||||
**Quadrant 3: Reckless & Inadvertent**
|
||||
- "What's layering?"
|
||||
- Focus on education and process improvement
|
||||
|
||||
**Quadrant 4: Prudent & Inadvertent**
|
||||
- "Now we know how we should have done it"
|
||||
- Normal part of learning, lowest priority
|
||||
|
||||
## Refactoring Strategies
|
||||
|
||||
### 1. Strangler Fig Pattern
|
||||
Gradually replace old system by building new functionality around it.
|
||||
|
||||
**When to use:**
|
||||
- Large, monolithic systems
|
||||
- High-risk changes to critical paths
|
||||
- Long-term architectural migrations
|
||||
|
||||
**Implementation:**
|
||||
1. Identify boundaries for extraction
|
||||
2. Create abstraction layer
|
||||
3. Route new features to new implementation
|
||||
4. Gradually migrate existing features
|
||||
5. Remove old implementation
|
||||
|
||||
### 2. Branch by Abstraction
|
||||
Create abstraction layer to allow parallel implementations.
|
||||
|
||||
**When to use:**
|
||||
- Need to support old and new systems simultaneously
|
||||
- High-risk changes with rollback requirements
|
||||
- A/B testing infrastructure changes
|
||||
|
||||
**Implementation:**
|
||||
1. Create abstraction interface
|
||||
2. Implement abstraction for current system
|
||||
3. Replace direct calls with abstraction calls
|
||||
4. Implement new version behind same abstraction
|
||||
5. Switch implementations via configuration
|
||||
6. Remove old implementation
|
||||
|
||||
### 3. Feature Toggles
|
||||
Use configuration flags to control code execution.
|
||||
|
||||
**When to use:**
|
||||
- Gradual rollout of refactored components
|
||||
- Risk mitigation during large changes
|
||||
- Experimental refactoring approaches
|
||||
|
||||
**Implementation:**
|
||||
1. Identify decision points in code
|
||||
2. Add toggle checks at decision points
|
||||
3. Implement both old and new paths
|
||||
4. Test both paths thoroughly
|
||||
5. Gradually move toggle to new implementation
|
||||
6. Remove old path and toggle
|
||||
|
||||
### 4. Parallel Run
|
||||
Run old and new implementations simultaneously to verify correctness.
|
||||
|
||||
**When to use:**
|
||||
- Critical business logic changes
|
||||
- Data processing pipeline changes
|
||||
- Algorithm improvements
|
||||
|
||||
**Implementation:**
|
||||
1. Implement new version alongside old
|
||||
2. Run both versions with same inputs
|
||||
3. Compare outputs and log discrepancies
|
||||
4. Investigate and fix discrepancies
|
||||
5. Build confidence through parallel execution
|
||||
6. Switch to new implementation
|
||||
7. Remove old implementation
|
||||
|
||||
## Sprint Allocation Recommendations
|
||||
|
||||
### Debt-to-Feature Ratio
|
||||
|
||||
Maintain healthy balance between new features and debt reduction:
|
||||
|
||||
**Team Velocity < 70% of capacity:**
|
||||
- 60% tech debt, 40% features
|
||||
- Focus on removing major blockers
|
||||
|
||||
**Team Velocity 70-85% of capacity:**
|
||||
- 30% tech debt, 70% features
|
||||
- Balanced maintenance approach
|
||||
|
||||
**Team Velocity > 85% of capacity:**
|
||||
- 15% tech debt, 85% features
|
||||
- Opportunistic debt reduction only
|
||||
|
||||
### Sprint Planning Integration
|
||||
|
||||
**Story Point Allocation:**
|
||||
- Reserve 20% of sprint capacity for tech debt
|
||||
- Prioritize debt items with highest interest rates
|
||||
- Include "debt tax" in feature estimates when working in high-debt areas
|
||||
|
||||
**Debt Budget Tracking:**
|
||||
- Track debt points completed per sprint
|
||||
- Monitor debt interest rate trend
|
||||
- Alert when debt accumulation exceeds team's paydown rate
|
||||
|
||||
### Quarterly Planning
|
||||
|
||||
**Debt Initiatives:**
|
||||
- Identify 1-2 major debt themes per quarter
|
||||
- Allocate dedicated sprints for large-scale refactoring
|
||||
- Plan debt work around major feature releases
|
||||
|
||||
**Success Metrics:**
|
||||
- Debt interest rate reduction
|
||||
- Developer velocity improvements
|
||||
- Defect rate reduction
|
||||
- Code review cycle time improvement
|
||||
|
||||
## Stakeholder Reporting
|
||||
|
||||
### Executive Dashboard
|
||||
|
||||
**Key Metrics:**
|
||||
- Overall tech debt health score (0-100)
|
||||
- Debt trend direction (improving/declining)
|
||||
- Cost of delayed fixes (in development days)
|
||||
- High-risk debt items count
|
||||
|
||||
**Monthly Report Structure:**
|
||||
1. **Executive Summary** (3 bullet points)
|
||||
2. **Health Score Trend** (6-month view)
|
||||
3. **Top 3 Risk Items** (business impact focus)
|
||||
4. **Investment Recommendation** (resource allocation)
|
||||
5. **Success Stories** (debt reduced last month)
|
||||
|
||||
### Engineering Team Dashboard
|
||||
|
||||
**Daily Metrics:**
|
||||
- New debt items identified
|
||||
- Debt items resolved
|
||||
- Interest rate by team/component
|
||||
- Debt hotspots (most problematic areas)
|
||||
|
||||
**Sprint Reviews:**
|
||||
- Debt points completed vs. planned
|
||||
- Velocity impact from debt work
|
||||
- Newly discovered debt during feature work
|
||||
- Team sentiment on code quality
|
||||
|
||||
### Product Manager Reports
|
||||
|
||||
**Feature Impact Analysis:**
|
||||
- How debt affects feature development time
|
||||
- Quality risk assessment for upcoming features
|
||||
- Debt that blocks planned features
|
||||
- Recommendations for feature sequence planning
|
||||
|
||||
**Customer Impact Translation:**
|
||||
- Debt that affects performance
|
||||
- Debt that increases bug rates
|
||||
- Debt that limits feature flexibility
|
||||
- Investment required to maintain current quality
|
||||
|
||||
## Implementation Roadmap
|
||||
|
||||
### Phase 1: Foundation (Weeks 1-2)
|
||||
1. Set up debt scanning infrastructure
|
||||
2. Establish debt taxonomy and scoring criteria
|
||||
3. Scan initial codebase and create baseline inventory
|
||||
4. Train team on debt identification and reporting
|
||||
|
||||
### Phase 2: Process Integration (Weeks 3-4)
|
||||
1. Integrate debt tracking into sprint planning
|
||||
2. Establish debt budgets and allocation rules
|
||||
3. Create stakeholder reporting templates
|
||||
4. Set up automated debt scanning in CI/CD
|
||||
|
||||
### Phase 3: Optimization (Weeks 5-6)
|
||||
1. Refine scoring algorithms based on team feedback
|
||||
2. Implement trend analysis and predictive metrics
|
||||
3. Create specialized debt reduction initiatives
|
||||
4. Establish cross-team debt coordination processes
|
||||
|
||||
### Phase 4: Maturity (Ongoing)
|
||||
1. Continuous improvement of detection algorithms
|
||||
2. Advanced analytics and prediction models
|
||||
3. Integration with planning and project management tools
|
||||
4. Organization-wide debt management best practices
|
||||
|
||||
## Success Criteria
|
||||
|
||||
**Quantitative Metrics:**
|
||||
- 25% reduction in debt interest rate within 6 months
|
||||
- 15% improvement in development velocity
|
||||
- 30% reduction in production defects
|
||||
- 20% faster code review cycles
|
||||
|
||||
**Qualitative Metrics:**
|
||||
- Improved developer satisfaction scores
|
||||
- Reduced context switching during feature development
|
||||
- Faster onboarding for new team members
|
||||
- Better predictability in feature delivery timelines
|
||||
|
||||
## Common Pitfalls and How to Avoid Them
|
||||
|
||||
### 1. Analysis Paralysis
|
||||
**Problem**: Spending too much time analyzing debt instead of fixing it.
|
||||
**Solution**: Set time limits for analysis, use "good enough" scoring for most items.
|
||||
|
||||
### 2. Perfectionism
|
||||
**Problem**: Trying to eliminate all debt instead of managing it.
|
||||
**Solution**: Focus on high-impact debt, accept that some debt is acceptable.
|
||||
|
||||
### 3. Ignoring Business Context
|
||||
**Problem**: Prioritizing technical elegance over business value.
|
||||
**Solution**: Always tie debt work to business outcomes and customer impact.
|
||||
|
||||
### 4. Inconsistent Application
|
||||
**Problem**: Some teams adopt practices while others ignore them.
|
||||
**Solution**: Make debt tracking part of standard development workflow.
|
||||
|
||||
### 5. Tool Over-Engineering
|
||||
**Problem**: Building complex debt management systems that nobody uses.
|
||||
**Solution**: Start simple, iterate based on actual usage patterns.
|
||||
|
||||
Technical debt management is not just about writing better code - it's about creating sustainable development practices that balance short-term delivery pressure with long-term system health. Use these tools and frameworks to make informed decisions about when and how to invest in debt reduction.
|
||||
@@ -0,0 +1,268 @@
|
||||
{
|
||||
"scan_metadata": {
|
||||
"directory": "/project/src",
|
||||
"scan_date": "2024-01-15T09:00:00",
|
||||
"scanner_version": "1.0.0"
|
||||
},
|
||||
"summary": {
|
||||
"total_files_scanned": 25,
|
||||
"total_lines_scanned": 12543,
|
||||
"total_debt_items": 28,
|
||||
"health_score": 68.5,
|
||||
"debt_density": 1.12
|
||||
},
|
||||
"debt_items": [
|
||||
{
|
||||
"id": "DEBT-0001",
|
||||
"type": "large_function",
|
||||
"description": "create_user function in user_service.py is 89 lines long",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0002",
|
||||
"type": "duplicate_code",
|
||||
"description": "Password validation logic duplicated in 3 locations",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0003",
|
||||
"type": "security_risk",
|
||||
"description": "Hardcoded API key in payment_processor.py",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "critical",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0004",
|
||||
"type": "high_complexity",
|
||||
"description": "process_payment function has cyclomatic complexity of 24",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0005",
|
||||
"type": "missing_docstring",
|
||||
"description": "PaymentProcessor class missing docstring",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0006",
|
||||
"type": "todo_comment",
|
||||
"description": "TODO: Move this to configuration file",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0007",
|
||||
"type": "empty_catch_blocks",
|
||||
"description": "Empty catch block in update_user method",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0008",
|
||||
"type": "magic_numbers",
|
||||
"description": "Magic number 1800 used for lock timeout",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0009",
|
||||
"type": "deep_nesting",
|
||||
"description": "Deep nesting detected: 6 levels in preferences handling",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0010",
|
||||
"type": "long_line",
|
||||
"description": "Line too long: 156 characters",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0011",
|
||||
"type": "commented_code",
|
||||
"description": "Dead code left in comments",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0012",
|
||||
"type": "global_variables",
|
||||
"description": "Global variable userCache should be encapsulated",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0013",
|
||||
"type": "synchronous_ajax",
|
||||
"description": "Synchronous AJAX call blocks UI thread",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0014",
|
||||
"type": "hardcoded_values",
|
||||
"description": "Tax rates hardcoded in payment processing logic",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0015",
|
||||
"type": "no_error_handling",
|
||||
"description": "API calls without proper error handling",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0016",
|
||||
"type": "inefficient_algorithm",
|
||||
"description": "O(n) user search could be optimized with indexing",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0017",
|
||||
"type": "memory_leak_risk",
|
||||
"description": "Event listeners attached without cleanup",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0018",
|
||||
"type": "sql_injection_risk",
|
||||
"description": "Potential SQL injection in user query",
|
||||
"file_path": "src/database.py",
|
||||
"severity": "critical",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0019",
|
||||
"type": "outdated_dependency",
|
||||
"description": "jQuery version 2.1.4 has known security vulnerabilities",
|
||||
"file_path": "package.json",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0020",
|
||||
"type": "test_debt",
|
||||
"description": "No unit tests for critical payment processing logic",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0021",
|
||||
"type": "large_class",
|
||||
"description": "UserService class has 15 methods",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0022",
|
||||
"type": "unused_imports",
|
||||
"description": "Unused import: sys",
|
||||
"file_path": "src/utils.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0023",
|
||||
"type": "missing_type_hints",
|
||||
"description": "Function get_user_score missing type hints",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0024",
|
||||
"type": "circular_dependency",
|
||||
"description": "Circular import between user_service and auth_service",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0025",
|
||||
"type": "inconsistent_naming",
|
||||
"description": "Variable name userID should be user_id",
|
||||
"file_path": "src/auth.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0026",
|
||||
"type": "broad_exception",
|
||||
"description": "Catching generic Exception instead of specific types",
|
||||
"file_path": "src/database.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0027",
|
||||
"type": "deprecated_api",
|
||||
"description": "Using deprecated datetime.utcnow() method",
|
||||
"file_path": "src/utils.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0028",
|
||||
"type": "logging_issue",
|
||||
"description": "Using print() instead of proper logging",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,214 @@
|
||||
{
|
||||
"scan_metadata": {
|
||||
"directory": "/project/src",
|
||||
"scan_date": "2024-02-01T14:30:00",
|
||||
"scanner_version": "1.0.0"
|
||||
},
|
||||
"summary": {
|
||||
"total_files_scanned": 27,
|
||||
"total_lines_scanned": 13421,
|
||||
"total_debt_items": 22,
|
||||
"health_score": 74.2,
|
||||
"debt_density": 0.81
|
||||
},
|
||||
"debt_items": [
|
||||
{
|
||||
"id": "DEBT-0001",
|
||||
"type": "large_function",
|
||||
"description": "create_user function in user_service.py is 89 lines long",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0002",
|
||||
"type": "duplicate_code",
|
||||
"description": "Password validation logic duplicated in 3 locations",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0004",
|
||||
"type": "high_complexity",
|
||||
"description": "process_payment function has cyclomatic complexity of 24",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0005",
|
||||
"type": "missing_docstring",
|
||||
"description": "PaymentProcessor class missing docstring",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0007",
|
||||
"type": "empty_catch_blocks",
|
||||
"description": "Empty catch block in update_user method",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0009",
|
||||
"type": "deep_nesting",
|
||||
"description": "Deep nesting detected: 6 levels in preferences handling",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0010",
|
||||
"type": "long_line",
|
||||
"description": "Line too long: 156 characters",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0011",
|
||||
"type": "commented_code",
|
||||
"description": "Dead code left in comments",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0012",
|
||||
"type": "global_variables",
|
||||
"description": "Global variable userCache should be encapsulated",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0013",
|
||||
"type": "synchronous_ajax",
|
||||
"description": "Synchronous AJAX call blocks UI thread",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0014",
|
||||
"type": "hardcoded_values",
|
||||
"description": "Tax rates hardcoded in payment processing logic",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0015",
|
||||
"type": "no_error_handling",
|
||||
"description": "API calls without proper error handling",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0016",
|
||||
"type": "inefficient_algorithm",
|
||||
"description": "O(n) user search could be optimized with indexing",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0017",
|
||||
"type": "memory_leak_risk",
|
||||
"description": "Event listeners attached without cleanup",
|
||||
"file_path": "src/frontend.js",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0021",
|
||||
"type": "large_class",
|
||||
"description": "UserService class has 15 methods",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0024",
|
||||
"type": "circular_dependency",
|
||||
"description": "Circular import between user_service and auth_service",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0026",
|
||||
"type": "broad_exception",
|
||||
"description": "Catching generic Exception instead of specific types",
|
||||
"file_path": "src/database.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-01-15T09:00:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0029",
|
||||
"type": "missing_validation",
|
||||
"description": "New API endpoint missing input validation",
|
||||
"file_path": "src/api.py",
|
||||
"severity": "high",
|
||||
"detected_date": "2024-02-01T14:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0030",
|
||||
"type": "performance_issue",
|
||||
"description": "N+1 query detected in user listing",
|
||||
"file_path": "src/user_service.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-02-01T14:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0031",
|
||||
"type": "css_debt",
|
||||
"description": "Inline styles should be moved to CSS files",
|
||||
"file_path": "templates/user_profile.html",
|
||||
"severity": "low",
|
||||
"detected_date": "2024-02-01T14:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0032",
|
||||
"type": "accessibility_issue",
|
||||
"description": "Missing alt text for images",
|
||||
"file_path": "templates/dashboard.html",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-02-01T14:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0033",
|
||||
"type": "configuration_debt",
|
||||
"description": "Environment-specific config hardcoded in application",
|
||||
"file_path": "src/config.py",
|
||||
"severity": "medium",
|
||||
"detected_date": "2024-02-01T14:30:00",
|
||||
"status": "identified"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,395 @@
|
||||
// Frontend JavaScript with various technical debt examples
|
||||
|
||||
// TODO: Move configuration to separate file
|
||||
const API_BASE_URL = "https://api.example.com";
|
||||
const API_KEY = "abc123def456"; // FIXME: Should be in environment
|
||||
|
||||
// Global variables - should be encapsulated
|
||||
var userCache = {};
|
||||
var authToken = null;
|
||||
var currentUser = null;
|
||||
|
||||
// HACK: Polyfill for older browsers - should use proper build system
|
||||
if (!String.prototype.includes) {
|
||||
String.prototype.includes = function(search) {
|
||||
return this.indexOf(search) !== -1;
|
||||
};
|
||||
}
|
||||
|
||||
class UserInterface {
|
||||
constructor() {
|
||||
this.components = {};
|
||||
this.eventHandlers = [];
|
||||
|
||||
// Long parameter list in constructor
|
||||
this.init(document, window, localStorage, sessionStorage, navigator, history, location);
|
||||
}
|
||||
|
||||
// Function with too many parameters
|
||||
init(doc, win, localStorage, sessionStorage, nav, hist, loc) {
|
||||
this.document = doc;
|
||||
this.window = win;
|
||||
this.localStorage = localStorage;
|
||||
this.sessionStorage = sessionStorage;
|
||||
this.navigator = nav;
|
||||
this.history = hist;
|
||||
this.location = loc;
|
||||
|
||||
// Deep nesting example
|
||||
if (this.localStorage) {
|
||||
if (this.localStorage.getItem('user')) {
|
||||
if (JSON.parse(this.localStorage.getItem('user'))) {
|
||||
if (JSON.parse(this.localStorage.getItem('user')).preferences) {
|
||||
if (JSON.parse(this.localStorage.getItem('user')).preferences.theme) {
|
||||
if (JSON.parse(this.localStorage.getItem('user')).preferences.theme === 'dark') {
|
||||
document.body.classList.add('dark-theme');
|
||||
} else if (JSON.parse(this.localStorage.getItem('user')).preferences.theme === 'light') {
|
||||
document.body.classList.add('light-theme');
|
||||
} else {
|
||||
document.body.classList.add('default-theme');
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Large function that does too many things
|
||||
renderUserDashboard(userId, includeStats, includeRecent, includeNotifications, includeSettings, includeHelp) {
|
||||
let user = this.getUser(userId);
|
||||
|
||||
if (!user) {
|
||||
console.log("User not found"); // Should use proper logging
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '<div class="dashboard">';
|
||||
|
||||
// Inline HTML generation - should use templates
|
||||
html += '<header class="dashboard-header">';
|
||||
html += '<h1>Welcome, ' + user.name + '</h1>';
|
||||
html += '<div class="user-avatar">';
|
||||
html += '<img src="' + user.avatar + '" alt="Avatar" />';
|
||||
html += '</div>';
|
||||
html += '</header>';
|
||||
|
||||
// Repeated validation pattern
|
||||
if (includeStats && includeStats === true) {
|
||||
html += '<section class="stats">';
|
||||
html += '<h2>Your Statistics</h2>';
|
||||
|
||||
// Magic numbers everywhere
|
||||
if (user.loginCount > 100) {
|
||||
html += '<div class="stat-item">Frequent User (100+ logins)</div>';
|
||||
} else if (user.loginCount > 50) {
|
||||
html += '<div class="stat-item">Regular User (50+ logins)</div>';
|
||||
} else if (user.loginCount > 10) {
|
||||
html += '<div class="stat-item">Casual User (10+ logins)</div>';
|
||||
} else {
|
||||
html += '<div class="stat-item">New User</div>';
|
||||
}
|
||||
|
||||
html += '</section>';
|
||||
}
|
||||
|
||||
if (includeRecent && includeRecent === true) {
|
||||
html += '<section class="recent">';
|
||||
html += '<h2>Recent Activity</h2>';
|
||||
|
||||
// No error handling for API calls
|
||||
let recentActivity = this.fetchRecentActivity(userId);
|
||||
|
||||
if (recentActivity && recentActivity.length > 0) {
|
||||
html += '<ul class="activity-list">';
|
||||
for (let i = 0; i < recentActivity.length; i++) {
|
||||
let activity = recentActivity[i];
|
||||
html += '<li class="activity-item">';
|
||||
html += '<span class="activity-type">' + activity.type + '</span>';
|
||||
html += '<span class="activity-description">' + activity.description + '</span>';
|
||||
html += '<span class="activity-time">' + this.formatTime(activity.timestamp) + '</span>';
|
||||
html += '</li>';
|
||||
}
|
||||
html += '</ul>';
|
||||
} else {
|
||||
html += '<p>No recent activity</p>';
|
||||
}
|
||||
|
||||
html += '</section>';
|
||||
}
|
||||
|
||||
if (includeNotifications && includeNotifications === true) {
|
||||
html += '<section class="notifications">';
|
||||
html += '<h2>Notifications</h2>';
|
||||
|
||||
let notifications = this.getNotifications(userId);
|
||||
|
||||
// Duplicate HTML generation pattern
|
||||
if (notifications && notifications.length > 0) {
|
||||
html += '<ul class="notification-list">';
|
||||
for (let i = 0; i < notifications.length; i++) {
|
||||
let notification = notifications[i];
|
||||
html += '<li class="notification-item">';
|
||||
html += '<span class="notification-title">' + notification.title + '</span>';
|
||||
html += '<span class="notification-message">' + notification.message + '</span>';
|
||||
html += '<span class="notification-time">' + this.formatTime(notification.timestamp) + '</span>';
|
||||
html += '</li>';
|
||||
}
|
||||
html += '</ul>';
|
||||
} else {
|
||||
html += '<p>No notifications</p>';
|
||||
}
|
||||
|
||||
html += '</section>';
|
||||
}
|
||||
|
||||
html += '</div>';
|
||||
|
||||
// Direct DOM manipulation without cleanup
|
||||
document.getElementById('main-content').innerHTML = html;
|
||||
|
||||
// Event handler attachment without cleanup
|
||||
let buttons = document.querySelectorAll('.action-button');
|
||||
for (let i = 0; i < buttons.length; i++) {
|
||||
buttons[i].addEventListener('click', function(event) {
|
||||
// Nested event handlers - memory leak risk
|
||||
let buttonType = event.target.getAttribute('data-type');
|
||||
if (buttonType === 'edit') {
|
||||
// Inline event handling - should be separate methods
|
||||
let modal = document.createElement('div');
|
||||
modal.className = 'modal';
|
||||
modal.innerHTML = '<div class="modal-content"><h3>Edit Profile</h3><button onclick="closeModal()">Close</button></div>';
|
||||
document.body.appendChild(modal);
|
||||
} else if (buttonType === 'delete') {
|
||||
if (confirm('Are you sure?')) { // Using confirm - poor UX
|
||||
// No error handling
|
||||
fetch(API_BASE_URL + '/users/' + userId, {
|
||||
method: 'DELETE',
|
||||
headers: {'Authorization': 'Bearer ' + authToken}
|
||||
});
|
||||
}
|
||||
} else if (buttonType === 'share') {
|
||||
// Hardcoded share logic
|
||||
if (navigator.share) {
|
||||
navigator.share({
|
||||
title: 'Check out my profile',
|
||||
url: window.location.href
|
||||
});
|
||||
} else {
|
||||
// Fallback for browsers without Web Share API
|
||||
let shareUrl = 'https://twitter.com/intent/tweet?url=' + encodeURIComponent(window.location.href);
|
||||
window.open(shareUrl, '_blank');
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Duplicate code - similar to above but for admin dashboard
|
||||
renderAdminDashboard(adminId) {
|
||||
let admin = this.getUser(adminId);
|
||||
|
||||
if (!admin) {
|
||||
console.log("Admin not found");
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '<div class="admin-dashboard">';
|
||||
|
||||
html += '<header class="dashboard-header">';
|
||||
html += '<h1>Admin Panel - Welcome, ' + admin.name + '</h1>';
|
||||
html += '<div class="user-avatar">';
|
||||
html += '<img src="' + admin.avatar + '" alt="Avatar" />';
|
||||
html += '</div>';
|
||||
html += '</header>';
|
||||
|
||||
// Same pattern repeated
|
||||
html += '<section class="admin-stats">';
|
||||
html += '<h2>System Statistics</h2>';
|
||||
|
||||
let stats = this.getSystemStats();
|
||||
if (stats) {
|
||||
html += '<div class="stat-grid">';
|
||||
html += '<div class="stat-item">Total Users: ' + stats.totalUsers + '</div>';
|
||||
html += '<div class="stat-item">Active Users: ' + stats.activeUsers + '</div>';
|
||||
html += '<div class="stat-item">New Today: ' + stats.newToday + '</div>';
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
html += '</section>';
|
||||
html += '</div>';
|
||||
|
||||
document.getElementById('main-content').innerHTML = html;
|
||||
}
|
||||
|
||||
getUser(userId) {
|
||||
// Check cache first - but cache never expires
|
||||
if (userCache[userId]) {
|
||||
return userCache[userId];
|
||||
}
|
||||
|
||||
// Synchronous AJAX - blocks UI
|
||||
let xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', API_BASE_URL + '/users/' + userId, false);
|
||||
xhr.setRequestHeader('Authorization', 'Bearer ' + authToken);
|
||||
xhr.send();
|
||||
|
||||
if (xhr.status === 200) {
|
||||
let user = JSON.parse(xhr.responseText);
|
||||
userCache[userId] = user;
|
||||
return user;
|
||||
} else {
|
||||
// Generic error handling
|
||||
console.error('Failed to fetch user');
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
fetchRecentActivity(userId) {
|
||||
// Another synchronous call
|
||||
try {
|
||||
let xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', API_BASE_URL + '/users/' + userId + '/activity', false);
|
||||
xhr.setRequestHeader('Authorization', 'Bearer ' + authToken);
|
||||
xhr.send();
|
||||
|
||||
if (xhr.status === 200) {
|
||||
return JSON.parse(xhr.responseText);
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
} catch (error) {
|
||||
// Swallowing errors
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
getNotifications(userId) {
|
||||
// Yet another sync call - should be async
|
||||
let xhr = new XMLHttpRequest();
|
||||
xhr.open('GET', API_BASE_URL + '/users/' + userId + '/notifications', false);
|
||||
xhr.setRequestHeader('Authorization', 'Bearer ' + authToken);
|
||||
xhr.send();
|
||||
|
||||
if (xhr.status === 200) {
|
||||
return JSON.parse(xhr.responseText);
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
formatTime(timestamp) {
|
||||
// Basic time formatting - should use proper library
|
||||
let date = new Date(timestamp);
|
||||
return date.getMonth() + '/' + date.getDate() + '/' + date.getFullYear();
|
||||
}
|
||||
|
||||
// XXX: This method is never used
|
||||
formatCurrency(amount, currency) {
|
||||
if (currency === 'USD') {
|
||||
return '$' + amount.toFixed(2);
|
||||
} else if (currency === 'EUR') {
|
||||
return '€' + amount.toFixed(2);
|
||||
} else {
|
||||
return amount.toFixed(2) + ' ' + currency;
|
||||
}
|
||||
}
|
||||
|
||||
getSystemStats() {
|
||||
// Hardcoded test data - should come from API
|
||||
return {
|
||||
totalUsers: 12534,
|
||||
activeUsers: 8765,
|
||||
newToday: 23
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Global functions - should be methods or modules
|
||||
function closeModal() {
|
||||
// Assumes modal exists - no error checking
|
||||
document.querySelector('.modal').remove();
|
||||
}
|
||||
|
||||
function validateEmail(email) {
|
||||
// Regex without explanation - magic pattern
|
||||
return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email);
|
||||
}
|
||||
|
||||
function validatePassword(password) {
|
||||
// Duplicate validation logic from backend
|
||||
if (password.length < 8) return false;
|
||||
if (!/[A-Z]/.test(password)) return false;
|
||||
if (!/[a-z]/.test(password)) return false;
|
||||
if (!/\d/.test(password)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// jQuery-style utility - reinventing the wheel
|
||||
function $(selector) {
|
||||
return document.querySelector(selector);
|
||||
}
|
||||
|
||||
function $all(selector) {
|
||||
return document.querySelectorAll(selector);
|
||||
}
|
||||
|
||||
// Global event handlers - should be encapsulated
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
// Inline anonymous function
|
||||
let ui = new UserInterface();
|
||||
|
||||
// Event delegation would be better
|
||||
document.body.addEventListener('click', function(event) {
|
||||
if (event.target.classList.contains('login-button')) {
|
||||
// Inline login logic
|
||||
let username = $('#username').value;
|
||||
let password = $('#password').value;
|
||||
|
||||
if (!username || !password) {
|
||||
alert('Please enter username and password'); // Poor UX
|
||||
return;
|
||||
}
|
||||
|
||||
// No CSRF protection
|
||||
fetch(API_BASE_URL + '/auth/login', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({username: username, password: password})
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
authToken = data.token;
|
||||
currentUser = data.user;
|
||||
localStorage.setItem('authToken', authToken); // Storing sensitive data
|
||||
localStorage.setItem('currentUser', JSON.stringify(currentUser));
|
||||
window.location.reload(); // Poor navigation
|
||||
} else {
|
||||
alert('Login failed: ' + data.error);
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Login error:', error);
|
||||
alert('Login failed');
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// // Old code left as comments - should be removed
|
||||
// function oldRenderFunction() {
|
||||
// var html = '<div>Old implementation</div>';
|
||||
// document.body.innerHTML = html;
|
||||
// }
|
||||
|
||||
// Commented out feature - should be removed or implemented
|
||||
// function darkModeToggle() {
|
||||
// if (document.body.classList.contains('dark-theme')) {
|
||||
// document.body.classList.remove('dark-theme');
|
||||
// document.body.classList.add('light-theme');
|
||||
// } else {
|
||||
// document.body.classList.remove('light-theme');
|
||||
// document.body.classList.add('dark-theme');
|
||||
// }
|
||||
// }
|
||||
@@ -0,0 +1,315 @@
|
||||
"""
|
||||
Payment processing module - contains various technical debt examples
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
import requests
|
||||
from decimal import Decimal
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
class PaymentProcessor:
|
||||
|
||||
def __init__(self):
|
||||
# TODO: These should come from environment or config
|
||||
self.stripe_key = "sk_test_1234567890"
|
||||
self.paypal_key = "paypal_secret_key_here"
|
||||
self.square_key = "square_api_key"
|
||||
|
||||
def process_payment(self, amount, currency, payment_method, customer_data, billing_address, shipping_address, items, discount_code, tax_rate, processing_fee, metadata):
|
||||
"""
|
||||
Process a payment - this function is too large and complex
|
||||
"""
|
||||
|
||||
# Input validation - should be extracted to separate function
|
||||
if not amount or amount <= 0:
|
||||
return {"success": False, "error": "Invalid amount"}
|
||||
|
||||
if not currency:
|
||||
return {"success": False, "error": "Currency required"}
|
||||
|
||||
if currency not in ["USD", "EUR", "GBP", "CAD", "AUD"]: # Hardcoded list
|
||||
return {"success": False, "error": "Unsupported currency"}
|
||||
|
||||
if not payment_method:
|
||||
return {"success": False, "error": "Payment method required"}
|
||||
|
||||
if not customer_data or "email" not in customer_data:
|
||||
return {"success": False, "error": "Customer email required"}
|
||||
|
||||
# Tax calculation - complex business logic that should be separate service
|
||||
tax_amount = 0
|
||||
if tax_rate:
|
||||
if currency == "USD":
|
||||
# US tax logic - hardcoded rules
|
||||
if billing_address and "state" in billing_address:
|
||||
state = billing_address["state"]
|
||||
if state == "CA":
|
||||
tax_amount = amount * 0.08 # California tax
|
||||
elif state == "NY":
|
||||
tax_amount = amount * 0.085 # New York tax
|
||||
elif state == "TX":
|
||||
tax_amount = amount * 0.0625 # Texas tax
|
||||
elif state == "FL":
|
||||
tax_amount = amount * 0.06 # Florida tax
|
||||
else:
|
||||
tax_amount = amount * 0.05 # Default tax
|
||||
elif currency == "EUR":
|
||||
# EU VAT logic - also hardcoded
|
||||
tax_amount = amount * 0.20 # 20% VAT
|
||||
elif currency == "GBP":
|
||||
tax_amount = amount * 0.20 # UK VAT
|
||||
|
||||
# Discount calculation - another complex block
|
||||
discount_amount = 0
|
||||
if discount_code:
|
||||
# FIXME: This should query a discount service
|
||||
if discount_code == "SAVE10":
|
||||
discount_amount = amount * 0.10
|
||||
elif discount_code == "SAVE20":
|
||||
discount_amount = amount * 0.20
|
||||
elif discount_code == "NEWUSER":
|
||||
discount_amount = min(50, amount * 0.25) # Max $50 discount
|
||||
elif discount_code == "LOYALTY":
|
||||
# Complex loyalty discount logic
|
||||
customer_tier = customer_data.get("tier", "bronze")
|
||||
if customer_tier == "gold":
|
||||
discount_amount = amount * 0.15
|
||||
elif customer_tier == "silver":
|
||||
discount_amount = amount * 0.10
|
||||
elif customer_tier == "bronze":
|
||||
discount_amount = amount * 0.05
|
||||
|
||||
# Calculate final amount
|
||||
final_amount = amount - discount_amount + tax_amount + processing_fee
|
||||
|
||||
# Payment method routing - should use strategy pattern
|
||||
if payment_method["type"] == "credit_card":
|
||||
# Credit card processing
|
||||
if payment_method["provider"] == "stripe":
|
||||
try:
|
||||
# Stripe API call - no retry logic
|
||||
response = requests.post(
|
||||
"https://api.stripe.com/v1/charges",
|
||||
headers={"Authorization": f"Bearer {self.stripe_key}"},
|
||||
data={
|
||||
"amount": int(final_amount * 100), # Convert to cents
|
||||
"currency": currency.lower(),
|
||||
"source": payment_method["token"],
|
||||
"description": f"Payment for {len(items)} items"
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
stripe_response = response.json()
|
||||
# Store transaction - should be in database
|
||||
transaction = {
|
||||
"id": stripe_response["id"],
|
||||
"amount": final_amount,
|
||||
"currency": currency,
|
||||
"status": "completed",
|
||||
"timestamp": time.time(),
|
||||
"provider": "stripe",
|
||||
"customer": customer_data["email"],
|
||||
"items": items,
|
||||
"tax_amount": tax_amount,
|
||||
"discount_amount": discount_amount
|
||||
}
|
||||
|
||||
# Send confirmation email - inline instead of separate service
|
||||
self.send_payment_confirmation_email(customer_data["email"], transaction)
|
||||
|
||||
return {"success": True, "transaction": transaction}
|
||||
else:
|
||||
return {"success": False, "error": "Stripe payment failed"}
|
||||
|
||||
except Exception as e:
|
||||
# Broad exception handling - should be more specific
|
||||
print(f"Stripe error: {e}") # Should use proper logging
|
||||
return {"success": False, "error": "Payment processing error"}
|
||||
|
||||
elif payment_method["provider"] == "square":
|
||||
# Square processing - duplicate code structure
|
||||
try:
|
||||
response = requests.post(
|
||||
"https://connect.squareup.com/v2/payments",
|
||||
headers={"Authorization": f"Bearer {self.square_key}"},
|
||||
json={
|
||||
"source_id": payment_method["token"],
|
||||
"amount_money": {
|
||||
"amount": int(final_amount * 100),
|
||||
"currency": currency
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
square_response = response.json()
|
||||
transaction = {
|
||||
"id": square_response["payment"]["id"],
|
||||
"amount": final_amount,
|
||||
"currency": currency,
|
||||
"status": "completed",
|
||||
"timestamp": time.time(),
|
||||
"provider": "square",
|
||||
"customer": customer_data["email"],
|
||||
"items": items,
|
||||
"tax_amount": tax_amount,
|
||||
"discount_amount": discount_amount
|
||||
}
|
||||
|
||||
self.send_payment_confirmation_email(customer_data["email"], transaction)
|
||||
|
||||
return {"success": True, "transaction": transaction}
|
||||
else:
|
||||
return {"success": False, "error": "Square payment failed"}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Square error: {e}")
|
||||
return {"success": False, "error": "Payment processing error"}
|
||||
|
||||
elif payment_method["type"] == "paypal":
|
||||
# PayPal processing - more duplicate code
|
||||
try:
|
||||
response = requests.post(
|
||||
"https://api.paypal.com/v2/checkout/orders",
|
||||
headers={"Authorization": f"Bearer {self.paypal_key}"},
|
||||
json={
|
||||
"intent": "CAPTURE",
|
||||
"purchase_units": [{
|
||||
"amount": {
|
||||
"currency_code": currency,
|
||||
"value": str(final_amount)
|
||||
}
|
||||
}]
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code == 201:
|
||||
paypal_response = response.json()
|
||||
transaction = {
|
||||
"id": paypal_response["id"],
|
||||
"amount": final_amount,
|
||||
"currency": currency,
|
||||
"status": "completed",
|
||||
"timestamp": time.time(),
|
||||
"provider": "paypal",
|
||||
"customer": customer_data["email"],
|
||||
"items": items,
|
||||
"tax_amount": tax_amount,
|
||||
"discount_amount": discount_amount
|
||||
}
|
||||
|
||||
self.send_payment_confirmation_email(customer_data["email"], transaction)
|
||||
|
||||
return {"success": True, "transaction": transaction}
|
||||
else:
|
||||
return {"success": False, "error": "PayPal payment failed"}
|
||||
|
||||
except Exception as e:
|
||||
print(f"PayPal error: {e}")
|
||||
return {"success": False, "error": "Payment processing error"}
|
||||
|
||||
else:
|
||||
return {"success": False, "error": "Unsupported payment method"}
|
||||
|
||||
def send_payment_confirmation_email(self, email, transaction):
|
||||
# Email sending logic - should be separate service
|
||||
# HACK: Using print instead of actual email service
|
||||
print(f"Sending confirmation email to {email}")
|
||||
print(f"Transaction ID: {transaction['id']}")
|
||||
print(f"Amount: {transaction['currency']} {transaction['amount']}")
|
||||
|
||||
# TODO: Implement actual email sending
|
||||
pass
|
||||
|
||||
def refund_payment(self, transaction_id, amount=None):
|
||||
# Refund logic - incomplete implementation
|
||||
# TODO: Implement refund for different providers
|
||||
print(f"Refunding transaction {transaction_id}")
|
||||
if amount:
|
||||
print(f"Partial refund: {amount}")
|
||||
else:
|
||||
print("Full refund")
|
||||
|
||||
# XXX: This doesn't actually process the refund
|
||||
return {"success": True, "message": "Refund initiated"}
|
||||
|
||||
def get_transaction(self, transaction_id):
|
||||
# Should query database, but we don't have one
|
||||
# FIXME: Implement actual transaction lookup
|
||||
return {"id": transaction_id, "status": "unknown"}
|
||||
|
||||
def validate_credit_card(self, card_number, expiry_month, expiry_year, cvv):
|
||||
# Basic card validation - should use proper validation library
|
||||
if not card_number or len(card_number) < 13 or len(card_number) > 19:
|
||||
return False
|
||||
|
||||
# Luhn algorithm check - reimplemented poorly
|
||||
digits = [int(d) for d in card_number if d.isdigit()]
|
||||
checksum = 0
|
||||
for i, digit in enumerate(reversed(digits)):
|
||||
if i % 2 == 1:
|
||||
digit *= 2
|
||||
if digit > 9:
|
||||
digit -= 9
|
||||
checksum += digit
|
||||
|
||||
if checksum % 10 != 0:
|
||||
return False
|
||||
|
||||
# Expiry validation
|
||||
if expiry_month < 1 or expiry_month > 12:
|
||||
return False
|
||||
|
||||
current_year = int(time.strftime("%Y"))
|
||||
current_month = int(time.strftime("%m"))
|
||||
|
||||
if expiry_year < current_year:
|
||||
return False
|
||||
elif expiry_year == current_year and expiry_month < current_month:
|
||||
return False
|
||||
|
||||
# CVV validation
|
||||
if not cvv or len(cvv) < 3 or len(cvv) > 4:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# Module-level functions that should be in class or separate module
|
||||
def calculate_processing_fee(amount, provider):
|
||||
"""Calculate processing fee - hardcoded rates"""
|
||||
if provider == "stripe":
|
||||
return amount * 0.029 + 0.30 # Stripe rates
|
||||
elif provider == "paypal":
|
||||
return amount * 0.031 + 0.30 # PayPal rates
|
||||
elif provider == "square":
|
||||
return amount * 0.026 + 0.10 # Square rates
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def format_currency(amount, currency):
|
||||
"""Format currency - basic implementation"""
|
||||
# Should use proper internationalization
|
||||
if currency == "USD":
|
||||
return f"${amount:.2f}"
|
||||
elif currency == "EUR":
|
||||
return f"€{amount:.2f}"
|
||||
elif currency == "GBP":
|
||||
return f"£{amount:.2f}"
|
||||
else:
|
||||
return f"{currency} {amount:.2f}"
|
||||
|
||||
|
||||
# Global state - anti-pattern
|
||||
payment_processor_instance = None
|
||||
|
||||
|
||||
def get_payment_processor():
|
||||
global payment_processor_instance
|
||||
if payment_processor_instance is None:
|
||||
payment_processor_instance = PaymentProcessor()
|
||||
return payment_processor_instance
|
||||
@@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
User service module with various tech debt examples
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# TODO: Move this to configuration file
|
||||
DATABASE_URL = "postgresql://user:password123@localhost:5432/mydb"
|
||||
API_KEY = "sk-1234567890abcdef" # FIXME: This should be in environment variables
|
||||
|
||||
|
||||
class UserService:
|
||||
def __init__(self):
|
||||
self.users = {}
|
||||
self.cache = {}
|
||||
# HACK: Using dict for now, should be proper database connection
|
||||
self.db_connection = None
|
||||
|
||||
def create_user(self, name, email, password, age, phone, address, city, state, zip_code, country, preferences, notifications, billing_info):
|
||||
# Function with too many parameters - should use User dataclass
|
||||
if not name:
|
||||
return None
|
||||
if not email:
|
||||
return None
|
||||
if not password:
|
||||
return None
|
||||
if not age:
|
||||
return None
|
||||
if not phone:
|
||||
return None
|
||||
if not address:
|
||||
return None
|
||||
if not city:
|
||||
return None
|
||||
if not state:
|
||||
return None
|
||||
if not zip_code:
|
||||
return None
|
||||
if not country:
|
||||
return None
|
||||
|
||||
# Duplicate validation logic - should be extracted
|
||||
if age < 13:
|
||||
print("User must be at least 13 years old")
|
||||
return None
|
||||
if age > 150:
|
||||
print("Invalid age")
|
||||
return None
|
||||
|
||||
# More validation
|
||||
if not self.validate_email(email):
|
||||
print("Invalid email format")
|
||||
return None
|
||||
|
||||
# Password validation - duplicated elsewhere
|
||||
if len(password) < 8:
|
||||
print("Password too short")
|
||||
return None
|
||||
if not re.search(r"[A-Z]", password):
|
||||
print("Password must contain uppercase letter")
|
||||
return None
|
||||
if not re.search(r"[a-z]", password):
|
||||
print("Password must contain lowercase letter")
|
||||
return None
|
||||
if not re.search(r"\d", password):
|
||||
print("Password must contain digit")
|
||||
return None
|
||||
|
||||
# Deep nesting example
|
||||
if preferences:
|
||||
if 'notifications' in preferences:
|
||||
if preferences['notifications']:
|
||||
if 'email' in preferences['notifications']:
|
||||
if preferences['notifications']['email']:
|
||||
if 'frequency' in preferences['notifications']['email']:
|
||||
if preferences['notifications']['email']['frequency'] == 'daily':
|
||||
print("Daily email notifications enabled")
|
||||
elif preferences['notifications']['email']['frequency'] == 'weekly':
|
||||
print("Weekly email notifications enabled")
|
||||
else:
|
||||
print("Invalid notification frequency")
|
||||
|
||||
# TODO: Implement proper user ID generation
|
||||
user_id = str(hash(email)) # XXX: This is terrible for production
|
||||
|
||||
# Magic numbers everywhere
|
||||
password_hash = hashlib.sha256((password + "salt123").encode()).hexdigest()
|
||||
|
||||
user_data = {
|
||||
"id": user_id,
|
||||
"name": name,
|
||||
"email": email,
|
||||
"password_hash": password_hash,
|
||||
"age": age,
|
||||
"phone": phone,
|
||||
"address": address,
|
||||
"city": city,
|
||||
"state": state,
|
||||
"zip_code": zip_code,
|
||||
"country": country,
|
||||
"preferences": preferences,
|
||||
"notifications": notifications,
|
||||
"billing_info": billing_info,
|
||||
"created_at": time.time(),
|
||||
"updated_at": time.time(),
|
||||
"last_login": None,
|
||||
"login_count": 0,
|
||||
"is_active": True,
|
||||
"is_verified": False,
|
||||
"verification_token": None,
|
||||
"reset_token": None,
|
||||
"failed_login_attempts": 0,
|
||||
"locked_until": None,
|
||||
"subscription_level": "free",
|
||||
"credits": 100
|
||||
}
|
||||
|
||||
self.users[user_id] = user_data
|
||||
return user_id
|
||||
|
||||
def validate_email(self, email):
|
||||
# Duplicate validation logic - should be in utils
|
||||
if not email:
|
||||
return False
|
||||
if "@" not in email:
|
||||
return False
|
||||
if "." not in email:
|
||||
return False
|
||||
return True
|
||||
|
||||
def authenticate_user(self, email, password):
|
||||
# More duplicate validation
|
||||
if not email:
|
||||
return None
|
||||
if not password:
|
||||
return None
|
||||
|
||||
# Linear search through users - O(n) complexity
|
||||
for user_id, user_data in self.users.items():
|
||||
if user_data["email"] == email:
|
||||
# Same password hashing logic duplicated
|
||||
password_hash = hashlib.sha256((password + "salt123").encode()).hexdigest()
|
||||
if user_data["password_hash"] == password_hash:
|
||||
# Update login stats
|
||||
user_data["last_login"] = time.time()
|
||||
user_data["login_count"] += 1
|
||||
user_data["failed_login_attempts"] = 0
|
||||
return user_id
|
||||
else:
|
||||
# Failed login handling
|
||||
user_data["failed_login_attempts"] += 1
|
||||
if user_data["failed_login_attempts"] >= 5: # Magic number
|
||||
user_data["locked_until"] = time.time() + 1800 # 30 minutes
|
||||
return None
|
||||
return None
|
||||
|
||||
def get_user(self, user_id):
|
||||
# No error handling
|
||||
return self.users[user_id]
|
||||
|
||||
def update_user(self, user_id, updates):
|
||||
try:
|
||||
# Empty catch block - bad practice
|
||||
user = self.users[user_id]
|
||||
except:
|
||||
pass
|
||||
|
||||
# More validation duplication
|
||||
if "age" in updates:
|
||||
if updates["age"] < 13:
|
||||
print("User must be at least 13 years old")
|
||||
return False
|
||||
if updates["age"] > 150:
|
||||
print("Invalid age")
|
||||
return False
|
||||
|
||||
if "email" in updates:
|
||||
if not self.validate_email(updates["email"]):
|
||||
print("Invalid email format")
|
||||
return False
|
||||
|
||||
# Direct dictionary manipulation without validation
|
||||
for key, value in updates.items():
|
||||
user[key] = value
|
||||
|
||||
user["updated_at"] = time.time()
|
||||
return True
|
||||
|
||||
def delete_user(self, user_id):
|
||||
# print("Deleting user", user_id) # Commented out code
|
||||
# TODO: Implement soft delete instead
|
||||
del self.users[user_id]
|
||||
|
||||
def search_users(self, query):
|
||||
results = []
|
||||
# Inefficient search algorithm - O(n*m)
|
||||
for user_id, user_data in self.users.items():
|
||||
if query.lower() in user_data["name"].lower():
|
||||
results.append(user_data)
|
||||
elif query.lower() in user_data["email"].lower():
|
||||
results.append(user_data)
|
||||
elif query in user_data.get("phone", ""):
|
||||
results.append(user_data)
|
||||
return results
|
||||
|
||||
def export_users(self):
|
||||
# Security risk - no access control
|
||||
return json.dumps(self.users, indent=2)
|
||||
|
||||
def import_users(self, json_data):
|
||||
# No validation of imported data
|
||||
imported_users = json.loads(json_data)
|
||||
self.users.update(imported_users)
|
||||
|
||||
# def old_create_user(self, name, email):
|
||||
# # Old implementation kept as comment
|
||||
# return {"name": name, "email": email}
|
||||
|
||||
def calculate_user_score(self, user_id):
|
||||
user = self.users[user_id]
|
||||
score = 0
|
||||
|
||||
# Complex scoring logic with magic numbers
|
||||
if user["login_count"] > 10:
|
||||
score += 50
|
||||
elif user["login_count"] > 5:
|
||||
score += 30
|
||||
elif user["login_count"] > 1:
|
||||
score += 10
|
||||
|
||||
if user["subscription_level"] == "premium":
|
||||
score += 100
|
||||
elif user["subscription_level"] == "pro":
|
||||
score += 75
|
||||
elif user["subscription_level"] == "basic":
|
||||
score += 25
|
||||
|
||||
# Age-based scoring with arbitrary rules
|
||||
if user["age"] >= 18 and user["age"] <= 65:
|
||||
score += 20
|
||||
elif user["age"] > 65:
|
||||
score += 10
|
||||
|
||||
return score
|
||||
|
||||
|
||||
# Global variable - should be encapsulated
|
||||
user_service_instance = UserService()
|
||||
|
||||
|
||||
def get_user_service():
|
||||
return user_service_instance
|
||||
|
||||
|
||||
# Utility function that should be in separate module
|
||||
def hash_password(password, salt="salt123"):
|
||||
# Hardcoded salt - security issue
|
||||
return hashlib.sha256((password + salt).encode()).hexdigest()
|
||||
|
||||
|
||||
# Another utility function with duplicate logic
|
||||
def validate_password(password):
|
||||
if len(password) < 8:
|
||||
return False, "Password too short"
|
||||
if not re.search(r"[A-Z]", password):
|
||||
return False, "Password must contain uppercase letter"
|
||||
if not re.search(r"[a-z]", password):
|
||||
return False, "Password must contain lowercase letter"
|
||||
if not re.search(r"\d", password):
|
||||
return False, "Password must contain digit"
|
||||
return True, "Valid password"
|
||||
285
engineering/tech-debt-tracker/assets/sample_debt_inventory.json
Normal file
285
engineering/tech-debt-tracker/assets/sample_debt_inventory.json
Normal file
@@ -0,0 +1,285 @@
|
||||
[
|
||||
{
|
||||
"id": "DEBT-0001",
|
||||
"type": "large_function",
|
||||
"description": "create_user function in user_service.py is 89 lines long",
|
||||
"file_path": "src/user_service.py",
|
||||
"line_number": 13,
|
||||
"severity": "high",
|
||||
"metadata": {
|
||||
"function_name": "create_user",
|
||||
"length": 89,
|
||||
"recommended_max": 50
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0002",
|
||||
"type": "duplicate_code",
|
||||
"description": "Password validation logic duplicated in 3 locations",
|
||||
"file_path": "src/user_service.py",
|
||||
"line_number": 45,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"duplicate_count": 3,
|
||||
"other_files": ["src/auth.py", "src/frontend.js"]
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0003",
|
||||
"type": "security_risk",
|
||||
"description": "Hardcoded API key in payment_processor.py",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"line_number": 10,
|
||||
"severity": "critical",
|
||||
"metadata": {
|
||||
"security_issue": "hardcoded_credentials",
|
||||
"exposure_risk": "high"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0004",
|
||||
"type": "high_complexity",
|
||||
"description": "process_payment function has cyclomatic complexity of 24",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"line_number": 19,
|
||||
"severity": "high",
|
||||
"metadata": {
|
||||
"function_name": "process_payment",
|
||||
"complexity": 24,
|
||||
"recommended_max": 10
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0005",
|
||||
"type": "missing_docstring",
|
||||
"description": "PaymentProcessor class missing docstring",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"line_number": 8,
|
||||
"severity": "low",
|
||||
"metadata": {
|
||||
"class_name": "PaymentProcessor"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0006",
|
||||
"type": "todo_comment",
|
||||
"description": "TODO: Move this to configuration file",
|
||||
"file_path": "src/user_service.py",
|
||||
"line_number": 8,
|
||||
"severity": "low",
|
||||
"metadata": {
|
||||
"comment": "TODO: Move this to configuration file"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0007",
|
||||
"type": "empty_catch_blocks",
|
||||
"description": "Empty catch block in update_user method",
|
||||
"file_path": "src/user_service.py",
|
||||
"line_number": 156,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"method_name": "update_user",
|
||||
"exception_type": "generic"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0008",
|
||||
"type": "magic_numbers",
|
||||
"description": "Magic number 1800 used for lock timeout",
|
||||
"file_path": "src/user_service.py",
|
||||
"line_number": 98,
|
||||
"severity": "low",
|
||||
"metadata": {
|
||||
"value": 1800,
|
||||
"context": "account_lockout_duration"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0009",
|
||||
"type": "deep_nesting",
|
||||
"description": "Deep nesting detected: 6 levels in preferences handling",
|
||||
"file_path": "src/frontend.js",
|
||||
"line_number": 32,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"nesting_level": 6,
|
||||
"recommended_max": 4
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0010",
|
||||
"type": "long_line",
|
||||
"description": "Line too long: 156 characters",
|
||||
"file_path": "src/frontend.js",
|
||||
"line_number": 127,
|
||||
"severity": "low",
|
||||
"metadata": {
|
||||
"length": 156,
|
||||
"recommended_max": 120
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0011",
|
||||
"type": "commented_code",
|
||||
"description": "Dead code left in comments",
|
||||
"file_path": "src/frontend.js",
|
||||
"line_number": 285,
|
||||
"severity": "low",
|
||||
"metadata": {
|
||||
"lines_of_commented_code": 8
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0012",
|
||||
"type": "global_variables",
|
||||
"description": "Global variable userCache should be encapsulated",
|
||||
"file_path": "src/frontend.js",
|
||||
"line_number": 7,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"variable_name": "userCache",
|
||||
"scope": "global"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0013",
|
||||
"type": "synchronous_ajax",
|
||||
"description": "Synchronous AJAX call blocks UI thread",
|
||||
"file_path": "src/frontend.js",
|
||||
"line_number": 189,
|
||||
"severity": "high",
|
||||
"metadata": {
|
||||
"method": "XMLHttpRequest",
|
||||
"async": false
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0014",
|
||||
"type": "hardcoded_values",
|
||||
"description": "Tax rates hardcoded in payment processing logic",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"line_number": 45,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"values": ["0.08", "0.085", "0.0625", "0.06"],
|
||||
"context": "tax_calculation"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0015",
|
||||
"type": "no_error_handling",
|
||||
"description": "API calls without proper error handling",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"line_number": 78,
|
||||
"severity": "high",
|
||||
"metadata": {
|
||||
"api_endpoint": "stripe",
|
||||
"error_scenarios": ["network_failure", "invalid_response"]
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0016",
|
||||
"type": "inefficient_algorithm",
|
||||
"description": "O(n) user search could be optimized with indexing",
|
||||
"file_path": "src/user_service.py",
|
||||
"line_number": 178,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"current_complexity": "O(n)",
|
||||
"recommended_complexity": "O(log n)",
|
||||
"method_name": "search_users"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0017",
|
||||
"type": "memory_leak_risk",
|
||||
"description": "Event listeners attached without cleanup",
|
||||
"file_path": "src/frontend.js",
|
||||
"line_number": 145,
|
||||
"severity": "medium",
|
||||
"metadata": {
|
||||
"event_type": "click",
|
||||
"cleanup_missing": true
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0018",
|
||||
"type": "sql_injection_risk",
|
||||
"description": "Potential SQL injection in user query",
|
||||
"file_path": "src/database.py",
|
||||
"line_number": 25,
|
||||
"severity": "critical",
|
||||
"metadata": {
|
||||
"query_type": "dynamic",
|
||||
"user_input": "unsanitized"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0019",
|
||||
"type": "outdated_dependency",
|
||||
"description": "jQuery version 2.1.4 has known security vulnerabilities",
|
||||
"file_path": "package.json",
|
||||
"line_number": 15,
|
||||
"severity": "high",
|
||||
"metadata": {
|
||||
"package": "jquery",
|
||||
"current_version": "2.1.4",
|
||||
"latest_version": "3.6.4",
|
||||
"vulnerabilities": ["CVE-2020-11022", "CVE-2020-11023"]
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
},
|
||||
{
|
||||
"id": "DEBT-0020",
|
||||
"type": "test_debt",
|
||||
"description": "No unit tests for critical payment processing logic",
|
||||
"file_path": "src/payment_processor.py",
|
||||
"line_number": 19,
|
||||
"severity": "high",
|
||||
"metadata": {
|
||||
"coverage": 0,
|
||||
"critical_paths": ["process_payment", "refund_payment"],
|
||||
"risk_level": "high"
|
||||
},
|
||||
"detected_date": "2024-02-10T10:30:00",
|
||||
"status": "identified"
|
||||
}
|
||||
]
|
||||
@@ -0,0 +1,221 @@
|
||||
{
|
||||
"metadata": {
|
||||
"generated_date": "2026-02-16T12:59:34.530390",
|
||||
"analysis_period": "monthly",
|
||||
"snapshots_analyzed": 2,
|
||||
"date_range": {
|
||||
"start": "2024-01-15T09:00:00",
|
||||
"end": "2024-02-01T14:30:00"
|
||||
},
|
||||
"team_size": 5
|
||||
},
|
||||
"executive_summary": {
|
||||
"overall_status": "excellent",
|
||||
"health_score": 87.3,
|
||||
"status_message": "Code quality is excellent with minimal technical debt.",
|
||||
"key_insights": [
|
||||
"Good progress on debt reduction"
|
||||
],
|
||||
"total_debt_items": 22,
|
||||
"estimated_effort_hours": 193.5,
|
||||
"high_priority_items": 6,
|
||||
"velocity_impact_percent": 12.3
|
||||
},
|
||||
"current_health": {
|
||||
"overall_score": 87.3,
|
||||
"debt_density": 0.81,
|
||||
"velocity_impact": 12.3,
|
||||
"quality_score": 81.8,
|
||||
"maintainability_score": 72.7,
|
||||
"technical_risk_score": 38.2,
|
||||
"date": "2024-02-01T14:30:00"
|
||||
},
|
||||
"trend_analysis": {
|
||||
"overall_score": {
|
||||
"metric_name": "overall_score",
|
||||
"trend_direction": "improving",
|
||||
"change_rate": 3.7,
|
||||
"correlation_strength": 0.0,
|
||||
"forecast_next_period": 91.0,
|
||||
"confidence_interval": [
|
||||
91.0,
|
||||
91.0
|
||||
]
|
||||
},
|
||||
"debt_density": {
|
||||
"metric_name": "debt_density",
|
||||
"trend_direction": "improving",
|
||||
"change_rate": -0.31,
|
||||
"correlation_strength": 0.0,
|
||||
"forecast_next_period": 0.5,
|
||||
"confidence_interval": [
|
||||
0.5,
|
||||
0.5
|
||||
]
|
||||
},
|
||||
"velocity_impact": {
|
||||
"metric_name": "velocity_impact",
|
||||
"trend_direction": "improving",
|
||||
"change_rate": -2.9,
|
||||
"correlation_strength": 0.0,
|
||||
"forecast_next_period": 9.4,
|
||||
"confidence_interval": [
|
||||
9.4,
|
||||
9.4
|
||||
]
|
||||
},
|
||||
"quality_score": {
|
||||
"metric_name": "quality_score",
|
||||
"trend_direction": "declining",
|
||||
"change_rate": -3.9,
|
||||
"correlation_strength": 0.0,
|
||||
"forecast_next_period": 77.9,
|
||||
"confidence_interval": [
|
||||
77.9,
|
||||
77.9
|
||||
]
|
||||
},
|
||||
"technical_risk_score": {
|
||||
"metric_name": "technical_risk_score",
|
||||
"trend_direction": "improving",
|
||||
"change_rate": -47.5,
|
||||
"correlation_strength": 0.0,
|
||||
"forecast_next_period": -9.3,
|
||||
"confidence_interval": [
|
||||
-9.3,
|
||||
-9.3
|
||||
]
|
||||
}
|
||||
},
|
||||
"debt_velocity": [
|
||||
{
|
||||
"period": "2024-01-15 to 2024-02-01",
|
||||
"new_debt_items": 0,
|
||||
"resolved_debt_items": 6,
|
||||
"net_change": -6,
|
||||
"velocity_ratio": 10.0,
|
||||
"effort_hours_added": 0,
|
||||
"effort_hours_resolved": 77.0,
|
||||
"net_effort_change": -77.0
|
||||
}
|
||||
],
|
||||
"forecasts": {
|
||||
"health_score_3_months": 98.4,
|
||||
"health_score_6_months": 100,
|
||||
"debt_count_3_months": 4,
|
||||
"debt_count_6_months": 0,
|
||||
"risk_score_3_months": 0
|
||||
},
|
||||
"recommendations": [
|
||||
{
|
||||
"priority": "medium",
|
||||
"category": "focus_area",
|
||||
"title": "Focus on Other Debt",
|
||||
"description": "Other represents the largest debt category (16 items). Consider targeted initiatives.",
|
||||
"impact": "medium",
|
||||
"effort": "medium"
|
||||
}
|
||||
],
|
||||
"visualizations": {
|
||||
"health_timeline": [
|
||||
{
|
||||
"date": "2024-01-15",
|
||||
"overall_score": 83.6,
|
||||
"quality_score": 85.7,
|
||||
"technical_risk": 85.7
|
||||
},
|
||||
{
|
||||
"date": "2024-02-01",
|
||||
"overall_score": 87.3,
|
||||
"quality_score": 81.8,
|
||||
"technical_risk": 38.2
|
||||
}
|
||||
],
|
||||
"debt_accumulation": [
|
||||
{
|
||||
"date": "2024-01-15",
|
||||
"total_debt": 28,
|
||||
"high_priority": 9,
|
||||
"security_debt": 5
|
||||
},
|
||||
{
|
||||
"date": "2024-02-01",
|
||||
"total_debt": 22,
|
||||
"high_priority": 6,
|
||||
"security_debt": 2
|
||||
}
|
||||
],
|
||||
"category_distribution": [
|
||||
{
|
||||
"category": "code_quality",
|
||||
"count": 5
|
||||
},
|
||||
{
|
||||
"category": "other",
|
||||
"count": 16
|
||||
},
|
||||
{
|
||||
"category": "maintenance",
|
||||
"count": 1
|
||||
}
|
||||
],
|
||||
"debt_velocity": [
|
||||
{
|
||||
"period": "2024-01-15 to 2024-02-01",
|
||||
"new_items": 0,
|
||||
"resolved_items": 6,
|
||||
"net_change": -6,
|
||||
"velocity_ratio": 10.0
|
||||
}
|
||||
],
|
||||
"effort_trend": [
|
||||
{
|
||||
"date": "2024-01-15",
|
||||
"total_effort": 270.5
|
||||
},
|
||||
{
|
||||
"date": "2024-02-01",
|
||||
"total_effort": 193.5
|
||||
}
|
||||
]
|
||||
},
|
||||
"detailed_metrics": {
|
||||
"debt_breakdown": {
|
||||
"large_function": 1,
|
||||
"duplicate_code": 1,
|
||||
"high_complexity": 1,
|
||||
"missing_docstring": 1,
|
||||
"empty_catch_blocks": 1,
|
||||
"deep_nesting": 1,
|
||||
"long_line": 1,
|
||||
"commented_code": 1,
|
||||
"global_variables": 1,
|
||||
"synchronous_ajax": 1,
|
||||
"hardcoded_values": 1,
|
||||
"no_error_handling": 1,
|
||||
"inefficient_algorithm": 1,
|
||||
"memory_leak_risk": 1,
|
||||
"large_class": 1,
|
||||
"circular_dependency": 1,
|
||||
"broad_exception": 1,
|
||||
"missing_validation": 1,
|
||||
"performance_issue": 1,
|
||||
"css_debt": 1,
|
||||
"accessibility_issue": 1,
|
||||
"configuration_debt": 1
|
||||
},
|
||||
"severity_breakdown": {
|
||||
"high": 6,
|
||||
"medium": 12,
|
||||
"low": 4
|
||||
},
|
||||
"category_breakdown": {
|
||||
"code_quality": 5,
|
||||
"other": 16,
|
||||
"maintenance": 1
|
||||
},
|
||||
"files_analyzed": 27,
|
||||
"debt_density": 0.8148148148148148,
|
||||
"average_effort_per_item": 8.795454545454545
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,261 @@
|
||||
# Technical Debt Classification Taxonomy
|
||||
|
||||
## Overview
|
||||
|
||||
This document provides a comprehensive taxonomy for classifying technical debt across different dimensions. Consistent classification is essential for tracking, prioritizing, and managing technical debt effectively across teams and projects.
|
||||
|
||||
## Primary Categories
|
||||
|
||||
### 1. Code Debt
|
||||
|
||||
**Definition**: Issues at the code level that make software harder to understand, modify, or maintain.
|
||||
|
||||
**Subcategories**:
|
||||
- **Structural Issues**
|
||||
- `large_function`: Functions exceeding recommended size limits
|
||||
- `high_complexity`: High cyclomatic complexity (>10)
|
||||
- `deep_nesting`: Excessive indentation levels (>4)
|
||||
- `long_parameter_list`: Too many function parameters (>5)
|
||||
- `data_clumps`: Related data that should be grouped together
|
||||
|
||||
- **Naming and Documentation**
|
||||
- `poor_naming`: Unclear or misleading variable/function names
|
||||
- `missing_docstring`: Functions/classes without documentation
|
||||
- `magic_numbers`: Hardcoded numeric values without explanation
|
||||
- `commented_code`: Dead code left in comments
|
||||
|
||||
- **Duplication and Patterns**
|
||||
- `duplicate_code`: Identical or similar code blocks
|
||||
- `copy_paste_programming`: Evidence of code duplication
|
||||
- `inconsistent_patterns`: Mixed coding styles within codebase
|
||||
|
||||
- **Error Handling**
|
||||
- `empty_catch_blocks`: Exception handling without proper action
|
||||
- `generic_exceptions`: Catching overly broad exception types
|
||||
- `missing_error_handling`: No error handling for failure scenarios
|
||||
|
||||
**Severity Indicators**:
|
||||
- **Critical**: Security vulnerabilities, syntax errors
|
||||
- **High**: Functions >100 lines, complexity >20
|
||||
- **Medium**: Functions 50-100 lines, complexity 10-20
|
||||
- **Low**: Minor style issues, short functions with minor problems
|
||||
|
||||
### 2. Architecture Debt
|
||||
|
||||
**Definition**: High-level design decisions that limit system flexibility, scalability, or maintainability.
|
||||
|
||||
**Subcategories**:
|
||||
- **Structural Issues**
|
||||
- `monolithic_design`: Components that should be separated
|
||||
- `circular_dependencies`: Modules depending on each other cyclically
|
||||
- `god_object`: Classes/modules with too many responsibilities
|
||||
- `inappropriate_intimacy`: Excessive coupling between modules
|
||||
|
||||
- **Layer Violations**
|
||||
- `abstraction_inversion`: Lower-level modules depending on higher-level ones
|
||||
- `leaky_abstractions`: Implementation details exposed through interfaces
|
||||
- `broken_hierarchy`: Inheritance relationships that don't make sense
|
||||
|
||||
- **Scalability Issues**
|
||||
- `performance_bottlenecks`: Known architectural performance limitations
|
||||
- `resource_contention`: Shared resources creating bottlenecks
|
||||
- `single_point_failure`: Critical components without redundancy
|
||||
|
||||
**Impact Assessment**:
|
||||
- **High Impact**: Affects system scalability, blocks major features
|
||||
- **Medium Impact**: Makes changes more difficult, affects team productivity
|
||||
- **Low Impact**: Minor architectural inconsistencies
|
||||
|
||||
### 3. Test Debt
|
||||
|
||||
**Definition**: Inadequate testing infrastructure, coverage, or quality that increases risk and slows development.
|
||||
|
||||
**Subcategories**:
|
||||
- **Coverage Issues**
|
||||
- `low_coverage`: Test coverage below team standards (<80%)
|
||||
- `missing_unit_tests`: No tests for critical business logic
|
||||
- `missing_integration_tests`: No tests for component interactions
|
||||
- `missing_end_to_end_tests`: No full system workflow validation
|
||||
|
||||
- **Test Quality**
|
||||
- `flaky_tests`: Tests that pass/fail inconsistently
|
||||
- `slow_tests`: Test suite taking too long to execute
|
||||
- `brittle_tests`: Tests that break with minor code changes
|
||||
- `unclear_test_intent`: Tests without clear purpose or documentation
|
||||
|
||||
- **Infrastructure**
|
||||
- `manual_testing_only`: No automated testing processes
|
||||
- `missing_test_data`: No proper test data management
|
||||
- `environment_dependencies`: Tests requiring specific environments
|
||||
|
||||
**Priority Matrix**:
|
||||
- **Critical Path Coverage**: High priority for business-critical features
|
||||
- **Regression Risk**: High priority for frequently changed code
|
||||
- **Development Velocity**: Medium priority for developer productivity
|
||||
- **Documentation Value**: Low priority for test clarity improvements
|
||||
|
||||
### 4. Documentation Debt
|
||||
|
||||
**Definition**: Missing, outdated, or poor-quality documentation that hinders understanding and maintenance.
|
||||
|
||||
**Subcategories**:
|
||||
- **API Documentation**
|
||||
- `missing_api_docs`: No documentation for public APIs
|
||||
- `outdated_api_docs`: Documentation doesn't match implementation
|
||||
- `incomplete_examples`: No usage examples for complex APIs
|
||||
|
||||
- **Code Documentation**
|
||||
- `missing_comments`: Complex algorithms without explanation
|
||||
- `outdated_comments`: Comments contradicting current implementation
|
||||
- `redundant_comments`: Comments that just restate the code
|
||||
|
||||
- **System Documentation**
|
||||
- `missing_architecture_docs`: No high-level system design documentation
|
||||
- `missing_deployment_docs`: No deployment or operations guide
|
||||
- `missing_onboarding_docs`: No guide for new team members
|
||||
|
||||
**Freshness Assessment**:
|
||||
- **Stale**: Documentation >6 months out of date
|
||||
- **Outdated**: Documentation 3-6 months out of date
|
||||
- **Current**: Documentation <3 months out of date
|
||||
|
||||
### 5. Dependency Debt
|
||||
|
||||
**Definition**: Issues with external libraries, frameworks, and system dependencies.
|
||||
|
||||
**Subcategories**:
|
||||
- **Version Management**
|
||||
- `outdated_dependencies`: Libraries with available updates
|
||||
- `vulnerable_dependencies`: Dependencies with known security issues
|
||||
- `deprecated_dependencies`: Dependencies no longer maintained
|
||||
- `version_conflicts`: Incompatible dependency versions
|
||||
|
||||
- **License and Compliance**
|
||||
- `license_violations`: Dependencies with incompatible licenses
|
||||
- `license_unknown`: Dependencies without clear licensing
|
||||
- `compliance_risk`: Dependencies creating legal/regulatory risks
|
||||
|
||||
- **Usage Optimization**
|
||||
- `unused_dependencies`: Dependencies included but not used
|
||||
- `oversized_dependencies`: Heavy libraries for simple functionality
|
||||
- `redundant_dependencies`: Multiple libraries solving same problem
|
||||
|
||||
**Risk Assessment**:
|
||||
- **Security Risk**: Known vulnerabilities, unmaintained dependencies
|
||||
- **Legal Risk**: License conflicts, compliance issues
|
||||
- **Technical Risk**: Breaking changes, deprecation notices
|
||||
- **Maintenance Risk**: Outdated versions, unsupported libraries
|
||||
|
||||
### 6. Infrastructure Debt
|
||||
|
||||
**Definition**: Operations, deployment, and infrastructure-related technical debt.
|
||||
|
||||
**Subcategories**:
|
||||
- **Deployment and CI/CD**
|
||||
- `manual_deployment`: No automated deployment processes
|
||||
- `missing_pipeline`: No CI/CD pipeline automation
|
||||
- `brittle_deployments`: Deployment process prone to failure
|
||||
- `environment_drift`: Inconsistencies between environments
|
||||
|
||||
- **Monitoring and Observability**
|
||||
- `missing_monitoring`: No application/system monitoring
|
||||
- `inadequate_logging`: Insufficient logging for troubleshooting
|
||||
- `missing_alerting`: No alerts for critical system conditions
|
||||
- `poor_observability`: Can't understand system behavior in production
|
||||
|
||||
- **Configuration Management**
|
||||
- `hardcoded_config`: Configuration embedded in code
|
||||
- `manual_configuration`: No automated configuration management
|
||||
- `secrets_in_code`: Sensitive information stored in code
|
||||
- `inconsistent_environments`: Dev/staging/prod differences
|
||||
|
||||
**Operational Impact**:
|
||||
- **Availability**: Affects system uptime and reliability
|
||||
- **Debuggability**: Affects ability to troubleshoot issues
|
||||
- **Scalability**: Affects ability to handle load increases
|
||||
- **Security**: Affects system security posture
|
||||
|
||||
## Severity Classification
|
||||
|
||||
### Critical (Score: 9-10)
|
||||
- Security vulnerabilities
|
||||
- Production-breaking issues
|
||||
- Legal/compliance violations
|
||||
- Blocking issues for team productivity
|
||||
|
||||
### High (Score: 7-8)
|
||||
- Significant technical risk
|
||||
- Major productivity impact
|
||||
- Customer-visible quality issues
|
||||
- Architecture limitations
|
||||
|
||||
### Medium (Score: 4-6)
|
||||
- Moderate productivity impact
|
||||
- Code quality concerns
|
||||
- Maintenance difficulties
|
||||
- Minor security concerns
|
||||
|
||||
### Low (Score: 1-3)
|
||||
- Style and convention issues
|
||||
- Documentation gaps
|
||||
- Minor optimizations
|
||||
- Cosmetic improvements
|
||||
|
||||
## Impact Dimensions
|
||||
|
||||
### Business Impact
|
||||
- **Customer Experience**: User-facing quality and performance
|
||||
- **Revenue**: Direct impact on business metrics
|
||||
- **Compliance**: Regulatory and legal requirements
|
||||
- **Market Position**: Competitive advantage considerations
|
||||
|
||||
### Technical Impact
|
||||
- **Development Velocity**: Speed of feature development
|
||||
- **Code Quality**: Maintainability and reliability
|
||||
- **System Reliability**: Uptime and performance
|
||||
- **Security Posture**: Vulnerability and risk exposure
|
||||
|
||||
### Team Impact
|
||||
- **Developer Productivity**: Individual efficiency
|
||||
- **Team Morale**: Job satisfaction and engagement
|
||||
- **Knowledge Sharing**: Team collaboration and learning
|
||||
- **Onboarding Speed**: New team member integration
|
||||
|
||||
## Effort Estimation Guidelines
|
||||
|
||||
### T-Shirt Sizing
|
||||
- **XS (1-4 hours)**: Simple fixes, documentation updates
|
||||
- **S (1-2 days)**: Minor refactoring, simple feature additions
|
||||
- **M (3-5 days)**: Moderate refactoring, component changes
|
||||
- **L (1-2 weeks)**: Major refactoring, architectural changes
|
||||
- **XL (3+ weeks)**: System-wide changes, major migrations
|
||||
|
||||
### Complexity Factors
|
||||
- **Technical Complexity**: How difficult is the change technically?
|
||||
- **Business Risk**: What's the risk if something goes wrong?
|
||||
- **Testing Requirements**: How much testing is needed?
|
||||
- **Team Knowledge**: Does the team understand this area well?
|
||||
- **Dependencies**: How many other systems/teams are involved?
|
||||
|
||||
## Usage Guidelines
|
||||
|
||||
### When Classifying Debt
|
||||
1. Start with primary category (code, architecture, test, etc.)
|
||||
2. Identify specific subcategory for precise tracking
|
||||
3. Assess severity based on business and technical impact
|
||||
4. Estimate effort using t-shirt sizing
|
||||
5. Tag with relevant impact dimensions
|
||||
|
||||
### Consistency Rules
|
||||
- Use consistent terminology across teams
|
||||
- Document custom categories for domain-specific debt
|
||||
- Regular reviews to ensure classification accuracy
|
||||
- Training for team members on taxonomy usage
|
||||
|
||||
### Review and Updates
|
||||
- Quarterly review of taxonomy relevance
|
||||
- Add new categories as patterns emerge
|
||||
- Remove unused categories to keep taxonomy lean
|
||||
- Update severity and impact criteria based on experience
|
||||
|
||||
This taxonomy should be adapted to your organization's specific context, technology stack, and business priorities. The key is consistency in application across teams and over time.
|
||||
@@ -0,0 +1,335 @@
|
||||
# Technical Debt Prioritization Framework
|
||||
|
||||
## Introduction
|
||||
|
||||
Technical debt prioritization is a critical capability that separates high-performing engineering teams from those struggling with maintenance burden. This framework provides multiple approaches to systematically prioritize technical debt based on business value, risk, effort, and strategic alignment.
|
||||
|
||||
## Core Principles
|
||||
|
||||
### 1. Business Value Alignment
|
||||
Technical debt work must connect to business outcomes. Every debt item should have a clear story about how fixing it supports business goals.
|
||||
|
||||
### 2. Evidence-Based Decisions
|
||||
Use data, not opinions, to drive prioritization. Measure impact, track trends, and validate assumptions with evidence.
|
||||
|
||||
### 3. Cost-Benefit Optimization
|
||||
Balance the cost of fixing debt against the cost of leaving it unfixed. Sometimes living with debt is the right business decision.
|
||||
|
||||
### 4. Risk Management
|
||||
Consider both the probability and impact of negative outcomes. High-probability, high-impact issues get priority.
|
||||
|
||||
### 5. Sustainable Pace
|
||||
Debt work should be sustainable over time. Avoid boom-bust cycles of neglect followed by emergency remediation.
|
||||
|
||||
## Prioritization Frameworks
|
||||
|
||||
### Framework 1: Cost of Delay (CoD)
|
||||
|
||||
**Best For**: Teams with clear business metrics and well-understood customer impact.
|
||||
|
||||
**Formula**: `Priority Score = (Business Value + Urgency + Risk Reduction) / Effort`
|
||||
|
||||
**Components**:
|
||||
|
||||
**Business Value (1-10 scale)**
|
||||
- Customer impact: How many users affected?
|
||||
- Revenue impact: Direct effect on business metrics
|
||||
- Strategic value: Alignment with business goals
|
||||
- Competitive advantage: Market positioning benefits
|
||||
|
||||
**Urgency (1-10 scale)**
|
||||
- Time sensitivity: How quickly does value decay?
|
||||
- Dependency criticality: Does this block other work?
|
||||
- Market timing: External deadlines or windows
|
||||
- Regulatory pressure: Compliance requirements
|
||||
|
||||
**Risk Reduction (1-10 scale)**
|
||||
- Security risk mitigation: Vulnerability reduction
|
||||
- Reliability improvement: Stability gains
|
||||
- Compliance risk: Regulatory issue prevention
|
||||
- Technical risk: Architectural problem prevention
|
||||
|
||||
**Effort Estimation**
|
||||
- Development time in story points or days
|
||||
- Risk multiplier for uncertainty (1.0-2.0x)
|
||||
- Skill requirements and availability
|
||||
- Cross-team coordination needs
|
||||
|
||||
**Example Calculation**:
|
||||
```
|
||||
Authentication module refactor:
|
||||
- Business Value: 8 (affects all users, blocks SSO)
|
||||
- Urgency: 7 (blocks Q2 enterprise features)
|
||||
- Risk Reduction: 9 (high security risk)
|
||||
- Total Numerator: 24
|
||||
- Effort: 3 weeks = 15 story points
|
||||
- CoD Score: 24/15 = 1.6
|
||||
```
|
||||
|
||||
### Framework 2: Weighted Shortest Job First (WSJF)
|
||||
|
||||
**Best For**: SAFe/Agile environments with portfolio-level planning.
|
||||
|
||||
**Formula**: `WSJF = (Business Value + Time Criticality + Risk Reduction) / Job Size`
|
||||
|
||||
**Scoring Guidelines**:
|
||||
|
||||
**Business Value (1-20 scale)**
|
||||
- User/business value from fixing this debt
|
||||
- Direct revenue or cost impact
|
||||
- Strategic importance to business objectives
|
||||
|
||||
**Time Criticality (1-20 scale)**
|
||||
- How user/business value declines over time
|
||||
- Dependency on other work items
|
||||
- Fixed deadlines or time-sensitive opportunities
|
||||
|
||||
**Risk Reduction/Opportunity Enablement (1-20 scale)**
|
||||
- Risk mitigation value
|
||||
- Future opportunities this enables
|
||||
- Options this preserves or creates
|
||||
|
||||
**Job Size (1-20 scale)**
|
||||
- Relative sizing compared to other debt items
|
||||
- Include uncertainty and risk factors
|
||||
- Consider dependencies and coordination overhead
|
||||
|
||||
**WSJF Bands**:
|
||||
- **Highest (WSJF > 10)**: Do immediately
|
||||
- **High (WSJF 5-10)**: Next quarter priority
|
||||
- **Medium (WSJF 2-5)**: Planned work
|
||||
- **Low (WSJF < 2)**: Backlog
|
||||
|
||||
### Framework 3: RICE (Reach, Impact, Confidence, Effort)
|
||||
|
||||
**Best For**: Product-focused teams with user-centric metrics.
|
||||
|
||||
**Formula**: `RICE Score = (Reach × Impact × Confidence) / Effort`
|
||||
|
||||
**Components**:
|
||||
|
||||
**Reach (number or percentage)**
|
||||
- How many developers/users affected per period?
|
||||
- Percentage of codebase impacted
|
||||
- Number of features that would benefit
|
||||
|
||||
**Impact (1-3 scale)**
|
||||
- 3 = Massive impact
|
||||
- 2 = High impact
|
||||
- 1 = Medium impact
|
||||
- 0.5 = Low impact
|
||||
- 0.25 = Minimal impact
|
||||
|
||||
**Confidence (percentage)**
|
||||
- How confident are you in your estimates?
|
||||
- Based on evidence, not gut feeling
|
||||
- 100% = High confidence with data
|
||||
- 80% = Medium confidence with some data
|
||||
- 50% = Low confidence, mostly assumptions
|
||||
|
||||
**Effort (story points or person-months)**
|
||||
- Total effort from all team members
|
||||
- Include design, development, testing, deployment
|
||||
- Account for coordination and communication overhead
|
||||
|
||||
**Example**:
|
||||
```
|
||||
Legacy API cleanup:
|
||||
- Reach: 5 teams × 4 developers = 20 people per quarter
|
||||
- Impact: 2 (high - significantly improves developer experience)
|
||||
- Confidence: 80% (have done similar cleanups before)
|
||||
- Effort: 8 story points
|
||||
- RICE: (20 × 2 × 0.8) / 8 = 4.0
|
||||
```
|
||||
|
||||
### Framework 4: Technical Debt Quadrants
|
||||
|
||||
**Best For**: Teams needing to understand debt context and strategy.
|
||||
|
||||
Based on Martin Fowler's framework, categorize debt into quadrants:
|
||||
|
||||
**Quadrant 1: Reckless & Deliberate**
|
||||
- "We don't have time for design"
|
||||
- **Strategy**: Immediate remediation
|
||||
- **Priority**: Highest - created knowingly with poor justification
|
||||
|
||||
**Quadrant 2: Prudent & Deliberate**
|
||||
- "We must ship now and deal with consequences"
|
||||
- **Strategy**: Planned remediation
|
||||
- **Priority**: High - was right decision at time, now needs attention
|
||||
|
||||
**Quadrant 3: Reckless & Inadvertent**
|
||||
- "What's layering?"
|
||||
- **Strategy**: Education and process improvement
|
||||
- **Priority**: Medium - focus on preventing more
|
||||
|
||||
**Quadrant 4: Prudent & Inadvertent**
|
||||
- "Now we know how we should have done it"
|
||||
- **Strategy**: Opportunistic improvement
|
||||
- **Priority**: Low - normal part of learning
|
||||
|
||||
### Framework 5: Risk-Impact Matrix
|
||||
|
||||
**Best For**: Risk-averse organizations or regulated environments.
|
||||
|
||||
Plot debt items on 2D matrix:
|
||||
- X-axis: Likelihood of negative impact (1-5)
|
||||
- Y-axis: Severity of negative impact (1-5)
|
||||
|
||||
**Priority Quadrants**:
|
||||
- **Critical (High likelihood, High impact)**: Immediate action
|
||||
- **Important (High likelihood, Low impact OR Low likelihood, High impact)**: Planned action
|
||||
- **Monitor (Medium likelihood, Medium impact)**: Watch and assess
|
||||
- **Accept (Low likelihood, Low impact)**: Document decision to accept
|
||||
|
||||
**Impact Categories**:
|
||||
- **Security**: Data breaches, vulnerability exploitation
|
||||
- **Reliability**: System outages, data corruption
|
||||
- **Performance**: User experience degradation
|
||||
- **Compliance**: Regulatory violations, audit findings
|
||||
- **Productivity**: Team velocity reduction, developer frustration
|
||||
|
||||
## Multi-Framework Approach
|
||||
|
||||
### When to Use Multiple Frameworks
|
||||
|
||||
**Portfolio-Level Planning**:
|
||||
- Use WSJF for quarterly planning
|
||||
- Use CoD for sprint-level decisions
|
||||
- Use Risk-Impact for security review
|
||||
|
||||
**Team Maturity Progression**:
|
||||
- Start with simple Risk-Impact matrix
|
||||
- Progress to RICE as metrics improve
|
||||
- Advanced teams can use CoD effectively
|
||||
|
||||
**Context-Dependent Selection**:
|
||||
- **Regulated industries**: Risk-Impact primary, WSJF secondary
|
||||
- **Product companies**: RICE primary, CoD secondary
|
||||
- **Enterprise software**: CoD primary, WSJF secondary
|
||||
|
||||
### Combining Framework Results
|
||||
|
||||
**Weighted Scoring**:
|
||||
```
|
||||
Final Priority = 0.4 × CoD_Score + 0.3 × RICE_Score + 0.3 × Risk_Score
|
||||
```
|
||||
|
||||
**Tier-Based Approach**:
|
||||
1. Security/compliance items (Risk-Impact)
|
||||
2. High business value items (RICE/CoD)
|
||||
3. Developer productivity items (WSJF)
|
||||
4. Technical excellence items (Quadrants)
|
||||
|
||||
## Implementation Guidelines
|
||||
|
||||
### Setting Up Prioritization
|
||||
|
||||
**Step 1: Choose Primary Framework**
|
||||
- Consider team maturity, organization culture, available data
|
||||
- Start simple, evolve complexity over time
|
||||
- Ensure framework aligns with business planning cycles
|
||||
|
||||
**Step 2: Define Scoring Criteria**
|
||||
- Create rubrics for each scoring dimension
|
||||
- Use organization-specific examples
|
||||
- Train team on consistent application
|
||||
|
||||
**Step 3: Establish Review Cadence**
|
||||
- Weekly: New urgent items
|
||||
- Bi-weekly: Sprint planning integration
|
||||
- Monthly: Portfolio review and reprioritization
|
||||
- Quarterly: Framework effectiveness review
|
||||
|
||||
**Step 4: Tool Integration**
|
||||
- Use existing project management tools
|
||||
- Automate scoring where possible
|
||||
- Create dashboards for stakeholder communication
|
||||
|
||||
### Common Pitfalls
|
||||
|
||||
**Analysis Paralysis**
|
||||
- **Problem**: Spending too much time on perfect prioritization
|
||||
- **Solution**: Use "good enough" decisions, iterate quickly
|
||||
|
||||
**Ignoring Business Context**
|
||||
- **Problem**: Purely technical prioritization
|
||||
- **Solution**: Always include business stakeholder perspective
|
||||
|
||||
**Inconsistent Application**
|
||||
- **Problem**: Different teams using different approaches
|
||||
- **Solution**: Standardize framework, provide training
|
||||
|
||||
**Over-Engineering the Process**
|
||||
- **Problem**: Complex frameworks nobody uses
|
||||
- **Solution**: Start simple, add complexity only when needed
|
||||
|
||||
**Neglecting Stakeholder Buy-In**
|
||||
- **Problem**: Engineering-only prioritization decisions
|
||||
- **Solution**: Include product, business stakeholders in framework design
|
||||
|
||||
### Measuring Framework Effectiveness
|
||||
|
||||
**Leading Indicators**:
|
||||
- Framework adoption rate across teams
|
||||
- Time to prioritization decision
|
||||
- Stakeholder satisfaction with decisions
|
||||
- Consistency of scoring across team members
|
||||
|
||||
**Lagging Indicators**:
|
||||
- Debt reduction velocity
|
||||
- Business outcome improvements
|
||||
- Technical incident reduction
|
||||
- Developer satisfaction improvements
|
||||
|
||||
**Review Questions**:
|
||||
1. Are we making better debt decisions than before?
|
||||
2. Do stakeholders trust our prioritization process?
|
||||
3. Are we delivering measurable business value from debt work?
|
||||
4. Is the framework sustainable for long-term use?
|
||||
|
||||
## Stakeholder Communication
|
||||
|
||||
### For Engineering Leaders
|
||||
|
||||
**Monthly Dashboard**:
|
||||
- Debt portfolio health score
|
||||
- Priority distribution by framework
|
||||
- Progress on high-priority items
|
||||
- Framework effectiveness metrics
|
||||
|
||||
**Quarterly Business Review**:
|
||||
- Debt work business impact
|
||||
- Framework ROI analysis
|
||||
- Resource allocation recommendations
|
||||
- Strategic debt initiative proposals
|
||||
|
||||
### For Product Managers
|
||||
|
||||
**Sprint Planning Input**:
|
||||
- Debt items affecting feature velocity
|
||||
- User experience impact from debt
|
||||
- Feature delivery risk from debt
|
||||
- Opportunity cost of debt work vs features
|
||||
|
||||
**Roadmap Integration**:
|
||||
- Debt work timing with feature releases
|
||||
- Dependencies between debt work and features
|
||||
- Resource allocation for debt vs features
|
||||
- Customer impact communication
|
||||
|
||||
### for Executive Leadership
|
||||
|
||||
**Executive Summary**:
|
||||
- Overall technical health trend
|
||||
- Business risk from technical debt
|
||||
- Investment recommendations
|
||||
- Competitive implications
|
||||
|
||||
**Key Metrics**:
|
||||
- Debt-adjusted development velocity
|
||||
- Technical incident trends
|
||||
- Customer satisfaction correlations
|
||||
- Team retention and satisfaction
|
||||
|
||||
This prioritization framework should be adapted to your organization's context, but the core principles of evidence-based, business-aligned, systematic prioritization should remain constant.
|
||||
@@ -0,0 +1,418 @@
|
||||
# Stakeholder Communication Templates
|
||||
|
||||
## Introduction
|
||||
|
||||
Effective communication about technical debt is crucial for securing resources, setting expectations, and maintaining stakeholder trust. This document provides templates and guidelines for communicating technical debt status, impact, and recommendations to different stakeholder groups.
|
||||
|
||||
## Executive Summary Templates
|
||||
|
||||
### Monthly Executive Report
|
||||
|
||||
**Subject**: Technical Health Report - [Month] [Year]
|
||||
|
||||
---
|
||||
|
||||
**EXECUTIVE SUMMARY**
|
||||
|
||||
**Overall Status**: [EXCELLENT/GOOD/FAIR/POOR] - Health Score: [X]/100
|
||||
|
||||
**Key Message**: [One sentence summary of current state and trend]
|
||||
|
||||
**Immediate Actions Required**: [Yes/No] - [Brief explanation if yes]
|
||||
|
||||
---
|
||||
|
||||
**BUSINESS IMPACT**
|
||||
|
||||
• **Development Velocity**: [X]% impact on feature delivery speed
|
||||
• **Quality Risk**: [LOW/MEDIUM/HIGH] - [Brief explanation]
|
||||
• **Security Posture**: [X] critical issues, [X] high-priority issues
|
||||
• **Customer Impact**: [Direct customer-facing implications]
|
||||
|
||||
**FINANCIAL IMPLICATIONS**
|
||||
|
||||
• **Current Cost**: $[X]K monthly in reduced velocity
|
||||
• **Investment Needed**: $[X]K for critical issues (next quarter)
|
||||
• **ROI Projection**: [X]% velocity improvement, $[X]K annual savings
|
||||
• **Risk Cost**: Up to $[X]K if critical issues materialize
|
||||
|
||||
**STRATEGIC RECOMMENDATIONS**
|
||||
|
||||
1. **[Priority 1]**: [Action] - [Business justification] - [Timeline]
|
||||
2. **[Priority 2]**: [Action] - [Business justification] - [Timeline]
|
||||
3. **[Priority 3]**: [Action] - [Business justification] - [Timeline]
|
||||
|
||||
**TREND ANALYSIS**
|
||||
|
||||
• Health Score: [Previous] → [Current] ([Improving/Declining/Stable])
|
||||
• Debt Items: [Previous] → [Current] ([Net change])
|
||||
• High-Priority Issues: [Previous] → [Current]
|
||||
|
||||
---
|
||||
|
||||
**NEXT STEPS**
|
||||
|
||||
• **This Quarter**: [Key initiatives and expected outcomes]
|
||||
• **Resource Request**: [Additional resources needed, if any]
|
||||
• **Dependencies**: [External dependencies or blockers]
|
||||
|
||||
---
|
||||
|
||||
### Quarterly Board-Level Report
|
||||
|
||||
**Subject**: Technical Debt & Engineering Health - Q[X] [Year]
|
||||
|
||||
---
|
||||
|
||||
**KEY METRICS**
|
||||
|
||||
| Metric | Current | Target | Trend |
|
||||
|--------|---------|--------|--------|
|
||||
| Health Score | [X]/100 | [X]/100 | [↑/↓/→] |
|
||||
| Velocity Impact | [X]% | <[X]% | [↑/↓/→] |
|
||||
| Critical Issues | [X] | 0 | [↑/↓/→] |
|
||||
| Security Risk | [LOW/MED/HIGH] | LOW | [↑/↓/→] |
|
||||
|
||||
**STRATEGIC CONTEXT**
|
||||
|
||||
Technical debt represents deferred investment in our technology platform. Our current debt portfolio has [positive/negative/neutral] implications for:
|
||||
|
||||
• **Growth Capacity**: [Impact on ability to scale]
|
||||
• **Competitive Position**: [Impact on market responsiveness]
|
||||
• **Risk Profile**: [Impact on operational risk]
|
||||
• **Team Retention**: [Impact on engineering talent]
|
||||
|
||||
**INVESTMENT ANALYSIS**
|
||||
|
||||
• **Current Annual Cost**: $[X]M in reduced productivity
|
||||
• **Proposed Investment**: $[X]M over [timeframe]
|
||||
• **Expected ROI**: [X]% productivity improvement, $[X]M NPV
|
||||
• **Risk Mitigation**: $[X]M in avoided incident costs
|
||||
|
||||
**RECOMMENDATIONS**
|
||||
|
||||
1. **[Immediate]**: [Strategic action with business rationale]
|
||||
2. **[This Year]**: [Medium-term initiative with expected outcomes]
|
||||
3. **[Ongoing]**: [Process or cultural change needed]
|
||||
|
||||
---
|
||||
|
||||
## Product Management Templates
|
||||
|
||||
### Sprint Planning Discussion
|
||||
|
||||
**Subject**: Tech Debt Impact on Sprint [X] Planning
|
||||
|
||||
---
|
||||
|
||||
**SPRINT CAPACITY IMPACT**
|
||||
|
||||
**Affected User Stories**:
|
||||
• [Story 1]: [X] point increase due to [debt issue]
|
||||
• [Story 2]: [X]% risk of scope reduction due to [debt issue]
|
||||
• [Story 3]: Blocked by [debt issue] - requires [X] points of debt work first
|
||||
|
||||
**Recommended Debt Work This Sprint**:
|
||||
• **[Debt Item 1]** ([X] points): Unblocks [Story Y], reduces future story complexity
|
||||
• **[Debt Item 2]** ([X] points): Prevents [specific risk] in upcoming features
|
||||
|
||||
**Trade-off Analysis**:
|
||||
• **If we fix debt**: [X] points for features, [benefits for future sprints]
|
||||
• **If we don't fix debt**: [X] points for features, [accumulated costs and risks]
|
||||
|
||||
**Recommendation**: [Specific allocation suggestion with rationale]
|
||||
|
||||
---
|
||||
|
||||
### Feature Impact Assessment
|
||||
|
||||
**Subject**: Technical Debt Impact Assessment - [Feature Name]
|
||||
|
||||
---
|
||||
|
||||
**DEBT AFFECTING THIS FEATURE**
|
||||
|
||||
| Debt Item | Impact | Effort to Fix | Recommendation |
|
||||
|-----------|--------|---------------|----------------|
|
||||
| [Item 1] | [Description] | [X] points | Fix before/Work around/Accept |
|
||||
| [Item 2] | [Description] | [X] points | Fix before/Work around/Accept |
|
||||
|
||||
**DELIVERY IMPACT**
|
||||
|
||||
• **Timeline Risk**: [LOW/MEDIUM/HIGH]
|
||||
- Base estimate: [X] points
|
||||
- Debt-adjusted estimate: [X] points ([X]% increase)
|
||||
- Risk factors: [Specific risks and probabilities]
|
||||
|
||||
• **Quality Risk**: [LOW/MEDIUM/HIGH]
|
||||
- [Specific quality concerns from debt]
|
||||
- Mitigation strategies: [Options for reducing risk]
|
||||
|
||||
• **Future Feature Impact**:
|
||||
- This feature will [add to/reduce/not affect] debt burden
|
||||
- Related future features will be [easier/harder/unaffected]
|
||||
|
||||
**RECOMMENDATIONS**
|
||||
|
||||
1. **[Option 1]**: [Approach with pros/cons]
|
||||
2. **[Option 2]**: [Alternative approach with trade-offs]
|
||||
3. **Recommended**: [Chosen approach with justification]
|
||||
|
||||
---
|
||||
|
||||
## Engineering Team Templates
|
||||
|
||||
### Team Health Check
|
||||
|
||||
**Subject**: Weekly Team Health Check - [Date]
|
||||
|
||||
---
|
||||
|
||||
**DEBT BURDEN THIS WEEK**
|
||||
|
||||
• **New Debt Identified**: [X] items ([categories])
|
||||
• **Debt Resolved**: [X] items ([X] hours saved)
|
||||
• **Net Change**: [Positive/Negative] [X] items
|
||||
• **Top Pain Points**: [Developer-reported friction areas]
|
||||
|
||||
**VELOCITY IMPACT**
|
||||
|
||||
• **Stories Affected by Debt**: [X] of [Y] planned stories
|
||||
• **Estimated Overhead**: [X] hours of extra work due to debt
|
||||
• **Blocked Work**: [Any stories waiting on debt resolution]
|
||||
|
||||
**TEAM SENTIMENT**
|
||||
|
||||
• **Frustration Level**: [1-5 scale] ([trend])
|
||||
• **Confidence in Codebase**: [1-5 scale] ([trend])
|
||||
• **Top Complaints**: [Most common developer concerns]
|
||||
|
||||
**ACTIONS THIS WEEK**
|
||||
|
||||
• **Debt Work Planned**: [Specific items and assignees]
|
||||
• **Prevention Measures**: [Process improvements or reviews]
|
||||
• **Escalations**: [Issues needing management attention]
|
||||
|
||||
---
|
||||
|
||||
### Architecture Decision Record (ADR) Template
|
||||
|
||||
**Subject**: ADR-[XXX]: [Decision Title] - Technical Debt Consideration
|
||||
|
||||
---
|
||||
|
||||
**Status**: [Proposed/Accepted/Deprecated]
|
||||
**Date**: [YYYY-MM-DD]
|
||||
**Decision Makers**: [Names]
|
||||
|
||||
**CONTEXT**
|
||||
|
||||
[Background and current situation]
|
||||
|
||||
**TECHNICAL DEBT ANALYSIS**
|
||||
|
||||
• **Debt Created by This Decision**:
|
||||
- [Specific debt that will be introduced]
|
||||
- [Estimated effort to resolve later: X points]
|
||||
- [Interest rate: impact over time]
|
||||
|
||||
• **Debt Resolved by This Decision**:
|
||||
- [Existing debt this addresses]
|
||||
- [Estimated effort saved: X points]
|
||||
- [Risk reduction achieved]
|
||||
|
||||
• **Net Debt Impact**: [Positive/Negative/Neutral]
|
||||
|
||||
**DECISION**
|
||||
|
||||
[What we decided to do]
|
||||
|
||||
**RATIONALE**
|
||||
|
||||
[Why we made this decision, including debt trade-offs]
|
||||
|
||||
**DEBT MANAGEMENT PLAN**
|
||||
|
||||
• **Monitoring**: [How we'll track the debt introduced]
|
||||
• **Timeline**: [When we plan to address the debt]
|
||||
• **Success Criteria**: [How we'll know it's time to pay down the debt]
|
||||
|
||||
**CONSEQUENCES**
|
||||
|
||||
[Expected outcomes, including debt implications]
|
||||
|
||||
---
|
||||
|
||||
## Customer-Facing Templates
|
||||
|
||||
### Release Notes - Quality Improvements
|
||||
|
||||
**Subject**: Platform Stability and Performance Improvements - Release [X.Y]
|
||||
|
||||
---
|
||||
|
||||
**QUALITY IMPROVEMENTS**
|
||||
|
||||
We've invested significant effort in improving the reliability and performance of our platform. While these changes aren't feature additions, they provide important benefits:
|
||||
|
||||
**RELIABILITY ENHANCEMENTS**
|
||||
|
||||
• **Reduced Error Rates**: [X]% fewer errors in [specific area]
|
||||
• **Improved Uptime**: [X]% improvement in system availability
|
||||
• **Faster Recovery**: [X]% faster recovery from service interruptions
|
||||
|
||||
**PERFORMANCE IMPROVEMENTS**
|
||||
|
||||
• **Page Load Speed**: [X]% faster loading for [specific features]
|
||||
• **API Response Time**: [X]% improvement in response times
|
||||
• **Resource Usage**: [X]% reduction in memory/CPU usage
|
||||
|
||||
**SECURITY STRENGTHENING**
|
||||
|
||||
• **Vulnerability Resolution**: Addressed [X] security findings
|
||||
• **Authentication Improvements**: Enhanced login security and reliability
|
||||
• **Data Protection**: Improved data encryption and access controls
|
||||
|
||||
**WHAT THIS MEANS FOR YOU**
|
||||
|
||||
• **Better User Experience**: Fewer interruptions, faster responses
|
||||
• **Increased Reliability**: Less downtime, more predictable performance
|
||||
• **Enhanced Security**: Your data is better protected
|
||||
|
||||
We continue to balance new feature development with platform investments to ensure a reliable, secure, and performant experience.
|
||||
|
||||
---
|
||||
|
||||
### Service Incident Communication
|
||||
|
||||
**Subject**: Service Update - [Brief Description] - [Status]
|
||||
|
||||
---
|
||||
|
||||
**INCIDENT SUMMARY**
|
||||
|
||||
• **Impact**: [Description of customer impact]
|
||||
• **Duration**: [Start time] - [End time / Ongoing]
|
||||
• **Root Cause**: [High-level, customer-appropriate explanation]
|
||||
• **Resolution**: [What was done to fix it]
|
||||
|
||||
**TECHNICAL DEBT CONNECTION**
|
||||
|
||||
This incident was [directly caused by / contributed to by / unrelated to] technical debt in our system. Specifically:
|
||||
|
||||
• **Contributing Factors**: [How debt played a role, if any]
|
||||
• **Prevention Measures**: [Debt work planned to prevent recurrence]
|
||||
• **Timeline**: [When preventive measures will be completed]
|
||||
|
||||
**IMMEDIATE ACTIONS**
|
||||
|
||||
1. [Action 1 with timeline]
|
||||
2. [Action 2 with timeline]
|
||||
3. [Action 3 with timeline]
|
||||
|
||||
**LONG-TERM IMPROVEMENTS**
|
||||
|
||||
We're investing in [specific technical improvements] to prevent similar issues:
|
||||
|
||||
• **Infrastructure**: [Relevant infrastructure debt work]
|
||||
• **Monitoring**: [Observability improvements planned]
|
||||
• **Process**: [Development process improvements]
|
||||
|
||||
We apologize for the inconvenience and appreciate your patience as we continue to strengthen our platform.
|
||||
|
||||
---
|
||||
|
||||
## Internal Communication Templates
|
||||
|
||||
### Engineering All-Hands Presentation
|
||||
|
||||
**Slide Template: Technical Debt State of the Union**
|
||||
|
||||
---
|
||||
|
||||
**SLIDE 1: Current State**
|
||||
- Health Score: [X]/100 [Trend arrow]
|
||||
- Total Debt Items: [X] ([X]% of codebase)
|
||||
- High Priority: [X] items requiring immediate attention
|
||||
- Team Impact: [X]% velocity reduction
|
||||
|
||||
**SLIDE 2: What We've Accomplished**
|
||||
- Resolved [X] debt items ([X] hours of future work saved)
|
||||
- Improved health score by [X] points
|
||||
- Key wins: [2-3 specific examples with business impact]
|
||||
|
||||
**SLIDE 3: Current Focus Areas**
|
||||
- [Category 1]: [X] items, [business impact]
|
||||
- [Category 2]: [X] items, [business impact]
|
||||
- [Category 3]: [X] items, [business impact]
|
||||
|
||||
**SLIDE 4: Success Stories**
|
||||
- [Specific example]: [Problem] → [Solution] → [Outcome]
|
||||
- Metrics: [Before/after comparison]
|
||||
- Team feedback: [Developer quotes]
|
||||
|
||||
**SLIDE 5: Looking Forward**
|
||||
- Q[X] Goals: [Specific targets]
|
||||
- Major Initiatives: [2-3 big-picture improvements]
|
||||
- How You Can Help: [Specific asks of the team]
|
||||
|
||||
---
|
||||
|
||||
### Retrospective Templates
|
||||
|
||||
**Sprint Retrospective - Debt Focus**
|
||||
|
||||
**What Went Well**:
|
||||
• Debt work completed: [Specific items and impact]
|
||||
• Process improvements: [What worked for debt management]
|
||||
• Team collaboration: [Cross-functional debt work successes]
|
||||
|
||||
**What Didn't Go Well**:
|
||||
• Debt work challenges: [Obstacles encountered]
|
||||
• Scope creep: [Debt work that expanded beyond estimates]
|
||||
• Communication gaps: [Information that wasn't shared effectively]
|
||||
|
||||
**Action Items**:
|
||||
• **Process**: [Changes to how we handle debt work]
|
||||
• **Planning**: [Improvements to debt estimation/prioritization]
|
||||
• **Prevention**: [Changes to prevent new debt creation]
|
||||
• **Tools**: [Tooling improvements needed]
|
||||
|
||||
---
|
||||
|
||||
## Communication Best Practices
|
||||
|
||||
### Do's and Don'ts
|
||||
|
||||
**DO**:
|
||||
• Use business language, not technical jargon
|
||||
• Quantify impact with specific metrics
|
||||
• Provide clear timelines and expectations
|
||||
• Acknowledge trade-offs and constraints
|
||||
• Connect debt work to business outcomes
|
||||
• Be proactive in communication
|
||||
|
||||
**DON'T**:
|
||||
• Blame previous decisions or developers
|
||||
• Use fear-based messaging exclusively
|
||||
• Overwhelm stakeholders with technical details
|
||||
• Make promises without clear plans
|
||||
• Ignore the business context
|
||||
• Assume stakeholders understand technical implications
|
||||
|
||||
### Tailoring Messages
|
||||
|
||||
**For Executives**: Focus on business impact, ROI, and strategic implications
|
||||
**For Product**: Focus on feature impact, timeline risks, and user experience
|
||||
**For Engineering**: Focus on technical details, process improvements, and developer experience
|
||||
**For Customers**: Focus on reliability, performance, and security benefits
|
||||
|
||||
### Frequency Guidelines
|
||||
|
||||
**Real-time**: Critical security issues, production incidents
|
||||
**Weekly**: Team health checks, sprint impacts
|
||||
**Monthly**: Stakeholder updates, trend analysis
|
||||
**Quarterly**: Strategic reviews, investment planning
|
||||
**As-needed**: Major decisions, significant changes
|
||||
|
||||
These templates should be customized for your organization's communication style, stakeholder preferences, and business context.
|
||||
970
engineering/tech-debt-tracker/scripts/debt_dashboard.py
Normal file
970
engineering/tech-debt-tracker/scripts/debt_dashboard.py
Normal file
@@ -0,0 +1,970 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tech Debt Dashboard
|
||||
|
||||
Takes historical debt inventories (multiple scans over time) and generates trend analysis,
|
||||
debt velocity (accruing vs paying down), health score, and executive summary.
|
||||
|
||||
Usage:
|
||||
python debt_dashboard.py historical_data.json
|
||||
python debt_dashboard.py data1.json data2.json data3.json
|
||||
python debt_dashboard.py --input-dir ./debt_scans/ --output dashboard_report.json
|
||||
python debt_dashboard.py historical_data.json --period quarterly --team-size 8
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
from collections import defaultdict, Counter
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
from statistics import mean, median, stdev
|
||||
import re
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthMetrics:
|
||||
"""Health metrics for a specific time period."""
|
||||
overall_score: float # 0-100
|
||||
debt_density: float # debt items per file
|
||||
velocity_impact: float # estimated velocity reduction %
|
||||
quality_score: float # 0-100
|
||||
maintainability_score: float # 0-100
|
||||
technical_risk_score: float # 0-100
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrendAnalysis:
|
||||
"""Trend analysis for debt metrics over time."""
|
||||
metric_name: str
|
||||
trend_direction: str # "improving", "declining", "stable"
|
||||
change_rate: float # rate of change per period
|
||||
correlation_strength: float # -1 to 1
|
||||
forecast_next_period: float
|
||||
confidence_interval: Tuple[float, float]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DebtVelocity:
|
||||
"""Debt velocity tracking - how fast debt is being created vs resolved."""
|
||||
period: str
|
||||
new_debt_items: int
|
||||
resolved_debt_items: int
|
||||
net_change: int
|
||||
velocity_ratio: float # resolved/new, >1 is good
|
||||
effort_hours_added: float
|
||||
effort_hours_resolved: float
|
||||
net_effort_change: float
|
||||
|
||||
|
||||
class DebtDashboard:
|
||||
"""Main dashboard class for debt trend analysis and reporting."""
|
||||
|
||||
def __init__(self, team_size: int = 5):
|
||||
self.team_size = team_size
|
||||
self.historical_data = []
|
||||
self.processed_snapshots = []
|
||||
self.trend_analyses = {}
|
||||
self.health_history = []
|
||||
self.velocity_history = []
|
||||
|
||||
# Configuration for health scoring
|
||||
self.health_weights = {
|
||||
"debt_density": 0.25,
|
||||
"complexity_score": 0.20,
|
||||
"test_coverage_proxy": 0.15,
|
||||
"documentation_proxy": 0.10,
|
||||
"security_score": 0.15,
|
||||
"maintainability": 0.15
|
||||
}
|
||||
|
||||
# Thresholds for categorization
|
||||
self.thresholds = {
|
||||
"excellent": 85,
|
||||
"good": 70,
|
||||
"fair": 55,
|
||||
"poor": 40
|
||||
}
|
||||
|
||||
def load_historical_data(self, file_paths: List[str]) -> bool:
|
||||
"""Load multiple debt inventory files for historical analysis."""
|
||||
self.historical_data = []
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Normalize data format
|
||||
if isinstance(data, dict) and 'debt_items' in data:
|
||||
# Scanner output format
|
||||
snapshot = {
|
||||
"file_path": file_path,
|
||||
"scan_date": data.get("scan_metadata", {}).get("scan_date",
|
||||
self._extract_date_from_filename(file_path)),
|
||||
"debt_items": data["debt_items"],
|
||||
"summary": data.get("summary", {}),
|
||||
"file_statistics": data.get("file_statistics", {})
|
||||
}
|
||||
elif isinstance(data, dict) and 'prioritized_backlog' in data:
|
||||
# Prioritizer output format
|
||||
snapshot = {
|
||||
"file_path": file_path,
|
||||
"scan_date": data.get("metadata", {}).get("analysis_date",
|
||||
self._extract_date_from_filename(file_path)),
|
||||
"debt_items": data["prioritized_backlog"],
|
||||
"summary": data.get("insights", {}),
|
||||
"file_statistics": {}
|
||||
}
|
||||
elif isinstance(data, list):
|
||||
# Raw debt items array
|
||||
snapshot = {
|
||||
"file_path": file_path,
|
||||
"scan_date": self._extract_date_from_filename(file_path),
|
||||
"debt_items": data,
|
||||
"summary": {},
|
||||
"file_statistics": {}
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unrecognized data format in {file_path}")
|
||||
|
||||
self.historical_data.append(snapshot)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading {file_path}: {e}")
|
||||
continue
|
||||
|
||||
if not self.historical_data:
|
||||
print("No valid data files loaded.")
|
||||
return False
|
||||
|
||||
# Sort by date
|
||||
self.historical_data.sort(key=lambda x: x["scan_date"])
|
||||
print(f"Loaded {len(self.historical_data)} historical snapshots")
|
||||
return True
|
||||
|
||||
def load_from_directory(self, directory_path: str, pattern: str = "*.json") -> bool:
|
||||
"""Load all JSON files from a directory."""
|
||||
directory = Path(directory_path)
|
||||
if not directory.exists():
|
||||
print(f"Directory does not exist: {directory_path}")
|
||||
return False
|
||||
|
||||
file_paths = []
|
||||
for file_path in directory.glob(pattern):
|
||||
if file_path.is_file():
|
||||
file_paths.append(str(file_path))
|
||||
|
||||
if not file_paths:
|
||||
print(f"No matching files found in {directory_path}")
|
||||
return False
|
||||
|
||||
return self.load_historical_data(file_paths)
|
||||
|
||||
def _extract_date_from_filename(self, file_path: str) -> str:
|
||||
"""Extract date from filename if possible, otherwise use current date."""
|
||||
filename = Path(file_path).name
|
||||
|
||||
# Try to find date patterns in filename
|
||||
date_patterns = [
|
||||
r"(\d{4}-\d{2}-\d{2})", # YYYY-MM-DD
|
||||
r"(\d{4}\d{2}\d{2})", # YYYYMMDD
|
||||
r"(\d{2}-\d{2}-\d{4})", # MM-DD-YYYY
|
||||
]
|
||||
|
||||
for pattern in date_patterns:
|
||||
match = re.search(pattern, filename)
|
||||
if match:
|
||||
date_str = match.group(1)
|
||||
try:
|
||||
if len(date_str) == 8: # YYYYMMDD
|
||||
date_str = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
|
||||
datetime.strptime(date_str, "%Y-%m-%d")
|
||||
return date_str + "T12:00:00"
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Fallback to file modification time
|
||||
try:
|
||||
mtime = os.path.getmtime(file_path)
|
||||
return datetime.fromtimestamp(mtime).isoformat()
|
||||
except:
|
||||
return datetime.now().isoformat()
|
||||
|
||||
def generate_dashboard(self, period: str = "monthly") -> Dict[str, Any]:
|
||||
"""
|
||||
Generate comprehensive debt dashboard.
|
||||
|
||||
Args:
|
||||
period: Analysis period ("weekly", "monthly", "quarterly")
|
||||
|
||||
Returns:
|
||||
Dictionary containing dashboard data and analysis
|
||||
"""
|
||||
print(f"Generating debt dashboard for {len(self.historical_data)} snapshots...")
|
||||
print(f"Analysis period: {period}")
|
||||
print("=" * 50)
|
||||
|
||||
# Step 1: Process historical snapshots
|
||||
self._process_snapshots()
|
||||
|
||||
# Step 2: Calculate health metrics for each snapshot
|
||||
self._calculate_health_metrics()
|
||||
|
||||
# Step 3: Analyze trends
|
||||
self._analyze_trends(period)
|
||||
|
||||
# Step 4: Calculate debt velocity
|
||||
self._calculate_debt_velocity(period)
|
||||
|
||||
# Step 5: Generate forecasts
|
||||
forecasts = self._generate_forecasts()
|
||||
|
||||
# Step 6: Create executive summary
|
||||
executive_summary = self._generate_executive_summary()
|
||||
|
||||
# Step 7: Generate recommendations
|
||||
recommendations = self._generate_strategic_recommendations()
|
||||
|
||||
# Step 8: Create visualizations data
|
||||
visualizations = self._generate_visualization_data()
|
||||
|
||||
dashboard_data = {
|
||||
"metadata": {
|
||||
"generated_date": datetime.now().isoformat(),
|
||||
"analysis_period": period,
|
||||
"snapshots_analyzed": len(self.historical_data),
|
||||
"date_range": {
|
||||
"start": self.historical_data[0]["scan_date"] if self.historical_data else None,
|
||||
"end": self.historical_data[-1]["scan_date"] if self.historical_data else None
|
||||
},
|
||||
"team_size": self.team_size
|
||||
},
|
||||
"executive_summary": executive_summary,
|
||||
"current_health": self.health_history[-1] if self.health_history else None,
|
||||
"trend_analysis": {name: asdict(trend) for name, trend in self.trend_analyses.items()},
|
||||
"debt_velocity": [asdict(v) for v in self.velocity_history],
|
||||
"forecasts": forecasts,
|
||||
"recommendations": recommendations,
|
||||
"visualizations": visualizations,
|
||||
"detailed_metrics": self._get_detailed_metrics()
|
||||
}
|
||||
|
||||
return dashboard_data
|
||||
|
||||
def _process_snapshots(self):
|
||||
"""Process raw snapshots into standardized format."""
|
||||
self.processed_snapshots = []
|
||||
|
||||
for snapshot in self.historical_data:
|
||||
processed = {
|
||||
"date": snapshot["scan_date"],
|
||||
"total_debt_items": len(snapshot["debt_items"]),
|
||||
"debt_by_type": Counter(item.get("type", "unknown") for item in snapshot["debt_items"]),
|
||||
"debt_by_severity": Counter(item.get("severity", "medium") for item in snapshot["debt_items"]),
|
||||
"debt_by_category": Counter(self._categorize_debt_item(item) for item in snapshot["debt_items"]),
|
||||
"total_files": snapshot["summary"].get("total_files_scanned",
|
||||
len(snapshot["file_statistics"])),
|
||||
"total_effort_estimate": self._calculate_total_effort(snapshot["debt_items"]),
|
||||
"high_priority_count": len([item for item in snapshot["debt_items"]
|
||||
if self._is_high_priority(item)]),
|
||||
"security_debt_count": len([item for item in snapshot["debt_items"]
|
||||
if self._is_security_related(item)]),
|
||||
"raw_data": snapshot
|
||||
}
|
||||
self.processed_snapshots.append(processed)
|
||||
|
||||
def _categorize_debt_item(self, item: Dict[str, Any]) -> str:
|
||||
"""Categorize debt item into high-level categories."""
|
||||
debt_type = item.get("type", "unknown")
|
||||
|
||||
categories = {
|
||||
"code_quality": ["large_function", "high_complexity", "duplicate_code",
|
||||
"long_line", "missing_docstring"],
|
||||
"architecture": ["architecture_debt", "large_file"],
|
||||
"security": ["security_risk", "hardcoded_secrets", "sql_injection_risk"],
|
||||
"testing": ["test_debt", "missing_tests", "low_coverage"],
|
||||
"maintenance": ["todo_comment", "commented_code"],
|
||||
"dependencies": ["dependency_debt", "outdated_packages"],
|
||||
"infrastructure": ["deployment_debt", "monitoring_gaps"],
|
||||
"documentation": ["missing_docstring", "outdated_docs"]
|
||||
}
|
||||
|
||||
for category, types in categories.items():
|
||||
if debt_type in types:
|
||||
return category
|
||||
|
||||
return "other"
|
||||
|
||||
def _calculate_total_effort(self, debt_items: List[Dict[str, Any]]) -> float:
|
||||
"""Calculate total estimated effort for debt items."""
|
||||
total_effort = 0.0
|
||||
|
||||
for item in debt_items:
|
||||
# Try to get effort from existing analysis
|
||||
if "effort_estimate" in item:
|
||||
total_effort += item["effort_estimate"].get("hours_estimate", 0)
|
||||
else:
|
||||
# Estimate based on debt type and severity
|
||||
effort = self._estimate_item_effort(item)
|
||||
total_effort += effort
|
||||
|
||||
return total_effort
|
||||
|
||||
def _estimate_item_effort(self, item: Dict[str, Any]) -> float:
|
||||
"""Estimate effort for a debt item."""
|
||||
debt_type = item.get("type", "unknown")
|
||||
severity = item.get("severity", "medium")
|
||||
|
||||
base_efforts = {
|
||||
"todo_comment": 2,
|
||||
"missing_docstring": 2,
|
||||
"long_line": 1,
|
||||
"large_function": 8,
|
||||
"high_complexity": 16,
|
||||
"duplicate_code": 12,
|
||||
"large_file": 32,
|
||||
"syntax_error": 4,
|
||||
"security_risk": 20,
|
||||
"architecture_debt": 80,
|
||||
"test_debt": 16
|
||||
}
|
||||
|
||||
base_effort = base_efforts.get(debt_type, 8)
|
||||
|
||||
severity_multipliers = {
|
||||
"low": 0.5,
|
||||
"medium": 1.0,
|
||||
"high": 1.5,
|
||||
"critical": 2.0
|
||||
}
|
||||
|
||||
return base_effort * severity_multipliers.get(severity, 1.0)
|
||||
|
||||
def _is_high_priority(self, item: Dict[str, Any]) -> bool:
|
||||
"""Determine if debt item is high priority."""
|
||||
severity = item.get("severity", "medium")
|
||||
priority_score = item.get("priority_score", 0)
|
||||
debt_type = item.get("type", "")
|
||||
|
||||
return (severity in ["high", "critical"] or
|
||||
priority_score >= 7 or
|
||||
debt_type in ["security_risk", "syntax_error", "architecture_debt"])
|
||||
|
||||
def _is_security_related(self, item: Dict[str, Any]) -> bool:
|
||||
"""Determine if debt item is security-related."""
|
||||
debt_type = item.get("type", "")
|
||||
description = item.get("description", "").lower()
|
||||
|
||||
security_types = ["security_risk", "hardcoded_secrets", "sql_injection_risk"]
|
||||
security_keywords = ["password", "token", "key", "secret", "auth", "security"]
|
||||
|
||||
return (debt_type in security_types or
|
||||
any(keyword in description for keyword in security_keywords))
|
||||
|
||||
def _calculate_health_metrics(self):
|
||||
"""Calculate health metrics for each snapshot."""
|
||||
self.health_history = []
|
||||
|
||||
for snapshot in self.processed_snapshots:
|
||||
# Debt density (lower is better)
|
||||
debt_density = snapshot["total_debt_items"] / max(1, snapshot["total_files"])
|
||||
debt_density_score = max(0, 100 - (debt_density * 20)) # Scale to 0-100
|
||||
|
||||
# Complexity score (based on high complexity debt)
|
||||
complex_debt_ratio = (snapshot["debt_by_type"].get("high_complexity", 0) +
|
||||
snapshot["debt_by_type"].get("large_function", 0)) / max(1, snapshot["total_debt_items"])
|
||||
complexity_score = max(0, 100 - (complex_debt_ratio * 100))
|
||||
|
||||
# Test coverage proxy (based on test debt)
|
||||
test_debt_ratio = snapshot["debt_by_category"].get("testing", 0) / max(1, snapshot["total_debt_items"])
|
||||
test_coverage_proxy = max(0, 100 - (test_debt_ratio * 150))
|
||||
|
||||
# Documentation proxy (based on documentation debt)
|
||||
doc_debt_ratio = snapshot["debt_by_category"].get("documentation", 0) / max(1, snapshot["total_debt_items"])
|
||||
documentation_proxy = max(0, 100 - (doc_debt_ratio * 100))
|
||||
|
||||
# Security score (based on security debt)
|
||||
security_debt_ratio = snapshot["security_debt_count"] / max(1, snapshot["total_debt_items"])
|
||||
security_score = max(0, 100 - (security_debt_ratio * 200))
|
||||
|
||||
# Maintainability (based on architecture and code quality debt)
|
||||
maint_debt_count = (snapshot["debt_by_category"].get("architecture", 0) +
|
||||
snapshot["debt_by_category"].get("code_quality", 0))
|
||||
maint_debt_ratio = maint_debt_count / max(1, snapshot["total_debt_items"])
|
||||
maintainability = max(0, 100 - (maint_debt_ratio * 120))
|
||||
|
||||
# Calculate weighted overall score
|
||||
weights = self.health_weights
|
||||
overall_score = (
|
||||
debt_density_score * weights["debt_density"] +
|
||||
complexity_score * weights["complexity_score"] +
|
||||
test_coverage_proxy * weights["test_coverage_proxy"] +
|
||||
documentation_proxy * weights["documentation_proxy"] +
|
||||
security_score * weights["security_score"] +
|
||||
maintainability * weights["maintainability"]
|
||||
)
|
||||
|
||||
# Velocity impact (estimated percentage reduction in team velocity)
|
||||
high_impact_ratio = snapshot["high_priority_count"] / max(1, snapshot["total_debt_items"])
|
||||
velocity_impact = min(50, high_impact_ratio * 30 + debt_density * 5)
|
||||
|
||||
# Technical risk (0-100, higher is more risky)
|
||||
risk_factors = snapshot["security_debt_count"] + snapshot["debt_by_type"].get("architecture_debt", 0)
|
||||
technical_risk = min(100, risk_factors * 10 + (100 - security_score))
|
||||
|
||||
health_metrics = HealthMetrics(
|
||||
overall_score=round(overall_score, 1),
|
||||
debt_density=round(debt_density, 2),
|
||||
velocity_impact=round(velocity_impact, 1),
|
||||
quality_score=round((complexity_score + maintainability) / 2, 1),
|
||||
maintainability_score=round(maintainability, 1),
|
||||
technical_risk_score=round(technical_risk, 1)
|
||||
)
|
||||
|
||||
# Add timestamp
|
||||
health_entry = asdict(health_metrics)
|
||||
health_entry["date"] = snapshot["date"]
|
||||
self.health_history.append(health_entry)
|
||||
|
||||
def _analyze_trends(self, period: str):
|
||||
"""Analyze trends in various metrics."""
|
||||
self.trend_analyses = {}
|
||||
|
||||
if len(self.health_history) < 2:
|
||||
return
|
||||
|
||||
# Define metrics to analyze
|
||||
metrics_to_analyze = [
|
||||
"overall_score",
|
||||
"debt_density",
|
||||
"velocity_impact",
|
||||
"quality_score",
|
||||
"technical_risk_score"
|
||||
]
|
||||
|
||||
for metric in metrics_to_analyze:
|
||||
values = [entry[metric] for entry in self.health_history]
|
||||
dates = [datetime.fromisoformat(entry["date"].replace('Z', '+00:00'))
|
||||
for entry in self.health_history]
|
||||
|
||||
trend = self._calculate_trend(values, dates, metric)
|
||||
self.trend_analyses[metric] = trend
|
||||
|
||||
def _calculate_trend(self, values: List[float], dates: List[datetime], metric_name: str) -> TrendAnalysis:
|
||||
"""Calculate trend analysis for a specific metric."""
|
||||
if len(values) < 2:
|
||||
return TrendAnalysis(metric_name, "stable", 0.0, 0.0, values[-1], (values[-1], values[-1]))
|
||||
|
||||
# Calculate simple linear trend
|
||||
n = len(values)
|
||||
x = list(range(n)) # Time periods as numbers
|
||||
|
||||
# Linear regression
|
||||
x_mean = mean(x)
|
||||
y_mean = mean(values)
|
||||
|
||||
numerator = sum((x[i] - x_mean) * (values[i] - y_mean) for i in range(n))
|
||||
denominator = sum((x[i] - x_mean) ** 2 for i in range(n))
|
||||
|
||||
if denominator == 0:
|
||||
slope = 0
|
||||
else:
|
||||
slope = numerator / denominator
|
||||
|
||||
# Correlation strength
|
||||
if n > 2 and len(set(values)) > 1:
|
||||
try:
|
||||
correlation = numerator / (
|
||||
(sum((x[i] - x_mean) ** 2 for i in range(n)) *
|
||||
sum((values[i] - y_mean) ** 2 for i in range(n))) ** 0.5
|
||||
)
|
||||
except ZeroDivisionError:
|
||||
correlation = 0.0
|
||||
else:
|
||||
correlation = 0.0
|
||||
|
||||
# Determine trend direction
|
||||
if abs(slope) < 0.1:
|
||||
trend_direction = "stable"
|
||||
elif slope > 0:
|
||||
if metric_name in ["overall_score", "quality_score"]:
|
||||
trend_direction = "improving" # Higher is better
|
||||
else:
|
||||
trend_direction = "declining" # Higher is worse
|
||||
else:
|
||||
if metric_name in ["overall_score", "quality_score"]:
|
||||
trend_direction = "declining"
|
||||
else:
|
||||
trend_direction = "improving"
|
||||
|
||||
# Forecast next period
|
||||
forecast = values[-1] + slope
|
||||
|
||||
# Confidence interval (simple approach)
|
||||
if n > 2:
|
||||
residuals = [values[i] - (y_mean + slope * (x[i] - x_mean)) for i in range(n)]
|
||||
std_error = (sum(r**2 for r in residuals) / (n - 2)) ** 0.5
|
||||
confidence_interval = (forecast - std_error, forecast + std_error)
|
||||
else:
|
||||
confidence_interval = (forecast, forecast)
|
||||
|
||||
return TrendAnalysis(
|
||||
metric_name=metric_name,
|
||||
trend_direction=trend_direction,
|
||||
change_rate=round(slope, 3),
|
||||
correlation_strength=round(correlation, 3),
|
||||
forecast_next_period=round(forecast, 2),
|
||||
confidence_interval=(round(confidence_interval[0], 2), round(confidence_interval[1], 2))
|
||||
)
|
||||
|
||||
def _calculate_debt_velocity(self, period: str):
|
||||
"""Calculate debt velocity between snapshots."""
|
||||
self.velocity_history = []
|
||||
|
||||
if len(self.processed_snapshots) < 2:
|
||||
return
|
||||
|
||||
for i in range(1, len(self.processed_snapshots)):
|
||||
current = self.processed_snapshots[i]
|
||||
previous = self.processed_snapshots[i-1]
|
||||
|
||||
# Track debt by unique identifiers when possible
|
||||
current_debt_ids = set()
|
||||
previous_debt_ids = set()
|
||||
|
||||
current_effort = current["total_effort_estimate"]
|
||||
previous_effort = previous["total_effort_estimate"]
|
||||
|
||||
# Simple approach: compare total counts and effort
|
||||
debt_change = current["total_debt_items"] - previous["total_debt_items"]
|
||||
effort_change = current_effort - previous_effort
|
||||
|
||||
# Estimate new vs resolved (rough approximation)
|
||||
if debt_change >= 0:
|
||||
new_debt_items = debt_change
|
||||
resolved_debt_items = 0
|
||||
else:
|
||||
new_debt_items = 0
|
||||
resolved_debt_items = abs(debt_change)
|
||||
|
||||
# Calculate velocity ratio
|
||||
if new_debt_items > 0:
|
||||
velocity_ratio = resolved_debt_items / new_debt_items
|
||||
else:
|
||||
velocity_ratio = float('inf') if resolved_debt_items > 0 else 1.0
|
||||
|
||||
velocity = DebtVelocity(
|
||||
period=f"{previous['date'][:10]} to {current['date'][:10]}",
|
||||
new_debt_items=new_debt_items,
|
||||
resolved_debt_items=resolved_debt_items,
|
||||
net_change=debt_change,
|
||||
velocity_ratio=min(10.0, velocity_ratio), # Cap at 10 for display
|
||||
effort_hours_added=max(0, effort_change),
|
||||
effort_hours_resolved=max(0, -effort_change),
|
||||
net_effort_change=effort_change
|
||||
)
|
||||
|
||||
self.velocity_history.append(velocity)
|
||||
|
||||
def _generate_forecasts(self) -> Dict[str, Any]:
|
||||
"""Generate forecasts based on trend analysis."""
|
||||
if not self.trend_analyses:
|
||||
return {}
|
||||
|
||||
forecasts = {}
|
||||
|
||||
# Overall health forecast
|
||||
health_trend = self.trend_analyses.get("overall_score")
|
||||
if health_trend:
|
||||
current_score = self.health_history[-1]["overall_score"]
|
||||
forecasts["health_score_3_months"] = max(0, min(100,
|
||||
current_score + (health_trend.change_rate * 3)))
|
||||
forecasts["health_score_6_months"] = max(0, min(100,
|
||||
current_score + (health_trend.change_rate * 6)))
|
||||
|
||||
# Debt accumulation forecast
|
||||
if self.velocity_history:
|
||||
avg_net_change = mean([v.net_change for v in self.velocity_history[-3:]]) # Last 3 periods
|
||||
current_debt = self.processed_snapshots[-1]["total_debt_items"]
|
||||
|
||||
forecasts["debt_count_3_months"] = max(0, current_debt + (avg_net_change * 3))
|
||||
forecasts["debt_count_6_months"] = max(0, current_debt + (avg_net_change * 6))
|
||||
|
||||
# Risk forecast
|
||||
risk_trend = self.trend_analyses.get("technical_risk_score")
|
||||
if risk_trend:
|
||||
current_risk = self.health_history[-1]["technical_risk_score"]
|
||||
forecasts["risk_score_3_months"] = max(0, min(100,
|
||||
current_risk + (risk_trend.change_rate * 3)))
|
||||
|
||||
return forecasts
|
||||
|
||||
def _generate_executive_summary(self) -> Dict[str, Any]:
|
||||
"""Generate executive summary of debt status."""
|
||||
if not self.health_history:
|
||||
return {}
|
||||
|
||||
current_health = self.health_history[-1]
|
||||
|
||||
# Determine overall status
|
||||
score = current_health["overall_score"]
|
||||
if score >= self.thresholds["excellent"]:
|
||||
status = "excellent"
|
||||
status_message = "Code quality is excellent with minimal technical debt."
|
||||
elif score >= self.thresholds["good"]:
|
||||
status = "good"
|
||||
status_message = "Code quality is good with manageable technical debt."
|
||||
elif score >= self.thresholds["fair"]:
|
||||
status = "fair"
|
||||
status_message = "Code quality needs attention. Technical debt is accumulating."
|
||||
else:
|
||||
status = "poor"
|
||||
status_message = "Critical: High levels of technical debt requiring immediate action."
|
||||
|
||||
# Key insights
|
||||
insights = []
|
||||
|
||||
if len(self.health_history) > 1:
|
||||
prev_health = self.health_history[-2]
|
||||
score_change = current_health["overall_score"] - prev_health["overall_score"]
|
||||
|
||||
if score_change > 5:
|
||||
insights.append("Health score improving significantly")
|
||||
elif score_change < -5:
|
||||
insights.append("Health score declining - attention needed")
|
||||
|
||||
if current_health["velocity_impact"] > 20:
|
||||
insights.append("High velocity impact detected - development speed affected")
|
||||
|
||||
if current_health["technical_risk_score"] > 70:
|
||||
insights.append("High technical risk - security and stability concerns")
|
||||
|
||||
# Debt velocity insight
|
||||
if self.velocity_history:
|
||||
recent_velocity = self.velocity_history[-1]
|
||||
if recent_velocity.velocity_ratio < 0.5:
|
||||
insights.append("Debt accumulating faster than resolution")
|
||||
elif recent_velocity.velocity_ratio > 1.5:
|
||||
insights.append("Good progress on debt reduction")
|
||||
|
||||
return {
|
||||
"overall_status": status,
|
||||
"health_score": current_health["overall_score"],
|
||||
"status_message": status_message,
|
||||
"key_insights": insights,
|
||||
"total_debt_items": self.processed_snapshots[-1]["total_debt_items"] if self.processed_snapshots else 0,
|
||||
"estimated_effort_hours": self.processed_snapshots[-1]["total_effort_estimate"] if self.processed_snapshots else 0,
|
||||
"high_priority_items": self.processed_snapshots[-1]["high_priority_count"] if self.processed_snapshots else 0,
|
||||
"velocity_impact_percent": current_health["velocity_impact"]
|
||||
}
|
||||
|
||||
def _generate_strategic_recommendations(self) -> List[Dict[str, Any]]:
|
||||
"""Generate strategic recommendations for debt management."""
|
||||
recommendations = []
|
||||
|
||||
if not self.health_history:
|
||||
return recommendations
|
||||
|
||||
current_health = self.health_history[-1]
|
||||
current_snapshot = self.processed_snapshots[-1] if self.processed_snapshots else {}
|
||||
|
||||
# Health-based recommendations
|
||||
if current_health["overall_score"] < 50:
|
||||
recommendations.append({
|
||||
"priority": "critical",
|
||||
"category": "immediate_action",
|
||||
"title": "Initiate Emergency Debt Reduction",
|
||||
"description": "Current health score is critically low. Consider dedicating 50%+ of development capacity to debt reduction.",
|
||||
"impact": "high",
|
||||
"effort": "high"
|
||||
})
|
||||
|
||||
# Velocity impact recommendations
|
||||
if current_health["velocity_impact"] > 25:
|
||||
recommendations.append({
|
||||
"priority": "high",
|
||||
"category": "productivity",
|
||||
"title": "Address Velocity Blockers",
|
||||
"description": f"Technical debt is reducing team velocity by {current_health['velocity_impact']:.1f}%. Focus on high-impact debt items first.",
|
||||
"impact": "high",
|
||||
"effort": "medium"
|
||||
})
|
||||
|
||||
# Security recommendations
|
||||
if current_health["technical_risk_score"] > 70:
|
||||
recommendations.append({
|
||||
"priority": "high",
|
||||
"category": "security",
|
||||
"title": "Security Debt Review Required",
|
||||
"description": "High technical risk score indicates security vulnerabilities. Conduct immediate security debt audit.",
|
||||
"impact": "high",
|
||||
"effort": "medium"
|
||||
})
|
||||
|
||||
# Trend-based recommendations
|
||||
health_trend = self.trend_analyses.get("overall_score")
|
||||
if health_trend and health_trend.trend_direction == "declining":
|
||||
recommendations.append({
|
||||
"priority": "medium",
|
||||
"category": "process",
|
||||
"title": "Implement Debt Prevention Measures",
|
||||
"description": "Health score is declining over time. Establish coding standards, automated quality gates, and regular debt reviews.",
|
||||
"impact": "medium",
|
||||
"effort": "medium"
|
||||
})
|
||||
|
||||
# Category-specific recommendations
|
||||
if current_snapshot:
|
||||
debt_by_category = current_snapshot["debt_by_category"]
|
||||
top_category = debt_by_category.most_common(1)[0] if debt_by_category else None
|
||||
|
||||
if top_category and top_category[1] > 10:
|
||||
category, count = top_category
|
||||
recommendations.append({
|
||||
"priority": "medium",
|
||||
"category": "focus_area",
|
||||
"title": f"Focus on {category.replace('_', ' ').title()} Debt",
|
||||
"description": f"{category.replace('_', ' ').title()} represents the largest debt category ({count} items). Consider targeted initiatives.",
|
||||
"impact": "medium",
|
||||
"effort": "medium"
|
||||
})
|
||||
|
||||
# Velocity-based recommendations
|
||||
if self.velocity_history:
|
||||
recent_velocities = self.velocity_history[-3:] if len(self.velocity_history) >= 3 else self.velocity_history
|
||||
avg_velocity_ratio = mean([v.velocity_ratio for v in recent_velocities])
|
||||
|
||||
if avg_velocity_ratio < 0.8:
|
||||
recommendations.append({
|
||||
"priority": "medium",
|
||||
"category": "capacity",
|
||||
"title": "Increase Debt Resolution Capacity",
|
||||
"description": "Debt is accumulating faster than resolution. Consider increasing debt budget or improving resolution efficiency.",
|
||||
"impact": "medium",
|
||||
"effort": "low"
|
||||
})
|
||||
|
||||
return recommendations
|
||||
|
||||
def _generate_visualization_data(self) -> Dict[str, Any]:
|
||||
"""Generate data for dashboard visualizations."""
|
||||
visualizations = {}
|
||||
|
||||
# Health score timeline
|
||||
visualizations["health_timeline"] = [
|
||||
{
|
||||
"date": entry["date"][:10], # Date only
|
||||
"overall_score": entry["overall_score"],
|
||||
"quality_score": entry["quality_score"],
|
||||
"technical_risk": entry["technical_risk_score"]
|
||||
}
|
||||
for entry in self.health_history
|
||||
]
|
||||
|
||||
# Debt accumulation trend
|
||||
visualizations["debt_accumulation"] = [
|
||||
{
|
||||
"date": snapshot["date"][:10],
|
||||
"total_debt": snapshot["total_debt_items"],
|
||||
"high_priority": snapshot["high_priority_count"],
|
||||
"security_debt": snapshot["security_debt_count"]
|
||||
}
|
||||
for snapshot in self.processed_snapshots
|
||||
]
|
||||
|
||||
# Category distribution (latest snapshot)
|
||||
if self.processed_snapshots:
|
||||
latest_categories = self.processed_snapshots[-1]["debt_by_category"]
|
||||
visualizations["category_distribution"] = [
|
||||
{"category": category, "count": count}
|
||||
for category, count in latest_categories.items()
|
||||
]
|
||||
|
||||
# Velocity chart
|
||||
visualizations["debt_velocity"] = [
|
||||
{
|
||||
"period": velocity.period,
|
||||
"new_items": velocity.new_debt_items,
|
||||
"resolved_items": velocity.resolved_debt_items,
|
||||
"net_change": velocity.net_change,
|
||||
"velocity_ratio": velocity.velocity_ratio
|
||||
}
|
||||
for velocity in self.velocity_history
|
||||
]
|
||||
|
||||
# Effort estimation trend
|
||||
visualizations["effort_trend"] = [
|
||||
{
|
||||
"date": snapshot["date"][:10],
|
||||
"total_effort": snapshot["total_effort_estimate"]
|
||||
}
|
||||
for snapshot in self.processed_snapshots
|
||||
]
|
||||
|
||||
return visualizations
|
||||
|
||||
def _get_detailed_metrics(self) -> Dict[str, Any]:
|
||||
"""Get detailed metrics for the current state."""
|
||||
if not self.processed_snapshots:
|
||||
return {}
|
||||
|
||||
current = self.processed_snapshots[-1]
|
||||
|
||||
return {
|
||||
"debt_breakdown": dict(current["debt_by_type"]),
|
||||
"severity_breakdown": dict(current["debt_by_severity"]),
|
||||
"category_breakdown": dict(current["debt_by_category"]),
|
||||
"files_analyzed": current["total_files"],
|
||||
"debt_density": current["total_debt_items"] / max(1, current["total_files"]),
|
||||
"average_effort_per_item": current["total_effort_estimate"] / max(1, current["total_debt_items"])
|
||||
}
|
||||
|
||||
|
||||
def format_dashboard_report(dashboard_data: Dict[str, Any]) -> str:
|
||||
"""Format dashboard data into human-readable report."""
|
||||
output = []
|
||||
|
||||
# Header
|
||||
output.append("=" * 60)
|
||||
output.append("TECHNICAL DEBT DASHBOARD")
|
||||
output.append("=" * 60)
|
||||
metadata = dashboard_data["metadata"]
|
||||
output.append(f"Generated: {metadata['generated_date'][:19]}")
|
||||
output.append(f"Analysis Period: {metadata['analysis_period']}")
|
||||
output.append(f"Snapshots Analyzed: {metadata['snapshots_analyzed']}")
|
||||
if metadata["date_range"]["start"]:
|
||||
output.append(f"Date Range: {metadata['date_range']['start'][:10]} to {metadata['date_range']['end'][:10]}")
|
||||
output.append("")
|
||||
|
||||
# Executive Summary
|
||||
exec_summary = dashboard_data["executive_summary"]
|
||||
output.append("EXECUTIVE SUMMARY")
|
||||
output.append("-" * 30)
|
||||
output.append(f"Overall Status: {exec_summary['overall_status'].upper()}")
|
||||
output.append(f"Health Score: {exec_summary['health_score']:.1f}/100")
|
||||
output.append(f"Status: {exec_summary['status_message']}")
|
||||
output.append("")
|
||||
output.append("Key Metrics:")
|
||||
output.append(f" • Total Debt Items: {exec_summary['total_debt_items']}")
|
||||
output.append(f" • High Priority Items: {exec_summary['high_priority_items']}")
|
||||
output.append(f" • Estimated Effort: {exec_summary['estimated_effort_hours']:.1f} hours")
|
||||
output.append(f" • Velocity Impact: {exec_summary['velocity_impact_percent']:.1f}%")
|
||||
output.append("")
|
||||
|
||||
if exec_summary["key_insights"]:
|
||||
output.append("Key Insights:")
|
||||
for insight in exec_summary["key_insights"]:
|
||||
output.append(f" • {insight}")
|
||||
output.append("")
|
||||
|
||||
# Current Health
|
||||
if dashboard_data["current_health"]:
|
||||
health = dashboard_data["current_health"]
|
||||
output.append("CURRENT HEALTH METRICS")
|
||||
output.append("-" * 30)
|
||||
output.append(f"Overall Score: {health['overall_score']:.1f}/100")
|
||||
output.append(f"Quality Score: {health['quality_score']:.1f}/100")
|
||||
output.append(f"Maintainability: {health['maintainability_score']:.1f}/100")
|
||||
output.append(f"Technical Risk: {health['technical_risk_score']:.1f}/100")
|
||||
output.append(f"Debt Density: {health['debt_density']:.2f} items/file")
|
||||
output.append("")
|
||||
|
||||
# Trend Analysis
|
||||
trends = dashboard_data["trend_analysis"]
|
||||
if trends:
|
||||
output.append("TREND ANALYSIS")
|
||||
output.append("-" * 30)
|
||||
for metric, trend in trends.items():
|
||||
direction_symbol = {
|
||||
"improving": "↑",
|
||||
"declining": "↓",
|
||||
"stable": "→"
|
||||
}.get(trend["trend_direction"], "→")
|
||||
|
||||
output.append(f"{metric.replace('_', ' ').title()}: {direction_symbol} {trend['trend_direction']}")
|
||||
output.append(f" Change Rate: {trend['change_rate']:.3f} per period")
|
||||
output.append(f" Forecast: {trend['forecast_next_period']:.1f}")
|
||||
output.append("")
|
||||
|
||||
# Top Recommendations
|
||||
recommendations = dashboard_data["recommendations"]
|
||||
if recommendations:
|
||||
output.append("TOP RECOMMENDATIONS")
|
||||
output.append("-" * 30)
|
||||
for i, rec in enumerate(recommendations[:5], 1):
|
||||
output.append(f"{i}. [{rec['priority'].upper()}] {rec['title']}")
|
||||
output.append(f" {rec['description']}")
|
||||
output.append(f" Impact: {rec['impact']}, Effort: {rec['effort']}")
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the debt dashboard."""
|
||||
parser = argparse.ArgumentParser(description="Generate technical debt dashboard")
|
||||
parser.add_argument("files", nargs="*", help="Debt inventory files")
|
||||
parser.add_argument("--input-dir", help="Directory containing debt inventory files")
|
||||
parser.add_argument("--output", help="Output file path")
|
||||
parser.add_argument("--format", choices=["json", "text", "both"],
|
||||
default="both", help="Output format")
|
||||
parser.add_argument("--period", choices=["weekly", "monthly", "quarterly"],
|
||||
default="monthly", help="Analysis period")
|
||||
parser.add_argument("--team-size", type=int, default=5, help="Team size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize dashboard
|
||||
dashboard = DebtDashboard(args.team_size)
|
||||
|
||||
# Load data
|
||||
if args.input_dir:
|
||||
success = dashboard.load_from_directory(args.input_dir)
|
||||
elif args.files:
|
||||
success = dashboard.load_historical_data(args.files)
|
||||
else:
|
||||
print("Error: Must specify either files or --input-dir")
|
||||
sys.exit(1)
|
||||
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
|
||||
# Generate dashboard
|
||||
try:
|
||||
dashboard_data = dashboard.generate_dashboard(args.period)
|
||||
except Exception as e:
|
||||
print(f"Dashboard generation failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Output results
|
||||
if args.format in ["json", "both"]:
|
||||
json_output = json.dumps(dashboard_data, indent=2, default=str)
|
||||
if args.output:
|
||||
output_path = args.output if args.output.endswith('.json') else f"{args.output}.json"
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(json_output)
|
||||
print(f"JSON dashboard written to: {output_path}")
|
||||
else:
|
||||
print("JSON DASHBOARD:")
|
||||
print("=" * 50)
|
||||
print(json_output)
|
||||
|
||||
if args.format in ["text", "both"]:
|
||||
text_output = format_dashboard_report(dashboard_data)
|
||||
if args.output:
|
||||
output_path = args.output if args.output.endswith('.txt') else f"{args.output}.txt"
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(text_output)
|
||||
print(f"Text dashboard written to: {output_path}")
|
||||
else:
|
||||
print("\nTEXT DASHBOARD:")
|
||||
print("=" * 50)
|
||||
print(text_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
857
engineering/tech-debt-tracker/scripts/debt_prioritizer.py
Normal file
857
engineering/tech-debt-tracker/scripts/debt_prioritizer.py
Normal file
@@ -0,0 +1,857 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tech Debt Prioritizer
|
||||
|
||||
Takes a debt inventory (from scanner or manual JSON) and calculates interest rate,
|
||||
effort estimates, and produces a prioritized backlog with recommended sprint allocation.
|
||||
Uses cost-of-delay vs effort scoring and various prioritization frameworks.
|
||||
|
||||
Usage:
|
||||
python debt_prioritizer.py debt_inventory.json
|
||||
python debt_prioritizer.py debt_inventory.json --output prioritized_backlog.json
|
||||
python debt_prioritizer.py debt_inventory.json --team-size 6 --sprint-capacity 80
|
||||
python debt_prioritizer.py debt_inventory.json --framework wsjf --output results.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import sys
|
||||
import math
|
||||
from collections import defaultdict, Counter
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
|
||||
@dataclass
|
||||
class EffortEstimate:
|
||||
"""Represents effort estimation for a debt item."""
|
||||
size_points: int
|
||||
hours_estimate: float
|
||||
risk_factor: float # 1.0 = low risk, 1.5 = medium, 2.0+ = high
|
||||
skill_level_required: str # junior, mid, senior, expert
|
||||
confidence: float # 0.0-1.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class BusinessImpact:
|
||||
"""Represents business impact assessment for a debt item."""
|
||||
customer_impact: int # 1-10 scale
|
||||
revenue_impact: int # 1-10 scale
|
||||
team_velocity_impact: int # 1-10 scale
|
||||
quality_impact: int # 1-10 scale
|
||||
security_impact: int # 1-10 scale
|
||||
|
||||
|
||||
@dataclass
|
||||
class InterestRate:
|
||||
"""Represents the interest rate calculation for technical debt."""
|
||||
daily_cost: float # cost per day if left unfixed
|
||||
frequency_multiplier: float # how often this code is touched
|
||||
team_impact_multiplier: float # how many developers affected
|
||||
compound_rate: float # how quickly this debt makes other debt worse
|
||||
|
||||
|
||||
class DebtPrioritizer:
|
||||
"""Main class for prioritizing technical debt items."""
|
||||
|
||||
def __init__(self, team_size: int = 5, sprint_capacity_hours: int = 80):
|
||||
self.team_size = team_size
|
||||
self.sprint_capacity_hours = sprint_capacity_hours
|
||||
self.debt_items = []
|
||||
self.prioritized_items = []
|
||||
|
||||
# Prioritization framework weights
|
||||
self.framework_weights = {
|
||||
"cost_of_delay": {
|
||||
"business_value": 0.3,
|
||||
"urgency": 0.3,
|
||||
"risk_reduction": 0.2,
|
||||
"team_productivity": 0.2
|
||||
},
|
||||
"wsjf": {
|
||||
"business_value": 0.25,
|
||||
"time_criticality": 0.25,
|
||||
"risk_reduction": 0.25,
|
||||
"effort": 0.25
|
||||
},
|
||||
"rice": {
|
||||
"reach": 0.25,
|
||||
"impact": 0.25,
|
||||
"confidence": 0.25,
|
||||
"effort": 0.25
|
||||
}
|
||||
}
|
||||
|
||||
def load_debt_inventory(self, file_path: str) -> bool:
|
||||
"""Load debt inventory from JSON file."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle different input formats
|
||||
if isinstance(data, dict) and 'debt_items' in data:
|
||||
self.debt_items = data['debt_items']
|
||||
elif isinstance(data, list):
|
||||
self.debt_items = data
|
||||
else:
|
||||
raise ValueError("Invalid debt inventory format")
|
||||
|
||||
print(f"Loaded {len(self.debt_items)} debt items from {file_path}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading debt inventory: {e}")
|
||||
return False
|
||||
|
||||
def analyze_and_prioritize(self, framework: str = "cost_of_delay") -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze debt items and create prioritized backlog.
|
||||
|
||||
Args:
|
||||
framework: Prioritization framework to use
|
||||
|
||||
Returns:
|
||||
Dictionary containing prioritized backlog and analysis
|
||||
"""
|
||||
print(f"Analyzing {len(self.debt_items)} debt items...")
|
||||
print(f"Using {framework} prioritization framework")
|
||||
print("=" * 50)
|
||||
|
||||
# Step 1: Enrich debt items with estimates
|
||||
enriched_items = []
|
||||
for item in self.debt_items:
|
||||
enriched_item = self._enrich_debt_item(item)
|
||||
enriched_items.append(enriched_item)
|
||||
|
||||
# Step 2: Calculate prioritization scores
|
||||
for item in enriched_items:
|
||||
if framework == "cost_of_delay":
|
||||
item["priority_score"] = self._calculate_cost_of_delay_score(item)
|
||||
elif framework == "wsjf":
|
||||
item["priority_score"] = self._calculate_wsjf_score(item)
|
||||
elif framework == "rice":
|
||||
item["priority_score"] = self._calculate_rice_score(item)
|
||||
else:
|
||||
raise ValueError(f"Unknown prioritization framework: {framework}")
|
||||
|
||||
# Step 3: Sort by priority score
|
||||
self.prioritized_items = sorted(enriched_items,
|
||||
key=lambda x: x["priority_score"],
|
||||
reverse=True)
|
||||
|
||||
# Step 4: Generate sprint allocation recommendations
|
||||
sprint_allocation = self._generate_sprint_allocation()
|
||||
|
||||
# Step 5: Generate insights and recommendations
|
||||
insights = self._generate_insights()
|
||||
|
||||
# Step 6: Create visualization data
|
||||
charts_data = self._generate_charts_data()
|
||||
|
||||
return {
|
||||
"metadata": {
|
||||
"analysis_date": datetime.now().isoformat(),
|
||||
"framework_used": framework,
|
||||
"team_size": self.team_size,
|
||||
"sprint_capacity_hours": self.sprint_capacity_hours,
|
||||
"total_items_analyzed": len(self.debt_items)
|
||||
},
|
||||
"prioritized_backlog": self.prioritized_items,
|
||||
"sprint_allocation": sprint_allocation,
|
||||
"insights": insights,
|
||||
"charts_data": charts_data,
|
||||
"recommendations": self._generate_recommendations()
|
||||
}
|
||||
|
||||
def _enrich_debt_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Enrich debt item with detailed estimates and impact analysis."""
|
||||
enriched = item.copy()
|
||||
|
||||
# Generate effort estimate
|
||||
effort = self._estimate_effort(item)
|
||||
enriched["effort_estimate"] = asdict(effort)
|
||||
|
||||
# Generate business impact assessment
|
||||
business_impact = self._assess_business_impact(item)
|
||||
enriched["business_impact"] = asdict(business_impact)
|
||||
|
||||
# Calculate interest rate
|
||||
interest_rate = self._calculate_interest_rate(item, business_impact)
|
||||
enriched["interest_rate"] = asdict(interest_rate)
|
||||
|
||||
# Calculate cost of delay
|
||||
enriched["cost_of_delay"] = self._calculate_cost_of_delay(interest_rate, effort)
|
||||
|
||||
# Assign categories and tags
|
||||
enriched["category"] = self._categorize_debt_item(item)
|
||||
enriched["impact_tags"] = self._generate_impact_tags(item, business_impact)
|
||||
|
||||
return enriched
|
||||
|
||||
def _estimate_effort(self, item: Dict[str, Any]) -> EffortEstimate:
|
||||
"""Estimate effort required to fix debt item."""
|
||||
debt_type = item.get("type", "unknown")
|
||||
severity = item.get("severity", "medium")
|
||||
|
||||
# Base effort estimation by debt type
|
||||
base_efforts = {
|
||||
"todo_comment": (1, 2),
|
||||
"missing_docstring": (1, 4),
|
||||
"long_line": (0.5, 1),
|
||||
"large_function": (4, 16),
|
||||
"high_complexity": (8, 32),
|
||||
"duplicate_code": (6, 24),
|
||||
"large_file": (16, 64),
|
||||
"syntax_error": (2, 8),
|
||||
"security_risk": (4, 40),
|
||||
"architecture_debt": (40, 160),
|
||||
"test_debt": (8, 40),
|
||||
"dependency_debt": (4, 24)
|
||||
}
|
||||
|
||||
min_hours, max_hours = base_efforts.get(debt_type, (4, 16))
|
||||
|
||||
# Adjust by severity
|
||||
severity_multipliers = {
|
||||
"low": 0.5,
|
||||
"medium": 1.0,
|
||||
"high": 1.5,
|
||||
"critical": 2.0
|
||||
}
|
||||
|
||||
multiplier = severity_multipliers.get(severity, 1.0)
|
||||
hours_estimate = (min_hours + max_hours) / 2 * multiplier
|
||||
|
||||
# Convert to story points (assuming 6 hours per point)
|
||||
size_points = max(1, round(hours_estimate / 6))
|
||||
|
||||
# Determine risk factor
|
||||
risk_factor = 1.0
|
||||
if debt_type in ["architecture_debt", "security_risk", "large_file"]:
|
||||
risk_factor = 1.8
|
||||
elif debt_type in ["high_complexity", "duplicate_code"]:
|
||||
risk_factor = 1.4
|
||||
elif debt_type in ["syntax_error", "dependency_debt"]:
|
||||
risk_factor = 1.2
|
||||
|
||||
# Determine skill level required
|
||||
skill_requirements = {
|
||||
"architecture_debt": "expert",
|
||||
"security_risk": "senior",
|
||||
"high_complexity": "senior",
|
||||
"large_function": "mid",
|
||||
"duplicate_code": "mid",
|
||||
"dependency_debt": "mid",
|
||||
"test_debt": "mid",
|
||||
"todo_comment": "junior",
|
||||
"missing_docstring": "junior",
|
||||
"long_line": "junior"
|
||||
}
|
||||
|
||||
skill_level = skill_requirements.get(debt_type, "mid")
|
||||
|
||||
# Confidence based on debt type clarity
|
||||
confidence_levels = {
|
||||
"todo_comment": 0.9,
|
||||
"missing_docstring": 0.9,
|
||||
"long_line": 0.95,
|
||||
"syntax_error": 0.8,
|
||||
"large_function": 0.7,
|
||||
"duplicate_code": 0.6,
|
||||
"high_complexity": 0.5,
|
||||
"architecture_debt": 0.3,
|
||||
"security_risk": 0.4
|
||||
}
|
||||
|
||||
confidence = confidence_levels.get(debt_type, 0.6)
|
||||
|
||||
return EffortEstimate(
|
||||
size_points=size_points,
|
||||
hours_estimate=hours_estimate,
|
||||
risk_factor=risk_factor,
|
||||
skill_level_required=skill_level,
|
||||
confidence=confidence
|
||||
)
|
||||
|
||||
def _assess_business_impact(self, item: Dict[str, Any]) -> BusinessImpact:
|
||||
"""Assess business impact of debt item."""
|
||||
debt_type = item.get("type", "unknown")
|
||||
severity = item.get("severity", "medium")
|
||||
|
||||
# Base impact scores by debt type (1-10 scale)
|
||||
impact_profiles = {
|
||||
"security_risk": (9, 8, 7, 9, 10), # customer, revenue, velocity, quality, security
|
||||
"architecture_debt": (6, 7, 9, 8, 4),
|
||||
"large_function": (3, 4, 7, 6, 2),
|
||||
"high_complexity": (4, 5, 8, 7, 3),
|
||||
"duplicate_code": (3, 4, 6, 6, 2),
|
||||
"syntax_error": (7, 6, 8, 9, 3),
|
||||
"test_debt": (5, 5, 7, 8, 3),
|
||||
"dependency_debt": (6, 5, 6, 7, 7),
|
||||
"todo_comment": (1, 1, 2, 2, 1),
|
||||
"missing_docstring": (2, 2, 4, 3, 1)
|
||||
}
|
||||
|
||||
base_impacts = impact_profiles.get(debt_type, (3, 3, 5, 5, 3))
|
||||
|
||||
# Adjust by severity
|
||||
severity_adjustments = {
|
||||
"low": 0.6,
|
||||
"medium": 1.0,
|
||||
"high": 1.4,
|
||||
"critical": 1.8
|
||||
}
|
||||
|
||||
adjustment = severity_adjustments.get(severity, 1.0)
|
||||
|
||||
# Apply adjustment and cap at 10
|
||||
adjusted_impacts = [min(10, max(1, round(impact * adjustment)))
|
||||
for impact in base_impacts]
|
||||
|
||||
return BusinessImpact(
|
||||
customer_impact=adjusted_impacts[0],
|
||||
revenue_impact=adjusted_impacts[1],
|
||||
team_velocity_impact=adjusted_impacts[2],
|
||||
quality_impact=adjusted_impacts[3],
|
||||
security_impact=adjusted_impacts[4]
|
||||
)
|
||||
|
||||
def _calculate_interest_rate(self, item: Dict[str, Any],
|
||||
business_impact: BusinessImpact) -> InterestRate:
|
||||
"""Calculate interest rate for technical debt."""
|
||||
|
||||
# Base daily cost calculation
|
||||
velocity_impact = business_impact.team_velocity_impact
|
||||
quality_impact = business_impact.quality_impact
|
||||
|
||||
# Daily cost in "developer hours lost"
|
||||
daily_cost = (velocity_impact * 0.5) + (quality_impact * 0.3)
|
||||
|
||||
# Frequency multiplier based on code location and type
|
||||
file_path = item.get("file_path", "")
|
||||
debt_type = item.get("type", "unknown")
|
||||
|
||||
# Estimate frequency based on file path patterns
|
||||
frequency_multiplier = 1.0
|
||||
if any(pattern in file_path.lower() for pattern in ["main", "core", "auth", "api"]):
|
||||
frequency_multiplier = 2.0
|
||||
elif any(pattern in file_path.lower() for pattern in ["util", "helper", "common"]):
|
||||
frequency_multiplier = 1.5
|
||||
elif any(pattern in file_path.lower() for pattern in ["test", "spec", "config"]):
|
||||
frequency_multiplier = 0.5
|
||||
|
||||
# Team impact multiplier
|
||||
team_impact_multiplier = min(self.team_size, 8) / 5.0 # Normalize around team of 5
|
||||
|
||||
# Compound rate - how this debt creates more debt
|
||||
compound_rates = {
|
||||
"architecture_debt": 0.1, # Creates 10% more debt monthly
|
||||
"duplicate_code": 0.08,
|
||||
"high_complexity": 0.05,
|
||||
"large_function": 0.03,
|
||||
"test_debt": 0.04,
|
||||
"security_risk": 0.02, # Doesn't compound much, but high initial impact
|
||||
"todo_comment": 0.01
|
||||
}
|
||||
|
||||
compound_rate = compound_rates.get(debt_type, 0.02)
|
||||
|
||||
return InterestRate(
|
||||
daily_cost=daily_cost,
|
||||
frequency_multiplier=frequency_multiplier,
|
||||
team_impact_multiplier=team_impact_multiplier,
|
||||
compound_rate=compound_rate
|
||||
)
|
||||
|
||||
def _calculate_cost_of_delay(self, interest_rate: InterestRate,
|
||||
effort: EffortEstimate) -> float:
|
||||
"""Calculate total cost of delay if debt is not fixed."""
|
||||
|
||||
# Estimate delay in days (assuming debt gets fixed eventually)
|
||||
estimated_delay_days = effort.hours_estimate / (self.sprint_capacity_hours / 14) # 2-week sprints
|
||||
|
||||
# Calculate cumulative cost
|
||||
daily_cost = (interest_rate.daily_cost *
|
||||
interest_rate.frequency_multiplier *
|
||||
interest_rate.team_impact_multiplier)
|
||||
|
||||
# Add compound interest effect
|
||||
compound_effect = (1 + interest_rate.compound_rate) ** (estimated_delay_days / 30)
|
||||
|
||||
total_cost = daily_cost * estimated_delay_days * compound_effect
|
||||
|
||||
return round(total_cost, 2)
|
||||
|
||||
def _categorize_debt_item(self, item: Dict[str, Any]) -> str:
|
||||
"""Categorize debt item into high-level categories."""
|
||||
debt_type = item.get("type", "unknown")
|
||||
|
||||
categories = {
|
||||
"code_quality": ["large_function", "high_complexity", "duplicate_code",
|
||||
"long_line", "missing_docstring"],
|
||||
"architecture": ["architecture_debt", "large_file"],
|
||||
"security": ["security_risk", "hardcoded_secrets"],
|
||||
"testing": ["test_debt", "missing_tests"],
|
||||
"maintenance": ["todo_comment", "commented_code"],
|
||||
"dependencies": ["dependency_debt", "outdated_packages"],
|
||||
"infrastructure": ["deployment_debt", "monitoring_gaps"],
|
||||
"documentation": ["missing_docstring", "outdated_docs"]
|
||||
}
|
||||
|
||||
for category, types in categories.items():
|
||||
if debt_type in types:
|
||||
return category
|
||||
|
||||
return "other"
|
||||
|
||||
def _generate_impact_tags(self, item: Dict[str, Any],
|
||||
business_impact: BusinessImpact) -> List[str]:
|
||||
"""Generate impact tags for debt item."""
|
||||
tags = []
|
||||
|
||||
if business_impact.security_impact >= 7:
|
||||
tags.append("security-critical")
|
||||
if business_impact.customer_impact >= 7:
|
||||
tags.append("customer-facing")
|
||||
if business_impact.revenue_impact >= 7:
|
||||
tags.append("revenue-impact")
|
||||
if business_impact.team_velocity_impact >= 7:
|
||||
tags.append("velocity-blocker")
|
||||
if business_impact.quality_impact >= 7:
|
||||
tags.append("quality-risk")
|
||||
|
||||
# Add effort-based tags
|
||||
effort_hours = item.get("effort_estimate", {}).get("hours_estimate", 0)
|
||||
if effort_hours <= 4:
|
||||
tags.append("quick-win")
|
||||
elif effort_hours >= 40:
|
||||
tags.append("major-initiative")
|
||||
|
||||
return tags
|
||||
|
||||
def _calculate_cost_of_delay_score(self, item: Dict[str, Any]) -> float:
|
||||
"""Calculate priority score using cost-of-delay framework."""
|
||||
business_impact = item["business_impact"]
|
||||
effort = item["effort_estimate"]
|
||||
|
||||
# Business value (weighted average of impacts)
|
||||
business_value = (
|
||||
business_impact["customer_impact"] * 0.3 +
|
||||
business_impact["revenue_impact"] * 0.3 +
|
||||
business_impact["quality_impact"] * 0.2 +
|
||||
business_impact["team_velocity_impact"] * 0.2
|
||||
)
|
||||
|
||||
# Urgency (how quickly value decreases)
|
||||
urgency = item["interest_rate"]["daily_cost"] * 10 # Scale to 1-10
|
||||
urgency = min(10, max(1, urgency))
|
||||
|
||||
# Risk reduction
|
||||
risk_reduction = business_impact["security_impact"] * 0.6 + business_impact["quality_impact"] * 0.4
|
||||
|
||||
# Team productivity impact
|
||||
team_productivity = business_impact["team_velocity_impact"]
|
||||
|
||||
# Combine with weights
|
||||
weights = self.framework_weights["cost_of_delay"]
|
||||
numerator = (
|
||||
business_value * weights["business_value"] +
|
||||
urgency * weights["urgency"] +
|
||||
risk_reduction * weights["risk_reduction"] +
|
||||
team_productivity * weights["team_productivity"]
|
||||
)
|
||||
|
||||
# Divide by effort (adjusted for risk)
|
||||
effort_adjusted = effort["hours_estimate"] * effort["risk_factor"]
|
||||
denominator = max(1, effort_adjusted / 8) # Normalize to story points
|
||||
|
||||
return round(numerator / denominator, 2)
|
||||
|
||||
def _calculate_wsjf_score(self, item: Dict[str, Any]) -> float:
|
||||
"""Calculate priority score using Weighted Shortest Job First (WSJF)."""
|
||||
business_impact = item["business_impact"]
|
||||
effort = item["effort_estimate"]
|
||||
|
||||
# Business value
|
||||
business_value = (
|
||||
business_impact["customer_impact"] * 0.4 +
|
||||
business_impact["revenue_impact"] * 0.6
|
||||
)
|
||||
|
||||
# Time criticality
|
||||
time_criticality = item["cost_of_delay"] / 10 # Normalize
|
||||
time_criticality = min(10, max(1, time_criticality))
|
||||
|
||||
# Risk reduction
|
||||
risk_reduction = (
|
||||
business_impact["security_impact"] * 0.5 +
|
||||
business_impact["quality_impact"] * 0.5
|
||||
)
|
||||
|
||||
# Job size (effort)
|
||||
job_size = effort["size_points"]
|
||||
|
||||
# WSJF calculation
|
||||
numerator = business_value + time_criticality + risk_reduction
|
||||
denominator = max(1, job_size)
|
||||
|
||||
return round(numerator / denominator, 2)
|
||||
|
||||
def _calculate_rice_score(self, item: Dict[str, Any]) -> float:
|
||||
"""Calculate priority score using RICE framework."""
|
||||
business_impact = item["business_impact"]
|
||||
effort = item["effort_estimate"]
|
||||
|
||||
# Reach (how many developers/users affected)
|
||||
reach = min(10, self.team_size * business_impact["team_velocity_impact"] / 5)
|
||||
|
||||
# Impact
|
||||
impact = (
|
||||
business_impact["customer_impact"] * 0.3 +
|
||||
business_impact["revenue_impact"] * 0.3 +
|
||||
business_impact["quality_impact"] * 0.4
|
||||
)
|
||||
|
||||
# Confidence
|
||||
confidence = effort["confidence"] * 10
|
||||
|
||||
# Effort
|
||||
effort_score = effort["size_points"]
|
||||
|
||||
# RICE calculation
|
||||
rice_score = (reach * impact * confidence) / max(1, effort_score)
|
||||
|
||||
return round(rice_score, 2)
|
||||
|
||||
def _generate_sprint_allocation(self) -> Dict[str, Any]:
|
||||
"""Generate sprint allocation recommendations."""
|
||||
# Calculate total effort needed
|
||||
total_effort_hours = sum(item["effort_estimate"]["hours_estimate"]
|
||||
for item in self.prioritized_items)
|
||||
|
||||
# Assume 20% of sprint capacity goes to tech debt
|
||||
debt_capacity_per_sprint = self.sprint_capacity_hours * 0.2
|
||||
|
||||
# Allocate items to sprints
|
||||
sprints = []
|
||||
current_sprint = {"sprint_number": 1, "items": [], "total_hours": 0, "capacity_used": 0}
|
||||
|
||||
for item in self.prioritized_items:
|
||||
item_effort = item["effort_estimate"]["hours_estimate"]
|
||||
|
||||
if current_sprint["total_hours"] + item_effort <= debt_capacity_per_sprint:
|
||||
current_sprint["items"].append(item)
|
||||
current_sprint["total_hours"] += item_effort
|
||||
current_sprint["capacity_used"] = current_sprint["total_hours"] / debt_capacity_per_sprint
|
||||
else:
|
||||
# Start new sprint
|
||||
sprints.append(current_sprint)
|
||||
current_sprint = {
|
||||
"sprint_number": len(sprints) + 1,
|
||||
"items": [item],
|
||||
"total_hours": item_effort,
|
||||
"capacity_used": item_effort / debt_capacity_per_sprint
|
||||
}
|
||||
|
||||
# Add the last sprint
|
||||
if current_sprint["items"]:
|
||||
sprints.append(current_sprint)
|
||||
|
||||
# Calculate summary statistics
|
||||
total_sprints_needed = len(sprints)
|
||||
high_priority_items = len([item for item in self.prioritized_items
|
||||
if item.get("priority", "medium") in ["high", "critical"]])
|
||||
|
||||
return {
|
||||
"total_debt_hours": round(total_effort_hours, 1),
|
||||
"debt_capacity_per_sprint": debt_capacity_per_sprint,
|
||||
"total_sprints_needed": total_sprints_needed,
|
||||
"high_priority_items": high_priority_items,
|
||||
"sprint_plan": sprints[:6], # Show first 6 sprints
|
||||
"recommendations": [
|
||||
f"Allocate {debt_capacity_per_sprint} hours per sprint to tech debt",
|
||||
f"Focus on {high_priority_items} high-priority items first",
|
||||
f"Estimated {total_sprints_needed} sprints to clear current backlog"
|
||||
]
|
||||
}
|
||||
|
||||
def _generate_insights(self) -> Dict[str, Any]:
|
||||
"""Generate insights from the prioritized debt analysis."""
|
||||
|
||||
# Category distribution
|
||||
categories = Counter(item["category"] for item in self.prioritized_items)
|
||||
|
||||
# Effort distribution
|
||||
total_effort = sum(item["effort_estimate"]["hours_estimate"]
|
||||
for item in self.prioritized_items)
|
||||
effort_by_category = defaultdict(float)
|
||||
for item in self.prioritized_items:
|
||||
effort_by_category[item["category"]] += item["effort_estimate"]["hours_estimate"]
|
||||
|
||||
# Priority distribution
|
||||
priorities = Counter()
|
||||
for item in self.prioritized_items:
|
||||
score = item["priority_score"]
|
||||
if score >= 8:
|
||||
priorities["critical"] += 1
|
||||
elif score >= 5:
|
||||
priorities["high"] += 1
|
||||
elif score >= 2:
|
||||
priorities["medium"] += 1
|
||||
else:
|
||||
priorities["low"] += 1
|
||||
|
||||
# Risk analysis
|
||||
high_risk_items = [item for item in self.prioritized_items
|
||||
if item["effort_estimate"]["risk_factor"] >= 1.5]
|
||||
|
||||
# Quick wins identification
|
||||
quick_wins = [item for item in self.prioritized_items
|
||||
if (item["effort_estimate"]["hours_estimate"] <= 8 and
|
||||
item["priority_score"] >= 3)]
|
||||
|
||||
# Cost analysis
|
||||
total_cost_of_delay = sum(item["cost_of_delay"] for item in self.prioritized_items)
|
||||
avg_interest_rate = sum(item["interest_rate"]["daily_cost"]
|
||||
for item in self.prioritized_items) / len(self.prioritized_items)
|
||||
|
||||
return {
|
||||
"category_distribution": dict(categories),
|
||||
"total_effort_hours": round(total_effort, 1),
|
||||
"effort_by_category": {k: round(v, 1) for k, v in effort_by_category.items()},
|
||||
"priority_distribution": dict(priorities),
|
||||
"high_risk_items_count": len(high_risk_items),
|
||||
"quick_wins_count": len(quick_wins),
|
||||
"total_cost_of_delay": round(total_cost_of_delay, 1),
|
||||
"average_daily_interest_rate": round(avg_interest_rate, 2),
|
||||
"top_categories_by_effort": sorted(effort_by_category.items(),
|
||||
key=lambda x: x[1], reverse=True)[:3]
|
||||
}
|
||||
|
||||
def _generate_charts_data(self) -> Dict[str, Any]:
|
||||
"""Generate data for charts and visualizations."""
|
||||
|
||||
# Priority vs Effort scatter plot data
|
||||
scatter_data = []
|
||||
for item in self.prioritized_items:
|
||||
scatter_data.append({
|
||||
"x": item["effort_estimate"]["hours_estimate"],
|
||||
"y": item["priority_score"],
|
||||
"label": item.get("description", "")[:50],
|
||||
"category": item["category"],
|
||||
"size": item["cost_of_delay"]
|
||||
})
|
||||
|
||||
# Category effort distribution (pie chart)
|
||||
effort_by_category = defaultdict(float)
|
||||
for item in self.prioritized_items:
|
||||
effort_by_category[item["category"]] += item["effort_estimate"]["hours_estimate"]
|
||||
|
||||
pie_data = [{"category": k, "effort": round(v, 1)}
|
||||
for k, v in effort_by_category.items()]
|
||||
|
||||
# Priority timeline (bar chart)
|
||||
timeline_data = []
|
||||
cumulative_effort = 0
|
||||
for i, item in enumerate(self.prioritized_items[:20]): # Top 20 items
|
||||
cumulative_effort += item["effort_estimate"]["hours_estimate"]
|
||||
timeline_data.append({
|
||||
"item_rank": i + 1,
|
||||
"description": item.get("description", "")[:30],
|
||||
"effort": item["effort_estimate"]["hours_estimate"],
|
||||
"cumulative_effort": round(cumulative_effort, 1),
|
||||
"priority_score": item["priority_score"]
|
||||
})
|
||||
|
||||
# Interest rate trend (line chart data structure)
|
||||
interest_trend_data = []
|
||||
for i, item in enumerate(self.prioritized_items):
|
||||
interest_trend_data.append({
|
||||
"item_index": i,
|
||||
"daily_cost": item["interest_rate"]["daily_cost"],
|
||||
"category": item["category"]
|
||||
})
|
||||
|
||||
return {
|
||||
"priority_effort_scatter": scatter_data,
|
||||
"category_effort_distribution": pie_data,
|
||||
"priority_timeline": timeline_data,
|
||||
"interest_rate_trend": interest_trend_data[:50] # Limit for performance
|
||||
}
|
||||
|
||||
def _generate_recommendations(self) -> List[str]:
|
||||
"""Generate actionable recommendations based on analysis."""
|
||||
recommendations = []
|
||||
|
||||
insights = self._generate_insights()
|
||||
|
||||
# Quick wins recommendation
|
||||
if insights["quick_wins_count"] > 0:
|
||||
recommendations.append(
|
||||
f"Start with {insights['quick_wins_count']} quick wins to build momentum "
|
||||
"and demonstrate immediate value from tech debt reduction efforts."
|
||||
)
|
||||
|
||||
# High-risk items
|
||||
if insights["high_risk_items_count"] > 5:
|
||||
recommendations.append(
|
||||
f"Plan careful execution for {insights['high_risk_items_count']} high-risk items. "
|
||||
"Consider pair programming, extra testing, and incremental approaches."
|
||||
)
|
||||
|
||||
# Category focus
|
||||
top_category = insights["top_categories_by_effort"][0][0]
|
||||
recommendations.append(
|
||||
f"Focus initial efforts on '{top_category}' category debt, which represents "
|
||||
f"the largest effort investment ({insights['top_categories_by_effort'][0][1]:.1f} hours)."
|
||||
)
|
||||
|
||||
# Cost of delay urgency
|
||||
if insights["average_daily_interest_rate"] > 5:
|
||||
recommendations.append(
|
||||
f"High average daily interest rate ({insights['average_daily_interest_rate']:.1f}) "
|
||||
"suggests urgent action needed. Consider increasing tech debt budget allocation."
|
||||
)
|
||||
|
||||
# Sprint planning
|
||||
sprints_needed = len(self.prioritized_items) / 10 # Rough estimate
|
||||
if sprints_needed > 12:
|
||||
recommendations.append(
|
||||
"Large debt backlog detected. Consider dedicating entire sprints to debt reduction "
|
||||
"rather than trying to fit debt work around features."
|
||||
)
|
||||
|
||||
# Team capacity
|
||||
total_effort = insights["total_effort_hours"]
|
||||
weeks_needed = total_effort / (self.sprint_capacity_hours * 0.2)
|
||||
if weeks_needed > 26: # Half a year
|
||||
recommendations.append(
|
||||
f"With current capacity allocation, debt backlog will take {weeks_needed:.0f} weeks. "
|
||||
"Consider increasing tech debt budget or focusing on highest-impact items only."
|
||||
)
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
def format_prioritized_report(analysis_result: Dict[str, Any]) -> str:
|
||||
"""Format the prioritization analysis in human-readable format."""
|
||||
output = []
|
||||
|
||||
# Header
|
||||
output.append("=" * 60)
|
||||
output.append("TECHNICAL DEBT PRIORITIZATION REPORT")
|
||||
output.append("=" * 60)
|
||||
metadata = analysis_result["metadata"]
|
||||
output.append(f"Analysis Date: {metadata['analysis_date']}")
|
||||
output.append(f"Framework: {metadata['framework_used'].upper()}")
|
||||
output.append(f"Team Size: {metadata['team_size']}")
|
||||
output.append(f"Sprint Capacity: {metadata['sprint_capacity_hours']} hours")
|
||||
output.append("")
|
||||
|
||||
# Executive Summary
|
||||
insights = analysis_result["insights"]
|
||||
output.append("EXECUTIVE SUMMARY")
|
||||
output.append("-" * 30)
|
||||
output.append(f"Total Debt Items: {metadata['total_items_analyzed']}")
|
||||
output.append(f"Total Effort Required: {insights['total_effort_hours']} hours")
|
||||
output.append(f"Total Cost of Delay: ${insights['total_cost_of_delay']:,.0f}")
|
||||
output.append(f"Quick Wins Available: {insights['quick_wins_count']}")
|
||||
output.append(f"High-Risk Items: {insights['high_risk_items_count']}")
|
||||
output.append("")
|
||||
|
||||
# Sprint Plan
|
||||
sprint_plan = analysis_result["sprint_allocation"]
|
||||
output.append("SPRINT ALLOCATION PLAN")
|
||||
output.append("-" * 30)
|
||||
output.append(f"Sprints Needed: {sprint_plan['total_sprints_needed']}")
|
||||
output.append(f"Hours per Sprint: {sprint_plan['debt_capacity_per_sprint']}")
|
||||
output.append("")
|
||||
|
||||
for sprint in sprint_plan["sprint_plan"][:3]: # Show first 3 sprints
|
||||
output.append(f"Sprint {sprint['sprint_number']} ({sprint['capacity_used']:.0%} capacity):")
|
||||
for item in sprint["items"][:3]: # Top 3 items per sprint
|
||||
output.append(f" • {item['description'][:50]}...")
|
||||
output.append(f" Effort: {item['effort_estimate']['hours_estimate']:.1f}h, "
|
||||
f"Priority: {item['priority_score']}")
|
||||
output.append("")
|
||||
|
||||
# Top Priority Items
|
||||
output.append("TOP 10 PRIORITY ITEMS")
|
||||
output.append("-" * 30)
|
||||
for i, item in enumerate(analysis_result["prioritized_backlog"][:10], 1):
|
||||
output.append(f"{i}. [{item['priority_score']:.1f}] {item['description']}")
|
||||
output.append(f" Category: {item['category']}, "
|
||||
f"Effort: {item['effort_estimate']['hours_estimate']:.1f}h, "
|
||||
f"Cost of Delay: ${item['cost_of_delay']:.0f}")
|
||||
if item["impact_tags"]:
|
||||
output.append(f" Tags: {', '.join(item['impact_tags'])}")
|
||||
output.append("")
|
||||
|
||||
# Recommendations
|
||||
output.append("RECOMMENDATIONS")
|
||||
output.append("-" * 30)
|
||||
for i, rec in enumerate(analysis_result["recommendations"], 1):
|
||||
output.append(f"{i}. {rec}")
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the debt prioritizer."""
|
||||
parser = argparse.ArgumentParser(description="Prioritize technical debt backlog")
|
||||
parser.add_argument("inventory_file", help="Path to debt inventory JSON file")
|
||||
parser.add_argument("--output", help="Output file path")
|
||||
parser.add_argument("--format", choices=["json", "text", "both"],
|
||||
default="both", help="Output format")
|
||||
parser.add_argument("--framework", choices=["cost_of_delay", "wsjf", "rice"],
|
||||
default="cost_of_delay", help="Prioritization framework")
|
||||
parser.add_argument("--team-size", type=int, default=5, help="Team size")
|
||||
parser.add_argument("--sprint-capacity", type=int, default=80,
|
||||
help="Sprint capacity in hours")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize prioritizer
|
||||
prioritizer = DebtPrioritizer(args.team_size, args.sprint_capacity)
|
||||
|
||||
# Load inventory
|
||||
if not prioritizer.load_debt_inventory(args.inventory_file):
|
||||
sys.exit(1)
|
||||
|
||||
# Analyze and prioritize
|
||||
try:
|
||||
analysis_result = prioritizer.analyze_and_prioritize(args.framework)
|
||||
except Exception as e:
|
||||
print(f"Analysis failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Output results
|
||||
if args.format in ["json", "both"]:
|
||||
json_output = json.dumps(analysis_result, indent=2, default=str)
|
||||
if args.output:
|
||||
output_path = args.output if args.output.endswith('.json') else f"{args.output}.json"
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(json_output)
|
||||
print(f"JSON report written to: {output_path}")
|
||||
else:
|
||||
print("JSON REPORT:")
|
||||
print("=" * 50)
|
||||
print(json_output)
|
||||
|
||||
if args.format in ["text", "both"]:
|
||||
text_output = format_prioritized_report(analysis_result)
|
||||
if args.output:
|
||||
output_path = args.output if args.output.endswith('.txt') else f"{args.output}.txt"
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(text_output)
|
||||
print(f"Text report written to: {output_path}")
|
||||
else:
|
||||
print("\nTEXT REPORT:")
|
||||
print("=" * 50)
|
||||
print(text_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
684
engineering/tech-debt-tracker/scripts/debt_scanner.py
Normal file
684
engineering/tech-debt-tracker/scripts/debt_scanner.py
Normal file
@@ -0,0 +1,684 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tech Debt Scanner
|
||||
|
||||
Scans a codebase directory for tech debt signals using AST parsing (Python) and
|
||||
regex patterns (any language). Detects various forms of technical debt and generates
|
||||
both JSON inventory and human-readable reports.
|
||||
|
||||
Usage:
|
||||
python debt_scanner.py /path/to/codebase
|
||||
python debt_scanner.py /path/to/codebase --config config.json
|
||||
python debt_scanner.py /path/to/codebase --output report.json --format both
|
||||
"""
|
||||
|
||||
import ast
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict, Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional, Set, Tuple
|
||||
|
||||
|
||||
class DebtScanner:
|
||||
"""Main scanner class for detecting technical debt in codebases."""
|
||||
|
||||
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
||||
self.config = self._load_default_config()
|
||||
if config:
|
||||
self.config.update(config)
|
||||
|
||||
self.debt_items = []
|
||||
self.stats = defaultdict(int)
|
||||
self.file_stats = {}
|
||||
|
||||
# Compile regex patterns for performance
|
||||
self._compile_patterns()
|
||||
|
||||
def _load_default_config(self) -> Dict[str, Any]:
|
||||
"""Load default configuration for debt detection."""
|
||||
return {
|
||||
"max_function_length": 50,
|
||||
"max_complexity": 10,
|
||||
"max_nesting_depth": 4,
|
||||
"max_file_size_lines": 500,
|
||||
"min_duplicate_lines": 3,
|
||||
"ignore_patterns": [
|
||||
"*.pyc", "__pycache__", ".git", ".svn", "node_modules",
|
||||
"build", "dist", "*.min.js", "*.map"
|
||||
],
|
||||
"file_extensions": {
|
||||
"python": [".py"],
|
||||
"javascript": [".js", ".jsx", ".ts", ".tsx"],
|
||||
"java": [".java"],
|
||||
"csharp": [".cs"],
|
||||
"cpp": [".cpp", ".cc", ".cxx", ".c", ".h", ".hpp"],
|
||||
"ruby": [".rb"],
|
||||
"php": [".php"],
|
||||
"go": [".go"],
|
||||
"rust": [".rs"],
|
||||
"kotlin": [".kt"]
|
||||
},
|
||||
"comment_patterns": {
|
||||
"todo": r"(?i)(TODO|FIXME|HACK|XXX|BUG)[\s:]*(.+)",
|
||||
"commented_code": r"^\s*#.*[=(){}\[\];].*",
|
||||
"magic_numbers": r"\b\d{2,}\b",
|
||||
"long_strings": r'["\'](.{100,})["\']'
|
||||
},
|
||||
"severity_weights": {
|
||||
"critical": 10,
|
||||
"high": 7,
|
||||
"medium": 5,
|
||||
"low": 2,
|
||||
"info": 1
|
||||
}
|
||||
}
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Compile regex patterns for better performance."""
|
||||
self.comment_regexes = {}
|
||||
for name, pattern in self.config["comment_patterns"].items():
|
||||
self.comment_regexes[name] = re.compile(pattern)
|
||||
|
||||
# Common code smells patterns
|
||||
self.smell_patterns = {
|
||||
"empty_catch": re.compile(r"except[^:]*:\s*pass\s*$", re.MULTILINE),
|
||||
"print_debug": re.compile(r"print\s*\([^)]*debug[^)]*\)", re.IGNORECASE),
|
||||
"hardcoded_paths": re.compile(r'["\'][/\\][^"\']*[/\\][^"\']*["\']'),
|
||||
"sql_injection_risk": re.compile(r'["\'].*%s.*["\'].*execute', re.IGNORECASE),
|
||||
}
|
||||
|
||||
def scan_directory(self, directory: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Scan a directory for tech debt.
|
||||
|
||||
Args:
|
||||
directory: Path to the directory to scan
|
||||
|
||||
Returns:
|
||||
Dictionary containing debt inventory and statistics
|
||||
"""
|
||||
directory_path = Path(directory)
|
||||
if not directory_path.exists():
|
||||
raise ValueError(f"Directory does not exist: {directory}")
|
||||
|
||||
print(f"Scanning directory: {directory}")
|
||||
print("=" * 50)
|
||||
|
||||
# Reset state
|
||||
self.debt_items = []
|
||||
self.stats = defaultdict(int)
|
||||
self.file_stats = {}
|
||||
|
||||
# Walk through directory
|
||||
for root, dirs, files in os.walk(directory):
|
||||
# Filter out ignored directories
|
||||
dirs[:] = [d for d in dirs if not self._should_ignore(d)]
|
||||
|
||||
for file in files:
|
||||
if self._should_ignore(file):
|
||||
continue
|
||||
|
||||
file_path = os.path.join(root, file)
|
||||
relative_path = os.path.relpath(file_path, directory)
|
||||
|
||||
try:
|
||||
self._scan_file(file_path, relative_path)
|
||||
except Exception as e:
|
||||
print(f"Error scanning {relative_path}: {e}")
|
||||
self.stats["scan_errors"] += 1
|
||||
|
||||
# Post-process results
|
||||
self._detect_duplicates(directory)
|
||||
self._calculate_priorities()
|
||||
|
||||
return self._generate_report(directory)
|
||||
|
||||
def _should_ignore(self, name: str) -> bool:
|
||||
"""Check if file/directory should be ignored."""
|
||||
for pattern in self.config["ignore_patterns"]:
|
||||
if "*" in pattern:
|
||||
if re.match(pattern.replace("*", ".*"), name):
|
||||
return True
|
||||
elif pattern in name:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _scan_file(self, file_path: str, relative_path: str):
|
||||
"""Scan a single file for tech debt."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
content = f.read()
|
||||
lines = content.splitlines()
|
||||
except Exception as e:
|
||||
print(f"Cannot read {relative_path}: {e}")
|
||||
return
|
||||
|
||||
file_ext = Path(file_path).suffix.lower()
|
||||
file_info = {
|
||||
"path": relative_path,
|
||||
"lines": len(lines),
|
||||
"size_kb": os.path.getsize(file_path) / 1024,
|
||||
"language": self._detect_language(file_ext),
|
||||
"debt_count": 0
|
||||
}
|
||||
|
||||
self.stats["files_scanned"] += 1
|
||||
self.stats["total_lines"] += len(lines)
|
||||
|
||||
# File size debt
|
||||
if len(lines) > self.config["max_file_size_lines"]:
|
||||
self._add_debt_item(
|
||||
"large_file",
|
||||
f"File is too large: {len(lines)} lines",
|
||||
relative_path,
|
||||
"medium",
|
||||
{"lines": len(lines), "recommended_max": self.config["max_file_size_lines"]}
|
||||
)
|
||||
file_info["debt_count"] += 1
|
||||
|
||||
# Language-specific analysis
|
||||
if file_info["language"] == "python" and file_ext == ".py":
|
||||
self._scan_python_file(relative_path, content, lines)
|
||||
else:
|
||||
self._scan_generic_file(relative_path, content, lines, file_info["language"])
|
||||
|
||||
# Common patterns for all languages
|
||||
self._scan_common_patterns(relative_path, content, lines)
|
||||
|
||||
self.file_stats[relative_path] = file_info
|
||||
|
||||
def _detect_language(self, file_ext: str) -> str:
|
||||
"""Detect programming language from file extension."""
|
||||
for lang, extensions in self.config["file_extensions"].items():
|
||||
if file_ext in extensions:
|
||||
return lang
|
||||
return "unknown"
|
||||
|
||||
def _scan_python_file(self, file_path: str, content: str, lines: List[str]):
|
||||
"""Scan Python files using AST parsing."""
|
||||
try:
|
||||
tree = ast.parse(content)
|
||||
analyzer = PythonASTAnalyzer(self.config)
|
||||
debt_items = analyzer.analyze(tree, file_path, lines)
|
||||
self.debt_items.extend(debt_items)
|
||||
self.stats["python_files"] += 1
|
||||
except SyntaxError as e:
|
||||
self._add_debt_item(
|
||||
"syntax_error",
|
||||
f"Python syntax error: {e}",
|
||||
file_path,
|
||||
"high",
|
||||
{"line": e.lineno, "error": str(e)}
|
||||
)
|
||||
|
||||
def _scan_generic_file(self, file_path: str, content: str, lines: List[str], language: str):
|
||||
"""Scan non-Python files using pattern matching."""
|
||||
# Detect long lines
|
||||
for i, line in enumerate(lines):
|
||||
if len(line) > 120:
|
||||
self._add_debt_item(
|
||||
"long_line",
|
||||
f"Line too long: {len(line)} characters",
|
||||
file_path,
|
||||
"low",
|
||||
{"line_number": i + 1, "length": len(line)}
|
||||
)
|
||||
|
||||
# Detect deep nesting (approximate)
|
||||
for i, line in enumerate(lines):
|
||||
indent_level = len(line) - len(line.lstrip())
|
||||
if language in ["python"]:
|
||||
indent_level = indent_level // 4 # Python uses 4-space indents
|
||||
elif language in ["javascript", "java", "csharp", "cpp"]:
|
||||
# Count braces for brace-based languages
|
||||
brace_level = content[:content.find('\n'.join(lines[:i+1]))].count('{') - content[:content.find('\n'.join(lines[:i+1]))].count('}')
|
||||
if brace_level > self.config["max_nesting_depth"]:
|
||||
self._add_debt_item(
|
||||
"deep_nesting",
|
||||
f"Deep nesting detected: {brace_level} levels",
|
||||
file_path,
|
||||
"medium",
|
||||
{"line_number": i + 1, "nesting_level": brace_level}
|
||||
)
|
||||
|
||||
def _scan_common_patterns(self, file_path: str, content: str, lines: List[str]):
|
||||
"""Scan for common patterns across all file types."""
|
||||
# TODO/FIXME comments
|
||||
for i, line in enumerate(lines):
|
||||
for pattern_name, regex in self.comment_regexes.items():
|
||||
match = regex.search(line)
|
||||
if match:
|
||||
if pattern_name == "todo":
|
||||
self._add_debt_item(
|
||||
"todo_comment",
|
||||
f"TODO/FIXME comment: {match.group(0)}",
|
||||
file_path,
|
||||
"low",
|
||||
{"line_number": i + 1, "comment": match.group(0).strip()}
|
||||
)
|
||||
|
||||
# Code smells
|
||||
for smell_name, pattern in self.smell_patterns.items():
|
||||
matches = pattern.finditer(content)
|
||||
for match in matches:
|
||||
line_num = content[:match.start()].count('\n') + 1
|
||||
self._add_debt_item(
|
||||
smell_name,
|
||||
f"Code smell detected: {smell_name}",
|
||||
file_path,
|
||||
"medium",
|
||||
{"line_number": line_num, "pattern": match.group(0)[:100]}
|
||||
)
|
||||
|
||||
def _detect_duplicates(self, directory: str):
|
||||
"""Detect duplicate code blocks across files."""
|
||||
# Simple duplicate detection based on exact line matches
|
||||
line_hashes = defaultdict(list)
|
||||
|
||||
for file_path, file_info in self.file_stats.items():
|
||||
try:
|
||||
full_path = os.path.join(directory, file_path)
|
||||
with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
for i in range(len(lines) - self.config["min_duplicate_lines"] + 1):
|
||||
block = ''.join(lines[i:i + self.config["min_duplicate_lines"]])
|
||||
block_hash = hash(block.strip())
|
||||
if len(block.strip()) > 50: # Only consider substantial blocks
|
||||
line_hashes[block_hash].append((file_path, i + 1, block))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Report duplicates
|
||||
for block_hash, occurrences in line_hashes.items():
|
||||
if len(occurrences) > 1:
|
||||
for file_path, line_num, block in occurrences:
|
||||
self._add_debt_item(
|
||||
"duplicate_code",
|
||||
f"Duplicate code block found in {len(occurrences)} files",
|
||||
file_path,
|
||||
"medium",
|
||||
{
|
||||
"line_number": line_num,
|
||||
"duplicate_count": len(occurrences),
|
||||
"other_files": [f[0] for f in occurrences if f[0] != file_path]
|
||||
}
|
||||
)
|
||||
|
||||
def _calculate_priorities(self):
|
||||
"""Calculate priority scores for debt items."""
|
||||
severity_weights = self.config["severity_weights"]
|
||||
|
||||
for item in self.debt_items:
|
||||
base_score = severity_weights.get(item["severity"], 1)
|
||||
|
||||
# Adjust based on debt type
|
||||
type_multipliers = {
|
||||
"syntax_error": 2.0,
|
||||
"security_risk": 1.8,
|
||||
"large_function": 1.5,
|
||||
"high_complexity": 1.4,
|
||||
"duplicate_code": 1.3,
|
||||
"todo_comment": 0.5
|
||||
}
|
||||
|
||||
multiplier = type_multipliers.get(item["type"], 1.0)
|
||||
item["priority_score"] = int(base_score * multiplier)
|
||||
|
||||
# Set priority category
|
||||
if item["priority_score"] >= 15:
|
||||
item["priority"] = "critical"
|
||||
elif item["priority_score"] >= 10:
|
||||
item["priority"] = "high"
|
||||
elif item["priority_score"] >= 5:
|
||||
item["priority"] = "medium"
|
||||
else:
|
||||
item["priority"] = "low"
|
||||
|
||||
def _add_debt_item(self, debt_type: str, description: str, file_path: str,
|
||||
severity: str, metadata: Dict[str, Any]):
|
||||
"""Add a debt item to the inventory."""
|
||||
item = {
|
||||
"id": f"DEBT-{len(self.debt_items) + 1:04d}",
|
||||
"type": debt_type,
|
||||
"description": description,
|
||||
"file_path": file_path,
|
||||
"severity": severity,
|
||||
"metadata": metadata,
|
||||
"detected_date": datetime.now().isoformat(),
|
||||
"status": "identified"
|
||||
}
|
||||
|
||||
self.debt_items.append(item)
|
||||
self.stats[f"debt_{debt_type}"] += 1
|
||||
self.stats["total_debt_items"] += 1
|
||||
|
||||
if file_path in self.file_stats:
|
||||
self.file_stats[file_path]["debt_count"] += 1
|
||||
|
||||
def _generate_report(self, directory: str) -> Dict[str, Any]:
|
||||
"""Generate the final debt report."""
|
||||
# Sort debt items by priority score
|
||||
self.debt_items.sort(key=lambda x: x.get("priority_score", 0), reverse=True)
|
||||
|
||||
# Calculate summary statistics
|
||||
priority_counts = Counter(item["priority"] for item in self.debt_items)
|
||||
type_counts = Counter(item["type"] for item in self.debt_items)
|
||||
|
||||
# Calculate health score (0-100, higher is better)
|
||||
total_files = self.stats.get("files_scanned", 1)
|
||||
debt_density = len(self.debt_items) / total_files
|
||||
health_score = max(0, 100 - (debt_density * 10))
|
||||
|
||||
report = {
|
||||
"scan_metadata": {
|
||||
"directory": directory,
|
||||
"scan_date": datetime.now().isoformat(),
|
||||
"scanner_version": "1.0.0",
|
||||
"config": self.config
|
||||
},
|
||||
"summary": {
|
||||
"total_files_scanned": self.stats.get("files_scanned", 0),
|
||||
"total_lines_scanned": self.stats.get("total_lines", 0),
|
||||
"total_debt_items": len(self.debt_items),
|
||||
"health_score": round(health_score, 1),
|
||||
"debt_density": round(debt_density, 2),
|
||||
"priority_breakdown": dict(priority_counts),
|
||||
"type_breakdown": dict(type_counts)
|
||||
},
|
||||
"debt_items": self.debt_items,
|
||||
"file_statistics": self.file_stats,
|
||||
"recommendations": self._generate_recommendations()
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
def _generate_recommendations(self) -> List[str]:
|
||||
"""Generate actionable recommendations based on findings."""
|
||||
recommendations = []
|
||||
|
||||
# Priority-based recommendations
|
||||
high_priority_count = len([item for item in self.debt_items
|
||||
if item.get("priority") in ["critical", "high"]])
|
||||
|
||||
if high_priority_count > 10:
|
||||
recommendations.append(
|
||||
f"Address {high_priority_count} high-priority debt items immediately - "
|
||||
"they pose significant risk to code quality and maintainability."
|
||||
)
|
||||
|
||||
# Type-specific recommendations
|
||||
type_counts = Counter(item["type"] for item in self.debt_items)
|
||||
|
||||
if type_counts.get("large_function", 0) > 5:
|
||||
recommendations.append(
|
||||
"Consider refactoring large functions into smaller, more focused units. "
|
||||
"This will improve readability and testability."
|
||||
)
|
||||
|
||||
if type_counts.get("duplicate_code", 0) > 3:
|
||||
recommendations.append(
|
||||
"Extract duplicate code into reusable functions or modules. "
|
||||
"This reduces maintenance burden and potential for inconsistent changes."
|
||||
)
|
||||
|
||||
if type_counts.get("todo_comment", 0) > 20:
|
||||
recommendations.append(
|
||||
"Review and address TODO/FIXME comments. Consider creating proper "
|
||||
"tickets for substantial work items."
|
||||
)
|
||||
|
||||
# General recommendations
|
||||
total_files = self.stats.get("files_scanned", 1)
|
||||
if len(self.debt_items) / total_files > 2:
|
||||
recommendations.append(
|
||||
"High debt density detected. Consider establishing coding standards "
|
||||
"and regular code review processes to prevent debt accumulation."
|
||||
)
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("Code quality looks good! Continue current practices.")
|
||||
|
||||
return recommendations
|
||||
|
||||
|
||||
class PythonASTAnalyzer(ast.NodeVisitor):
|
||||
"""AST analyzer for Python-specific debt detection."""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
self.config = config
|
||||
self.debt_items = []
|
||||
self.current_file = ""
|
||||
self.lines = []
|
||||
self.function_stack = []
|
||||
|
||||
def analyze(self, tree: ast.AST, file_path: str, lines: List[str]) -> List[Dict[str, Any]]:
|
||||
"""Analyze Python AST for tech debt."""
|
||||
self.debt_items = []
|
||||
self.current_file = file_path
|
||||
self.lines = lines
|
||||
self.function_stack = []
|
||||
|
||||
self.visit(tree)
|
||||
return self.debt_items
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef):
|
||||
"""Analyze function definitions."""
|
||||
self.function_stack.append(node.name)
|
||||
|
||||
# Calculate function length
|
||||
func_length = node.end_lineno - node.lineno + 1
|
||||
if func_length > self.config["max_function_length"]:
|
||||
self._add_debt(
|
||||
"large_function",
|
||||
f"Function '{node.name}' is too long: {func_length} lines",
|
||||
node.lineno,
|
||||
"medium",
|
||||
{"function_name": node.name, "length": func_length}
|
||||
)
|
||||
|
||||
# Check for missing docstring
|
||||
if not ast.get_docstring(node):
|
||||
self._add_debt(
|
||||
"missing_docstring",
|
||||
f"Function '{node.name}' missing docstring",
|
||||
node.lineno,
|
||||
"low",
|
||||
{"function_name": node.name}
|
||||
)
|
||||
|
||||
# Calculate cyclomatic complexity
|
||||
complexity = self._calculate_complexity(node)
|
||||
if complexity > self.config["max_complexity"]:
|
||||
self._add_debt(
|
||||
"high_complexity",
|
||||
f"Function '{node.name}' has high complexity: {complexity}",
|
||||
node.lineno,
|
||||
"high",
|
||||
{"function_name": node.name, "complexity": complexity}
|
||||
)
|
||||
|
||||
# Check parameter count
|
||||
param_count = len(node.args.args)
|
||||
if param_count > 5:
|
||||
self._add_debt(
|
||||
"too_many_parameters",
|
||||
f"Function '{node.name}' has too many parameters: {param_count}",
|
||||
node.lineno,
|
||||
"medium",
|
||||
{"function_name": node.name, "parameter_count": param_count}
|
||||
)
|
||||
|
||||
self.generic_visit(node)
|
||||
self.function_stack.pop()
|
||||
|
||||
def visit_ClassDef(self, node: ast.ClassDef):
|
||||
"""Analyze class definitions."""
|
||||
# Check for missing docstring
|
||||
if not ast.get_docstring(node):
|
||||
self._add_debt(
|
||||
"missing_docstring",
|
||||
f"Class '{node.name}' missing docstring",
|
||||
node.lineno,
|
||||
"low",
|
||||
{"class_name": node.name}
|
||||
)
|
||||
|
||||
# Check for too many methods
|
||||
methods = [n for n in node.body if isinstance(n, ast.FunctionDef)]
|
||||
if len(methods) > 20:
|
||||
self._add_debt(
|
||||
"large_class",
|
||||
f"Class '{node.name}' has too many methods: {len(methods)}",
|
||||
node.lineno,
|
||||
"medium",
|
||||
{"class_name": node.name, "method_count": len(methods)}
|
||||
)
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def _calculate_complexity(self, node: ast.FunctionDef) -> int:
|
||||
"""Calculate cyclomatic complexity of a function."""
|
||||
complexity = 1 # Base complexity
|
||||
|
||||
for child in ast.walk(node):
|
||||
if isinstance(child, (ast.If, ast.While, ast.For, ast.AsyncFor)):
|
||||
complexity += 1
|
||||
elif isinstance(child, ast.ExceptHandler):
|
||||
complexity += 1
|
||||
elif isinstance(child, ast.BoolOp):
|
||||
complexity += len(child.values) - 1
|
||||
|
||||
return complexity
|
||||
|
||||
def _add_debt(self, debt_type: str, description: str, line_number: int,
|
||||
severity: str, metadata: Dict[str, Any]):
|
||||
"""Add a debt item to the collection."""
|
||||
item = {
|
||||
"id": f"DEBT-{len(self.debt_items) + 1:04d}",
|
||||
"type": debt_type,
|
||||
"description": description,
|
||||
"file_path": self.current_file,
|
||||
"line_number": line_number,
|
||||
"severity": severity,
|
||||
"metadata": metadata,
|
||||
"detected_date": datetime.now().isoformat(),
|
||||
"status": "identified"
|
||||
}
|
||||
self.debt_items.append(item)
|
||||
|
||||
|
||||
def format_human_readable_report(report: Dict[str, Any]) -> str:
|
||||
"""Format the report in human-readable format."""
|
||||
output = []
|
||||
|
||||
# Header
|
||||
output.append("=" * 60)
|
||||
output.append("TECHNICAL DEBT SCAN REPORT")
|
||||
output.append("=" * 60)
|
||||
output.append(f"Directory: {report['scan_metadata']['directory']}")
|
||||
output.append(f"Scan Date: {report['scan_metadata']['scan_date']}")
|
||||
output.append(f"Scanner Version: {report['scan_metadata']['scanner_version']}")
|
||||
output.append("")
|
||||
|
||||
# Summary
|
||||
summary = report["summary"]
|
||||
output.append("SUMMARY")
|
||||
output.append("-" * 30)
|
||||
output.append(f"Files Scanned: {summary['total_files_scanned']}")
|
||||
output.append(f"Lines Scanned: {summary['total_lines_scanned']:,}")
|
||||
output.append(f"Total Debt Items: {summary['total_debt_items']}")
|
||||
output.append(f"Health Score: {summary['health_score']}/100")
|
||||
output.append(f"Debt Density: {summary['debt_density']} items/file")
|
||||
output.append("")
|
||||
|
||||
# Priority breakdown
|
||||
output.append("PRIORITY BREAKDOWN")
|
||||
output.append("-" * 30)
|
||||
for priority, count in summary["priority_breakdown"].items():
|
||||
output.append(f"{priority.capitalize()}: {count}")
|
||||
output.append("")
|
||||
|
||||
# Top debt items
|
||||
output.append("TOP DEBT ITEMS")
|
||||
output.append("-" * 30)
|
||||
top_items = report["debt_items"][:10]
|
||||
for i, item in enumerate(top_items, 1):
|
||||
output.append(f"{i}. [{item['priority'].upper()}] {item['description']}")
|
||||
output.append(f" File: {item['file_path']}")
|
||||
if 'line_number' in item:
|
||||
output.append(f" Line: {item['line_number']}")
|
||||
output.append("")
|
||||
|
||||
# Recommendations
|
||||
output.append("RECOMMENDATIONS")
|
||||
output.append("-" * 30)
|
||||
for i, rec in enumerate(report["recommendations"], 1):
|
||||
output.append(f"{i}. {rec}")
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the debt scanner."""
|
||||
parser = argparse.ArgumentParser(description="Scan codebase for technical debt")
|
||||
parser.add_argument("directory", help="Directory to scan")
|
||||
parser.add_argument("--config", help="Configuration file (JSON)")
|
||||
parser.add_argument("--output", help="Output file path")
|
||||
parser.add_argument("--format", choices=["json", "text", "both"],
|
||||
default="both", help="Output format")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load configuration
|
||||
config = None
|
||||
if args.config:
|
||||
try:
|
||||
with open(args.config, 'r') as f:
|
||||
config = json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading config: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Run scan
|
||||
scanner = DebtScanner(config)
|
||||
try:
|
||||
report = scanner.scan_directory(args.directory)
|
||||
except Exception as e:
|
||||
print(f"Scan failed: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Output results
|
||||
if args.format in ["json", "both"]:
|
||||
json_output = json.dumps(report, indent=2, default=str)
|
||||
if args.output:
|
||||
output_path = args.output if args.output.endswith('.json') else f"{args.output}.json"
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(json_output)
|
||||
print(f"JSON report written to: {output_path}")
|
||||
else:
|
||||
print("\nJSON REPORT:")
|
||||
print("=" * 50)
|
||||
print(json_output)
|
||||
|
||||
if args.format in ["text", "both"]:
|
||||
text_output = format_human_readable_report(report)
|
||||
if args.output:
|
||||
output_path = args.output if args.output.endswith('.txt') else f"{args.output}.txt"
|
||||
with open(output_path, 'w') as f:
|
||||
f.write(text_output)
|
||||
print(f"Text report written to: {output_path}")
|
||||
else:
|
||||
print("\nTEXT REPORT:")
|
||||
print("=" * 50)
|
||||
print(text_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user