129 lines
2.8 KiB
Python
Executable File
129 lines
2.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Generate an operational runbook skeleton for a service."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
|
|
def build_runbook(service: str, owner: str, environment: str) -> str:
|
|
today = date.today().isoformat()
|
|
return f"""# Runbook - {service}
|
|
|
|
- Service: {service}
|
|
- Owner: {owner}
|
|
- Environment: {environment}
|
|
- Last verified: {today}
|
|
|
|
## Overview
|
|
|
|
Describe the service purpose, dependencies, and critical user impact.
|
|
|
|
## Preconditions
|
|
|
|
- Access to deployment platform
|
|
- Access to logs/metrics
|
|
- Access to secret/config manager
|
|
|
|
## Start Procedure
|
|
|
|
1. Pull latest config/secrets.
|
|
2. Start service process.
|
|
3. Confirm process is healthy.
|
|
|
|
```bash
|
|
# Example
|
|
# systemctl start {service}
|
|
```
|
|
|
|
## Stop Procedure
|
|
|
|
1. Drain traffic if applicable.
|
|
2. Stop service process.
|
|
3. Confirm no active workers remain.
|
|
|
|
```bash
|
|
# Example
|
|
# systemctl stop {service}
|
|
```
|
|
|
|
## Health Checks
|
|
|
|
- HTTP health endpoint
|
|
- Dependency connectivity checks
|
|
- Error-rate and latency checks
|
|
|
|
```bash
|
|
# Example
|
|
# curl -sf https://{service}.example.com/health
|
|
```
|
|
|
|
## Deployment Checklist
|
|
|
|
1. Verify CI status and artifact integrity.
|
|
2. Apply migrations (if required) in safe order.
|
|
3. Deploy service revision.
|
|
4. Run smoke checks.
|
|
5. Observe metrics for 10-15 minutes.
|
|
|
|
## Rollback
|
|
|
|
1. Identify last known good release.
|
|
2. Re-deploy previous version.
|
|
3. Re-run health checks.
|
|
4. Communicate rollback status to stakeholders.
|
|
|
|
```bash
|
|
# Example
|
|
# deployctl rollback --service {service}
|
|
```
|
|
|
|
## Incident Response
|
|
|
|
1. Classify severity.
|
|
2. Contain user impact.
|
|
3. Triage likely failing component.
|
|
4. Escalate if SLA risk is high.
|
|
|
|
## Escalation
|
|
|
|
- L1: On-call engineer
|
|
- L2: Service owner ({owner})
|
|
- L3: Platform/Engineering leadership
|
|
|
|
## Post-Incident
|
|
|
|
1. Write timeline and root cause.
|
|
2. Define corrective actions with owners.
|
|
3. Update this runbook with missing steps.
|
|
"""
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="Generate a markdown runbook skeleton.")
|
|
parser.add_argument("service", help="Service name")
|
|
parser.add_argument("--owner", default="platform-team", help="Service owner label")
|
|
parser.add_argument("--environment", default="production", help="Primary environment")
|
|
parser.add_argument("--output", help="Optional output path (prints to stdout if omitted)")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
markdown = build_runbook(args.service, owner=args.owner, environment=args.environment)
|
|
|
|
if args.output:
|
|
path = Path(args.output)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(markdown, encoding="utf-8")
|
|
print(f"Wrote runbook skeleton to {path}")
|
|
else:
|
|
print(markdown)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|