Merge pull request #332 from alirezarezvani/dev
This commit is contained in:
@@ -271,7 +271,7 @@ from typing import List
|
||||
class Settings(BaseSettings):
|
||||
DATABASE_URL: str = "postgresql://user:pass@localhost:5432/db"
|
||||
ALLOWED_ORIGINS: List[str] = ["http://localhost:3000", "http://localhost:5173"]
|
||||
SECRET_KEY: str = "change-me-in-production"
|
||||
SECRET_KEY: str = "change-me-in-production" # ⚠️ SCAFFOLDING PLACEHOLDER — replace before deployment
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
@@ -627,7 +627,7 @@ export default config;
|
||||
module.exports = { reactStrictMode: true };
|
||||
''',
|
||||
".env.example": '''DATABASE_URL="postgresql://user:password@localhost:5432/dbname"
|
||||
SECRET_KEY="your-secret-here"
|
||||
SECRET_KEY="your-secret-here" # ⚠️ SCAFFOLDING PLACEHOLDER — replace before deployment
|
||||
''',
|
||||
".gitignore": '''node_modules/
|
||||
.next/
|
||||
|
||||
@@ -13,431 +13,71 @@ description: "Agent Workflow Designer"
|
||||
|
||||
## Overview
|
||||
|
||||
Design production-grade multi-agent orchestration systems. Covers five core patterns (sequential pipeline, parallel fan-out/fan-in, hierarchical delegation, event-driven, consensus), platform-specific implementations, handoff protocols, state management, error recovery, context window budgeting, and cost optimization.
|
||||
|
||||
---
|
||||
Design production-grade multi-agent workflows with clear pattern choice, handoff contracts, failure handling, and cost/context controls.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- Pattern selection guide for any orchestration requirement
|
||||
- Handoff protocol templates (structured context passing)
|
||||
- State management patterns for multi-agent workflows
|
||||
- Error recovery and retry strategies
|
||||
- Context window budget management
|
||||
- Cost optimization strategies per platform
|
||||
- Platform-specific configs: Claude Code Agent Teams, OpenClaw, CrewAI, AutoGen
|
||||
- Workflow pattern selection for multi-step agent systems
|
||||
- Skeleton config generation for fast workflow bootstrapping
|
||||
- Context and cost discipline across long-running flows
|
||||
- Error recovery and retry strategy scaffolding
|
||||
- Documentation pointers for operational pattern tradeoffs
|
||||
|
||||
---
|
||||
|
||||
## When to Use
|
||||
|
||||
- Building a multi-step AI pipeline that exceeds one agent's context capacity
|
||||
- Parallelizing research, generation, or analysis tasks for speed
|
||||
- Creating specialist agents with defined roles and handoff contracts
|
||||
- Designing fault-tolerant AI workflows for production
|
||||
- A single prompt is insufficient for task complexity
|
||||
- You need specialist agents with explicit boundaries
|
||||
- You want deterministic workflow structure before implementation
|
||||
- You need validation loops for quality or safety gates
|
||||
|
||||
---
|
||||
|
||||
## Pattern Selection Guide
|
||||
## Quick Start
|
||||
|
||||
```
|
||||
Is the task sequential (each step needs previous output)?
|
||||
YES → Sequential Pipeline
|
||||
NO → Can tasks run in parallel?
|
||||
YES → Parallel Fan-out/Fan-in
|
||||
NO → Is there a hierarchy of decisions?
|
||||
YES → Hierarchical Delegation
|
||||
NO → Is it event-triggered?
|
||||
YES → Event-Driven
|
||||
NO → Need consensus/validation?
|
||||
YES → Consensus Pattern
|
||||
```bash
|
||||
# Generate a sequential workflow skeleton
|
||||
python3 scripts/workflow_scaffolder.py sequential --name content-pipeline
|
||||
|
||||
# Generate an orchestrator workflow and save it
|
||||
python3 scripts/workflow_scaffolder.py orchestrator --name incident-triage --output workflows/incident-triage.json
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 1: Sequential Pipeline
|
||||
## Pattern Map
|
||||
|
||||
**Use when:** Each step depends on the previous output. Research → Draft → Review → Polish.
|
||||
- `sequential`: strict step-by-step dependency chain
|
||||
- `parallel`: fan-out/fan-in for independent subtasks
|
||||
- `router`: dispatch by intent/type with fallback
|
||||
- `orchestrator`: planner coordinates specialists with dependencies
|
||||
- `evaluator`: generator + quality gate loop
|
||||
|
||||
```python
|
||||
# sequential_pipeline.py
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable, Any
|
||||
import anthropic
|
||||
|
||||
@dataclass
|
||||
class PipelineStage:
|
||||
name: "str"
|
||||
system_prompt: str
|
||||
input_key: str # what to take from state
|
||||
output_key: str # what to write to state
|
||||
model: str = "claude-3-5-sonnet-20241022"
|
||||
max_tokens: int = 2048
|
||||
|
||||
class SequentialPipeline:
|
||||
def __init__(self, stages: list[PipelineStage]):
|
||||
self.stages = stages
|
||||
self.client = anthropic.Anthropic()
|
||||
|
||||
def run(self, initial_input: str) -> dict:
|
||||
state = {"input": initial_input}
|
||||
|
||||
for stage in self.stages:
|
||||
print(f"[{stage.name}] Processing...")
|
||||
|
||||
stage_input = state.get(stage.input_key, "")
|
||||
|
||||
response = self.client.messages.create(
|
||||
model=stage.model,
|
||||
max_tokens=stage.max_tokens,
|
||||
system=stage.system_prompt,
|
||||
messages=[{"role": "user", "content": stage_input}],
|
||||
)
|
||||
|
||||
state[stage.output_key] = response.content[0].text
|
||||
state[f"{stage.name}_tokens"] = response.usage.input_tokens + response.usage.output_tokens
|
||||
|
||||
print(f"[{stage.name}] Done. Tokens: {state[f'{stage.name}_tokens']}")
|
||||
|
||||
return state
|
||||
|
||||
# Example: Blog post pipeline
|
||||
pipeline = SequentialPipeline([
|
||||
PipelineStage(
|
||||
name="researcher",
|
||||
system_prompt="You are a research specialist. Given a topic, produce a structured research brief with: key facts, statistics, expert perspectives, and controversy points.",
|
||||
input_key="input",
|
||||
output_key="research",
|
||||
),
|
||||
PipelineStage(
|
||||
name="writer",
|
||||
system_prompt="You are a senior content writer. Using the research provided, write a compelling 800-word blog post with a clear hook, 3 main sections, and a strong CTA.",
|
||||
input_key="research",
|
||||
output_key="draft",
|
||||
),
|
||||
PipelineStage(
|
||||
name="editor",
|
||||
system_prompt="You are a copy editor. Review the draft for: clarity, flow, grammar, and SEO. Return the improved version only, no commentary.",
|
||||
input_key="draft",
|
||||
output_key="final",
|
||||
),
|
||||
])
|
||||
```
|
||||
Detailed templates: `references/workflow-patterns.md`
|
||||
|
||||
---
|
||||
|
||||
## Pattern 2: Parallel Fan-out / Fan-in
|
||||
## Recommended Workflow
|
||||
|
||||
**Use when:** Independent tasks that can run concurrently. Research 5 competitors simultaneously.
|
||||
|
||||
```python
|
||||
# parallel_fanout.py
|
||||
import asyncio
|
||||
import anthropic
|
||||
from typing import Any
|
||||
|
||||
async def run_agent(client, task_name: "str-system-str-user-str-model-str"claude-3-5-sonnet-20241022") -> dict:
|
||||
"""Single async agent call"""
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
def _call():
|
||||
return client.messages.create(
|
||||
model=model,
|
||||
max_tokens=2048,
|
||||
system=system,
|
||||
messages=[{"role": "user", "content": user}],
|
||||
)
|
||||
|
||||
response = await loop.run_in_executor(None, _call)
|
||||
return {
|
||||
"task": task_name,
|
||||
"output": response.content[0].text,
|
||||
"tokens": response.usage.input_tokens + response.usage.output_tokens,
|
||||
}
|
||||
|
||||
async def parallel_research(competitors: list[str], research_type: str) -> dict:
|
||||
"""Fan-out: research all competitors in parallel. Fan-in: synthesize results."""
|
||||
client = anthropic.Anthropic()
|
||||
|
||||
# FAN-OUT: spawn parallel agent calls
|
||||
tasks = [
|
||||
run_agent(
|
||||
client,
|
||||
task_name=competitor,
|
||||
system=f"You are a competitive intelligence analyst. Research {competitor} and provide: pricing, key features, target market, and known weaknesses.",
|
||||
user=f"Analyze {competitor} for comparison with our product in the {research_type} market.",
|
||||
)
|
||||
for competitor in competitors
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Handle failures gracefully
|
||||
successful = [r for r in results if not isinstance(r, Exception)]
|
||||
failed = [r for r in results if isinstance(r, Exception)]
|
||||
|
||||
if failed:
|
||||
print(f"Warning: {len(failed)} research tasks failed: {failed}")
|
||||
|
||||
# FAN-IN: synthesize
|
||||
combined_research = "\n\n".join([
|
||||
f"## {r['task']}\n{r['output']}" for r in successful
|
||||
])
|
||||
|
||||
synthesis = await run_agent(
|
||||
client,
|
||||
task_name="synthesizer",
|
||||
system="You are a strategic analyst. Synthesize competitor research into a concise comparison matrix and strategic recommendations.",
|
||||
user=f"Synthesize these competitor analyses:\n\n{combined_research}",
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
)
|
||||
|
||||
return {
|
||||
"individual_analyses": successful,
|
||||
"synthesis": synthesis["output"],
|
||||
"total_tokens": sum(r["tokens"] for r in successful) + synthesis["tokens"],
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pattern 3: Hierarchical Delegation
|
||||
|
||||
**Use when:** Complex tasks with subtask discovery. Orchestrator breaks down work, delegates to specialists.
|
||||
|
||||
```python
|
||||
# hierarchical_delegation.py
|
||||
import json
|
||||
import anthropic
|
||||
|
||||
ORCHESTRATOR_SYSTEM = """You are an orchestration agent. Your job is to:
|
||||
1. Analyze the user's request
|
||||
2. Break it into subtasks
|
||||
3. Assign each to the appropriate specialist agent
|
||||
4. Collect results and synthesize
|
||||
|
||||
Available specialists:
|
||||
- researcher: finds facts, data, and information
|
||||
- writer: creates content and documents
|
||||
- coder: writes and reviews code
|
||||
- analyst: analyzes data and produces insights
|
||||
|
||||
Respond with a JSON plan:
|
||||
{
|
||||
"subtasks": [
|
||||
{"id": "1", "agent": "researcher", "task": "...", "depends_on": []},
|
||||
{"id": "2", "agent": "writer", "task": "...", "depends_on": ["1"]}
|
||||
]
|
||||
}"""
|
||||
|
||||
SPECIALIST_SYSTEMS = {
|
||||
"researcher": "You are a research specialist. Find accurate, relevant information and cite sources when possible.",
|
||||
"writer": "You are a professional writer. Create clear, engaging content in the requested format.",
|
||||
"coder": "You are a senior software engineer. Write clean, well-commented code with error handling.",
|
||||
"analyst": "You are a data analyst. Provide structured analysis with evidence-backed conclusions.",
|
||||
}
|
||||
|
||||
class HierarchicalOrchestrator:
|
||||
def __init__(self):
|
||||
self.client = anthropic.Anthropic()
|
||||
|
||||
def run(self, user_request: str) -> str:
|
||||
# 1. Orchestrator creates plan
|
||||
plan_response = self.client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=1024,
|
||||
system=ORCHESTRATOR_SYSTEM,
|
||||
messages=[{"role": "user", "content": user_request}],
|
||||
)
|
||||
|
||||
plan = json.loads(plan_response.content[0].text)
|
||||
results = {}
|
||||
|
||||
# 2. Execute subtasks respecting dependencies
|
||||
for subtask in self._topological_sort(plan["subtasks"]):
|
||||
context = self._build_context(subtask, results)
|
||||
specialist = SPECIALIST_SYSTEMS[subtask["agent"]]
|
||||
|
||||
result = self.client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=2048,
|
||||
system=specialist,
|
||||
messages=[{"role": "user", "content": f"{context}\n\nTask: {subtask['task']}"}],
|
||||
)
|
||||
results[subtask["id"]] = result.content[0].text
|
||||
|
||||
# 3. Final synthesis
|
||||
all_results = "\n\n".join([f"### {k}\n{v}" for k, v in results.items()])
|
||||
synthesis = self.client.messages.create(
|
||||
model="claude-3-5-sonnet-20241022",
|
||||
max_tokens=2048,
|
||||
system="Synthesize the specialist outputs into a coherent final response.",
|
||||
messages=[{"role": "user", "content": f"Original request: {user_request}\n\nSpecialist outputs:\n{all_results}"}],
|
||||
)
|
||||
return synthesis.content[0].text
|
||||
|
||||
def _build_context(self, subtask: dict, results: dict) -> str:
|
||||
if not subtask.get("depends_on"):
|
||||
return ""
|
||||
deps = [f"Output from task {dep}:\n{results[dep]}" for dep in subtask["depends_on"] if dep in results]
|
||||
return "Previous results:\n" + "\n\n".join(deps) if deps else ""
|
||||
|
||||
def _topological_sort(self, subtasks: list) -> list:
|
||||
# Simple ordered execution respecting depends_on
|
||||
ordered, remaining = [], list(subtasks)
|
||||
completed = set()
|
||||
while remaining:
|
||||
for task in remaining:
|
||||
if all(dep in completed for dep in task.get("depends_on", [])):
|
||||
ordered.append(task)
|
||||
completed.add(task["id"])
|
||||
remaining.remove(task)
|
||||
break
|
||||
return ordered
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Handoff Protocol Template
|
||||
|
||||
```python
|
||||
# Standard handoff context format — use between all agents
|
||||
@dataclass
|
||||
class AgentHandoff:
|
||||
"""Structured context passed between agents in a workflow."""
|
||||
task_id: str
|
||||
workflow_id: str
|
||||
step_number: int
|
||||
total_steps: int
|
||||
|
||||
# What was done
|
||||
previous_agent: str
|
||||
previous_output: str
|
||||
artifacts: dict # {"filename": "content"} for any files produced
|
||||
|
||||
# What to do next
|
||||
current_agent: str
|
||||
current_task: str
|
||||
constraints: list[str] # hard rules for this step
|
||||
|
||||
# Metadata
|
||||
context_budget_remaining: int # tokens left for this agent
|
||||
cost_so_far_usd: float
|
||||
|
||||
def to_prompt(self) -> str:
|
||||
return f"""
|
||||
# Agent Handoff — Step {self.step_number}/{self.total_steps}
|
||||
|
||||
## Your Task
|
||||
{self.current_task}
|
||||
|
||||
## Constraints
|
||||
{chr(10).join(f'- {c}' for c in self.constraints)}
|
||||
|
||||
## Context from Previous Step ({self.previous_agent})
|
||||
{self.previous_output[:2000]}{"... [truncated]" if len(self.previous_output) > 2000 else ""}
|
||||
|
||||
## Context Budget
|
||||
You have approximately {self.context_budget_remaining} tokens remaining. Be concise.
|
||||
"""
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Error Recovery Patterns
|
||||
|
||||
```python
|
||||
import time
|
||||
from functools import wraps
|
||||
|
||||
def with_retry(max_attempts=3, backoff_seconds=2, fallback_model=None):
|
||||
"""Decorator for agent calls with exponential backoff and model fallback."""
|
||||
def decorator(fn):
|
||||
@wraps(fn)
|
||||
def wrapper(*args, **kwargs):
|
||||
last_error = None
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
return fn(*args, **kwargs)
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if attempt < max_attempts - 1:
|
||||
wait = backoff_seconds * (2 ** attempt)
|
||||
print(f"Attempt {attempt+1} failed: {e}. Retrying in {wait}s...")
|
||||
time.sleep(wait)
|
||||
|
||||
# Fall back to cheaper/faster model on rate limit
|
||||
if fallback_model and "rate_limit" in str(e).lower():
|
||||
kwargs["model"] = fallback_model
|
||||
raise last_error
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
@with_retry(max_attempts=3, fallback_model="claude-3-haiku-20240307")
|
||||
def call_agent(model, system, user):
|
||||
...
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Context Window Budgeting
|
||||
|
||||
```python
|
||||
# Budget context across a multi-step pipeline
|
||||
# Rule: never let any step consume more than 60% of remaining budget
|
||||
|
||||
CONTEXT_LIMITS = {
|
||||
"claude-3-5-sonnet-20241022": 200_000,
|
||||
"gpt-4o": 128_000,
|
||||
}
|
||||
|
||||
class ContextBudget:
|
||||
def __init__(self, model: str, reserve_pct: float = 0.2):
|
||||
total = CONTEXT_LIMITS.get(model, 128_000)
|
||||
self.total = total
|
||||
self.reserve = int(total * reserve_pct) # keep 20% as buffer
|
||||
self.used = 0
|
||||
|
||||
@property
|
||||
def remaining(self):
|
||||
return self.total - self.reserve - self.used
|
||||
|
||||
def allocate(self, step_name: "str-requested-int-int"
|
||||
allocated = min(requested, int(self.remaining * 0.6)) # max 60% of remaining
|
||||
print(f"[Budget] {step_name}: allocated {allocated:,} tokens (remaining: {self.remaining:,})")
|
||||
return allocated
|
||||
|
||||
def consume(self, tokens_used: int):
|
||||
self.used += tokens_used
|
||||
|
||||
def truncate_to_budget(text: str, token_budget: int, chars_per_token: float = 4.0) -> str:
|
||||
"""Rough truncation — use tiktoken for precision."""
|
||||
char_budget = int(token_budget * chars_per_token)
|
||||
if len(text) <= char_budget:
|
||||
return text
|
||||
return text[:char_budget] + "\n\n[... truncated to fit context budget ...]"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cost Optimization Strategies
|
||||
|
||||
| Strategy | Savings | Tradeoff |
|
||||
|---|---|---|
|
||||
| Use Haiku for routing/classification | 85-90% | Slightly less nuanced judgment |
|
||||
| Cache repeated system prompts | 50-90% | Requires prompt caching setup |
|
||||
| Truncate intermediate outputs | 20-40% | May lose detail in handoffs |
|
||||
| Batch similar tasks | 50% | Latency increases |
|
||||
| Use Sonnet for most, Opus for final step only | 60-70% | Final quality may improve |
|
||||
| Short-circuit on confidence threshold | 30-50% | Need confidence scoring |
|
||||
1. Select pattern based on dependency shape and risk profile.
|
||||
2. Scaffold config via `scripts/workflow_scaffolder.py`.
|
||||
3. Define handoff contract fields for every edge.
|
||||
4. Add retry/timeouts and output validation gates.
|
||||
5. Dry-run with small context budgets before scaling.
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
- **Circular dependencies** — agents calling each other in loops; enforce DAG structure at design time
|
||||
- **Context bleed** — passing entire previous output to every step; summarize or extract only what's needed
|
||||
- **No timeout** — a stuck agent blocks the whole pipeline; always set max_tokens and wall-clock timeouts
|
||||
- **Silent failures** — agent returns plausible but wrong output; add validation steps for critical paths
|
||||
- **Ignoring cost** — 10 parallel Opus calls is $0.50 per workflow; model selection is a cost decision
|
||||
- **Over-orchestration** — if a single prompt can do it, it should; only add agents when genuinely needed
|
||||
- Over-orchestrating tasks solvable by one well-structured prompt
|
||||
- Missing timeout/retry policies for external-model calls
|
||||
- Passing full upstream context instead of targeted artifacts
|
||||
- Ignoring per-step cost accumulation
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. Start with the smallest pattern that can satisfy requirements.
|
||||
2. Keep handoff payloads explicit and bounded.
|
||||
3. Validate intermediate outputs before fan-in synthesis.
|
||||
4. Enforce budget and timeout limits in every step.
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
# Workflow Pattern Templates
|
||||
|
||||
## Sequential
|
||||
|
||||
Use when each step depends on prior output.
|
||||
|
||||
```json
|
||||
{
|
||||
"pattern": "sequential",
|
||||
"steps": ["research", "draft", "review"]
|
||||
}
|
||||
```
|
||||
|
||||
## Parallel
|
||||
|
||||
Use when independent tasks can fan out and then fan in.
|
||||
|
||||
```json
|
||||
{
|
||||
"pattern": "parallel",
|
||||
"fan_out": ["task_a", "task_b", "task_c"],
|
||||
"fan_in": "synthesizer"
|
||||
}
|
||||
```
|
||||
|
||||
## Router
|
||||
|
||||
Use when tasks must be routed to specialized handlers by intent.
|
||||
|
||||
```json
|
||||
{
|
||||
"pattern": "router",
|
||||
"router": "intent_router",
|
||||
"routes": ["sales", "support", "engineering"],
|
||||
"fallback": "generalist"
|
||||
}
|
||||
```
|
||||
|
||||
## Orchestrator
|
||||
|
||||
Use when dynamic planning and dependency management are required.
|
||||
|
||||
```json
|
||||
{
|
||||
"pattern": "orchestrator",
|
||||
"orchestrator": "planner",
|
||||
"specialists": ["researcher", "analyst", "coder"],
|
||||
"dependency_mode": "dag"
|
||||
}
|
||||
```
|
||||
|
||||
## Evaluator
|
||||
|
||||
Use when output quality gates are mandatory before finalization.
|
||||
|
||||
```json
|
||||
{
|
||||
"pattern": "evaluator",
|
||||
"generator": "content_agent",
|
||||
"evaluator": "quality_agent",
|
||||
"max_iterations": 3,
|
||||
"pass_threshold": 0.8
|
||||
}
|
||||
```
|
||||
|
||||
## Pattern Selection Heuristics
|
||||
|
||||
- Choose `sequential` for strict linear workflows.
|
||||
- Choose `parallel` for throughput and latency reduction.
|
||||
- Choose `router` for intent- or type-based branching.
|
||||
- Choose `orchestrator` for complex adaptive workflows.
|
||||
- Choose `evaluator` when correctness/quality loops are required.
|
||||
|
||||
## Handoff Minimum Contract
|
||||
|
||||
- `workflow_id`
|
||||
- `step_id`
|
||||
- `task`
|
||||
- `constraints`
|
||||
- `upstream_artifacts`
|
||||
- `budget_tokens`
|
||||
- `timeout_seconds`
|
||||
113
engineering/agent-workflow-designer/scripts/workflow_scaffolder.py
Executable file
113
engineering/agent-workflow-designer/scripts/workflow_scaffolder.py
Executable file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate workflow skeleton configs from common multi-agent patterns."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def sequential_template(name: str) -> Dict:
|
||||
return {
|
||||
"name": name,
|
||||
"pattern": "sequential",
|
||||
"steps": [
|
||||
{"id": "research", "agent": "researcher", "next": "draft"},
|
||||
{"id": "draft", "agent": "writer", "next": "review"},
|
||||
{"id": "review", "agent": "reviewer", "next": None},
|
||||
],
|
||||
"retry": {"max_attempts": 2, "backoff_seconds": 2},
|
||||
}
|
||||
|
||||
|
||||
def parallel_template(name: str) -> Dict:
|
||||
return {
|
||||
"name": name,
|
||||
"pattern": "parallel",
|
||||
"fan_out": {
|
||||
"tasks": ["research_a", "research_b", "research_c"],
|
||||
"agent": "analyst",
|
||||
},
|
||||
"fan_in": {"agent": "synthesizer", "output": "combined_report"},
|
||||
"timeouts": {"per_task_seconds": 180, "fan_in_seconds": 120},
|
||||
}
|
||||
|
||||
|
||||
def router_template(name: str) -> Dict:
|
||||
return {
|
||||
"name": name,
|
||||
"pattern": "router",
|
||||
"router": {"agent": "router", "routes": ["sales", "support", "engineering"]},
|
||||
"handlers": {
|
||||
"sales": {"agent": "sales_specialist"},
|
||||
"support": {"agent": "support_specialist"},
|
||||
"engineering": {"agent": "engineering_specialist"},
|
||||
},
|
||||
"fallback": {"agent": "generalist"},
|
||||
}
|
||||
|
||||
|
||||
def orchestrator_template(name: str) -> Dict:
|
||||
return {
|
||||
"name": name,
|
||||
"pattern": "orchestrator",
|
||||
"orchestrator": {"agent": "orchestrator", "planning": "dynamic"},
|
||||
"specialists": ["researcher", "coder", "analyst", "writer"],
|
||||
"execution": {
|
||||
"dependency_mode": "dag",
|
||||
"max_parallel": 3,
|
||||
"completion_policy": "all_required",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def evaluator_template(name: str) -> Dict:
|
||||
return {
|
||||
"name": name,
|
||||
"pattern": "evaluator",
|
||||
"generator": {"agent": "generator"},
|
||||
"evaluator": {"agent": "evaluator", "criteria": ["accuracy", "format", "safety"]},
|
||||
"loop": {
|
||||
"max_iterations": 3,
|
||||
"pass_threshold": 0.8,
|
||||
"on_fail": "revise_and_retry",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
PATTERNS = {
|
||||
"sequential": sequential_template,
|
||||
"parallel": parallel_template,
|
||||
"router": router_template,
|
||||
"orchestrator": orchestrator_template,
|
||||
"evaluator": evaluator_template,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generate a workflow skeleton config from a pattern.")
|
||||
parser.add_argument("pattern", choices=sorted(PATTERNS.keys()), help="Workflow pattern")
|
||||
parser.add_argument("--name", default="new-workflow", help="Workflow name")
|
||||
parser.add_argument("--output", help="Optional output path for JSON config")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
config = PATTERNS[args.pattern](args.name)
|
||||
payload = json.dumps(config, indent=2)
|
||||
|
||||
if args.output:
|
||||
out = Path(args.output)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(payload + "\n", encoding="utf-8")
|
||||
print(f"Wrote workflow config to {out}")
|
||||
else:
|
||||
print(payload)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -7,459 +7,78 @@ description: "Codebase Onboarding"
|
||||
|
||||
**Tier:** POWERFUL
|
||||
**Category:** Engineering
|
||||
**Domain:** Documentation / Developer Experience
|
||||
**Domain:** Documentation / Developer Experience
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Analyze a codebase and generate comprehensive onboarding documentation tailored to your audience. Produces architecture overviews, key file maps, local setup guides, common task runbooks, debugging guides, and contribution guidelines. Outputs to Markdown, Notion, or Confluence.
|
||||
Analyze a codebase and generate onboarding documentation for engineers, tech leads, and contractors. This skill is optimized for fast fact-gathering and repeatable onboarding outputs.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Architecture overview** — tech stack, system boundaries, data flow diagrams
|
||||
- **Key file map** — what's important and why, with annotations
|
||||
- **Local setup guide** — step-by-step from clone to running tests
|
||||
- **Common developer tasks** — how to add a route, run migrations, create a component
|
||||
- **Debugging guide** — common errors, log locations, useful queries
|
||||
- **Contribution guidelines** — branch strategy, PR process, code style
|
||||
- **Audience-aware output** — junior, senior, or contractor mode
|
||||
- Architecture and stack discovery from repository signals
|
||||
- Key file and config inventory for new contributors
|
||||
- Local setup and common-task guidance generation
|
||||
- Audience-aware documentation framing
|
||||
- Debugging and contribution checklist scaffolding
|
||||
|
||||
---
|
||||
|
||||
## When to Use
|
||||
|
||||
- Onboarding a new team member or contractor
|
||||
- After a major refactor that made existing docs stale
|
||||
- Before open-sourcing a project
|
||||
- Creating a team wiki page for a service
|
||||
- Self-documenting before a long vacation
|
||||
|
||||
---
|
||||
|
||||
## Codebase Analysis Commands
|
||||
|
||||
Run these before generating docs to gather facts:
|
||||
|
||||
```bash
|
||||
# Project overview
|
||||
cat package.json | jq '{name, version, scripts, dependencies: (.dependencies | keys), devDependencies: (.devDependencies | keys)}'
|
||||
|
||||
# Directory structure (top 2 levels)
|
||||
find . -maxdepth 2 -not -path '*/node_modules/*' -not -path '*/.git/*' -not -path '*/.next/*' | sort | head -60
|
||||
|
||||
# Largest files (often core modules)
|
||||
find src/ -name "*.ts" -not -path "*/test*" -exec wc -l {} + | sort -rn | head -20
|
||||
|
||||
# All routes (Next.js App Router)
|
||||
find app/ -name "route.ts" -o -name "page.tsx" | sort
|
||||
|
||||
# All routes (Express)
|
||||
grep -rn "router\.\(get\|post\|put\|patch\|delete\)" src/routes/ --include="*.ts"
|
||||
|
||||
# Recent major changes
|
||||
git log --oneline --since="90 days ago" | grep -E "feat|refactor|breaking"
|
||||
|
||||
# Top contributors
|
||||
git shortlog -sn --no-merges | head -10
|
||||
|
||||
# Test coverage summary
|
||||
pnpm test:ci --coverage 2>&1 | tail -20
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Generated Documentation Template
|
||||
|
||||
### README.md — Full Template
|
||||
|
||||
```markdown
|
||||
# [Project Name]
|
||||
|
||||
> One-sentence description of what this does and who uses it.
|
||||
|
||||
[](https://github.com/org/repo/actions/workflows/ci.yml)
|
||||
[](https://codecov.io/gh/org/repo)
|
||||
|
||||
## What is this?
|
||||
|
||||
[2-3 sentences: problem it solves, who uses it, current state]
|
||||
|
||||
**Live:** https://myapp.com
|
||||
**Staging:** https://staging.myapp.com
|
||||
**Docs:** https://docs.myapp.com
|
||||
- Rebuilding stale project docs after large refactors
|
||||
- Preparing internal handoff documentation
|
||||
- Creating a standardized onboarding packet for services
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
| Tool | Version | Install |
|
||||
|------|---------|---------|
|
||||
| Node.js | 20+ | `nvm install 20` |
|
||||
| pnpm | 8+ | `npm i -g pnpm` |
|
||||
| Docker | 24+ | [docker.com](https://docker.com) |
|
||||
| PostgreSQL | 16+ | via Docker (see below) |
|
||||
|
||||
### Setup (5 minutes)
|
||||
|
||||
```bash
|
||||
# 1. Clone
|
||||
git clone https://github.com/org/repo
|
||||
cd repo
|
||||
# 1) Gather codebase facts
|
||||
python3 scripts/codebase_analyzer.py /path/to/repo
|
||||
|
||||
# 2. Install dependencies
|
||||
pnpm install
|
||||
# 2) Export machine-readable output
|
||||
python3 scripts/codebase_analyzer.py /path/to/repo --json
|
||||
|
||||
# 3. Start infrastructure
|
||||
docker compose up -d # Starts Postgres, Redis
|
||||
|
||||
# 4. Environment
|
||||
cp .env.example .env
|
||||
# Edit .env — ask a teammate for real values or see Vault
|
||||
|
||||
# 5. Database setup
|
||||
pnpm db:migrate # Run migrations
|
||||
pnpm db:seed # Optional: load test data
|
||||
|
||||
# 6. Start dev server
|
||||
pnpm dev # → http://localhost:3000
|
||||
|
||||
# 7. Verify
|
||||
pnpm test # Should be all green
|
||||
```
|
||||
|
||||
### Verify it works
|
||||
|
||||
- [ ] `http://localhost:3000` loads the app
|
||||
- [ ] `http://localhost:3000/api/health` returns `{"status":"ok"}`
|
||||
- [ ] `pnpm test` passes
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
### System Overview
|
||||
|
||||
```
|
||||
Browser / Mobile
|
||||
│
|
||||
▼
|
||||
[Next.js App] ←──── [Auth: NextAuth]
|
||||
│
|
||||
├──→ [PostgreSQL] (primary data store)
|
||||
├──→ [Redis] (sessions, job queue)
|
||||
└──→ [S3] (file uploads)
|
||||
|
||||
Background:
|
||||
[BullMQ workers] ←── Redis queue
|
||||
└──→ [External APIs: Stripe, SendGrid]
|
||||
```
|
||||
|
||||
### Tech Stack
|
||||
|
||||
| Layer | Technology | Why |
|
||||
|-------|-----------|-----|
|
||||
| Frontend | Next.js 14 (App Router) | SSR, file-based routing |
|
||||
| Styling | Tailwind CSS + shadcn/ui | Rapid UI development |
|
||||
| API | Next.js Route Handlers | Co-located with frontend |
|
||||
| Database | PostgreSQL 16 | Relational, RLS for multi-tenancy |
|
||||
| ORM | Drizzle ORM | Type-safe, lightweight |
|
||||
| Auth | NextAuth v5 | OAuth + email/password |
|
||||
| Queue | BullMQ + Redis | Background jobs |
|
||||
| Storage | AWS S3 | File uploads |
|
||||
| Email | SendGrid | Transactional email |
|
||||
| Payments | Stripe | Subscriptions |
|
||||
| Deployment | Vercel (app) + Railway (workers) | |
|
||||
| Monitoring | Sentry + Datadog | |
|
||||
|
||||
---
|
||||
|
||||
## Key Files
|
||||
|
||||
| Path | Purpose |
|
||||
|------|---------|
|
||||
| `app/` | Next.js App Router — pages and API routes |
|
||||
| `app/api/` | API route handlers |
|
||||
| `app/(auth)/` | Auth pages (login, register, reset) |
|
||||
| `app/(app)/` | Protected app pages |
|
||||
| `src/db/` | Database schema, migrations, client |
|
||||
| `src/db/schema.ts` | **Drizzle schema — single source of truth** |
|
||||
| `src/lib/` | Shared utilities (auth, email, stripe) |
|
||||
| `src/lib/auth.ts` | **Auth configuration — read this first** |
|
||||
| `src/components/` | Reusable React components |
|
||||
| `src/hooks/` | Custom React hooks |
|
||||
| `src/types/` | Shared TypeScript types |
|
||||
| `workers/` | BullMQ background job processors |
|
||||
| `emails/` | React Email templates |
|
||||
| `tests/` | Test helpers, factories, integration tests |
|
||||
| `.env.example` | All env vars with descriptions |
|
||||
| `docker-compose.yml` | Local infrastructure |
|
||||
|
||||
---
|
||||
|
||||
## Common Developer Tasks
|
||||
|
||||
### Add a new API endpoint
|
||||
|
||||
```bash
|
||||
# 1. Create route handler
|
||||
touch app/api/my-resource/route.ts
|
||||
```
|
||||
|
||||
```typescript
|
||||
// app/api/my-resource/route.ts
|
||||
import { NextRequest, NextResponse } from 'next/server'
|
||||
import { auth } from '@/lib/auth'
|
||||
import { db } from '@/db/client'
|
||||
|
||||
export async function GET(req: NextRequest) {
|
||||
const session = await auth()
|
||||
if (!session) {
|
||||
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
|
||||
}
|
||||
|
||||
const data = await db.query.myResource.findMany({
|
||||
where: (r, { eq }) => eq(r.userId, session.user.id),
|
||||
})
|
||||
|
||||
return NextResponse.json({ data })
|
||||
}
|
||||
```
|
||||
|
||||
```bash
|
||||
# 2. Add tests
|
||||
touch tests/api/my-resource.test.ts
|
||||
|
||||
# 3. Add to OpenAPI spec (if applicable)
|
||||
pnpm generate:openapi
|
||||
```
|
||||
|
||||
### Run a database migration
|
||||
|
||||
```bash
|
||||
# Create migration
|
||||
pnpm db:generate # Generates SQL from schema changes
|
||||
|
||||
# Review the generated SQL
|
||||
cat drizzle/migrations/0001_my_change.sql
|
||||
|
||||
# Apply
|
||||
pnpm db:migrate
|
||||
|
||||
# Roll back (manual — inspect generated SQL and revert)
|
||||
psql $DATABASE_URL -f scripts/rollback_0001.sql
|
||||
```
|
||||
|
||||
### Add a new email template
|
||||
|
||||
```bash
|
||||
# 1. Create template
|
||||
touch emails/my-email.tsx
|
||||
|
||||
# 2. Preview in browser
|
||||
pnpm email:preview
|
||||
|
||||
# 3. Send in code
|
||||
import { sendEmail } from '@/lib/email'
|
||||
await sendEmail({
|
||||
to: user.email,
|
||||
subject: 'Subject line',
|
||||
template: 'my-email',
|
||||
props: { name: "username"
|
||||
})
|
||||
```
|
||||
|
||||
### Add a background job
|
||||
|
||||
```typescript
|
||||
// 1. Define job in workers/jobs/my-job.ts
|
||||
import { Queue, Worker } from 'bullmq'
|
||||
import { redis } from '@/lib/redis'
|
||||
|
||||
export const myJobQueue = new Queue('my-job', { connection: redis })
|
||||
|
||||
export const myJobWorker = new Worker('my-job', async (job) => {
|
||||
const { userId, data } = job.data
|
||||
// do work
|
||||
}, { connection: redis })
|
||||
|
||||
// 2. Enqueue
|
||||
await myJobQueue.add('process', { userId, data }, {
|
||||
attempts: 3,
|
||||
backoff: { type: 'exponential', delay: 1000 },
|
||||
})
|
||||
# 3) Use the template to draft onboarding docs
|
||||
# See references/onboarding-template.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Debugging Guide
|
||||
## Recommended Workflow
|
||||
|
||||
### Common Errors
|
||||
|
||||
**`Error: DATABASE_URL is not set`**
|
||||
```bash
|
||||
# Check your .env file exists and has the var
|
||||
cat .env | grep DATABASE_URL
|
||||
|
||||
# Start Postgres if not running
|
||||
docker compose up -d postgres
|
||||
```
|
||||
|
||||
**`PrismaClientKnownRequestError: P2002 Unique constraint failed`**
|
||||
```
|
||||
User already exists with that email. Check: is this a duplicate registration?
|
||||
Run: SELECT * FROM users WHERE email = 'test@example.com';
|
||||
```
|
||||
|
||||
**`Error: JWT expired`**
|
||||
```bash
|
||||
# Dev: extend token TTL in .env
|
||||
JWT_EXPIRES_IN=30d
|
||||
|
||||
# Check clock skew between server and client
|
||||
date && docker exec postgres date
|
||||
```
|
||||
|
||||
**`500 on /api/*` in local dev**
|
||||
```bash
|
||||
# 1. Check terminal for stack trace
|
||||
# 2. Check database connectivity
|
||||
psql $DATABASE_URL -c "SELECT 1"
|
||||
# 3. Check Redis
|
||||
redis-cli ping
|
||||
# 4. Check logs
|
||||
pnpm dev 2>&1 | grep -E "error|Error|ERROR"
|
||||
```
|
||||
|
||||
### Useful SQL Queries
|
||||
|
||||
```sql
|
||||
-- Find slow queries (requires pg_stat_statements)
|
||||
SELECT query, mean_exec_time, calls, total_exec_time
|
||||
FROM pg_stat_statements
|
||||
ORDER BY mean_exec_time DESC
|
||||
LIMIT 20;
|
||||
|
||||
-- Check active connections
|
||||
SELECT count(*), state FROM pg_stat_activity GROUP BY state;
|
||||
|
||||
-- Find bloated tables
|
||||
SELECT relname, n_dead_tup, n_live_tup,
|
||||
round(n_dead_tup::numeric/nullif(n_live_tup,0)*100, 2) AS dead_pct
|
||||
FROM pg_stat_user_tables
|
||||
ORDER BY n_dead_tup DESC;
|
||||
```
|
||||
|
||||
### Debug Authentication
|
||||
|
||||
```bash
|
||||
# Decode a JWT (no secret needed for header/payload)
|
||||
echo "YOUR_JWT" | cut -d. -f2 | base64 -d | jq .
|
||||
|
||||
# Check session in DB
|
||||
psql $DATABASE_URL -c "SELECT * FROM sessions WHERE user_id = 'usr_...' ORDER BY expires_at DESC LIMIT 5;"
|
||||
```
|
||||
|
||||
### Log Locations
|
||||
|
||||
| Environment | Logs |
|
||||
|-------------|------|
|
||||
| Local dev | Terminal running `pnpm dev` |
|
||||
| Vercel production | Vercel dashboard → Logs |
|
||||
| Workers (Railway) | Railway dashboard → Deployments → Logs |
|
||||
| Database | `docker logs postgres` (local) |
|
||||
| Background jobs | `pnpm worker:dev` terminal |
|
||||
1. Run `scripts/codebase_analyzer.py` against the target repository.
|
||||
2. Capture key signals: file counts, detected languages, config files, top-level structure.
|
||||
3. Fill the onboarding template in `references/onboarding-template.md`.
|
||||
4. Tailor output depth by audience:
|
||||
- Junior: setup + guardrails
|
||||
- Senior: architecture + operational concerns
|
||||
- Contractor: scoped ownership + integration boundaries
|
||||
|
||||
---
|
||||
|
||||
## Contribution Guidelines
|
||||
## Onboarding Document Template
|
||||
|
||||
### Branch Strategy
|
||||
|
||||
```
|
||||
main → production (protected, requires PR + CI)
|
||||
└── feature/PROJ-123-short-desc
|
||||
└── fix/PROJ-456-bug-description
|
||||
└── chore/update-dependencies
|
||||
```
|
||||
|
||||
### PR Requirements
|
||||
|
||||
- [ ] Branch name includes ticket ID (e.g., `feature/PROJ-123-...`)
|
||||
- [ ] PR description explains the why
|
||||
- [ ] All CI checks pass
|
||||
- [ ] Test coverage doesn't decrease
|
||||
- [ ] Self-reviewed (read your own diff before requesting review)
|
||||
- [ ] Screenshots/video for UI changes
|
||||
|
||||
### Commit Convention
|
||||
|
||||
```
|
||||
feat(scope): short description → new feature
|
||||
fix(scope): short description → bug fix
|
||||
chore: update dependencies → maintenance
|
||||
docs: update API reference → documentation
|
||||
```
|
||||
|
||||
### Code Style
|
||||
|
||||
```bash
|
||||
# Lint + format
|
||||
pnpm lint
|
||||
pnpm format
|
||||
|
||||
# Type check
|
||||
pnpm typecheck
|
||||
|
||||
# All checks (run before pushing)
|
||||
pnpm validate
|
||||
```
|
||||
Detailed template and section examples live in:
|
||||
- `references/onboarding-template.md`
|
||||
- `references/output-format-templates.md`
|
||||
|
||||
---
|
||||
|
||||
## Audience-Specific Notes
|
||||
|
||||
### For Junior Developers
|
||||
- Start with `src/lib/auth.ts` to understand authentication
|
||||
- Read existing tests in `tests/api/` — they document expected behavior
|
||||
- Ask before touching anything in `src/db/schema.ts` — schema changes affect everyone
|
||||
- Use `pnpm db:seed` to get realistic local data
|
||||
|
||||
### For Senior Engineers / Tech Leads
|
||||
- Architecture decisions are documented in `docs/adr/` (Architecture Decision Records)
|
||||
- Performance benchmarks: `pnpm bench` — baseline is in `tests/benchmarks/baseline.json`
|
||||
- Security model: RLS policies in `src/db/rls.sql`, enforced at DB level
|
||||
- Scaling notes: `docs/scaling.md`
|
||||
|
||||
### For Contractors
|
||||
- Scope is limited to `src/features/[your-feature]/` unless discussed
|
||||
- Never push directly to `main`
|
||||
- All external API calls go through `src/lib/` wrappers (for mocking in tests)
|
||||
- Time estimates: log in Linear ticket comments daily
|
||||
|
||||
---
|
||||
|
||||
## Output Formats
|
||||
→ See references/output-format-templates.md for details
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
- **Docs written once, never updated** — add doc updates to PR checklist
|
||||
- **Missing local setup step** — test setup instructions on a fresh machine quarterly
|
||||
- **No error troubleshooting** — debugging section is the most valuable part for new hires
|
||||
- **Too much detail for contractors** — they need task-specific, not architecture-deep docs
|
||||
- **No screenshots** — UI flows need screenshots; they go stale but are still valuable
|
||||
- **Skipping the "why"** — document why decisions were made, not just what was decided
|
||||
|
||||
---
|
||||
- Writing docs without validating setup commands on a clean environment
|
||||
- Mixing architecture deep-dives into contractor-oriented docs
|
||||
- Omitting troubleshooting and verification steps
|
||||
- Letting onboarding docs drift from current repo state
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Keep setup under 10 minutes** — if it takes longer, fix the setup, not the docs
|
||||
2. **Test the docs** — have a new hire follow them literally, fix every gap they hit
|
||||
3. **Link, don't repeat** — link to ADRs, issues, and external docs instead of duplicating
|
||||
4. **Update in the same PR** — docs changes alongside code changes
|
||||
5. **Version-specific notes** — call out things that changed in recent versions
|
||||
6. **Runbooks over theory** — "run this command" beats "the system uses Redis for..."
|
||||
1. Keep setup instructions executable and time-bounded.
|
||||
2. Document the "why" for key architectural decisions.
|
||||
3. Update docs in the same PR as behavior changes.
|
||||
4. Treat onboarding docs as living operational assets, not one-time deliverables.
|
||||
|
||||
@@ -0,0 +1,186 @@
|
||||
# Onboarding Document Template
|
||||
|
||||
## README.md - Full Template
|
||||
|
||||
```markdown
|
||||
# [Project Name]
|
||||
|
||||
> One-sentence description of what this does and who uses it.
|
||||
|
||||
[](https://github.com/org/repo/actions/workflows/ci.yml)
|
||||
[](https://codecov.io/gh/org/repo)
|
||||
|
||||
## What is this?
|
||||
|
||||
[2-3 sentences: problem it solves, who uses it, current state]
|
||||
|
||||
**Live:** https://myapp.com
|
||||
**Staging:** https://staging.myapp.com
|
||||
**Docs:** https://docs.myapp.com
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites
|
||||
|
||||
| Tool | Version | Install |
|
||||
|------|---------|---------|
|
||||
| Node.js | 20+ | `nvm install 20` |
|
||||
| pnpm | 8+ | `npm i -g pnpm` |
|
||||
| Docker | 24+ | [docker.com](https://docker.com) |
|
||||
| PostgreSQL | 16+ | via Docker (see below) |
|
||||
|
||||
### Setup (5 minutes)
|
||||
|
||||
```bash
|
||||
git clone https://github.com/org/repo
|
||||
cd repo
|
||||
pnpm install
|
||||
docker compose up -d
|
||||
cp .env.example .env
|
||||
pnpm db:migrate
|
||||
pnpm db:seed
|
||||
pnpm dev
|
||||
pnpm test
|
||||
```
|
||||
|
||||
### Verify it works
|
||||
|
||||
- [ ] App loads on localhost
|
||||
- [ ] Health endpoint returns ok
|
||||
- [ ] Tests pass
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
### System Overview
|
||||
|
||||
```
|
||||
Browser / Mobile
|
||||
|
|
||||
v
|
||||
[Next.js App] <- [Auth]
|
||||
|
|
||||
+-> [PostgreSQL]
|
||||
+-> [Redis]
|
||||
+-> [S3]
|
||||
```
|
||||
|
||||
### Tech Stack
|
||||
|
||||
| Layer | Technology | Why |
|
||||
|-------|-----------|-----|
|
||||
| Frontend | Next.js | SSR + routing |
|
||||
| Styling | Tailwind + shadcn/ui | Rapid UI |
|
||||
| API | Route handlers | Co-location |
|
||||
| Database | PostgreSQL | Relational |
|
||||
| Queue | BullMQ + Redis | Background jobs |
|
||||
|
||||
---
|
||||
|
||||
## Key Files
|
||||
|
||||
| Path | Purpose |
|
||||
|------|---------|
|
||||
| `app/` | Pages and route handlers |
|
||||
| `src/db/` | Schema and migrations |
|
||||
| `src/lib/` | Shared utilities |
|
||||
| `tests/` | Test suites and helpers |
|
||||
| `.env.example` | Required variables |
|
||||
|
||||
---
|
||||
|
||||
## Common Developer Tasks
|
||||
|
||||
### Add a new API endpoint
|
||||
|
||||
```bash
|
||||
touch app/api/my-resource/route.ts
|
||||
touch tests/api/my-resource.test.ts
|
||||
```
|
||||
|
||||
### Run a database migration
|
||||
|
||||
```bash
|
||||
pnpm db:generate
|
||||
pnpm db:migrate
|
||||
```
|
||||
|
||||
### Add a background job
|
||||
|
||||
```bash
|
||||
# Create worker module and enqueue path
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Debugging Guide
|
||||
|
||||
### Common Errors
|
||||
|
||||
- Missing environment variable
|
||||
- Database connectivity failure
|
||||
- Expired auth token
|
||||
- Generic 500 in local dev
|
||||
|
||||
### Useful SQL Queries
|
||||
|
||||
- Slow query checks
|
||||
- Connection status
|
||||
- Table bloat checks
|
||||
|
||||
### Log Locations
|
||||
|
||||
| Environment | Logs |
|
||||
|-------------|------|
|
||||
| Local dev | local terminal |
|
||||
| Production | platform logs |
|
||||
| Worker | worker process logs |
|
||||
|
||||
---
|
||||
|
||||
## Contribution Guidelines
|
||||
|
||||
### Branch Strategy
|
||||
|
||||
- `main` protected
|
||||
- feature/fix branches with ticket IDs
|
||||
|
||||
### PR Requirements
|
||||
|
||||
- CI green
|
||||
- Tests updated
|
||||
- Why documented
|
||||
- Self-review completed
|
||||
|
||||
### Commit Convention
|
||||
|
||||
- `feat(scope): ...`
|
||||
- `fix(scope): ...`
|
||||
- `docs: ...`
|
||||
|
||||
---
|
||||
|
||||
## Audience-Specific Notes
|
||||
|
||||
### Junior Developers
|
||||
- Start with core auth/data modules
|
||||
- Follow tests as executable examples
|
||||
|
||||
### Senior Engineers
|
||||
- Read ADRs and scaling notes first
|
||||
- Validate performance/security assumptions early
|
||||
|
||||
### Contractors
|
||||
- Stay within scoped feature boundaries
|
||||
- Use wrappers for external integrations
|
||||
```
|
||||
|
||||
## Usage Notes
|
||||
|
||||
- Keep onboarding setup under 10 minutes where possible.
|
||||
- Include executable verification checks after each setup phase.
|
||||
- Prefer links to canonical docs instead of duplicating long content.
|
||||
- Update this template when stack conventions or tooling change.
|
||||
205
engineering/codebase-onboarding/scripts/codebase_analyzer.py
Executable file
205
engineering/codebase-onboarding/scripts/codebase_analyzer.py
Executable file
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate a compact onboarding summary for a codebase (stdlib only)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List
|
||||
|
||||
IGNORED_DIRS = {
|
||||
".git",
|
||||
"node_modules",
|
||||
".next",
|
||||
"dist",
|
||||
"build",
|
||||
"coverage",
|
||||
"venv",
|
||||
".venv",
|
||||
"__pycache__",
|
||||
}
|
||||
|
||||
EXT_TO_LANG = {
|
||||
".py": "Python",
|
||||
".ts": "TypeScript",
|
||||
".tsx": "TypeScript",
|
||||
".js": "JavaScript",
|
||||
".jsx": "JavaScript",
|
||||
".go": "Go",
|
||||
".rs": "Rust",
|
||||
".java": "Java",
|
||||
".kt": "Kotlin",
|
||||
".rb": "Ruby",
|
||||
".php": "PHP",
|
||||
".cs": "C#",
|
||||
".c": "C",
|
||||
".cpp": "C++",
|
||||
".h": "C/C++",
|
||||
".swift": "Swift",
|
||||
".sql": "SQL",
|
||||
".sh": "Shell",
|
||||
}
|
||||
|
||||
KEY_CONFIG_FILES = [
|
||||
"package.json",
|
||||
"pnpm-workspace.yaml",
|
||||
"turbo.json",
|
||||
"nx.json",
|
||||
"lerna.json",
|
||||
"tsconfig.json",
|
||||
"next.config.js",
|
||||
"next.config.mjs",
|
||||
"pyproject.toml",
|
||||
"requirements.txt",
|
||||
"go.mod",
|
||||
"Cargo.toml",
|
||||
"docker-compose.yml",
|
||||
"Dockerfile",
|
||||
".github/workflows",
|
||||
]
|
||||
|
||||
|
||||
def iter_files(root: Path) -> Iterable[Path]:
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
|
||||
for name in filenames:
|
||||
path = Path(dirpath) / name
|
||||
if path.is_file():
|
||||
yield path
|
||||
|
||||
|
||||
def detect_languages(paths: Iterable[Path]) -> Dict[str, int]:
|
||||
counts: Counter[str] = Counter()
|
||||
for path in paths:
|
||||
lang = EXT_TO_LANG.get(path.suffix.lower())
|
||||
if lang:
|
||||
counts[lang] += 1
|
||||
return dict(sorted(counts.items(), key=lambda item: (-item[1], item[0])))
|
||||
|
||||
|
||||
def find_key_configs(root: Path) -> List[str]:
|
||||
found: List[str] = []
|
||||
for rel in KEY_CONFIG_FILES:
|
||||
if (root / rel).exists():
|
||||
found.append(rel)
|
||||
return found
|
||||
|
||||
|
||||
def top_level_structure(root: Path, max_depth: int) -> List[str]:
|
||||
lines: List[str] = []
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
rel = Path(dirpath).relative_to(root)
|
||||
depth = 0 if str(rel) == "." else len(rel.parts)
|
||||
if depth > max_depth:
|
||||
dirnames[:] = []
|
||||
continue
|
||||
|
||||
if any(part in IGNORED_DIRS for part in rel.parts):
|
||||
dirnames[:] = []
|
||||
continue
|
||||
|
||||
indent = " " * depth
|
||||
if str(rel) != ".":
|
||||
lines.append(f"{indent}{rel.name}/")
|
||||
|
||||
visible_files = [f for f in sorted(filenames) if not f.startswith(".")]
|
||||
for filename in visible_files[:10]:
|
||||
lines.append(f"{indent} {filename}")
|
||||
|
||||
dirnames[:] = sorted([d for d in dirnames if d not in IGNORED_DIRS])
|
||||
return lines
|
||||
|
||||
|
||||
def build_report(root: Path, max_depth: int) -> Dict[str, object]:
|
||||
files = list(iter_files(root))
|
||||
languages = detect_languages(files)
|
||||
total_files = len(files)
|
||||
file_count_by_ext: Counter[str] = Counter(p.suffix.lower() or "<no-ext>" for p in files)
|
||||
|
||||
largest = sorted(
|
||||
((str(p.relative_to(root)), p.stat().st_size) for p in files),
|
||||
key=lambda item: item[1],
|
||||
reverse=True,
|
||||
)[:20]
|
||||
|
||||
return {
|
||||
"root": str(root),
|
||||
"file_count": total_files,
|
||||
"languages": languages,
|
||||
"key_config_files": find_key_configs(root),
|
||||
"top_extensions": dict(file_count_by_ext.most_common(12)),
|
||||
"largest_files": largest,
|
||||
"directory_structure": top_level_structure(root, max_depth),
|
||||
}
|
||||
|
||||
|
||||
def format_size(num_bytes: int) -> str:
|
||||
units = ["B", "KB", "MB", "GB"]
|
||||
value = float(num_bytes)
|
||||
for unit in units:
|
||||
if value < 1024 or unit == units[-1]:
|
||||
return f"{value:.1f}{unit}"
|
||||
value /= 1024
|
||||
return f"{num_bytes}B"
|
||||
|
||||
|
||||
def print_text(report: Dict[str, object]) -> None:
|
||||
print("Codebase Onboarding Summary")
|
||||
print(f"Root: {report['root']}")
|
||||
print(f"Total files: {report['file_count']}")
|
||||
print("")
|
||||
|
||||
print("Languages detected")
|
||||
if report["languages"]:
|
||||
for lang, count in report["languages"].items():
|
||||
print(f"- {lang}: {count}")
|
||||
else:
|
||||
print("- No recognized source file extensions")
|
||||
print("")
|
||||
|
||||
print("Key config files")
|
||||
configs = report["key_config_files"]
|
||||
if configs:
|
||||
for cfg in configs:
|
||||
print(f"- {cfg}")
|
||||
else:
|
||||
print("- None found from default checklist")
|
||||
print("")
|
||||
|
||||
print("Largest files")
|
||||
for rel, size in report["largest_files"][:10]:
|
||||
print(f"- {rel}: {format_size(size)}")
|
||||
print("")
|
||||
|
||||
print("Directory structure")
|
||||
for line in report["directory_structure"][:200]:
|
||||
print(line)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Scan a repository and generate onboarding summary facts.")
|
||||
parser.add_argument("path", help="Path to project directory")
|
||||
parser.add_argument("--max-depth", type=int, default=2, help="Max depth for structure output (default: 2)")
|
||||
parser.add_argument("--json", action="store_true", help="Print JSON output")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
root = Path(args.path).expanduser().resolve()
|
||||
if not root.exists() or not root.is_dir():
|
||||
raise SystemExit(f"Path is not a directory: {root}")
|
||||
|
||||
report = build_report(root, max_depth=max(1, args.max_depth))
|
||||
if args.json:
|
||||
print(json.dumps(report, indent=2))
|
||||
else:
|
||||
print_text(report)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -13,325 +13,66 @@ description: "Env & Secrets Manager"
|
||||
|
||||
## Overview
|
||||
|
||||
Complete environment and secrets management workflow: .env file lifecycle across dev/staging/prod,
|
||||
.env.example auto-generation, required-var validation, secret leak detection in git history, and
|
||||
credential rotation playbook. Integrates with HashiCorp Vault, AWS SSM, 1Password CLI, and Doppler.
|
||||
|
||||
---
|
||||
Manage environment-variable hygiene and secrets safety across local development and production workflows. This skill focuses on practical auditing, drift awareness, and rotation readiness.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **.env lifecycle** — create, validate, sync across environments
|
||||
- **.env.example generation** — strip values, preserve keys and comments
|
||||
- **Validation script** — fail-fast on missing required vars at startup
|
||||
- **Secret leak detection** — regex scan of git history and working tree
|
||||
- **Rotation workflow** — detect → scope → rotate → deploy → verify
|
||||
- **Secret manager integrations** — Vault KV v2, AWS SSM, 1Password, Doppler
|
||||
- `.env` and `.env.example` lifecycle guidance
|
||||
- Secret leak detection for repository working trees
|
||||
- Severity-based findings for likely credentials
|
||||
- Operational pointers for rotation and containment
|
||||
- Integration-ready outputs for CI checks
|
||||
|
||||
---
|
||||
|
||||
## When to Use
|
||||
|
||||
- Setting up a new project — scaffold .env.example and validation
|
||||
- Before every commit — scan for accidentally staged secrets
|
||||
- Post-incident response — leaked credential rotation procedure
|
||||
- Onboarding new developers — they need all vars, not just some
|
||||
- Environment drift investigation — prod behaving differently from staging
|
||||
- Before pushing commits that touched env/config files
|
||||
- During security audits and incident triage
|
||||
- When onboarding contributors who need safe env conventions
|
||||
- When validating that no obvious secrets are hardcoded
|
||||
|
||||
---
|
||||
|
||||
## .env File Structure
|
||||
## Quick Start
|
||||
|
||||
### Canonical Layout
|
||||
```bash
|
||||
# .env.example — committed to git (no values)
|
||||
# .env.local — developer machine (gitignored)
|
||||
# .env.staging — CI/CD or secret manager reference
|
||||
# .env.prod — never on disk; pulled from secret manager at runtime
|
||||
# Scan a repository for likely secret leaks
|
||||
python3 scripts/env_auditor.py /path/to/repo
|
||||
|
||||
# Application
|
||||
APP_NAME=
|
||||
APP_ENV= # dev | staging | prod
|
||||
APP_PORT=3000 # default port if not set
|
||||
APP_SECRET= # REQUIRED: JWT signing secret (min 32 chars)
|
||||
APP_URL= # REQUIRED: public base URL
|
||||
|
||||
# Database
|
||||
DATABASE_URL= # REQUIRED: full connection string
|
||||
DATABASE_POOL_MIN=2
|
||||
DATABASE_POOL_MAX=10
|
||||
|
||||
# Auth
|
||||
AUTH_JWT_SECRET= # REQUIRED
|
||||
AUTH_JWT_EXPIRY=3600 # seconds
|
||||
AUTH_REFRESH_SECRET= # REQUIRED
|
||||
|
||||
# Third-party APIs
|
||||
STRIPE_SECRET_KEY= # REQUIRED in prod
|
||||
STRIPE_WEBHOOK_SECRET= # REQUIRED in prod
|
||||
SENDGRID_API_KEY=
|
||||
|
||||
# Storage
|
||||
AWS_ACCESS_KEY_ID=
|
||||
AWS_SECRET_ACCESS_KEY=
|
||||
AWS_REGION=eu-central-1
|
||||
AWS_S3_BUCKET=
|
||||
|
||||
# Monitoring
|
||||
SENTRY_DSN=
|
||||
DD_API_KEY=
|
||||
# JSON output for CI pipelines
|
||||
python3 scripts/env_auditor.py /path/to/repo --json
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## .gitignore Patterns
|
||||
## Recommended Workflow
|
||||
|
||||
Add to your project's `.gitignore`:
|
||||
|
||||
```gitignore
|
||||
# Environment files — NEVER commit these
|
||||
.env
|
||||
.env.local
|
||||
.env.development
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.staging
|
||||
.env.staging.local
|
||||
.env.production
|
||||
.env.production.local
|
||||
.env.prod
|
||||
.env.*.local
|
||||
|
||||
# Secret files
|
||||
*.pem
|
||||
*.key
|
||||
*.p12
|
||||
*.pfx
|
||||
secrets.json
|
||||
secrets.yaml
|
||||
secrets.yml
|
||||
credentials.json
|
||||
service-account.json
|
||||
|
||||
# AWS
|
||||
.aws/credentials
|
||||
|
||||
# Terraform state (may contain secrets)
|
||||
*.tfstate
|
||||
*.tfstate.backup
|
||||
.terraform/
|
||||
|
||||
# Kubernetes secrets
|
||||
*-secret.yaml
|
||||
*-secrets.yaml
|
||||
```
|
||||
1. Run `scripts/env_auditor.py` on the repository root.
|
||||
2. Prioritize `critical` and `high` findings first.
|
||||
3. Rotate real credentials and remove exposed values.
|
||||
4. Update `.env.example` and `.gitignore` as needed.
|
||||
5. Add or tighten pre-commit/CI secret scanning gates.
|
||||
|
||||
---
|
||||
|
||||
## .env.example Auto-Generation
|
||||
## Reference Docs
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/gen-env-example.sh
|
||||
# Strips values from .env, preserves keys, defaults, and comments
|
||||
|
||||
INPUT="${1:-.env}"
|
||||
OUTPUT="${2:-.env.example}"
|
||||
|
||||
if [ ! -f "$INPUT" ]; then
|
||||
echo "ERROR: $INPUT not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
python3 - "$INPUT" "$OUTPUT" << 'PYEOF'
|
||||
import sys, re
|
||||
|
||||
input_file = sys.argv[1]
|
||||
output_file = sys.argv[2]
|
||||
lines = []
|
||||
|
||||
with open(input_file) as f:
|
||||
for line in f:
|
||||
stripped = line.rstrip('\n')
|
||||
# Keep blank lines and comments as-is
|
||||
if stripped == '' or stripped.startswith('#'):
|
||||
lines.append(stripped)
|
||||
continue
|
||||
# Match KEY=VALUE or KEY="VALUE"
|
||||
m = re.match(r'^([A-Z_][A-Z0-9_]*)=(.*)$', stripped)
|
||||
if m:
|
||||
key = m.group(1)
|
||||
value = m.group(2).strip('"\'')
|
||||
# Keep non-sensitive defaults (ports, regions, feature flags)
|
||||
safe_defaults = re.compile(
|
||||
r'^(APP_PORT|APP_ENV|APP_NAME|AWS_REGION|DATABASE_POOL_|LOG_LEVEL|'
|
||||
r'FEATURE_|CACHE_TTL|RATE_LIMIT_|PAGINATION_|TIMEOUT_)',
|
||||
re.I
|
||||
)
|
||||
sensitive = re.compile(
|
||||
r'(SECRET|KEY|TOKEN|PASSWORD|PASS|CREDENTIAL|DSN|AUTH|PRIVATE|CERT)',
|
||||
re.I
|
||||
)
|
||||
if safe_defaults.match(key) and value:
|
||||
lines.append(f"{key}={value} # default")
|
||||
else:
|
||||
lines.append(f"{key}=")
|
||||
else:
|
||||
lines.append(stripped)
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write('\n'.join(lines) + '\n')
|
||||
|
||||
print(f"Generated {output_file} from {input_file}")
|
||||
PYEOF
|
||||
```
|
||||
|
||||
Usage:
|
||||
```bash
|
||||
bash scripts/gen-env-example.sh .env .env.example
|
||||
# Commit .env.example, never .env
|
||||
git add .env.example
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Required Variable Validation Script
|
||||
→ See references/validation-detection-rotation.md for details
|
||||
|
||||
## Secret Manager Integrations
|
||||
|
||||
### HashiCorp Vault KV v2
|
||||
```bash
|
||||
# Setup
|
||||
export VAULT_ADDR="https://vault.internal.company.com"
|
||||
export VAULT_TOKEN="$(vault login -method=oidc -format=json | jq -r '.auth.client_token')"
|
||||
|
||||
# Write secrets
|
||||
vault kv put secret/myapp/prod \
|
||||
DATABASE_URL="postgres://user:pass@host/db" \
|
||||
APP_SECRET="$(openssl rand -base64 32)"
|
||||
|
||||
# Read secrets into env
|
||||
eval $(vault kv get -format=json secret/myapp/prod | \
|
||||
jq -r '.data.data | to_entries[] | "export \(.key)=\(.value)"')
|
||||
|
||||
# In CI/CD (GitHub Actions)
|
||||
# Use vault-action: hashicorp/vault-action@v2
|
||||
```
|
||||
|
||||
### AWS SSM Parameter Store
|
||||
```bash
|
||||
# Write (SecureString = encrypted with KMS)
|
||||
aws ssm put-parameter \
|
||||
--name "/myapp/prod/DATABASE_URL" \
|
||||
--value "postgres://..." \
|
||||
--type "SecureString" \
|
||||
--key-id "alias/myapp-secrets"
|
||||
|
||||
# Read all params for an app/env into shell
|
||||
eval $(aws ssm get-parameters-by-path \
|
||||
--path "/myapp/prod/" \
|
||||
--with-decryption \
|
||||
--query "Parameters[*].[Name,Value]" \
|
||||
--output text | \
|
||||
awk '{split($1,a,"/"); print "export " a[length(a)] "=\"" $2 "\""}')
|
||||
|
||||
# In Node.js at startup
|
||||
# Use @aws-sdk/client-ssm to pull params before server starts
|
||||
```
|
||||
|
||||
### 1Password CLI
|
||||
```bash
|
||||
# Authenticate
|
||||
eval $(op signin)
|
||||
|
||||
# Get a specific field
|
||||
op read "op://MyVault/MyApp Prod/STRIPE_SECRET_KEY"
|
||||
|
||||
# Export all fields from an item as env vars
|
||||
op item get "MyApp Prod" --format json | \
|
||||
jq -r '.fields[] | select(.value != null) | "export \(.label)=\"\(.value)\""' | \
|
||||
grep -E "^export [A-Z_]+" | source /dev/stdin
|
||||
|
||||
# .env injection
|
||||
op inject -i .env.tpl -o .env
|
||||
# .env.tpl uses {{ op://Vault/Item/field }} syntax
|
||||
```
|
||||
|
||||
### Doppler
|
||||
```bash
|
||||
# Setup
|
||||
doppler setup # interactive: select project + config
|
||||
|
||||
# Run any command with secrets injected
|
||||
doppler run -- node server.js
|
||||
doppler run -- npm run dev
|
||||
|
||||
# Export to .env (local dev only — never commit output)
|
||||
doppler secrets download --no-file --format env > .env.local
|
||||
|
||||
# Pull specific secret
|
||||
doppler secrets get DATABASE_URL --plain
|
||||
|
||||
# Sync to another environment
|
||||
doppler secrets upload --project myapp --config staging < .env.staging.example
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Environment Drift Detection
|
||||
|
||||
Check if staging and prod have the same set of keys (values may differ):
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/check-env-drift.sh
|
||||
|
||||
# Pull key names from both environments (not values)
|
||||
STAGING_KEYS=$(doppler secrets --project myapp --config staging --format json 2>/dev/null | \
|
||||
jq -r 'keys[]' | sort)
|
||||
PROD_KEYS=$(doppler secrets --project myapp --config prod --format json 2>/dev/null | \
|
||||
jq -r 'keys[]' | sort)
|
||||
|
||||
ONLY_IN_STAGING=$(comm -23 <(echo "$STAGING_KEYS") <(echo "$PROD_KEYS"))
|
||||
ONLY_IN_PROD=$(comm -13 <(echo "$STAGING_KEYS") <(echo "$PROD_KEYS"))
|
||||
|
||||
if [ -n "$ONLY_IN_STAGING" ]; then
|
||||
echo "Keys in STAGING but NOT in PROD:"
|
||||
echo "$ONLY_IN_STAGING" | sed 's/^/ /'
|
||||
fi
|
||||
|
||||
if [ -n "$ONLY_IN_PROD" ]; then
|
||||
echo "Keys in PROD but NOT in STAGING:"
|
||||
echo "$ONLY_IN_PROD" | sed 's/^/ /'
|
||||
fi
|
||||
|
||||
if [ -z "$ONLY_IN_STAGING" ] && [ -z "$ONLY_IN_PROD" ]; then
|
||||
echo "✅ No env drift detected — staging and prod have identical key sets"
|
||||
fi
|
||||
```
|
||||
- `references/validation-detection-rotation.md`
|
||||
- `references/secret-patterns.md`
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
- **Committing .env instead of .env.example** — add `.env` to .gitignore on day 1; use pre-commit hooks
|
||||
- **Storing secrets in CI/CD logs** — never `echo $SECRET`; mask vars in CI settings
|
||||
- **Rotating only one place** — secrets often appear in Heroku, Vercel, Docker, K8s, CI — update ALL
|
||||
- **Forgetting to invalidate sessions after JWT secret rotation** — all users will be logged out; communicate this
|
||||
- **Using .env.example with real values** — example files are public; strip everything sensitive
|
||||
- **Not monitoring after rotation** — watch audit logs for 24h after rotation to catch unauthorized old-credential use
|
||||
- **Weak secrets** — `APP_SECRET=mysecret` is not a secret. Use `openssl rand -base64 32`
|
||||
|
||||
---
|
||||
- Committing real values in `.env.example`
|
||||
- Rotating one system but missing downstream consumers
|
||||
- Logging secrets during debugging or incident response
|
||||
- Treating suspected leaks as low urgency without validation
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Secret manager is source of truth** — .env files are for local dev only; never in prod
|
||||
2. **Rotate on a schedule**, not just after incidents — quarterly minimum for long-lived keys
|
||||
3. **Principle of least privilege** — each service gets its own API key with minimal permissions
|
||||
4. **Audit access** — log every secret read in Vault/SSM; alert on anomalous access
|
||||
5. **Never log secrets** — add log scrubbing middleware that redacts known secret patterns
|
||||
6. **Use short-lived credentials** — prefer OIDC/instance roles over long-lived access keys
|
||||
7. **Separate secrets per environment** — never share a key between dev and prod
|
||||
8. **Document rotation runbooks** — before an incident, not during one
|
||||
1. Use a secret manager as the production source of truth.
|
||||
2. Keep dev env files local and gitignored.
|
||||
3. Enforce detection in CI before merge.
|
||||
4. Re-test application paths immediately after credential rotation.
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
# Secret Pattern Reference
|
||||
|
||||
## Detection Categories
|
||||
|
||||
### Critical
|
||||
|
||||
- OpenAI-like keys (`sk-...`)
|
||||
- GitHub personal access tokens (`ghp_...`)
|
||||
- AWS access key IDs (`AKIA...`)
|
||||
|
||||
### High
|
||||
|
||||
- Slack tokens (`xox...`)
|
||||
- Private key PEM blocks
|
||||
- Hardcoded assignments to `secret`, `token`, `password`, `api_key`
|
||||
|
||||
### Medium
|
||||
|
||||
- JWT-like tokens in plaintext
|
||||
- Suspected credentials in docs/scripts that should be redacted
|
||||
|
||||
## Severity Guidance
|
||||
|
||||
- `critical`: immediate rotation required; treat as active incident
|
||||
- `high`: likely sensitive; investigate and rotate if real credential
|
||||
- `medium`: possible exposure; verify context and sanitize where needed
|
||||
|
||||
## Response Playbook
|
||||
|
||||
1. Revoke or rotate exposed credential.
|
||||
2. Identify blast radius (services, environments, users).
|
||||
3. Remove from code/history where possible.
|
||||
4. Add preventive controls (pre-commit hooks, CI secret scans).
|
||||
5. Verify monitoring and access logs for abuse.
|
||||
|
||||
## Preventive Baseline
|
||||
|
||||
- Commit only `.env.example`, never `.env`.
|
||||
- Keep `.gitignore` patterns for env and key material.
|
||||
- Use secret managers for staging/prod.
|
||||
- Redact sensitive values from logs and debug output.
|
||||
145
engineering/env-secrets-manager/scripts/env_auditor.py
Executable file
145
engineering/env-secrets-manager/scripts/env_auditor.py
Executable file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scan env files and source code for likely secret exposure patterns."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List
|
||||
|
||||
IGNORED_DIRS = {
|
||||
".git",
|
||||
"node_modules",
|
||||
".next",
|
||||
"dist",
|
||||
"build",
|
||||
"coverage",
|
||||
"venv",
|
||||
".venv",
|
||||
"__pycache__",
|
||||
}
|
||||
|
||||
SOURCE_EXTS = {
|
||||
".env",
|
||||
".py",
|
||||
".ts",
|
||||
".tsx",
|
||||
".js",
|
||||
".jsx",
|
||||
".json",
|
||||
".yaml",
|
||||
".yml",
|
||||
".toml",
|
||||
".ini",
|
||||
".sh",
|
||||
".md",
|
||||
}
|
||||
|
||||
PATTERNS = [
|
||||
("critical", "openai_key", re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")),
|
||||
("critical", "github_pat", re.compile(r"\bghp_[A-Za-z0-9]{20,}\b")),
|
||||
("critical", "aws_access_key_id", re.compile(r"\bAKIA[0-9A-Z]{16}\b")),
|
||||
("high", "slack_token", re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b")),
|
||||
("high", "private_key_block", re.compile(r"-----BEGIN (RSA |EC |OPENSSH )?PRIVATE KEY-----")),
|
||||
("high", "generic_secret_assignment", re.compile(r"(?i)\b(secret|token|password|passwd|api[_-]?key)\b\s*[:=]\s*['\"]?[A-Za-z0-9_\-\/.+=]{8,}")),
|
||||
("medium", "jwt_like", re.compile(r"\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b")),
|
||||
]
|
||||
|
||||
|
||||
def iter_files(root: Path) -> Iterable[Path]:
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
dirnames[:] = [d for d in dirnames if d not in IGNORED_DIRS]
|
||||
for name in filenames:
|
||||
p = Path(dirpath) / name
|
||||
if p.is_file():
|
||||
yield p
|
||||
|
||||
|
||||
def is_candidate(path: Path) -> bool:
|
||||
if path.name.startswith(".env"):
|
||||
return True
|
||||
return path.suffix.lower() in SOURCE_EXTS
|
||||
|
||||
|
||||
def scan_file(path: Path, max_bytes: int, root: Path) -> List[Dict[str, object]]:
|
||||
findings: List[Dict[str, object]] = []
|
||||
try:
|
||||
if path.stat().st_size > max_bytes:
|
||||
return findings
|
||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception:
|
||||
return findings
|
||||
|
||||
for lineno, line in enumerate(text.splitlines(), start=1):
|
||||
for severity, kind, pattern in PATTERNS:
|
||||
if pattern.search(line):
|
||||
findings.append(
|
||||
{
|
||||
"severity": severity,
|
||||
"pattern": kind,
|
||||
"file": str(path.relative_to(root)),
|
||||
"line": lineno,
|
||||
"snippet": line.strip()[:180],
|
||||
}
|
||||
)
|
||||
return findings
|
||||
|
||||
|
||||
def severity_counts(findings: List[Dict[str, object]]) -> Dict[str, int]:
|
||||
counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
||||
for item in findings:
|
||||
sev = str(item.get("severity", "low"))
|
||||
counts[sev] = counts.get(sev, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Audit a repository for likely secret leaks in env files and source.")
|
||||
parser.add_argument("path", help="Path to repository root")
|
||||
parser.add_argument("--max-file-size-kb", type=int, default=512, help="Skip files larger than this size (default: 512)")
|
||||
parser.add_argument("--json", action="store_true", help="Output JSON")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
root = Path(args.path).expanduser().resolve()
|
||||
if not root.exists() or not root.is_dir():
|
||||
raise SystemExit(f"Path is not a directory: {root}")
|
||||
|
||||
max_bytes = max(1, args.max_file_size_kb) * 1024
|
||||
findings: List[Dict[str, object]] = []
|
||||
|
||||
for file_path in iter_files(root):
|
||||
if is_candidate(file_path):
|
||||
findings.extend(scan_file(file_path, max_bytes=max_bytes, root=root))
|
||||
|
||||
report = {
|
||||
"root": str(root),
|
||||
"total_findings": len(findings),
|
||||
"severity_counts": severity_counts(findings),
|
||||
"findings": findings,
|
||||
}
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(report, indent=2))
|
||||
else:
|
||||
print("Env/Secrets Audit Report")
|
||||
print(f"Root: {report['root']}")
|
||||
print(f"Total findings: {report['total_findings']}")
|
||||
print("Severity:")
|
||||
for sev, count in report["severity_counts"].items():
|
||||
print(f"- {sev}: {count}")
|
||||
print("")
|
||||
for item in findings[:200]:
|
||||
print(f"[{item['severity'].upper()}] {item['file']}:{item['line']} ({item['pattern']})")
|
||||
print(f" {item['snippet']}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -5,454 +5,55 @@ description: This skill should be used when the user asks to "design interview p
|
||||
|
||||
# Interview System Designer
|
||||
|
||||
Comprehensive interview system design, competency assessment, and hiring process optimization.
|
||||
Comprehensive interview loop planning and calibration support for role-based hiring systems.
|
||||
|
||||
## Table of Contents
|
||||
## Overview
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
- [Tools Overview](#tools-overview)
|
||||
- [Interview Loop Designer](#1-interview-loop-designer)
|
||||
- [Question Bank Generator](#2-question-bank-generator)
|
||||
- [Hiring Calibrator](#3-hiring-calibrator)
|
||||
- [Interview System Workflows](#interview-system-workflows)
|
||||
- [Role-Specific Loop Design](#role-specific-loop-design)
|
||||
- [Competency Matrix Development](#competency-matrix-development)
|
||||
- [Question Bank Creation](#question-bank-creation)
|
||||
- [Bias Mitigation Framework](#bias-mitigation-framework)
|
||||
- [Hiring Bar Calibration](#hiring-bar-calibration)
|
||||
- [Competency Frameworks](#competency-frameworks)
|
||||
- [Scoring & Calibration](#scoring--calibration)
|
||||
- [Reference Documentation](#reference-documentation)
|
||||
- [Industry Standards](#industry-standards)
|
||||
Use this skill to create structured interview loops, standardize question quality, and keep hiring signal consistent across interviewers.
|
||||
|
||||
---
|
||||
## Core Capabilities
|
||||
|
||||
- Interview loop planning by role and level
|
||||
- Round-by-round focus and timing recommendations
|
||||
- Suggested question sets by round type
|
||||
- Framework support for scoring and calibration
|
||||
- Bias-reduction and process consistency guidance
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Design a complete interview loop for a senior software engineer role
|
||||
python loop_designer.py --role "Senior Software Engineer" --level senior --team platform --output loops/
|
||||
# Generate a loop plan for a role and level
|
||||
python3 scripts/interview_planner.py --role "Senior Software Engineer" --level senior
|
||||
|
||||
# Generate a comprehensive question bank for a product manager position
|
||||
python question_bank_generator.py --role "Product Manager" --level senior --competencies leadership,strategy,analytics --output questions/
|
||||
|
||||
# Analyze interview calibration across multiple candidates and interviewers
|
||||
python hiring_calibrator.py --input interview_data.json --output calibration_report.json --analysis-type full
|
||||
# JSON output for integration with internal tooling
|
||||
python3 scripts/interview_planner.py --role "Product Manager" --level mid --json
|
||||
```
|
||||
|
||||
---
|
||||
## Recommended Workflow
|
||||
|
||||
## Tools Overview
|
||||
1. Run `scripts/interview_planner.py` to generate a baseline loop.
|
||||
2. Align rounds to role-specific competencies.
|
||||
3. Validate scoring rubric consistency with interview panel leads.
|
||||
4. Review for bias controls before rollout.
|
||||
5. Recalibrate quarterly using hiring outcome data.
|
||||
|
||||
### 1. Interview Loop Designer
|
||||
## References
|
||||
|
||||
Generates calibrated interview loops tailored to specific roles, levels, and teams.
|
||||
- `references/interview-frameworks.md`
|
||||
- `references/bias_mitigation_checklist.md`
|
||||
- `references/competency_matrix_templates.md`
|
||||
- `references/debrief_facilitation_guide.md`
|
||||
|
||||
**Input:** Role definition (title, level, team, competency requirements)
|
||||
**Output:** Complete interview loop with rounds, focus areas, time allocation, scorecard templates
|
||||
## Common Pitfalls
|
||||
|
||||
**Key Features:**
|
||||
- Role-specific competency mapping
|
||||
- Level-appropriate question difficulty
|
||||
- Interviewer skill requirements
|
||||
- Time-optimized scheduling
|
||||
- Standardized scorecards
|
||||
- Overweighting one round while ignoring other competency signals
|
||||
- Using unstructured interviews without standardized scoring
|
||||
- Skipping calibration sessions for interviewers
|
||||
- Changing hiring bar without documenting rationale
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Design loop for a specific role
|
||||
python loop_designer.py --role "Staff Data Scientist" --level staff --team ml-platform
|
||||
## Best Practices
|
||||
|
||||
# Generate loop with specific focus areas
|
||||
python loop_designer.py --role "Engineering Manager" --level senior --competencies leadership,technical,strategy
|
||||
|
||||
# Create loop for multiple levels
|
||||
python loop_designer.py --role "Backend Engineer" --levels junior,mid,senior --output loops/backend/
|
||||
```
|
||||
|
||||
### 2. Question Bank Generator
|
||||
|
||||
Creates comprehensive, competency-based interview questions with detailed scoring criteria.
|
||||
|
||||
**Input:** Role requirements, competency areas, experience level
|
||||
**Output:** Structured question bank with scoring rubrics, follow-up probes, and calibration examples
|
||||
|
||||
**Key Features:**
|
||||
- Competency-based question organization
|
||||
- Level-appropriate difficulty progression
|
||||
- Behavioral and technical question types
|
||||
- Anti-bias question design
|
||||
- Calibration examples (poor/good/great answers)
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Generate questions for technical competencies
|
||||
python question_bank_generator.py --role "Frontend Engineer" --competencies react,typescript,system-design
|
||||
|
||||
# Create behavioral question bank
|
||||
python question_bank_generator.py --role "Product Manager" --question-types behavioral,leadership --output pm_questions/
|
||||
|
||||
# Generate questions for all levels
|
||||
python question_bank_generator.py --role "DevOps Engineer" --levels junior,mid,senior,staff
|
||||
```
|
||||
|
||||
### 3. Hiring Calibrator
|
||||
|
||||
Analyzes interview scores to detect bias, calibration issues, and recommends improvements.
|
||||
|
||||
**Input:** Interview results data (candidate scores, interviewer feedback, demographics)
|
||||
**Output:** Calibration analysis, bias detection report, interviewer coaching recommendations
|
||||
|
||||
**Key Features:**
|
||||
- Statistical bias detection
|
||||
- Interviewer calibration analysis
|
||||
- Score distribution analysis
|
||||
- Recommendation engine
|
||||
- Trend tracking over time
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
# Analyze calibration across all interviews
|
||||
python hiring_calibrator.py --input interview_results.json --analysis-type comprehensive
|
||||
|
||||
# Focus on specific competency areas
|
||||
python hiring_calibrator.py --input data.json --competencies technical,leadership --output bias_report.json
|
||||
|
||||
# Track calibration trends over time
|
||||
python hiring_calibrator.py --input historical_data.json --trend-analysis --period quarterly
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Interview System Workflows
|
||||
|
||||
### Role-Specific Loop Design
|
||||
|
||||
#### Software Engineering Roles
|
||||
|
||||
**Junior/Mid Software Engineer (2-4 years)**
|
||||
- **Duration:** 3-4 hours across 3-4 rounds
|
||||
- **Focus Areas:** Coding fundamentals, debugging, system understanding, growth mindset
|
||||
- **Rounds:**
|
||||
1. Technical Phone Screen (45min) - Coding fundamentals, algorithms
|
||||
2. Coding Deep Dive (60min) - Problem-solving, code quality, testing
|
||||
3. System Design Basics (45min) - Component interaction, basic scalability
|
||||
4. Behavioral & Values (30min) - Team collaboration, learning agility
|
||||
|
||||
**Senior Software Engineer (5-8 years)**
|
||||
- **Duration:** 4-5 hours across 4-5 rounds
|
||||
- **Focus Areas:** System design, technical leadership, mentoring capability, domain expertise
|
||||
- **Rounds:**
|
||||
1. Technical Phone Screen (45min) - Advanced algorithms, optimization
|
||||
2. System Design (60min) - Scalability, trade-offs, architectural decisions
|
||||
3. Coding Excellence (60min) - Code quality, testing strategies, refactoring
|
||||
4. Technical Leadership (45min) - Mentoring, technical decisions, cross-team collaboration
|
||||
5. Behavioral & Culture (30min) - Leadership examples, conflict resolution
|
||||
|
||||
**Staff+ Engineer (8+ years)**
|
||||
- **Duration:** 5-6 hours across 5-6 rounds
|
||||
- **Focus Areas:** Architectural vision, organizational impact, technical strategy, cross-functional leadership
|
||||
- **Rounds:**
|
||||
1. Technical Phone Screen (45min) - System architecture, complex problem-solving
|
||||
2. Architecture Design (90min) - Large-scale systems, technology choices, evolution patterns
|
||||
3. Technical Strategy (60min) - Technical roadmaps, technology adoption, risk assessment
|
||||
4. Leadership & Influence (60min) - Cross-team impact, technical vision, stakeholder management
|
||||
5. Coding & Best Practices (45min) - Code quality standards, development processes
|
||||
6. Cultural & Strategic Fit (30min) - Company values, strategic thinking
|
||||
|
||||
#### Product Management Roles
|
||||
|
||||
**Product Manager (3-6 years)**
|
||||
- **Duration:** 3-4 hours across 4 rounds
|
||||
- **Focus Areas:** Product sense, analytical thinking, stakeholder management, execution
|
||||
- **Rounds:**
|
||||
1. Product Sense (60min) - Feature prioritization, user empathy, market understanding
|
||||
2. Analytical Thinking (45min) - Data interpretation, metrics design, experimentation
|
||||
3. Execution & Process (45min) - Project management, cross-functional collaboration
|
||||
4. Behavioral & Leadership (30min) - Stakeholder management, conflict resolution
|
||||
|
||||
**Senior Product Manager (6-10 years)**
|
||||
- **Duration:** 4-5 hours across 4-5 rounds
|
||||
- **Focus Areas:** Product strategy, team leadership, business impact, market analysis
|
||||
- **Rounds:**
|
||||
1. Product Strategy (75min) - Market analysis, competitive positioning, roadmap planning
|
||||
2. Leadership & Influence (60min) - Team building, stakeholder management, decision-making
|
||||
3. Data & Analytics (45min) - Advanced metrics, experimentation design, business intelligence
|
||||
4. Technical Collaboration (45min) - Technical trade-offs, engineering partnership
|
||||
5. Case Study Presentation (45min) - Past impact, lessons learned, strategic thinking
|
||||
|
||||
#### Design Roles
|
||||
|
||||
**UX Designer (2-5 years)**
|
||||
- **Duration:** 3-4 hours across 3-4 rounds
|
||||
- **Focus Areas:** Design process, user research, visual design, collaboration
|
||||
- **Rounds:**
|
||||
1. Portfolio Review (60min) - Design process, problem-solving approach, visual skills
|
||||
2. Design Challenge (90min) - User-centered design, wireframing, iteration
|
||||
3. Collaboration & Process (45min) - Cross-functional work, feedback incorporation
|
||||
4. Behavioral & Values (30min) - User advocacy, creative problem-solving
|
||||
|
||||
**Senior UX Designer (5+ years)**
|
||||
- **Duration:** 4-5 hours across 4-5 rounds
|
||||
- **Focus Areas:** Design leadership, system thinking, research methodology, business impact
|
||||
- **Rounds:**
|
||||
1. Portfolio Deep Dive (75min) - Design impact, methodology, leadership examples
|
||||
2. Design System Challenge (90min) - Systems thinking, scalability, consistency
|
||||
3. Research & Strategy (60min) - User research methods, data-driven design decisions
|
||||
4. Leadership & Mentoring (45min) - Design team leadership, process improvement
|
||||
5. Business & Strategy (30min) - Design's business impact, stakeholder management
|
||||
|
||||
### Competency Matrix Development
|
||||
|
||||
#### Technical Competencies
|
||||
|
||||
**Software Engineering**
|
||||
- **Coding Proficiency:** Algorithm design, data structures, language expertise
|
||||
- **System Design:** Architecture patterns, scalability, performance optimization
|
||||
- **Testing & Quality:** Unit testing, integration testing, code review practices
|
||||
- **DevOps & Tools:** CI/CD, monitoring, debugging, development workflows
|
||||
|
||||
**Data Science & Analytics**
|
||||
- **Statistical Analysis:** Statistical methods, hypothesis testing, experimental design
|
||||
- **Machine Learning:** Algorithm selection, model evaluation, feature engineering
|
||||
- **Data Engineering:** ETL processes, data pipeline design, data quality
|
||||
- **Business Intelligence:** Metrics design, dashboard creation, stakeholder communication
|
||||
|
||||
**Product Management**
|
||||
- **Product Strategy:** Market analysis, competitive research, roadmap planning
|
||||
- **User Research:** User interviews, usability testing, persona development
|
||||
- **Data Analysis:** Metrics interpretation, A/B testing, cohort analysis
|
||||
- **Technical Understanding:** API design, database concepts, system architecture
|
||||
|
||||
#### Behavioral Competencies
|
||||
|
||||
**Leadership & Influence**
|
||||
- **Team Building:** Hiring, onboarding, team culture development
|
||||
- **Mentoring & Coaching:** Skill development, career guidance, feedback delivery
|
||||
- **Strategic Thinking:** Long-term planning, vision setting, decision-making frameworks
|
||||
- **Change Management:** Process improvement, organizational change, resistance handling
|
||||
|
||||
**Communication & Collaboration**
|
||||
- **Stakeholder Management:** Expectation setting, conflict resolution, alignment building
|
||||
- **Cross-Functional Partnership:** Engineering-Product-Design collaboration
|
||||
- **Presentation Skills:** Technical communication, executive briefings, documentation
|
||||
- **Active Listening:** Empathy, question asking, perspective taking
|
||||
|
||||
**Problem-Solving & Innovation**
|
||||
- **Analytical Thinking:** Problem decomposition, root cause analysis, hypothesis formation
|
||||
- **Creative Problem-Solving:** Alternative solution generation, constraint navigation
|
||||
- **Learning Agility:** Skill acquisition, adaptation to change, knowledge transfer
|
||||
- **Risk Assessment:** Uncertainty navigation, trade-off analysis, mitigation planning
|
||||
|
||||
### Question Bank Creation
|
||||
|
||||
#### Technical Questions by Level
|
||||
|
||||
**Junior Level Questions**
|
||||
- **Coding:** "Implement a function to find the second largest element in an array"
|
||||
- **System Design:** "How would you design a simple URL shortener for 1000 users?"
|
||||
- **Debugging:** "Walk through how you would debug a slow-loading web page"
|
||||
|
||||
**Senior Level Questions**
|
||||
- **Architecture:** "Design a real-time chat system supporting 1M concurrent users"
|
||||
- **Leadership:** "Describe how you would onboard a new team member in your area"
|
||||
- **Trade-offs:** "Compare microservices vs monolith for a rapidly scaling startup"
|
||||
|
||||
**Staff+ Level Questions**
|
||||
- **Strategy:** "How would you evaluate and introduce a new programming language to the organization?"
|
||||
- **Influence:** "Describe a time you drove technical consensus across multiple teams"
|
||||
- **Vision:** "How do you balance technical debt against feature development?"
|
||||
|
||||
#### Behavioral Questions Framework
|
||||
|
||||
**STAR Method Implementation**
|
||||
- **Situation:** Context and background of the scenario
|
||||
- **Task:** Specific challenge or goal that needed to be addressed
|
||||
- **Action:** Concrete steps taken to address the challenge
|
||||
- **Result:** Measurable outcomes and lessons learned
|
||||
|
||||
**Sample Questions:**
|
||||
- "Tell me about a time you had to influence a decision without formal authority"
|
||||
- "Describe a situation where you had to deliver difficult feedback to a colleague"
|
||||
- "Give an example of when you had to adapt your communication style for different audiences"
|
||||
- "Walk me through a time when you had to make a decision with incomplete information"
|
||||
|
||||
### Bias Mitigation Framework
|
||||
|
||||
#### Structural Bias Prevention
|
||||
|
||||
**Interview Panel Composition**
|
||||
- Diverse interviewer panels (gender, ethnicity, experience level)
|
||||
- Rotating panel assignments to prevent pattern bias
|
||||
- Anonymous resume screening for initial phone screens
|
||||
- Standardized question sets to ensure consistency
|
||||
|
||||
**Process Standardization**
|
||||
- Structured interview guides with required probing questions
|
||||
- Consistent time allocation across all candidates
|
||||
- Standardized evaluation criteria and scoring rubrics
|
||||
- Required justification for all scoring decisions
|
||||
|
||||
#### Cognitive Bias Recognition
|
||||
|
||||
**Common Interview Biases**
|
||||
- **Halo Effect:** One strong impression influences overall assessment
|
||||
- **Confirmation Bias:** Seeking information that confirms initial impressions
|
||||
- **Similarity Bias:** Favoring candidates with similar backgrounds/experiences
|
||||
- **Contrast Effect:** Comparing candidates against each other rather than standard
|
||||
- **Anchoring Bias:** Over-relying on first piece of information received
|
||||
|
||||
**Mitigation Strategies**
|
||||
- Pre-interview bias awareness training for all interviewers
|
||||
- Structured debrief sessions with independent score recording
|
||||
- Regular calibration sessions with example candidate discussions
|
||||
- Statistical monitoring of scoring patterns by interviewer and demographic
|
||||
|
||||
### Hiring Bar Calibration
|
||||
|
||||
#### Calibration Methodology
|
||||
|
||||
**Regular Calibration Sessions**
|
||||
- Monthly interviewer calibration meetings
|
||||
- Shadow interviewing for new interviewers (minimum 5 sessions)
|
||||
- Quarterly cross-team calibration reviews
|
||||
- Annual hiring bar review and adjustment process
|
||||
|
||||
**Performance Tracking**
|
||||
- New hire performance correlation with interview scores
|
||||
- Interviewer accuracy tracking (prediction vs actual performance)
|
||||
- False positive/negative analysis
|
||||
- Offer acceptance rate analysis by interviewer
|
||||
|
||||
**Feedback Loops**
|
||||
- Six-month new hire performance reviews
|
||||
- Manager feedback on interview process effectiveness
|
||||
- Candidate experience surveys and feedback integration
|
||||
- Continuous process improvement based on data analysis
|
||||
|
||||
---
|
||||
|
||||
## Competency Frameworks
|
||||
|
||||
### Engineering Competency Levels
|
||||
|
||||
#### Level 1-2: Individual Contributor (Junior/Mid)
|
||||
- **Technical Skills:** Language proficiency, testing basics, code review participation
|
||||
- **Problem Solving:** Structured approach to debugging, logical thinking
|
||||
- **Communication:** Clear status updates, effective question asking
|
||||
- **Learning:** Proactive skill development, mentorship seeking
|
||||
|
||||
#### Level 3-4: Senior Individual Contributor
|
||||
- **Technical Leadership:** Architecture decisions, code quality advocacy
|
||||
- **Mentoring:** Junior developer guidance, knowledge sharing
|
||||
- **Project Ownership:** End-to-end feature delivery, stakeholder communication
|
||||
- **Innovation:** Process improvement, technology evaluation
|
||||
|
||||
#### Level 5-6: Staff+ Engineer
|
||||
- **Organizational Impact:** Cross-team technical leadership, strategic planning
|
||||
- **Technical Vision:** Long-term architectural planning, technology roadmap
|
||||
- **People Development:** Team growth, hiring contribution, culture building
|
||||
- **External Influence:** Industry contribution, thought leadership
|
||||
|
||||
### Product Management Competency Levels
|
||||
|
||||
#### Level 1-2: Associate/Product Manager
|
||||
- **Product Execution:** Feature specification, requirements gathering
|
||||
- **User Focus:** User research participation, feedback collection
|
||||
- **Data Analysis:** Basic metrics analysis, experiment interpretation
|
||||
- **Stakeholder Management:** Cross-functional collaboration, communication
|
||||
|
||||
#### Level 3-4: Senior Product Manager
|
||||
- **Strategic Thinking:** Market analysis, competitive positioning
|
||||
- **Leadership:** Cross-functional team leadership, decision making
|
||||
- **Business Impact:** Revenue impact, market share growth
|
||||
- **Process Innovation:** Product development process improvement
|
||||
|
||||
#### Level 5-6: Principal Product Manager
|
||||
- **Vision Setting:** Product strategy, market direction
|
||||
- **Organizational Influence:** Executive communication, team building
|
||||
- **Innovation Leadership:** New market creation, disruptive thinking
|
||||
- **Talent Development:** PM team growth, hiring leadership
|
||||
|
||||
---
|
||||
|
||||
## Scoring & Calibration
|
||||
|
||||
### Scoring Rubric Framework
|
||||
|
||||
#### 4-Point Scoring Scale
|
||||
- **4 - Exceeds Expectations:** Demonstrates mastery beyond required level
|
||||
- **3 - Meets Expectations:** Solid performance meeting all requirements
|
||||
- **2 - Partially Meets:** Shows potential but has development areas
|
||||
- **1 - Does Not Meet:** Significant gaps in required competencies
|
||||
|
||||
#### Competency-Specific Scoring
|
||||
|
||||
**Technical Competencies**
|
||||
- Code Quality (4): Clean, maintainable, well-tested code with excellent documentation
|
||||
- Code Quality (3): Functional code with good structure and basic testing
|
||||
- Code Quality (2): Working code with some structural issues or missing tests
|
||||
- Code Quality (1): Non-functional or poorly structured code with significant issues
|
||||
|
||||
**Leadership Competencies**
|
||||
- Team Influence (4): Drives team success, develops others, creates lasting positive change
|
||||
- Team Influence (3): Contributes positively to team dynamics and outcomes
|
||||
- Team Influence (2): Shows leadership potential with some effective examples
|
||||
- Team Influence (1): Limited evidence of leadership ability or negative team impact
|
||||
|
||||
### Calibration Standards
|
||||
|
||||
#### Statistical Benchmarks
|
||||
- Target score distribution: 20% (4s), 40% (3s), 30% (2s), 10% (1s)
|
||||
- Interviewer consistency target: <0.5 standard deviation from team average
|
||||
- Pass rate target: 15-25% for most roles (varies by level and market conditions)
|
||||
- Time to hire target: 2-3 weeks from first interview to offer
|
||||
|
||||
#### Quality Metrics
|
||||
- New hire 6-month performance correlation: >0.6 with interview scores
|
||||
- Interviewer agreement rate: >80% within 1 point on final recommendations
|
||||
- Candidate experience satisfaction: >4.0/5.0 average rating
|
||||
- Offer acceptance rate: >85% for preferred candidates
|
||||
|
||||
---
|
||||
|
||||
## Reference Documentation
|
||||
|
||||
### Interview Templates
|
||||
- Role-specific interview guides and question banks
|
||||
- Scorecard templates for consistent evaluation
|
||||
- Debrief facilitation guides for effective team discussions
|
||||
|
||||
### Bias Mitigation Resources
|
||||
- Unconscious bias training materials and exercises
|
||||
- Structured interviewing best practices checklist
|
||||
- Demographic diversity tracking and reporting templates
|
||||
|
||||
### Calibration Tools
|
||||
- Interview performance correlation analysis templates
|
||||
- Interviewer coaching and development frameworks
|
||||
- Hiring pipeline metrics and dashboard specifications
|
||||
|
||||
---
|
||||
|
||||
## Industry Standards
|
||||
|
||||
### Best Practices Integration
|
||||
- Google's structured interviewing methodology
|
||||
- Amazon's Leadership Principles assessment framework
|
||||
- Microsoft's competency-based evaluation system
|
||||
- Netflix's culture fit assessment approach
|
||||
|
||||
### Compliance & Legal Considerations
|
||||
- EEOC compliance requirements and documentation
|
||||
- ADA accommodation procedures and guidelines
|
||||
- International hiring law considerations
|
||||
- Privacy and data protection requirements (GDPR, CCPA)
|
||||
|
||||
### Continuous Improvement Framework
|
||||
- Regular process auditing and refinement cycles
|
||||
- Industry benchmarking and comparative analysis
|
||||
- Technology integration for interview optimization
|
||||
- Candidate experience enhancement initiatives
|
||||
|
||||
This comprehensive interview system design framework provides the structure and tools necessary to build fair, effective, and scalable hiring processes that consistently identify top talent while minimizing bias and maximizing candidate experience.
|
||||
1. Keep round objectives explicit and non-overlapping.
|
||||
2. Require evidence for each score recommendation.
|
||||
3. Use the same baseline rubric across comparable roles.
|
||||
4. Revisit loop design based on quality-of-hire outcomes.
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
# Interview Frameworks
|
||||
|
||||
## Loop Design by Level
|
||||
|
||||
### Junior/Mid
|
||||
|
||||
- Emphasize fundamentals, debugging, and growth potential.
|
||||
- Keep loops concise with coding + behavioral validation.
|
||||
|
||||
### Senior
|
||||
|
||||
- Add system design and leadership rounds.
|
||||
- Evaluate tradeoff quality, mentoring, and cross-team collaboration.
|
||||
|
||||
### Staff+
|
||||
|
||||
- Focus on architecture direction and organizational impact.
|
||||
- Assess strategy, influence, and long-term technical judgment.
|
||||
|
||||
## Competency Areas
|
||||
|
||||
- Technical depth (implementation, design, quality)
|
||||
- Problem solving (ambiguity handling, prioritization)
|
||||
- Collaboration (communication, stakeholder alignment)
|
||||
- Leadership (ownership, mentoring, influence)
|
||||
|
||||
## Scoring Rubric Baseline
|
||||
|
||||
- `4`: exceeds level expectations with strong evidence
|
||||
- `3`: meets expectations consistently
|
||||
- `2`: partial signal with notable gaps
|
||||
- `1`: does not meet baseline requirements
|
||||
|
||||
## Calibration Guidelines
|
||||
|
||||
- Run recurring interviewer calibration sessions.
|
||||
- Compare interviewer scoring variance across rounds.
|
||||
- Track interview signal against new-hire outcomes.
|
||||
- Use structured debriefs with independent scoring before discussion.
|
||||
|
||||
## Bias-Reduction Baseline
|
||||
|
||||
- Standardize question banks per competency area.
|
||||
- Keep scorecards evidence-based and behavior-specific.
|
||||
- Use diverse interviewer panels where possible.
|
||||
- Require written rationale for strong yes/no recommendations.
|
||||
133
engineering/interview-system-designer/scripts/interview_planner.py
Executable file
133
engineering/interview-system-designer/scripts/interview_planner.py
Executable file
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate an interview loop plan by role and level."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
BASE_ROUNDS = {
|
||||
"junior": [
|
||||
("Screen", 45, "Fundamentals and communication"),
|
||||
("Coding", 60, "Problem solving and code quality"),
|
||||
("Behavioral", 45, "Collaboration and growth mindset"),
|
||||
],
|
||||
"mid": [
|
||||
("Screen", 45, "Fundamentals and ownership"),
|
||||
("Coding", 60, "Implementation quality"),
|
||||
("System Design", 60, "Service/component design"),
|
||||
("Behavioral", 45, "Stakeholder collaboration"),
|
||||
],
|
||||
"senior": [
|
||||
("Screen", 45, "Depth and tradeoff reasoning"),
|
||||
("Coding", 60, "Code quality and testing"),
|
||||
("System Design", 75, "Scalability and reliability"),
|
||||
("Leadership", 60, "Mentoring and decision making"),
|
||||
("Behavioral", 45, "Cross-functional influence"),
|
||||
],
|
||||
"staff": [
|
||||
("Screen", 45, "Strategic and technical depth"),
|
||||
("Architecture", 90, "Org-level design decisions"),
|
||||
("Technical Strategy", 60, "Long-term tradeoffs"),
|
||||
("Influence", 60, "Cross-team leadership"),
|
||||
("Behavioral", 45, "Values and executive communication"),
|
||||
],
|
||||
}
|
||||
|
||||
QUESTION_BANK = {
|
||||
"coding": [
|
||||
"Walk through your approach before coding and identify tradeoffs.",
|
||||
"How would you test this implementation for edge cases?",
|
||||
"What would you refactor if this code became a shared library?",
|
||||
],
|
||||
"system": [
|
||||
"Design this system for 10x traffic growth in 12 months.",
|
||||
"Where are the main failure modes and how would you detect them?",
|
||||
"What components would you scale first and why?",
|
||||
],
|
||||
"leadership": [
|
||||
"Describe a time you changed technical direction with incomplete information.",
|
||||
"How do you raise the bar for code quality across a team?",
|
||||
"How do you handle disagreement between product and engineering priorities?",
|
||||
],
|
||||
"behavioral": [
|
||||
"Tell me about a high-stakes mistake and what changed afterward.",
|
||||
"Describe a conflict where you had to influence without authority.",
|
||||
"How do you support underperforming teammates?",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def normalize_level(level: str) -> str:
|
||||
level = level.strip().lower()
|
||||
if level in {"staff+", "principal", "lead"}:
|
||||
return "staff"
|
||||
if level not in BASE_ROUNDS:
|
||||
raise ValueError(f"Unsupported level: {level}")
|
||||
return level
|
||||
|
||||
|
||||
def suggested_questions(round_name: str) -> List[str]:
|
||||
name = round_name.lower()
|
||||
if "coding" in name:
|
||||
return QUESTION_BANK["coding"]
|
||||
if "system" in name or "architecture" in name:
|
||||
return QUESTION_BANK["system"]
|
||||
if "lead" in name or "influence" in name or "strategy" in name:
|
||||
return QUESTION_BANK["leadership"]
|
||||
return QUESTION_BANK["behavioral"]
|
||||
|
||||
|
||||
def generate_plan(role: str, level: str) -> Dict[str, object]:
|
||||
normalized = normalize_level(level)
|
||||
rounds = []
|
||||
for idx, (name, minutes, focus) in enumerate(BASE_ROUNDS[normalized], start=1):
|
||||
rounds.append(
|
||||
{
|
||||
"round": idx,
|
||||
"name": name,
|
||||
"duration_minutes": minutes,
|
||||
"focus": focus,
|
||||
"suggested_questions": suggested_questions(name),
|
||||
}
|
||||
)
|
||||
return {
|
||||
"role": role,
|
||||
"level": normalized,
|
||||
"total_rounds": len(rounds),
|
||||
"total_minutes": sum(r["duration_minutes"] for r in rounds),
|
||||
"rounds": rounds,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generate an interview loop plan for a role and level.")
|
||||
parser.add_argument("--role", required=True, help="Role name (e.g., Senior Software Engineer)")
|
||||
parser.add_argument("--level", required=True, help="Level: junior|mid|senior|staff")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
plan = generate_plan(args.role, args.level)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(plan, indent=2))
|
||||
else:
|
||||
print(f"Interview Plan: {plan['role']} ({plan['level']})")
|
||||
print(f"Total rounds: {plan['total_rounds']} | Total time: {plan['total_minutes']} minutes")
|
||||
print("")
|
||||
for r in plan["rounds"]:
|
||||
print(f"Round {r['round']}: {r['name']} ({r['duration_minutes']} min)")
|
||||
print(f"Focus: {r['focus']}")
|
||||
for q in r["suggested_questions"]:
|
||||
print(f"- {q}")
|
||||
print("")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -62,6 +62,15 @@ Most modern setups: **pnpm workspaces + Turborepo + Changesets**
|
||||
## Turborepo
|
||||
→ See references/monorepo-tooling-reference.md for details
|
||||
|
||||
## Workspace Analyzer
|
||||
|
||||
```bash
|
||||
python3 scripts/monorepo_analyzer.py /path/to/monorepo
|
||||
python3 scripts/monorepo_analyzer.py /path/to/monorepo --json
|
||||
```
|
||||
|
||||
Also see `references/monorepo-patterns.md` for common architecture and CI patterns.
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
| Pitfall | Fix |
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
# Monorepo Patterns
|
||||
|
||||
## Common Layouts
|
||||
|
||||
### apps + packages
|
||||
|
||||
- `apps/*`: deployable applications
|
||||
- `packages/*`: shared libraries, UI kits, utilities
|
||||
- `tooling/*`: lint/build config packages
|
||||
|
||||
### domains + shared
|
||||
|
||||
- `domains/*`: bounded-context product areas
|
||||
- `shared/*`: cross-domain code with strict API contracts
|
||||
|
||||
### service monorepo
|
||||
|
||||
- `services/*`: backend services
|
||||
- `libs/*`: shared service contracts and SDKs
|
||||
|
||||
## Dependency Rules
|
||||
|
||||
- Prefer one-way dependencies from apps/services to packages/libs.
|
||||
- Keep cross-app imports disallowed unless explicitly approved.
|
||||
- Keep `types` packages runtime-free to avoid unexpected coupling.
|
||||
|
||||
## Build/CI Patterns
|
||||
|
||||
- Use affected-only CI (`--filter` or equivalent).
|
||||
- Enable remote cache for build and test tasks.
|
||||
- Split lint/typecheck/test tasks to isolate failures quickly.
|
||||
|
||||
## Release Patterns
|
||||
|
||||
- Use Changesets or equivalent for versioning.
|
||||
- Keep package publishing automated and reproducible.
|
||||
- Use prerelease channels for unstable shared package changes.
|
||||
168
engineering/monorepo-navigator/scripts/monorepo_analyzer.py
Executable file
168
engineering/monorepo-navigator/scripts/monorepo_analyzer.py
Executable file
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Detect monorepo tooling, workspaces, and internal dependency graph."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict:
|
||||
try:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def detect_repo_type(root: Path) -> List[str]:
|
||||
detected: List[str] = []
|
||||
if (root / "turbo.json").exists():
|
||||
detected.append("Turborepo")
|
||||
if (root / "nx.json").exists():
|
||||
detected.append("Nx")
|
||||
if (root / "pnpm-workspace.yaml").exists():
|
||||
detected.append("pnpm-workspaces")
|
||||
if (root / "lerna.json").exists():
|
||||
detected.append("Lerna")
|
||||
|
||||
pkg = load_json(root / "package.json")
|
||||
if "workspaces" in pkg and "npm-workspaces" not in detected:
|
||||
detected.append("npm-workspaces")
|
||||
return detected
|
||||
|
||||
|
||||
def parse_pnpm_workspace(root: Path) -> List[str]:
|
||||
workspace_file = root / "pnpm-workspace.yaml"
|
||||
if not workspace_file.exists():
|
||||
return []
|
||||
|
||||
patterns: List[str] = []
|
||||
in_packages = False
|
||||
for line in workspace_file.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("packages:"):
|
||||
in_packages = True
|
||||
continue
|
||||
if in_packages and stripped.startswith("-"):
|
||||
item = stripped[1:].strip().strip('"').strip("'")
|
||||
if item:
|
||||
patterns.append(item)
|
||||
elif in_packages and stripped and not stripped.startswith("#") and not stripped.startswith("-"):
|
||||
in_packages = False
|
||||
return patterns
|
||||
|
||||
|
||||
def parse_package_workspaces(root: Path) -> List[str]:
|
||||
pkg = load_json(root / "package.json")
|
||||
workspaces = pkg.get("workspaces")
|
||||
if isinstance(workspaces, list):
|
||||
return [str(item) for item in workspaces]
|
||||
if isinstance(workspaces, dict) and isinstance(workspaces.get("packages"), list):
|
||||
return [str(item) for item in workspaces["packages"]]
|
||||
return []
|
||||
|
||||
|
||||
def expand_workspace_patterns(root: Path, patterns: List[str]) -> List[Path]:
|
||||
paths: Set[Path] = set()
|
||||
for pattern in patterns:
|
||||
for match in glob.glob(str(root / pattern)):
|
||||
p = Path(match)
|
||||
if p.is_dir() and (p / "package.json").exists():
|
||||
paths.add(p.resolve())
|
||||
return sorted(paths)
|
||||
|
||||
|
||||
def load_workspace_packages(workspaces: List[Path]) -> Dict[str, Dict]:
|
||||
packages: Dict[str, Dict] = {}
|
||||
for ws in workspaces:
|
||||
data = load_json(ws / "package.json")
|
||||
name = data.get("name") or ws.name
|
||||
packages[name] = {
|
||||
"path": str(ws),
|
||||
"dependencies": data.get("dependencies", {}),
|
||||
"devDependencies": data.get("devDependencies", {}),
|
||||
"peerDependencies": data.get("peerDependencies", {}),
|
||||
}
|
||||
return packages
|
||||
|
||||
|
||||
def build_dependency_graph(packages: Dict[str, Dict]) -> Dict[str, List[str]]:
|
||||
package_names = set(packages.keys())
|
||||
graph: Dict[str, List[str]] = {}
|
||||
for name, meta in packages.items():
|
||||
deps: Set[str] = set()
|
||||
for section in ("dependencies", "devDependencies", "peerDependencies"):
|
||||
dep_map = meta.get(section, {})
|
||||
if isinstance(dep_map, dict):
|
||||
for dep_name in dep_map.keys():
|
||||
if dep_name in package_names:
|
||||
deps.add(dep_name)
|
||||
graph[name] = sorted(deps)
|
||||
return graph
|
||||
|
||||
|
||||
def format_tree_paths(root: Path, workspaces: List[Path]) -> List[str]:
|
||||
out: List[str] = []
|
||||
for ws in workspaces:
|
||||
out.append(str(ws.relative_to(root)))
|
||||
return out
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Analyze monorepo type, workspaces, and internal dependency graph.")
|
||||
parser.add_argument("path", help="Monorepo root path")
|
||||
parser.add_argument("--json", action="store_true", help="Output JSON")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
root = Path(args.path).expanduser().resolve()
|
||||
if not root.exists() or not root.is_dir():
|
||||
raise SystemExit(f"Path is not a directory: {root}")
|
||||
|
||||
types = detect_repo_type(root)
|
||||
patterns = parse_pnpm_workspace(root)
|
||||
if not patterns:
|
||||
patterns = parse_package_workspaces(root)
|
||||
|
||||
workspaces = expand_workspace_patterns(root, patterns)
|
||||
packages = load_workspace_packages(workspaces)
|
||||
graph = build_dependency_graph(packages)
|
||||
|
||||
report = {
|
||||
"root": str(root),
|
||||
"detected_types": types,
|
||||
"workspace_patterns": patterns,
|
||||
"workspace_paths": format_tree_paths(root, workspaces),
|
||||
"package_count": len(packages),
|
||||
"dependency_graph": graph,
|
||||
}
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(report, indent=2))
|
||||
else:
|
||||
print("Monorepo Analysis")
|
||||
print(f"Root: {report['root']}")
|
||||
print(f"Detected: {', '.join(types) if types else 'none'}")
|
||||
print(f"Workspace patterns: {', '.join(patterns) if patterns else 'none'}")
|
||||
print("")
|
||||
print("Workspaces")
|
||||
for ws in report["workspace_paths"]:
|
||||
print(f"- {ws}")
|
||||
if not report["workspace_paths"]:
|
||||
print("- none detected")
|
||||
print("")
|
||||
print("Internal dependency graph")
|
||||
for pkg, deps in graph.items():
|
||||
print(f"- {pkg} -> {', '.join(deps) if deps else '(no internal deps)'}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -37,6 +37,21 @@ Systematic performance profiling for Node.js, Python, and Go applications. Ident
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Analyze a project for performance risk indicators
|
||||
python3 scripts/performance_profiler.py /path/to/project
|
||||
|
||||
# JSON output for CI integration
|
||||
python3 scripts/performance_profiler.py /path/to/project --json
|
||||
|
||||
# Custom large-file threshold
|
||||
python3 scripts/performance_profiler.py /path/to/project --large-file-threshold-kb 256
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Golden Rule: Measure First
|
||||
|
||||
```bash
|
||||
|
||||
192
engineering/performance-profiler/scripts/performance_profiler.py
Executable file
192
engineering/performance-profiler/scripts/performance_profiler.py
Executable file
@@ -0,0 +1,192 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Lightweight repo performance profiling helper (stdlib only)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
EXT_WEIGHTS = {
|
||||
".js": 1.0,
|
||||
".jsx": 1.0,
|
||||
".ts": 1.0,
|
||||
".tsx": 1.0,
|
||||
".css": 0.7,
|
||||
".map": 2.0,
|
||||
}
|
||||
|
||||
|
||||
def iter_files(root: Path) -> Iterable[Path]:
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
dirnames[:] = [d for d in dirnames if d not in {".git", "node_modules", ".next", "dist", "build", "coverage", "__pycache__"}]
|
||||
for filename in filenames:
|
||||
path = Path(dirpath) / filename
|
||||
if path.is_file():
|
||||
yield path
|
||||
|
||||
|
||||
def get_large_files(root: Path, threshold_bytes: int) -> List[Tuple[str, int]]:
|
||||
large: List[Tuple[str, int]] = []
|
||||
for file_path in iter_files(root):
|
||||
size = file_path.stat().st_size
|
||||
if size >= threshold_bytes:
|
||||
large.append((str(file_path.relative_to(root)), size))
|
||||
return sorted(large, key=lambda item: item[1], reverse=True)
|
||||
|
||||
|
||||
def count_dependencies(root: Path) -> Dict[str, int]:
|
||||
counts = {"node_dependencies": 0, "python_dependencies": 0, "go_dependencies": 0}
|
||||
|
||||
package_json = root / "package.json"
|
||||
if package_json.exists():
|
||||
try:
|
||||
data = json.loads(package_json.read_text(encoding="utf-8"))
|
||||
deps = data.get("dependencies", {})
|
||||
dev_deps = data.get("devDependencies", {})
|
||||
counts["node_dependencies"] = len(deps) + len(dev_deps)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
requirements = root / "requirements.txt"
|
||||
if requirements.exists():
|
||||
lines = [ln.strip() for ln in requirements.read_text(encoding="utf-8", errors="ignore").splitlines()]
|
||||
counts["python_dependencies"] = sum(1 for ln in lines if ln and not ln.startswith("#"))
|
||||
|
||||
go_mod = root / "go.mod"
|
||||
if go_mod.exists():
|
||||
lines = go_mod.read_text(encoding="utf-8", errors="ignore").splitlines()
|
||||
in_require_block = False
|
||||
go_count = 0
|
||||
for ln in lines:
|
||||
s = ln.strip()
|
||||
if s.startswith("require ("):
|
||||
in_require_block = True
|
||||
continue
|
||||
if in_require_block and s == ")":
|
||||
in_require_block = False
|
||||
continue
|
||||
if in_require_block and s and not s.startswith("//"):
|
||||
go_count += 1
|
||||
elif s.startswith("require ") and not s.endswith("("):
|
||||
go_count += 1
|
||||
counts["go_dependencies"] = go_count
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def bundle_indicators(root: Path) -> Dict[str, object]:
|
||||
indicators = {
|
||||
"build_dirs_present": [],
|
||||
"bundle_like_files": 0,
|
||||
"estimated_bundle_weight": 0.0,
|
||||
}
|
||||
for d in ["dist", "build", ".next", "out"]:
|
||||
if (root / d).exists():
|
||||
indicators["build_dirs_present"].append(d)
|
||||
|
||||
bundle_files = 0
|
||||
weight = 0.0
|
||||
for path in iter_files(root):
|
||||
ext = path.suffix.lower()
|
||||
if ext in EXT_WEIGHTS:
|
||||
bundle_files += 1
|
||||
size_kb = path.stat().st_size / 1024.0
|
||||
weight += size_kb * EXT_WEIGHTS[ext]
|
||||
|
||||
indicators["bundle_like_files"] = bundle_files
|
||||
indicators["estimated_bundle_weight"] = round(weight, 2)
|
||||
return indicators
|
||||
|
||||
|
||||
def format_size(num_bytes: int) -> str:
|
||||
units = ["B", "KB", "MB", "GB"]
|
||||
value = float(num_bytes)
|
||||
for unit in units:
|
||||
if value < 1024.0 or unit == units[-1]:
|
||||
return f"{value:.1f}{unit}"
|
||||
value /= 1024.0
|
||||
return f"{num_bytes}B"
|
||||
|
||||
|
||||
def build_report(root: Path, threshold_bytes: int) -> Dict[str, object]:
|
||||
large = get_large_files(root, threshold_bytes)
|
||||
deps = count_dependencies(root)
|
||||
bundles = bundle_indicators(root)
|
||||
return {
|
||||
"root": str(root),
|
||||
"large_file_threshold_bytes": threshold_bytes,
|
||||
"large_files": large,
|
||||
"dependency_counts": deps,
|
||||
"bundle_indicators": bundles,
|
||||
}
|
||||
|
||||
|
||||
def print_text(report: Dict[str, object]) -> None:
|
||||
print("Performance Profile Report")
|
||||
print(f"Root: {report['root']}")
|
||||
print(f"Large-file threshold: {format_size(int(report['large_file_threshold_bytes']))}")
|
||||
print("")
|
||||
|
||||
dep_counts = report["dependency_counts"]
|
||||
print("Dependency Counts")
|
||||
print(f"- Node: {dep_counts['node_dependencies']}")
|
||||
print(f"- Python: {dep_counts['python_dependencies']}")
|
||||
print(f"- Go: {dep_counts['go_dependencies']}")
|
||||
print("")
|
||||
|
||||
bundle = report["bundle_indicators"]
|
||||
print("Bundle Indicators")
|
||||
print(f"- Build directories present: {', '.join(bundle['build_dirs_present']) or 'none'}")
|
||||
print(f"- Bundle-like files: {bundle['bundle_like_files']}")
|
||||
print(f"- Estimated weighted bundle size: {bundle['estimated_bundle_weight']} KB")
|
||||
print("")
|
||||
|
||||
print("Large Files")
|
||||
large_files = report["large_files"]
|
||||
if not large_files:
|
||||
print("- None above threshold")
|
||||
else:
|
||||
for rel_path, size in large_files[:20]:
|
||||
print(f"- {rel_path}: {format_size(size)}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze a project directory for common performance risk indicators."
|
||||
)
|
||||
parser.add_argument("path", help="Directory to analyze")
|
||||
parser.add_argument(
|
||||
"--large-file-threshold-kb",
|
||||
type=int,
|
||||
default=512,
|
||||
help="Threshold in KB for reporting large files (default: 512)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Print JSON output instead of text",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
root = Path(args.path).expanduser().resolve()
|
||||
if not root.exists() or not root.is_dir():
|
||||
raise SystemExit(f"Path is not a directory: {root}")
|
||||
|
||||
threshold = max(1, args.large_file_threshold_kb) * 1024
|
||||
report = build_report(root, threshold)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(report, indent=2))
|
||||
else:
|
||||
print_text(report)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -7,409 +7,70 @@ description: "Runbook Generator"
|
||||
|
||||
**Tier:** POWERFUL
|
||||
**Category:** Engineering
|
||||
**Domain:** DevOps / Site Reliability Engineering
|
||||
**Domain:** DevOps / Site Reliability Engineering
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Analyze a codebase and generate production-grade operational runbooks. Detects your stack (CI/CD, database, hosting, containers), then produces step-by-step runbooks with copy-paste commands, verification checks, rollback procedures, escalation paths, and time estimates. Keeps runbooks fresh with staleness detection linked to config file modification dates.
|
||||
|
||||
---
|
||||
Generate operational runbooks quickly from a service name, then customize for deployment, incident response, maintenance, and rollback workflows.
|
||||
|
||||
## Core Capabilities
|
||||
|
||||
- **Stack detection** — auto-identify CI/CD, database, hosting, orchestration from repo files
|
||||
- **Runbook types** — deployment, incident response, database maintenance, scaling, monitoring setup
|
||||
- **Format discipline** — numbered steps, copy-paste commands, ✅ verification checks, time estimates
|
||||
- **Escalation paths** — L1 → L2 → L3 with contact info and decision criteria
|
||||
- **Rollback procedures** — every deployment step has a corresponding undo
|
||||
- **Staleness detection** — runbook sections reference config files; flag when source changes
|
||||
- **Testing methodology** — dry-run framework for staging validation, quarterly review cadence
|
||||
- Runbook skeleton generation from a CLI
|
||||
- Standard sections for start/stop/health/rollback
|
||||
- Structured escalation and incident handling placeholders
|
||||
- Reference templates for deployment and incident playbooks
|
||||
|
||||
---
|
||||
|
||||
## When to Use
|
||||
|
||||
Use when:
|
||||
- A codebase has no runbooks and you need to bootstrap them fast
|
||||
- Existing runbooks are outdated or incomplete (point at the repo, regenerate)
|
||||
- Onboarding a new engineer who needs clear operational procedures
|
||||
- Preparing for an incident response drill or audit
|
||||
- Setting up monitoring and on-call rotation from scratch
|
||||
|
||||
Skip when:
|
||||
- The system is too early-stage to have stable operational patterns
|
||||
- Runbooks already exist and only need minor updates (edit directly)
|
||||
- A service has no runbook and needs a baseline immediately
|
||||
- Existing runbooks are inconsistent across teams
|
||||
- On-call onboarding requires standardized operations docs
|
||||
- You need repeatable runbook scaffolding for new services
|
||||
|
||||
---
|
||||
|
||||
## Stack Detection
|
||||
|
||||
When given a repo, scan for these signals before writing a single runbook line:
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# CI/CD
|
||||
ls .github/workflows/ → GitHub Actions
|
||||
ls .gitlab-ci.yml → GitLab CI
|
||||
ls Jenkinsfile → Jenkins
|
||||
ls .circleci/ → CircleCI
|
||||
ls bitbucket-pipelines.yml → Bitbucket Pipelines
|
||||
# Print runbook to stdout
|
||||
python3 scripts/runbook_generator.py payments-api
|
||||
|
||||
# Database
|
||||
grep -r "postgresql\|postgres\|pg" package.json pyproject.toml → PostgreSQL
|
||||
grep -r "mysql\|mariadb" package.json → MySQL
|
||||
grep -r "mongodb\|mongoose" package.json → MongoDB
|
||||
grep -r "redis" package.json → Redis
|
||||
ls prisma/schema.prisma → Prisma ORM (check provider field)
|
||||
ls drizzle.config.* → Drizzle ORM
|
||||
|
||||
# Hosting
|
||||
ls vercel.json → Vercel
|
||||
ls railway.toml → Railway
|
||||
ls fly.toml → Fly.io
|
||||
ls .ebextensions/ → AWS Elastic Beanstalk
|
||||
ls terraform/ ls *.tf → Custom AWS/GCP/Azure (check provider)
|
||||
ls kubernetes/ ls k8s/ → Kubernetes
|
||||
ls docker-compose.yml → Docker Compose
|
||||
|
||||
# Framework
|
||||
ls next.config.* → Next.js
|
||||
ls nuxt.config.* → Nuxt
|
||||
ls svelte.config.* → SvelteKit
|
||||
cat package.json | jq '.scripts' → Check build/start commands
|
||||
```
|
||||
|
||||
Map detected stack → runbook templates. A Next.js + PostgreSQL + Vercel + GitHub Actions repo needs:
|
||||
- Deployment runbook (Vercel + GitHub Actions)
|
||||
- Database runbook (PostgreSQL backup, migration, vacuum)
|
||||
- Incident response (with Vercel logs + pg query debugging)
|
||||
- Monitoring setup (Vercel Analytics, pg_stat, alerting)
|
||||
|
||||
---
|
||||
|
||||
## Runbook Types
|
||||
|
||||
### 1. Deployment Runbook
|
||||
|
||||
```markdown
|
||||
# Deployment Runbook — [App Name]
|
||||
**Stack:** Next.js 14 + PostgreSQL 15 + Vercel
|
||||
**Last verified:** 2025-03-01
|
||||
**Source configs:** vercel.json (modified: git log -1 --format=%ci -- vercel.json)
|
||||
**Owner:** Platform Team
|
||||
**Est. total time:** 15–25 min
|
||||
|
||||
---
|
||||
|
||||
## Pre-deployment Checklist
|
||||
- [ ] All PRs merged to main
|
||||
- [ ] CI passing on main (GitHub Actions green)
|
||||
- [ ] Database migrations tested in staging
|
||||
- [ ] Rollback plan confirmed
|
||||
|
||||
## Steps
|
||||
|
||||
### Step 1 — Run CI checks locally (3 min)
|
||||
```bash
|
||||
pnpm test
|
||||
pnpm lint
|
||||
pnpm build
|
||||
```
|
||||
✅ Expected: All pass with 0 errors. Build output in `.next/`
|
||||
|
||||
### Step 2 — Apply database migrations (5 min)
|
||||
```bash
|
||||
# Staging first
|
||||
DATABASE_URL=$STAGING_DATABASE_URL npx prisma migrate deploy
|
||||
```
|
||||
✅ Expected: `All migrations have been successfully applied.`
|
||||
|
||||
```bash
|
||||
# Verify migration applied
|
||||
psql $STAGING_DATABASE_URL -c "\d" | grep -i migration
|
||||
```
|
||||
✅ Expected: Migration table shows new entry with today's date
|
||||
|
||||
### Step 3 — Deploy to production (5 min)
|
||||
```bash
|
||||
git push origin main
|
||||
# OR trigger manually:
|
||||
vercel --prod
|
||||
```
|
||||
✅ Expected: Vercel dashboard shows deployment in progress. URL format:
|
||||
`https://app-name-<hash>-team.vercel.app`
|
||||
|
||||
### Step 4 — Smoke test production (5 min)
|
||||
```bash
|
||||
# Health check
|
||||
curl -sf https://your-app.vercel.app/api/health | jq .
|
||||
|
||||
# Critical path
|
||||
curl -sf https://your-app.vercel.app/api/users/me \
|
||||
-H "Authorization: Bearer $TEST_TOKEN" | jq '.id'
|
||||
```
|
||||
✅ Expected: health returns `{"status":"ok","db":"connected"}`. Users API returns valid ID.
|
||||
|
||||
### Step 5 — Monitor for 10 min
|
||||
- Check Vercel Functions log for errors: `vercel logs --since=10m`
|
||||
- Check error rate in Vercel Analytics: < 1% 5xx
|
||||
- Check DB connection pool: `SELECT count(*) FROM pg_stat_activity;` (< 80% of max_connections)
|
||||
|
||||
---
|
||||
|
||||
## Rollback
|
||||
|
||||
If smoke tests fail or error rate spikes:
|
||||
|
||||
```bash
|
||||
# Instant rollback via Vercel (preferred — < 30 sec)
|
||||
vercel rollback [previous-deployment-url]
|
||||
|
||||
# Database rollback (only if migration was applied)
|
||||
DATABASE_URL=$PROD_DATABASE_URL npx prisma migrate reset --skip-seed
|
||||
# WARNING: This resets to previous migration. Confirm data impact first.
|
||||
```
|
||||
|
||||
✅ Expected after rollback: Previous deployment URL becomes active. Verify with smoke test.
|
||||
|
||||
---
|
||||
|
||||
## Escalation
|
||||
- **L1 (on-call engineer):** Check Vercel logs, run smoke tests, attempt rollback
|
||||
- **L2 (platform lead):** DB issues, data loss risk, rollback failed — Slack: @platform-lead
|
||||
- **L3 (CTO):** Production down > 30 min, data breach — PagerDuty: #critical-incidents
|
||||
# Write runbook file
|
||||
python3 scripts/runbook_generator.py payments-api --owner platform --output docs/runbooks/payments-api.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Incident Response Runbook
|
||||
## Recommended Workflow
|
||||
|
||||
```markdown
|
||||
# Incident Response Runbook
|
||||
**Severity levels:** P1 (down), P2 (degraded), P3 (minor)
|
||||
**Est. total time:** P1: 30–60 min, P2: 1–4 hours
|
||||
|
||||
## Phase 1 — Triage (5 min)
|
||||
|
||||
### Confirm the incident
|
||||
```bash
|
||||
# Is the app responding?
|
||||
curl -sw "%{http_code}" https://your-app.vercel.app/api/health -o /dev/null
|
||||
|
||||
# Check Vercel function errors (last 15 min)
|
||||
vercel logs --since=15m | grep -i "error\|exception\|5[0-9][0-9]"
|
||||
```
|
||||
✅ 200 = app up. 5xx or timeout = incident confirmed.
|
||||
|
||||
Declare severity:
|
||||
- Site completely down → P1 — page L2/L3 immediately
|
||||
- Partial degradation / slow responses → P2 — notify team channel
|
||||
- Single feature broken → P3 — create ticket, fix in business hours
|
||||
1. Generate the initial skeleton with `scripts/runbook_generator.py`.
|
||||
2. Fill in service-specific commands and URLs.
|
||||
3. Add verification checks and rollback triggers.
|
||||
4. Dry-run in staging.
|
||||
5. Store runbook in version control near service code.
|
||||
|
||||
---
|
||||
|
||||
## Phase 2 — Diagnose (10–15 min)
|
||||
## Reference Docs
|
||||
|
||||
```bash
|
||||
# Recent deployments — did something just ship?
|
||||
vercel ls --limit=5
|
||||
|
||||
# Database health
|
||||
psql $DATABASE_URL -c "SELECT pid, state, wait_event, query FROM pg_stat_activity WHERE state != 'idle' LIMIT 20;"
|
||||
|
||||
# Long-running queries (> 30 sec)
|
||||
psql $DATABASE_URL -c "SELECT pid, now() - pg_stat_activity.query_start AS duration, query FROM pg_stat_activity WHERE state = 'active' AND now() - pg_stat_activity.query_start > interval '30 seconds';"
|
||||
|
||||
# Connection pool saturation
|
||||
psql $DATABASE_URL -c "SELECT count(*), max_conn FROM pg_stat_activity, (SELECT setting::int AS max_conn FROM pg_settings WHERE name='max_connections') t GROUP BY max_conn;"
|
||||
```
|
||||
|
||||
Diagnostic decision tree:
|
||||
- Recent deploy + new errors → rollback (see Deployment Runbook)
|
||||
- DB query timeout / pool saturation → kill long queries, scale connections
|
||||
- External dependency failing → check status pages, add circuit breaker
|
||||
- Memory/CPU spike → check Vercel function logs for infinite loops
|
||||
|
||||
---
|
||||
|
||||
## Phase 3 — Mitigate (variable)
|
||||
|
||||
```bash
|
||||
# Kill a runaway DB query
|
||||
psql $DATABASE_URL -c "SELECT pg_terminate_backend(<pid>);"
|
||||
|
||||
# Scale DB connections (Supabase/Neon — adjust pool size)
|
||||
# Vercel → Settings → Environment Variables → update DATABASE_POOL_MAX
|
||||
|
||||
# Enable maintenance mode (if you have a feature flag)
|
||||
vercel env add MAINTENANCE_MODE true production
|
||||
vercel --prod # redeploy with flag
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Phase 4 — Resolve & Postmortem
|
||||
|
||||
After incident is resolved, within 24 hours:
|
||||
|
||||
1. Write incident timeline (what happened, when, who noticed, what fixed it)
|
||||
2. Identify root cause (5-Whys)
|
||||
3. Define action items with owners and due dates
|
||||
4. Update this runbook if a step was missing or wrong
|
||||
5. Add monitoring/alert that would have caught this earlier
|
||||
|
||||
**Postmortem template:** `docs/postmortems/YYYY-MM-DD-incident-title.md`
|
||||
|
||||
---
|
||||
|
||||
## Escalation Path
|
||||
|
||||
| Level | Who | When | Contact |
|
||||
|-------|-----|------|---------|
|
||||
| L1 | On-call engineer | Always first | PagerDuty rotation |
|
||||
| L2 | Platform lead | DB issues, rollback needed | Slack @platform-lead |
|
||||
| L3 | CTO/VP Eng | P1 > 30 min, data loss | Phone + PagerDuty |
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3. Database Maintenance Runbook
|
||||
|
||||
```markdown
|
||||
# Database Maintenance Runbook — PostgreSQL
|
||||
**Schedule:** Weekly vacuum (automated), monthly manual review
|
||||
|
||||
## Backup
|
||||
|
||||
```bash
|
||||
# Full backup
|
||||
pg_dump $DATABASE_URL \
|
||||
--format=custom \
|
||||
--compress=9 \
|
||||
--file="backup-$(date +%Y%m%d-%H%M%S).dump"
|
||||
```
|
||||
✅ Expected: File created, size > 0. `pg_restore --list backup.dump | head -20` shows tables.
|
||||
|
||||
Verify backup is restorable (test monthly):
|
||||
```bash
|
||||
pg_restore --dbname=$STAGING_DATABASE_URL backup.dump
|
||||
psql $STAGING_DATABASE_URL -c "SELECT count(*) FROM users;"
|
||||
```
|
||||
✅ Expected: Row count matches production.
|
||||
|
||||
## Migration
|
||||
|
||||
```bash
|
||||
# Always test in staging first
|
||||
DATABASE_URL=$STAGING_DATABASE_URL npx prisma migrate deploy
|
||||
# Verify, then:
|
||||
DATABASE_URL=$PROD_DATABASE_URL npx prisma migrate deploy
|
||||
```
|
||||
✅ Expected: `All migrations have been successfully applied.`
|
||||
|
||||
⚠️ For large table migrations (> 1M rows), use `pg_repack` or add column with DEFAULT separately to avoid table locks.
|
||||
|
||||
## Vacuum & Reindex
|
||||
|
||||
```bash
|
||||
# Check bloat before deciding
|
||||
psql $DATABASE_URL -c "
|
||||
SELECT schemaname, tablename,
|
||||
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
|
||||
n_dead_tup, n_live_tup,
|
||||
ROUND(n_dead_tup::numeric / NULLIF(n_live_tup + n_dead_tup, 0) * 100, 1) AS dead_ratio
|
||||
FROM pg_stat_user_tables
|
||||
ORDER BY n_dead_tup DESC LIMIT 10;"
|
||||
|
||||
# Vacuum high-bloat tables (non-blocking)
|
||||
psql $DATABASE_URL -c "VACUUM ANALYZE users;"
|
||||
psql $DATABASE_URL -c "VACUUM ANALYZE events;"
|
||||
|
||||
# Reindex (use CONCURRENTLY to avoid locks)
|
||||
psql $DATABASE_URL -c "REINDEX INDEX CONCURRENTLY users_email_idx;"
|
||||
```
|
||||
✅ Expected: dead_ratio drops below 5% after vacuum.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Staleness Detection
|
||||
|
||||
Add a staleness header to every runbook:
|
||||
|
||||
```markdown
|
||||
## Staleness Check
|
||||
This runbook references the following config files. If they've changed since the
|
||||
"Last verified" date, review the affected steps.
|
||||
|
||||
| Config File | Last Modified | Affects Steps |
|
||||
|-------------|--------------|---------------|
|
||||
| vercel.json | `git log -1 --format=%ci -- vercel.json` | Step 3, Rollback |
|
||||
| prisma/schema.prisma | `git log -1 --format=%ci -- prisma/schema.prisma` | Step 2, DB Maintenance |
|
||||
| .github/workflows/deploy.yml | `git log -1 --format=%ci -- .github/workflows/deploy.yml` | Step 1, Step 3 |
|
||||
| docker-compose.yml | `git log -1 --format=%ci -- docker-compose.yml` | All scaling steps |
|
||||
```
|
||||
|
||||
**Automation:** Add a CI job that runs weekly and comments on the runbook doc if any referenced file was modified more recently than the runbook's "Last verified" date.
|
||||
|
||||
---
|
||||
|
||||
## Runbook Testing Methodology
|
||||
|
||||
### Dry-Run in Staging
|
||||
|
||||
Before trusting a runbook in production, validate every step in staging:
|
||||
|
||||
```bash
|
||||
# 1. Create a staging environment mirror
|
||||
vercel env pull .env.staging
|
||||
source .env.staging
|
||||
|
||||
# 2. Run each step with staging credentials
|
||||
# Replace all $DATABASE_URL with $STAGING_DATABASE_URL
|
||||
# Replace all production URLs with staging URLs
|
||||
|
||||
# 3. Verify expected outputs match
|
||||
# Document any discrepancies and update the runbook
|
||||
|
||||
# 4. Time each step — update estimates in the runbook
|
||||
time npx prisma migrate deploy
|
||||
```
|
||||
|
||||
### Quarterly Review Cadence
|
||||
|
||||
Schedule a 1-hour review every quarter:
|
||||
|
||||
1. **Run each command** in staging — does it still work?
|
||||
2. **Check config drift** — compare "Last Modified" dates vs "Last verified"
|
||||
3. **Test rollback procedures** — actually roll back in staging
|
||||
4. **Update contact info** — L1/L2/L3 may have changed
|
||||
5. **Add new failure modes** discovered in the past quarter
|
||||
6. **Update "Last verified" date** at top of runbook
|
||||
- `references/runbook-templates.md`
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
| Pitfall | Fix |
|
||||
|---|---|
|
||||
| Commands that require manual copy of dynamic values | Use env vars — `$DATABASE_URL` not `postgres://user:pass@host/db` |
|
||||
| No expected output specified | Add ✅ with exact expected string after every verification step |
|
||||
| Rollback steps missing | Every destructive step needs a corresponding undo |
|
||||
| Runbooks that never get tested | Schedule quarterly staging dry-runs in team calendar |
|
||||
| L3 escalation contact is the former CTO | Review contact info every quarter |
|
||||
| Migration runbook doesn't mention table locks | Call out lock risk for large table operations explicitly |
|
||||
|
||||
---
|
||||
- Missing rollback triggers or rollback commands
|
||||
- Steps without expected output checks
|
||||
- Stale ownership/escalation contacts
|
||||
- Runbooks never tested outside of incidents
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Every command must be copy-pasteable** — no placeholder text, use env vars
|
||||
2. **✅ after every step** — explicit expected output, not "it should work"
|
||||
3. **Time estimates are mandatory** — engineers need to know if they have time to fix before SLA breach
|
||||
4. **Rollback before you deploy** — plan the undo before executing
|
||||
5. **Runbooks live in the repo** — `docs/runbooks/`, versioned with the code they describe
|
||||
6. **Postmortem → runbook update** — every incident should improve a runbook
|
||||
7. **Link, don't duplicate** — reference the canonical config file, don't copy its contents into the runbook
|
||||
8. **Test runbooks like you test code** — untested runbooks are worse than no runbooks (false confidence)
|
||||
1. Keep every command copy-pasteable.
|
||||
2. Include health checks after every critical step.
|
||||
3. Validate runbooks on a fixed review cadence.
|
||||
4. Update runbook content after incidents and postmortems.
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
# Runbook Templates
|
||||
|
||||
## Deployment Runbook Template
|
||||
|
||||
- Pre-deployment checks
|
||||
- Deploy steps with expected output
|
||||
- Smoke tests
|
||||
- Rollback plan with explicit triggers
|
||||
- Escalation and communication notes
|
||||
|
||||
## Incident Response Template
|
||||
|
||||
- Triage phase (first 5 minutes)
|
||||
- Diagnosis phase (logs, metrics, recent deploys)
|
||||
- Mitigation phase (containment and restoration)
|
||||
- Resolution and postmortem actions
|
||||
|
||||
## Database Maintenance Template
|
||||
|
||||
- Backup and restore verification
|
||||
- Migration sequencing and lock-risk notes
|
||||
- Vacuum/reindex routines
|
||||
- Verification queries and performance checks
|
||||
|
||||
## Staleness Detection Template
|
||||
|
||||
Track referenced config files and update runbooks whenever these change:
|
||||
|
||||
- deployment config (`vercel.json`, Helm charts, Terraform)
|
||||
- CI pipelines (`.github/workflows/*`, `.gitlab-ci.yml`)
|
||||
- data schema/migration definitions
|
||||
- service runtime/env configuration
|
||||
|
||||
## Quarterly Validation Checklist
|
||||
|
||||
1. Execute commands in staging.
|
||||
2. Validate expected outputs.
|
||||
3. Test rollback paths.
|
||||
4. Confirm contact/escalation ownership.
|
||||
5. Update `Last verified` date.
|
||||
128
engineering/runbook-generator/scripts/runbook_generator.py
Executable file
128
engineering/runbook-generator/scripts/runbook_generator.py
Executable file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate an operational runbook skeleton for a service."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def build_runbook(service: str, owner: str, environment: str) -> str:
|
||||
today = date.today().isoformat()
|
||||
return f"""# Runbook - {service}
|
||||
|
||||
- Service: {service}
|
||||
- Owner: {owner}
|
||||
- Environment: {environment}
|
||||
- Last verified: {today}
|
||||
|
||||
## Overview
|
||||
|
||||
Describe the service purpose, dependencies, and critical user impact.
|
||||
|
||||
## Preconditions
|
||||
|
||||
- Access to deployment platform
|
||||
- Access to logs/metrics
|
||||
- Access to secret/config manager
|
||||
|
||||
## Start Procedure
|
||||
|
||||
1. Pull latest config/secrets.
|
||||
2. Start service process.
|
||||
3. Confirm process is healthy.
|
||||
|
||||
```bash
|
||||
# Example
|
||||
# systemctl start {service}
|
||||
```
|
||||
|
||||
## Stop Procedure
|
||||
|
||||
1. Drain traffic if applicable.
|
||||
2. Stop service process.
|
||||
3. Confirm no active workers remain.
|
||||
|
||||
```bash
|
||||
# Example
|
||||
# systemctl stop {service}
|
||||
```
|
||||
|
||||
## Health Checks
|
||||
|
||||
- HTTP health endpoint
|
||||
- Dependency connectivity checks
|
||||
- Error-rate and latency checks
|
||||
|
||||
```bash
|
||||
# Example
|
||||
# curl -sf https://{service}.example.com/health
|
||||
```
|
||||
|
||||
## Deployment Checklist
|
||||
|
||||
1. Verify CI status and artifact integrity.
|
||||
2. Apply migrations (if required) in safe order.
|
||||
3. Deploy service revision.
|
||||
4. Run smoke checks.
|
||||
5. Observe metrics for 10-15 minutes.
|
||||
|
||||
## Rollback
|
||||
|
||||
1. Identify last known good release.
|
||||
2. Re-deploy previous version.
|
||||
3. Re-run health checks.
|
||||
4. Communicate rollback status to stakeholders.
|
||||
|
||||
```bash
|
||||
# Example
|
||||
# deployctl rollback --service {service}
|
||||
```
|
||||
|
||||
## Incident Response
|
||||
|
||||
1. Classify severity.
|
||||
2. Contain user impact.
|
||||
3. Triage likely failing component.
|
||||
4. Escalate if SLA risk is high.
|
||||
|
||||
## Escalation
|
||||
|
||||
- L1: On-call engineer
|
||||
- L2: Service owner ({owner})
|
||||
- L3: Platform/Engineering leadership
|
||||
|
||||
## Post-Incident
|
||||
|
||||
1. Write timeline and root cause.
|
||||
2. Define corrective actions with owners.
|
||||
3. Update this runbook with missing steps.
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generate a markdown runbook skeleton.")
|
||||
parser.add_argument("service", help="Service name")
|
||||
parser.add_argument("--owner", default="platform-team", help="Service owner label")
|
||||
parser.add_argument("--environment", default="production", help="Primary environment")
|
||||
parser.add_argument("--output", help="Optional output path (prints to stdout if omitted)")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
markdown = build_runbook(args.service, owner=args.owner, environment=args.environment)
|
||||
|
||||
if args.output:
|
||||
path = Path(args.output)
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(markdown, encoding="utf-8")
|
||||
print(f"Wrote runbook skeleton to {path}")
|
||||
else:
|
||||
print(markdown)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,5 +1,11 @@
|
||||
"""
|
||||
Payment processing module - contains various technical debt examples
|
||||
Payment processing module - contains various technical debt examples.
|
||||
|
||||
⚠️ DISCLAIMER: This is an INTENTIONAL example of bad code patterns for
|
||||
tech debt detection training. The hardcoded credentials, missing error
|
||||
handling, and other issues are deliberate anti-patterns used by the
|
||||
tech-debt-tracker skill to demonstrate detection capabilities.
|
||||
DO NOT use this code in production.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -13,9 +19,10 @@ class PaymentProcessor:
|
||||
|
||||
def __init__(self):
|
||||
# TODO: These should come from environment or config
|
||||
self.stripe_key = "sk_test_1234567890"
|
||||
self.paypal_key = "paypal_secret_key_here"
|
||||
self.square_key = "square_api_key"
|
||||
# ⚠️ INTENTIONAL BAD PATTERN — hardcoded keys for tech debt detection demo
|
||||
self.stripe_key = "sk_test_EXAMPLE_NOT_REAL"
|
||||
self.paypal_key = "paypal_EXAMPLE_NOT_REAL"
|
||||
self.square_key = "square_EXAMPLE_NOT_REAL"
|
||||
|
||||
def process_payment(self, amount, currency, payment_method, customer_data, billing_address, shipping_address, items, discount_code, tax_rate, processing_fee, metadata):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user